summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/db
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/db')
-rw-r--r--src/rocksdb/db/arena_wrapped_db_iter.cc160
-rw-r--r--src/rocksdb/db/arena_wrapped_db_iter.h127
-rw-r--r--src/rocksdb/db/blob/blob_constants.h16
-rw-r--r--src/rocksdb/db/blob/blob_contents.cc90
-rw-r--r--src/rocksdb/db/blob/blob_contents.h56
-rw-r--r--src/rocksdb/db/blob/blob_counting_iterator.h146
-rw-r--r--src/rocksdb/db/blob/blob_counting_iterator_test.cc327
-rw-r--r--src/rocksdb/db/blob/blob_fetcher.cc34
-rw-r--r--src/rocksdb/db/blob/blob_fetcher.h37
-rw-r--r--src/rocksdb/db/blob/blob_file_addition.cc156
-rw-r--r--src/rocksdb/db/blob/blob_file_addition.h67
-rw-r--r--src/rocksdb/db/blob/blob_file_addition_test.cc211
-rw-r--r--src/rocksdb/db/blob/blob_file_builder.cc446
-rw-r--r--src/rocksdb/db/blob/blob_file_builder.h112
-rw-r--r--src/rocksdb/db/blob/blob_file_builder_test.cc680
-rw-r--r--src/rocksdb/db/blob/blob_file_cache.cc102
-rw-r--r--src/rocksdb/db/blob/blob_file_cache.h52
-rw-r--r--src/rocksdb/db/blob/blob_file_cache_test.cc269
-rw-r--r--src/rocksdb/db/blob/blob_file_completion_callback.h101
-rw-r--r--src/rocksdb/db/blob/blob_file_garbage.cc134
-rw-r--r--src/rocksdb/db/blob/blob_file_garbage.h57
-rw-r--r--src/rocksdb/db/blob/blob_file_garbage_test.cc174
-rw-r--r--src/rocksdb/db/blob/blob_file_meta.cc62
-rw-r--r--src/rocksdb/db/blob/blob_file_meta.h170
-rw-r--r--src/rocksdb/db/blob/blob_file_reader.cc610
-rw-r--r--src/rocksdb/db/blob/blob_file_reader.h108
-rw-r--r--src/rocksdb/db/blob/blob_file_reader_test.cc1024
-rw-r--r--src/rocksdb/db/blob/blob_garbage_meter.cc100
-rw-r--r--src/rocksdb/db/blob/blob_garbage_meter.h102
-rw-r--r--src/rocksdb/db/blob/blob_garbage_meter_test.cc197
-rw-r--r--src/rocksdb/db/blob/blob_index.h187
-rw-r--r--src/rocksdb/db/blob/blob_log_format.cc143
-rw-r--r--src/rocksdb/db/blob/blob_log_format.h164
-rw-r--r--src/rocksdb/db/blob/blob_log_sequential_reader.cc134
-rw-r--r--src/rocksdb/db/blob/blob_log_sequential_reader.h83
-rw-r--r--src/rocksdb/db/blob/blob_log_writer.cc178
-rw-r--r--src/rocksdb/db/blob/blob_log_writer.h83
-rw-r--r--src/rocksdb/db/blob/blob_read_request.h58
-rw-r--r--src/rocksdb/db/blob/blob_source.cc488
-rw-r--r--src/rocksdb/db/blob/blob_source.h153
-rw-r--r--src/rocksdb/db/blob/blob_source_test.cc1624
-rw-r--r--src/rocksdb/db/blob/db_blob_basic_test.cc1789
-rw-r--r--src/rocksdb/db/blob/db_blob_compaction_test.cc913
-rw-r--r--src/rocksdb/db/blob/db_blob_corruption_test.cc82
-rw-r--r--src/rocksdb/db/blob/db_blob_index_test.cc602
-rw-r--r--src/rocksdb/db/blob/prefetch_buffer_collection.cc21
-rw-r--r--src/rocksdb/db/blob/prefetch_buffer_collection.h38
-rw-r--r--src/rocksdb/db/builder.cc434
-rw-r--r--src/rocksdb/db/builder.h77
-rw-r--r--src/rocksdb/db/c.cc6390
-rw-r--r--src/rocksdb/db/c_test.c3476
-rw-r--r--src/rocksdb/db/column_family.cc1683
-rw-r--r--src/rocksdb/db/column_family.h845
-rw-r--r--src/rocksdb/db/column_family_test.cc3453
-rw-r--r--src/rocksdb/db/compact_files_test.cc502
-rw-r--r--src/rocksdb/db/compaction/clipping_iterator.h276
-rw-r--r--src/rocksdb/db/compaction/clipping_iterator_test.cc259
-rw-r--r--src/rocksdb/db/compaction/compaction.cc855
-rw-r--r--src/rocksdb/db/compaction/compaction.h559
-rw-r--r--src/rocksdb/db/compaction/compaction_iteration_stats.h49
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator.cc1338
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator.h513
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator_test.cc1618
-rw-r--r--src/rocksdb/db/compaction/compaction_job.cc2060
-rw-r--r--src/rocksdb/db/compaction/compaction_job.h500
-rw-r--r--src/rocksdb/db/compaction/compaction_job_stats_test.cc975
-rw-r--r--src/rocksdb/db/compaction/compaction_job_test.cc2451
-rw-r--r--src/rocksdb/db/compaction/compaction_outputs.cc646
-rw-r--r--src/rocksdb/db/compaction/compaction_outputs.h385
-rw-r--r--src/rocksdb/db/compaction/compaction_picker.cc1234
-rw-r--r--src/rocksdb/db/compaction/compaction_picker.h323
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_fifo.cc433
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_fifo.h63
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_level.cc841
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_level.h33
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_test.cc3964
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_universal.cc1450
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_universal.h32
-rw-r--r--src/rocksdb/db/compaction/compaction_service_job.cc829
-rw-r--r--src/rocksdb/db/compaction/compaction_service_test.cc966
-rw-r--r--src/rocksdb/db/compaction/compaction_state.cc46
-rw-r--r--src/rocksdb/db/compaction/compaction_state.h42
-rw-r--r--src/rocksdb/db/compaction/file_pri.h92
-rw-r--r--src/rocksdb/db/compaction/sst_partitioner.cc90
-rw-r--r--src/rocksdb/db/compaction/subcompaction_state.cc106
-rw-r--r--src/rocksdb/db/compaction/subcompaction_state.h214
-rw-r--r--src/rocksdb/db/compaction/tiered_compaction_test.cc2028
-rw-r--r--src/rocksdb/db/comparator_db_test.cc678
-rw-r--r--src/rocksdb/db/convenience.cc81
-rw-r--r--src/rocksdb/db/corruption_test.cc1587
-rw-r--r--src/rocksdb/db/cuckoo_table_db_test.cc361
-rw-r--r--src/rocksdb/db/db_basic_test.cc4643
-rw-r--r--src/rocksdb/db/db_block_cache_test.cc2313
-rw-r--r--src/rocksdb/db/db_bloom_filter_test.cc3498
-rw-r--r--src/rocksdb/db/db_compaction_filter_test.cc1036
-rw-r--r--src/rocksdb/db/db_compaction_test.cc8227
-rw-r--r--src/rocksdb/db/db_dynamic_level_test.cc507
-rw-r--r--src/rocksdb/db/db_encryption_test.cc130
-rw-r--r--src/rocksdb/db/db_filesnapshot.cc442
-rw-r--r--src/rocksdb/db/db_flush_test.cc3084
-rw-r--r--src/rocksdb/db/db_impl/compacted_db_impl.cc257
-rw-r--r--src/rocksdb/db/db_impl/compacted_db_impl.h154
-rw-r--r--src/rocksdb/db/db_impl/db_impl.cc5918
-rw-r--r--src/rocksdb/db/db_impl/db_impl.h2804
-rw-r--r--src/rocksdb/db/db_impl/db_impl_compaction_flush.cc3857
-rw-r--r--src/rocksdb/db/db_impl/db_impl_debug.cc312
-rw-r--r--src/rocksdb/db/db_impl/db_impl_experimental.cc158
-rw-r--r--src/rocksdb/db/db_impl/db_impl_files.cc1013
-rw-r--r--src/rocksdb/db/db_impl/db_impl_open.cc2106
-rw-r--r--src/rocksdb/db/db_impl/db_impl_readonly.cc341
-rw-r--r--src/rocksdb/db/db_impl/db_impl_readonly.h170
-rw-r--r--src/rocksdb/db/db_impl/db_impl_secondary.cc967
-rw-r--r--src/rocksdb/db/db_impl/db_impl_secondary.h410
-rw-r--r--src/rocksdb/db/db_impl/db_impl_write.cc2435
-rw-r--r--src/rocksdb/db/db_info_dumper.cc147
-rw-r--r--src/rocksdb/db/db_info_dumper.h15
-rw-r--r--src/rocksdb/db/db_inplace_update_test.cc262
-rw-r--r--src/rocksdb/db/db_io_failure_test.cc593
-rw-r--r--src/rocksdb/db/db_iter.cc1708
-rw-r--r--src/rocksdb/db/db_iter.h420
-rw-r--r--src/rocksdb/db/db_iter_stress_test.cc658
-rw-r--r--src/rocksdb/db/db_iter_test.cc3195
-rw-r--r--src/rocksdb/db/db_iterator_test.cc3265
-rw-r--r--src/rocksdb/db/db_kv_checksum_test.cc885
-rw-r--r--src/rocksdb/db/db_log_iter_test.cc305
-rw-r--r--src/rocksdb/db/db_logical_block_size_cache_test.cc521
-rw-r--r--src/rocksdb/db/db_memtable_test.cc344
-rw-r--r--src/rocksdb/db/db_merge_operand_test.cc448
-rw-r--r--src/rocksdb/db/db_merge_operator_test.cc669
-rw-r--r--src/rocksdb/db/db_options_test.cc1219
-rw-r--r--src/rocksdb/db/db_properties_test.cc2206
-rw-r--r--src/rocksdb/db/db_range_del_test.cc2807
-rw-r--r--src/rocksdb/db/db_rate_limiter_test.cc451
-rw-r--r--src/rocksdb/db/db_readonly_with_timestamp_test.cc960
-rw-r--r--src/rocksdb/db/db_secondary_test.cc1693
-rw-r--r--src/rocksdb/db/db_sst_test.cc1868
-rw-r--r--src/rocksdb/db/db_statistics_test.cc215
-rw-r--r--src/rocksdb/db/db_table_properties_test.cc625
-rw-r--r--src/rocksdb/db/db_tailing_iter_test.cc604
-rw-r--r--src/rocksdb/db/db_test.cc7397
-rw-r--r--src/rocksdb/db/db_test2.cc7652
-rw-r--r--src/rocksdb/db/db_test_util.cc1773
-rw-r--r--src/rocksdb/db/db_test_util.h1402
-rw-r--r--src/rocksdb/db/db_universal_compaction_test.cc2235
-rw-r--r--src/rocksdb/db/db_wal_test.cc2314
-rw-r--r--src/rocksdb/db/db_with_timestamp_basic_test.cc3880
-rw-r--r--src/rocksdb/db/db_with_timestamp_compaction_test.cc334
-rw-r--r--src/rocksdb/db/db_with_timestamp_test_util.cc96
-rw-r--r--src/rocksdb/db/db_with_timestamp_test_util.h126
-rw-r--r--src/rocksdb/db/db_write_buffer_manager_test.cc862
-rw-r--r--src/rocksdb/db/db_write_test.cc679
-rw-r--r--src/rocksdb/db/dbformat.cc188
-rw-r--r--src/rocksdb/db/dbformat.h865
-rw-r--r--src/rocksdb/db/dbformat_test.cc214
-rw-r--r--src/rocksdb/db/deletefile_test.cc614
-rw-r--r--src/rocksdb/db/error_handler.cc819
-rw-r--r--src/rocksdb/db/error_handler.h124
-rw-r--r--src/rocksdb/db/error_handler_fs_test.cc2875
-rw-r--r--src/rocksdb/db/event_helpers.cc371
-rw-r--r--src/rocksdb/db/event_helpers.h82
-rw-r--r--src/rocksdb/db/experimental.cc155
-rw-r--r--src/rocksdb/db/external_sst_file_basic_test.cc1997
-rw-r--r--src/rocksdb/db/external_sst_file_ingestion_job.cc1020
-rw-r--r--src/rocksdb/db/external_sst_file_ingestion_job.h201
-rw-r--r--src/rocksdb/db/external_sst_file_test.cc2967
-rw-r--r--src/rocksdb/db/fault_injection_test.cc637
-rw-r--r--src/rocksdb/db/file_indexer.cc218
-rw-r--r--src/rocksdb/db/file_indexer.h140
-rw-r--r--src/rocksdb/db/file_indexer_test.cc352
-rw-r--r--src/rocksdb/db/filename_test.cc241
-rw-r--r--src/rocksdb/db/flush_job.cc1094
-rw-r--r--src/rocksdb/db/flush_job.h203
-rw-r--r--src/rocksdb/db/flush_job_test.cc745
-rw-r--r--src/rocksdb/db/flush_scheduler.cc86
-rw-r--r--src/rocksdb/db/flush_scheduler.h55
-rw-r--r--src/rocksdb/db/forward_iterator.cc1062
-rw-r--r--src/rocksdb/db/forward_iterator.h168
-rw-r--r--src/rocksdb/db/forward_iterator_bench.cc378
-rw-r--r--src/rocksdb/db/history_trimming_iterator.h91
-rw-r--r--src/rocksdb/db/import_column_family_job.cc312
-rw-r--r--src/rocksdb/db/import_column_family_job.h82
-rw-r--r--src/rocksdb/db/import_column_family_test.cc644
-rw-r--r--src/rocksdb/db/internal_stats.cc2002
-rw-r--r--src/rocksdb/db/internal_stats.h996
-rw-r--r--src/rocksdb/db/job_context.h238
-rw-r--r--src/rocksdb/db/kv_checksum.h398
-rw-r--r--src/rocksdb/db/listener_test.cc1595
-rw-r--r--src/rocksdb/db/log_format.h51
-rw-r--r--src/rocksdb/db/log_reader.cc854
-rw-r--r--src/rocksdb/db/log_reader.h225
-rw-r--r--src/rocksdb/db/log_test.cc1062
-rw-r--r--src/rocksdb/db/log_writer.cc249
-rw-r--r--src/rocksdb/db/log_writer.h128
-rw-r--r--src/rocksdb/db/logs_with_prep_tracker.cc67
-rw-r--r--src/rocksdb/db/logs_with_prep_tracker.h62
-rw-r--r--src/rocksdb/db/lookup_key.h68
-rw-r--r--src/rocksdb/db/malloc_stats.cc55
-rw-r--r--src/rocksdb/db/malloc_stats.h24
-rw-r--r--src/rocksdb/db/manual_compaction_test.cc308
-rw-r--r--src/rocksdb/db/memtable.cc1675
-rw-r--r--src/rocksdb/db/memtable.h664
-rw-r--r--src/rocksdb/db/memtable_list.cc991
-rw-r--r--src/rocksdb/db/memtable_list.h471
-rw-r--r--src/rocksdb/db/memtable_list_test.cc1039
-rw-r--r--src/rocksdb/db/merge_context.h147
-rw-r--r--src/rocksdb/db/merge_helper.cc583
-rw-r--r--src/rocksdb/db/merge_helper.h216
-rw-r--r--src/rocksdb/db/merge_helper_test.cc298
-rw-r--r--src/rocksdb/db/merge_operator.cc85
-rw-r--r--src/rocksdb/db/merge_test.cc629
-rw-r--r--src/rocksdb/db/obsolete_files_test.cc328
-rw-r--r--src/rocksdb/db/options_file_test.cc120
-rw-r--r--src/rocksdb/db/output_validator.cc33
-rw-r--r--src/rocksdb/db/output_validator.h48
-rw-r--r--src/rocksdb/db/perf_context_test.cc1010
-rw-r--r--src/rocksdb/db/periodic_task_scheduler.cc113
-rw-r--r--src/rocksdb/db/periodic_task_scheduler.h110
-rw-r--r--src/rocksdb/db/periodic_task_scheduler_test.cc231
-rw-r--r--src/rocksdb/db/pinned_iterators_manager.h92
-rw-r--r--src/rocksdb/db/plain_table_db_test.cc1357
-rw-r--r--src/rocksdb/db/post_memtable_callback.h25
-rw-r--r--src/rocksdb/db/pre_release_callback.h37
-rw-r--r--src/rocksdb/db/prefix_test.cc906
-rw-r--r--src/rocksdb/db/range_del_aggregator.cc524
-rw-r--r--src/rocksdb/db/range_del_aggregator.h476
-rw-r--r--src/rocksdb/db/range_del_aggregator_bench.cc280
-rw-r--r--src/rocksdb/db/range_del_aggregator_test.cc715
-rw-r--r--src/rocksdb/db/range_tombstone_fragmenter.cc502
-rw-r--r--src/rocksdb/db/range_tombstone_fragmenter.h357
-rw-r--r--src/rocksdb/db/range_tombstone_fragmenter_test.cc555
-rw-r--r--src/rocksdb/db/read_callback.h54
-rw-r--r--src/rocksdb/db/repair.cc771
-rw-r--r--src/rocksdb/db/repair_test.cc442
-rw-r--r--src/rocksdb/db/seqno_time_test.cc996
-rw-r--r--src/rocksdb/db/seqno_to_time_mapping.cc341
-rw-r--r--src/rocksdb/db/seqno_to_time_mapping.h189
-rw-r--r--src/rocksdb/db/snapshot_checker.h60
-rw-r--r--src/rocksdb/db/snapshot_impl.cc25
-rw-r--r--src/rocksdb/db/snapshot_impl.h239
-rw-r--r--src/rocksdb/db/table_cache.cc753
-rw-r--r--src/rocksdb/db/table_cache.h275
-rw-r--r--src/rocksdb/db/table_cache_sync_and_async.h135
-rw-r--r--src/rocksdb/db/table_properties_collector.cc74
-rw-r--r--src/rocksdb/db/table_properties_collector.h175
-rw-r--r--src/rocksdb/db/table_properties_collector_test.cc513
-rw-r--r--src/rocksdb/db/transaction_log_impl.cc298
-rw-r--r--src/rocksdb/db/transaction_log_impl.h130
-rw-r--r--src/rocksdb/db/trim_history_scheduler.cc54
-rw-r--r--src/rocksdb/db/trim_history_scheduler.h46
-rw-r--r--src/rocksdb/db/version_builder.cc1372
-rw-r--r--src/rocksdb/db/version_builder.h72
-rw-r--r--src/rocksdb/db/version_builder_test.cc1695
-rw-r--r--src/rocksdb/db/version_edit.cc1043
-rw-r--r--src/rocksdb/db/version_edit.h669
-rw-r--r--src/rocksdb/db/version_edit_handler.cc1002
-rw-r--r--src/rocksdb/db/version_edit_handler.h313
-rw-r--r--src/rocksdb/db/version_edit_test.cc730
-rw-r--r--src/rocksdb/db/version_set.cc6903
-rw-r--r--src/rocksdb/db/version_set.h1652
-rw-r--r--src/rocksdb/db/version_set_sync_and_async.h151
-rw-r--r--src/rocksdb/db/version_set_test.cc3587
-rw-r--r--src/rocksdb/db/version_util.h71
-rw-r--r--src/rocksdb/db/wal_edit.cc211
-rw-r--r--src/rocksdb/db/wal_edit.h177
-rw-r--r--src/rocksdb/db/wal_edit_test.cc213
-rw-r--r--src/rocksdb/db/wal_manager.cc529
-rw-r--r--src/rocksdb/db/wal_manager.h138
-rw-r--r--src/rocksdb/db/wal_manager_test.cc346
-rw-r--r--src/rocksdb/db/wide/db_wide_basic_test.cc654
-rw-r--r--src/rocksdb/db/wide/wide_column_serialization.cc182
-rw-r--r--src/rocksdb/db/wide/wide_column_serialization.h77
-rw-r--r--src/rocksdb/db/wide/wide_column_serialization_test.cc338
-rw-r--r--src/rocksdb/db/wide/wide_columns.cc22
-rw-r--r--src/rocksdb/db/write_batch.cc3137
-rw-r--r--src/rocksdb/db/write_batch_base.cc94
-rw-r--r--src/rocksdb/db/write_batch_internal.h401
-rw-r--r--src/rocksdb/db/write_batch_test.cc1114
-rw-r--r--src/rocksdb/db/write_callback.h27
-rw-r--r--src/rocksdb/db/write_callback_test.cc465
-rw-r--r--src/rocksdb/db/write_controller.cc121
-rw-r--r--src/rocksdb/db/write_controller.h148
-rw-r--r--src/rocksdb/db/write_controller_test.cc248
-rw-r--r--src/rocksdb/db/write_thread.cc815
-rw-r--r--src/rocksdb/db/write_thread.h440
284 files changed, 233845 insertions, 0 deletions
diff --git a/src/rocksdb/db/arena_wrapped_db_iter.cc b/src/rocksdb/db/arena_wrapped_db_iter.cc
new file mode 100644
index 000000000..607403ccc
--- /dev/null
+++ b/src/rocksdb/db/arena_wrapped_db_iter.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/arena_wrapped_db_iter.h"
+
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
+ std::string* prop) {
+ if (prop_name == "rocksdb.iterator.super-version-number") {
+ // First try to pass the value returned from inner iterator.
+ if (!db_iter_->GetProperty(prop_name, prop).ok()) {
+ *prop = std::to_string(sv_number_);
+ }
+ return Status::OK();
+ }
+ return db_iter_->GetProperty(prop_name, prop);
+}
+
+void ArenaWrappedDBIter::Init(
+ Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, const Version* version,
+ const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
+ uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
+ ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
+ auto mem = arena_.AllocateAligned(sizeof(DBIter));
+ db_iter_ =
+ new (mem) DBIter(env, read_options, ioptions, mutable_cf_options,
+ ioptions.user_comparator, /* iter */ nullptr, version,
+ sequence, true, max_sequential_skip_in_iteration,
+ read_callback, db_impl, cfd, expose_blob_index);
+ sv_number_ = version_number;
+ read_options_ = read_options;
+ allow_refresh_ = allow_refresh;
+ memtable_range_tombstone_iter_ = nullptr;
+}
+
+Status ArenaWrappedDBIter::Refresh() {
+ if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
+ return Status::NotSupported("Creating renew iterator is not allowed.");
+ }
+ assert(db_iter_ != nullptr);
+ // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
+ // correct behavior. Will be corrected automatically when we take a snapshot
+ // here for the case of WritePreparedTxnDB.
+ uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
+ TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1");
+ TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2");
+ auto reinit_internal_iter = [&]() {
+ Env* env = db_iter_->env();
+ db_iter_->~DBIter();
+ arena_.~Arena();
+ new (&arena_) Arena();
+
+ SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
+ SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+ if (read_callback_) {
+ read_callback_->Refresh(latest_seq);
+ }
+ Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
+ sv->current, latest_seq,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations,
+ cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_,
+ allow_refresh_);
+
+ InternalIterator* internal_iter = db_impl_->NewInternalIterator(
+ read_options_, cfd_, sv, &arena_, latest_seq,
+ /* allow_unprepared_value */ true, /* db_iter */ this);
+ SetIterUnderDBIter(internal_iter);
+ };
+ while (true) {
+ if (sv_number_ != cur_sv_number) {
+ reinit_internal_iter();
+ break;
+ } else {
+ SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+ // Refresh range-tombstones in MemTable
+ if (!read_options_.ignore_range_deletions) {
+ SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
+ TEST_SYNC_POINT_CALLBACK("ArenaWrappedDBIter::Refresh:SV", nullptr);
+ auto t = sv->mem->NewRangeTombstoneIterator(
+ read_options_, latest_seq, false /* immutable_memtable */);
+ if (!t || t->empty()) {
+ // If memtable_range_tombstone_iter_ points to a non-empty tombstone
+ // iterator, then it means sv->mem is not the memtable that
+ // memtable_range_tombstone_iter_ points to, so SV must have changed
+ // after the sv_number_ != cur_sv_number check above. We will fall
+ // back to re-init the InternalIterator, and the tombstone iterator
+ // will be freed during db_iter destruction there.
+ if (memtable_range_tombstone_iter_) {
+ assert(!*memtable_range_tombstone_iter_ ||
+ sv_number_ != cfd_->GetSuperVersionNumber());
+ }
+ delete t;
+ } else { // current mutable memtable has range tombstones
+ if (!memtable_range_tombstone_iter_) {
+ delete t;
+ db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv);
+ // The memtable under DBIter did not have range tombstone before
+ // refresh.
+ reinit_internal_iter();
+ break;
+ } else {
+ delete *memtable_range_tombstone_iter_;
+ *memtable_range_tombstone_iter_ = new TruncatedRangeDelIterator(
+ std::unique_ptr<FragmentedRangeTombstoneIterator>(t),
+ &cfd_->internal_comparator(), nullptr, nullptr);
+ }
+ }
+ db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv);
+ }
+ // Refresh latest sequence number
+ db_iter_->set_sequence(latest_seq);
+ db_iter_->set_valid(false);
+ // Check again if the latest super version number is changed
+ uint64_t latest_sv_number = cfd_->GetSuperVersionNumber();
+ if (latest_sv_number != cur_sv_number) {
+ // If the super version number is changed after refreshing,
+ // fallback to Re-Init the InternalIterator
+ cur_sv_number = latest_sv_number;
+ continue;
+ }
+ break;
+ }
+ }
+ return Status::OK();
+}
+
+ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+ Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, const Version* version,
+ const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+ uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
+ ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
+ ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
+ iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence,
+ max_sequential_skip_in_iterations, version_number, read_callback,
+ db_impl, cfd, expose_blob_index, allow_refresh);
+ if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
+ iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index);
+ }
+
+ return iter;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/arena_wrapped_db_iter.h b/src/rocksdb/db/arena_wrapped_db_iter.h
new file mode 100644
index 000000000..f15be306d
--- /dev/null
+++ b/src/rocksdb/db/arena_wrapped_db_iter.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class Version;
+
+// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
+// iterator is supposed to be allocated. This class is used as an entry point of
+// a iterator hierarchy whose memory can be allocated inline. In that way,
+// accessing the iterator tree can be more cache friendly. It is also faster
+// to allocate.
+// When using the class's Iterator interface, the behavior is exactly
+// the same as the inner DBIter.
+class ArenaWrappedDBIter : public Iterator {
+ public:
+ ~ArenaWrappedDBIter() override {
+ if (db_iter_ != nullptr) {
+ db_iter_->~DBIter();
+ } else {
+ assert(false);
+ }
+ }
+
+ // Get the arena to be used to allocate memory for DBIter to be wrapped,
+ // as well as child iterators in it.
+ virtual Arena* GetArena() { return &arena_; }
+
+ const ReadOptions& GetReadOptions() { return read_options_; }
+
+ // Set the internal iterator wrapped inside the DB Iterator. Usually it is
+ // a merging iterator.
+ virtual void SetIterUnderDBIter(InternalIterator* iter) {
+ db_iter_->SetIter(iter);
+ }
+
+ void SetMemtableRangetombstoneIter(TruncatedRangeDelIterator** iter) {
+ memtable_range_tombstone_iter_ = iter;
+ }
+
+ bool Valid() const override { return db_iter_->Valid(); }
+ void SeekToFirst() override { db_iter_->SeekToFirst(); }
+ void SeekToLast() override { db_iter_->SeekToLast(); }
+ // 'target' does not contain timestamp, even if user timestamp feature is
+ // enabled.
+ void Seek(const Slice& target) override { db_iter_->Seek(target); }
+ void SeekForPrev(const Slice& target) override {
+ db_iter_->SeekForPrev(target);
+ }
+ void Next() override { db_iter_->Next(); }
+ void Prev() override { db_iter_->Prev(); }
+ Slice key() const override { return db_iter_->key(); }
+ Slice value() const override { return db_iter_->value(); }
+ const WideColumns& columns() const override { return db_iter_->columns(); }
+ Status status() const override { return db_iter_->status(); }
+ Slice timestamp() const override { return db_iter_->timestamp(); }
+ bool IsBlob() const { return db_iter_->IsBlob(); }
+
+ Status GetProperty(std::string prop_name, std::string* prop) override;
+
+ Status Refresh() override;
+
+ void Init(Env* env, const ReadOptions& read_options,
+ const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, const Version* version,
+ const SequenceNumber& sequence,
+ uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+ ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+ bool expose_blob_index, bool allow_refresh);
+
+ // Store some parameters so we can refresh the iterator at a later point
+ // with these same params
+ void StoreRefreshInfo(DBImpl* db_impl, ColumnFamilyData* cfd,
+ ReadCallback* read_callback, bool expose_blob_index) {
+ db_impl_ = db_impl;
+ cfd_ = cfd;
+ read_callback_ = read_callback;
+ expose_blob_index_ = expose_blob_index;
+ }
+
+ private:
+ DBIter* db_iter_ = nullptr;
+ Arena arena_;
+ uint64_t sv_number_;
+ ColumnFamilyData* cfd_ = nullptr;
+ DBImpl* db_impl_ = nullptr;
+ ReadOptions read_options_;
+ ReadCallback* read_callback_;
+ bool expose_blob_index_ = false;
+ bool allow_refresh_ = true;
+ // If this is nullptr, it means the mutable memtable does not contain range
+ // tombstone when added under this DBIter.
+ TruncatedRangeDelIterator** memtable_range_tombstone_iter_ = nullptr;
+};
+
+// Generate the arena wrapped iterator class.
+// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
+// be supported.
+extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+ Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, const Version* version,
+ const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+ uint64_t version_number, ReadCallback* read_callback,
+ DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+ bool expose_blob_index = false, bool allow_refresh = true);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_constants.h b/src/rocksdb/db/blob/blob_constants.h
new file mode 100644
index 000000000..a5d09ac76
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_constants.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint64_t kInvalidBlobFileNumber = 0;
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_contents.cc b/src/rocksdb/db/blob/blob_contents.cc
new file mode 100644
index 000000000..9015609e7
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_contents.cc
@@ -0,0 +1,90 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_contents.h"
+
+#include <cassert>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_helpers.h"
+#include "port/malloc.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<BlobContents> BlobContents::Create(
+ CacheAllocationPtr&& allocation, size_t size) {
+ return std::unique_ptr<BlobContents>(
+ new BlobContents(std::move(allocation), size));
+}
+
+size_t BlobContents::ApproximateMemoryUsage() const {
+ size_t usage = 0;
+
+ if (allocation_) {
+ MemoryAllocator* const allocator = allocation_.get_deleter().allocator;
+
+ if (allocator) {
+ usage += allocator->UsableSize(allocation_.get(), data_.size());
+ } else {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(allocation_.get());
+#else
+ usage += data_.size();
+#endif
+ }
+ }
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(const_cast<BlobContents*>(this));
+#else
+ usage += sizeof(*this);
+#endif
+
+ return usage;
+}
+
+size_t BlobContents::SizeCallback(void* obj) {
+ assert(obj);
+
+ return static_cast<const BlobContents*>(obj)->size();
+}
+
+Status BlobContents::SaveToCallback(void* from_obj, size_t from_offset,
+ size_t length, void* out) {
+ assert(from_obj);
+
+ const BlobContents* buf = static_cast<const BlobContents*>(from_obj);
+ assert(buf->size() >= from_offset + length);
+
+ memcpy(out, buf->data().data() + from_offset, length);
+
+ return Status::OK();
+}
+
+Cache::CacheItemHelper* BlobContents::GetCacheItemHelper() {
+ static Cache::CacheItemHelper cache_helper(
+ &SizeCallback, &SaveToCallback,
+ GetCacheEntryDeleterForRole<BlobContents, CacheEntryRole::kBlobValue>());
+
+ return &cache_helper;
+}
+
+Status BlobContents::CreateCallback(CacheAllocationPtr&& allocation,
+ const void* buf, size_t size,
+ void** out_obj, size_t* charge) {
+ assert(allocation);
+
+ memcpy(allocation.get(), buf, size);
+
+ std::unique_ptr<BlobContents> obj = Create(std::move(allocation), size);
+ BlobContents* const contents = obj.release();
+
+ *out_obj = contents;
+ *charge = contents->ApproximateMemoryUsage();
+
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_contents.h b/src/rocksdb/db/blob/blob_contents.h
new file mode 100644
index 000000000..9b7c5b969
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_contents.h
@@ -0,0 +1,56 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "memory/memory_allocator.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A class representing a single uncompressed value read from a blob file.
+class BlobContents {
+ public:
+ static std::unique_ptr<BlobContents> Create(CacheAllocationPtr&& allocation,
+ size_t size);
+
+ BlobContents(const BlobContents&) = delete;
+ BlobContents& operator=(const BlobContents&) = delete;
+
+ BlobContents(BlobContents&&) = default;
+ BlobContents& operator=(BlobContents&&) = default;
+
+ ~BlobContents() = default;
+
+ const Slice& data() const { return data_; }
+ size_t size() const { return data_.size(); }
+
+ size_t ApproximateMemoryUsage() const;
+
+ // Callbacks for secondary cache
+ static size_t SizeCallback(void* obj);
+
+ static Status SaveToCallback(void* from_obj, size_t from_offset,
+ size_t length, void* out);
+
+ static Cache::CacheItemHelper* GetCacheItemHelper();
+
+ static Status CreateCallback(CacheAllocationPtr&& allocation, const void* buf,
+ size_t size, void** out_obj, size_t* charge);
+
+ private:
+ BlobContents(CacheAllocationPtr&& allocation, size_t size)
+ : allocation_(std::move(allocation)), data_(allocation_.get(), size) {}
+
+ CacheAllocationPtr allocation_;
+ Slice data_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_counting_iterator.h b/src/rocksdb/db/blob/blob_counting_iterator.h
new file mode 100644
index 000000000..de549afa2
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_counting_iterator.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "db/blob/blob_garbage_meter.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An internal iterator that passes each key-value encountered to
+// BlobGarbageMeter as inflow in order to measure the total number and size of
+// blobs in the compaction input on a per-blob file basis.
+class BlobCountingIterator : public InternalIterator {
+ public:
+ BlobCountingIterator(InternalIterator* iter,
+ BlobGarbageMeter* blob_garbage_meter)
+ : iter_(iter), blob_garbage_meter_(blob_garbage_meter) {
+ assert(iter_);
+ assert(blob_garbage_meter_);
+
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ bool Valid() const override { return iter_->Valid() && status_.ok(); }
+
+ void SeekToFirst() override {
+ iter_->SeekToFirst();
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ void SeekToLast() override {
+ iter_->SeekToLast();
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ void Seek(const Slice& target) override {
+ iter_->Seek(target);
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ void SeekForPrev(const Slice& target) override {
+ iter_->SeekForPrev(target);
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ void Next() override {
+ assert(Valid());
+
+ iter_->Next();
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ bool NextAndGetResult(IterateResult* result) override {
+ assert(Valid());
+
+ const bool res = iter_->NextAndGetResult(result);
+ UpdateAndCountBlobIfNeeded();
+ return res;
+ }
+
+ void Prev() override {
+ assert(Valid());
+
+ iter_->Prev();
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ Slice key() const override {
+ assert(Valid());
+ return iter_->key();
+ }
+
+ Slice user_key() const override {
+ assert(Valid());
+ return iter_->user_key();
+ }
+
+ Slice value() const override {
+ assert(Valid());
+ return iter_->value();
+ }
+
+ Status status() const override { return status_; }
+
+ bool PrepareValue() override {
+ assert(Valid());
+ return iter_->PrepareValue();
+ }
+
+ bool MayBeOutOfLowerBound() override {
+ assert(Valid());
+ return iter_->MayBeOutOfLowerBound();
+ }
+
+ IterBoundCheck UpperBoundCheckResult() override {
+ assert(Valid());
+ return iter_->UpperBoundCheckResult();
+ }
+
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ iter_->SetPinnedItersMgr(pinned_iters_mgr);
+ }
+
+ bool IsKeyPinned() const override {
+ assert(Valid());
+ return iter_->IsKeyPinned();
+ }
+
+ bool IsValuePinned() const override {
+ assert(Valid());
+ return iter_->IsValuePinned();
+ }
+
+ Status GetProperty(std::string prop_name, std::string* prop) override {
+ return iter_->GetProperty(prop_name, prop);
+ }
+
+ private:
+ void UpdateAndCountBlobIfNeeded() {
+ assert(!iter_->Valid() || iter_->status().ok());
+
+ if (!iter_->Valid()) {
+ status_ = iter_->status();
+ return;
+ }
+
+ TEST_SYNC_POINT(
+ "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow");
+
+ status_ = blob_garbage_meter_->ProcessInFlow(key(), value());
+ }
+
+ InternalIterator* iter_;
+ BlobGarbageMeter* blob_garbage_meter_;
+ Status status_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_counting_iterator_test.cc b/src/rocksdb/db/blob/blob_counting_iterator_test.cc
new file mode 100644
index 000000000..c7bbc8f58
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_counting_iterator_test.cc
@@ -0,0 +1,327 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_counting_iterator.h"
+
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_garbage_meter.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CheckInFlow(const BlobGarbageMeter& blob_garbage_meter,
+ uint64_t blob_file_number, uint64_t count, uint64_t bytes) {
+ const auto& flows = blob_garbage_meter.flows();
+
+ const auto it = flows.find(blob_file_number);
+ if (it == flows.end()) {
+ ASSERT_EQ(count, 0);
+ ASSERT_EQ(bytes, 0);
+ return;
+ }
+
+ const auto& in = it->second.GetInFlow();
+
+ ASSERT_EQ(in.GetCount(), count);
+ ASSERT_EQ(in.GetBytes(), bytes);
+}
+
+TEST(BlobCountingIteratorTest, CountBlobs) {
+ // Note: the input consists of three key-values: two are blob references to
+ // different blob files, while the third one is a plain value.
+ constexpr char user_key0[] = "key0";
+ constexpr char user_key1[] = "key1";
+ constexpr char user_key2[] = "key2";
+
+ const std::vector<std::string> keys{
+ test::KeyStr(user_key0, 1, kTypeBlobIndex),
+ test::KeyStr(user_key1, 2, kTypeBlobIndex),
+ test::KeyStr(user_key2, 3, kTypeValue)};
+
+ constexpr uint64_t first_blob_file_number = 4;
+ constexpr uint64_t first_offset = 1000;
+ constexpr uint64_t first_size = 2000;
+
+ std::string first_blob_index;
+ BlobIndex::EncodeBlob(&first_blob_index, first_blob_file_number, first_offset,
+ first_size, kNoCompression);
+
+ constexpr uint64_t second_blob_file_number = 6;
+ constexpr uint64_t second_offset = 2000;
+ constexpr uint64_t second_size = 4000;
+
+ std::string second_blob_index;
+ BlobIndex::EncodeBlob(&second_blob_index, second_blob_file_number,
+ second_offset, second_size, kNoCompression);
+
+ const std::vector<std::string> values{first_blob_index, second_blob_index,
+ "raw_value"};
+
+ assert(keys.size() == values.size());
+
+ VectorIterator input(keys, values);
+ BlobGarbageMeter blob_garbage_meter;
+
+ BlobCountingIterator blob_counter(&input, &blob_garbage_meter);
+
+ constexpr uint64_t first_expected_bytes =
+ first_size +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key0) - 1);
+ constexpr uint64_t second_expected_bytes =
+ second_size +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key1) - 1);
+
+ // Call SeekToFirst and iterate forward
+ blob_counter.SeekToFirst();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[0]);
+ ASSERT_EQ(blob_counter.user_key(), user_key0);
+ ASSERT_EQ(blob_counter.value(), values[0]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+ first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 0, 0);
+
+ blob_counter.Next();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[1]);
+ ASSERT_EQ(blob_counter.user_key(), user_key1);
+ ASSERT_EQ(blob_counter.value(), values[1]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+ first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+ second_expected_bytes);
+
+ blob_counter.Next();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[2]);
+ ASSERT_EQ(blob_counter.user_key(), user_key2);
+ ASSERT_EQ(blob_counter.value(), values[2]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+ first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+ second_expected_bytes);
+
+ blob_counter.Next();
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+ first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+ second_expected_bytes);
+
+ // Do it again using NextAndGetResult
+ blob_counter.SeekToFirst();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[0]);
+ ASSERT_EQ(blob_counter.user_key(), user_key0);
+ ASSERT_EQ(blob_counter.value(), values[0]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+ second_expected_bytes);
+
+ {
+ IterateResult result;
+ ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
+ ASSERT_EQ(result.key, keys[1]);
+ ASSERT_EQ(blob_counter.user_key(), user_key1);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[1]);
+ ASSERT_EQ(blob_counter.value(), values[1]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+ 2 * second_expected_bytes);
+ }
+
+ {
+ IterateResult result;
+ ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
+ ASSERT_EQ(result.key, keys[2]);
+ ASSERT_EQ(blob_counter.user_key(), user_key2);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[2]);
+ ASSERT_EQ(blob_counter.value(), values[2]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+ 2 * second_expected_bytes);
+ }
+
+ {
+ IterateResult result;
+ ASSERT_FALSE(blob_counter.NextAndGetResult(&result));
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+ 2 * second_expected_bytes);
+ }
+
+ // Call SeekToLast and iterate backward
+ blob_counter.SeekToLast();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[2]);
+ ASSERT_EQ(blob_counter.user_key(), user_key2);
+ ASSERT_EQ(blob_counter.value(), values[2]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+ 2 * second_expected_bytes);
+
+ blob_counter.Prev();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[1]);
+ ASSERT_EQ(blob_counter.user_key(), user_key1);
+ ASSERT_EQ(blob_counter.value(), values[1]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+ 3 * second_expected_bytes);
+
+ blob_counter.Prev();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[0]);
+ ASSERT_EQ(blob_counter.user_key(), user_key0);
+ ASSERT_EQ(blob_counter.value(), values[0]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 3,
+ 3 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+ 3 * second_expected_bytes);
+
+ blob_counter.Prev();
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 3,
+ 3 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+ 3 * second_expected_bytes);
+
+ // Call Seek for all keys (plus one that's greater than all of them)
+ blob_counter.Seek(keys[0]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[0]);
+ ASSERT_EQ(blob_counter.user_key(), user_key0);
+ ASSERT_EQ(blob_counter.value(), values[0]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+ 4 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+ 3 * second_expected_bytes);
+
+ blob_counter.Seek(keys[1]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[1]);
+ ASSERT_EQ(blob_counter.user_key(), user_key1);
+ ASSERT_EQ(blob_counter.value(), values[1]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+ 4 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+ 4 * second_expected_bytes);
+
+ blob_counter.Seek(keys[2]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[2]);
+ ASSERT_EQ(blob_counter.user_key(), user_key2);
+ ASSERT_EQ(blob_counter.value(), values[2]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+ 4 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+ 4 * second_expected_bytes);
+
+ blob_counter.Seek("zzz");
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+ 4 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+ 4 * second_expected_bytes);
+
+ // Call SeekForPrev for all keys (plus one that's less than all of them)
+ blob_counter.SeekForPrev("aaa");
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+ 4 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+ 4 * second_expected_bytes);
+
+ blob_counter.SeekForPrev(keys[0]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[0]);
+ ASSERT_EQ(blob_counter.user_key(), user_key0);
+ ASSERT_EQ(blob_counter.value(), values[0]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+ 5 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+ 4 * second_expected_bytes);
+
+ blob_counter.SeekForPrev(keys[1]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[1]);
+ ASSERT_EQ(blob_counter.user_key(), user_key1);
+ ASSERT_EQ(blob_counter.value(), values[1]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+ 5 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 5,
+ 5 * second_expected_bytes);
+
+ blob_counter.SeekForPrev(keys[2]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[2]);
+ ASSERT_EQ(blob_counter.user_key(), user_key2);
+ ASSERT_EQ(blob_counter.value(), values[2]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+ 5 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 5,
+ 5 * second_expected_bytes);
+}
+
+TEST(BlobCountingIteratorTest, CorruptBlobIndex) {
+ const std::vector<std::string> keys{
+ test::KeyStr("user_key", 1, kTypeBlobIndex)};
+ const std::vector<std::string> values{"i_am_not_a_blob_index"};
+
+ assert(keys.size() == values.size());
+
+ VectorIterator input(keys, values);
+ BlobGarbageMeter blob_garbage_meter;
+
+ BlobCountingIterator blob_counter(&input, &blob_garbage_meter);
+
+ blob_counter.SeekToFirst();
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_NOK(blob_counter.status());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_fetcher.cc b/src/rocksdb/db/blob/blob_fetcher.cc
new file mode 100644
index 000000000..124429f93
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_fetcher.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_fetcher.h"
+
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobFetcher::FetchBlob(const Slice& user_key,
+ const Slice& blob_index_slice,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* blob_value,
+ uint64_t* bytes_read) const {
+ assert(version_);
+
+ return version_->GetBlob(read_options_, user_key, blob_index_slice,
+ prefetch_buffer, blob_value, bytes_read);
+}
+
+Status BlobFetcher::FetchBlob(const Slice& user_key,
+ const BlobIndex& blob_index,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* blob_value,
+ uint64_t* bytes_read) const {
+ assert(version_);
+
+ return version_->GetBlob(read_options_, user_key, blob_index, prefetch_buffer,
+ blob_value, bytes_read);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_fetcher.h b/src/rocksdb/db/blob/blob_fetcher.h
new file mode 100644
index 000000000..8aeaf965d
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_fetcher.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Version;
+class Slice;
+class FilePrefetchBuffer;
+class PinnableSlice;
+class BlobIndex;
+
+// A thin wrapper around the blob retrieval functionality of Version.
+class BlobFetcher {
+ public:
+ BlobFetcher(const Version* version, const ReadOptions& read_options)
+ : version_(version), read_options_(read_options) {}
+
+ Status FetchBlob(const Slice& user_key, const Slice& blob_index_slice,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* blob_value, uint64_t* bytes_read) const;
+
+ Status FetchBlob(const Slice& user_key, const BlobIndex& blob_index,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* blob_value, uint64_t* bytes_read) const;
+
+ private:
+ const Version* version_;
+ ReadOptions read_options_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition.cc b/src/rocksdb/db/blob/blob_file_addition.cc
new file mode 100644
index 000000000..71b1bb7fc
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_addition.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tags for custom fields. Note that these get persisted in the manifest,
+// so existing tags should not be modified.
+enum BlobFileAddition::CustomFieldTags : uint32_t {
+ kEndMarker,
+
+ // Add forward compatible fields here
+
+ /////////////////////////////////////////////////////////////////////
+
+ kForwardIncompatibleMask = 1 << 6,
+
+ // Add forward incompatible fields here
+};
+
+void BlobFileAddition::EncodeTo(std::string* output) const {
+ PutVarint64(output, blob_file_number_);
+ PutVarint64(output, total_blob_count_);
+ PutVarint64(output, total_blob_bytes_);
+ PutLengthPrefixedSlice(output, checksum_method_);
+ PutLengthPrefixedSlice(output, checksum_value_);
+
+ // Encode any custom fields here. The format to use is a Varint32 tag (see
+ // CustomFieldTags above) followed by a length prefixed slice. Unknown custom
+ // fields will be ignored during decoding unless they're in the forward
+ // incompatible range.
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileAddition::EncodeTo::CustomFields", output);
+
+ PutVarint32(output, kEndMarker);
+}
+
+Status BlobFileAddition::DecodeFrom(Slice* input) {
+ constexpr char class_name[] = "BlobFileAddition";
+
+ if (!GetVarint64(input, &blob_file_number_)) {
+ return Status::Corruption(class_name, "Error decoding blob file number");
+ }
+
+ if (!GetVarint64(input, &total_blob_count_)) {
+ return Status::Corruption(class_name, "Error decoding total blob count");
+ }
+
+ if (!GetVarint64(input, &total_blob_bytes_)) {
+ return Status::Corruption(class_name, "Error decoding total blob bytes");
+ }
+
+ Slice checksum_method;
+ if (!GetLengthPrefixedSlice(input, &checksum_method)) {
+ return Status::Corruption(class_name, "Error decoding checksum method");
+ }
+ checksum_method_ = checksum_method.ToString();
+
+ Slice checksum_value;
+ if (!GetLengthPrefixedSlice(input, &checksum_value)) {
+ return Status::Corruption(class_name, "Error decoding checksum value");
+ }
+ checksum_value_ = checksum_value.ToString();
+
+ while (true) {
+ uint32_t custom_field_tag = 0;
+ if (!GetVarint32(input, &custom_field_tag)) {
+ return Status::Corruption(class_name, "Error decoding custom field tag");
+ }
+
+ if (custom_field_tag == kEndMarker) {
+ break;
+ }
+
+ if (custom_field_tag & kForwardIncompatibleMask) {
+ return Status::Corruption(
+ class_name, "Forward incompatible custom field encountered");
+ }
+
+ Slice custom_field_value;
+ if (!GetLengthPrefixedSlice(input, &custom_field_value)) {
+ return Status::Corruption(class_name,
+ "Error decoding custom field value");
+ }
+ }
+
+ return Status::OK();
+}
+
+std::string BlobFileAddition::DebugString() const {
+ std::ostringstream oss;
+
+ oss << *this;
+
+ return oss.str();
+}
+
+std::string BlobFileAddition::DebugJSON() const {
+ JSONWriter jw;
+
+ jw << *this;
+
+ jw.EndObject();
+
+ return jw.Get();
+}
+
+bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs) {
+ return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() &&
+ lhs.GetTotalBlobCount() == rhs.GetTotalBlobCount() &&
+ lhs.GetTotalBlobBytes() == rhs.GetTotalBlobBytes() &&
+ lhs.GetChecksumMethod() == rhs.GetChecksumMethod() &&
+ lhs.GetChecksumValue() == rhs.GetChecksumValue();
+}
+
+bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs) {
+ return !(lhs == rhs);
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const BlobFileAddition& blob_file_addition) {
+ os << "blob_file_number: " << blob_file_addition.GetBlobFileNumber()
+ << " total_blob_count: " << blob_file_addition.GetTotalBlobCount()
+ << " total_blob_bytes: " << blob_file_addition.GetTotalBlobBytes()
+ << " checksum_method: " << blob_file_addition.GetChecksumMethod()
+ << " checksum_value: "
+ << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true);
+
+ return os;
+}
+
+JSONWriter& operator<<(JSONWriter& jw,
+ const BlobFileAddition& blob_file_addition) {
+ jw << "BlobFileNumber" << blob_file_addition.GetBlobFileNumber()
+ << "TotalBlobCount" << blob_file_addition.GetTotalBlobCount()
+ << "TotalBlobBytes" << blob_file_addition.GetTotalBlobBytes()
+ << "ChecksumMethod" << blob_file_addition.GetChecksumMethod()
+ << "ChecksumValue"
+ << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true);
+
+ return jw;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition.h b/src/rocksdb/db/blob/blob_file_addition.h
new file mode 100644
index 000000000..43b1a0bcb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+class BlobFileAddition {
+ public:
+ BlobFileAddition() = default;
+
+ BlobFileAddition(uint64_t blob_file_number, uint64_t total_blob_count,
+ uint64_t total_blob_bytes, std::string checksum_method,
+ std::string checksum_value)
+ : blob_file_number_(blob_file_number),
+ total_blob_count_(total_blob_count),
+ total_blob_bytes_(total_blob_bytes),
+ checksum_method_(std::move(checksum_method)),
+ checksum_value_(std::move(checksum_value)) {
+ assert(checksum_method_.empty() == checksum_value_.empty());
+ }
+
+ uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+ uint64_t GetTotalBlobCount() const { return total_blob_count_; }
+ uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; }
+ const std::string& GetChecksumMethod() const { return checksum_method_; }
+ const std::string& GetChecksumValue() const { return checksum_value_; }
+
+ void EncodeTo(std::string* output) const;
+ Status DecodeFrom(Slice* input);
+
+ std::string DebugString() const;
+ std::string DebugJSON() const;
+
+ private:
+ enum CustomFieldTags : uint32_t;
+
+ uint64_t blob_file_number_ = kInvalidBlobFileNumber;
+ uint64_t total_blob_count_ = 0;
+ uint64_t total_blob_bytes_ = 0;
+ std::string checksum_method_;
+ std::string checksum_value_;
+};
+
+bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs);
+bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs);
+
+std::ostream& operator<<(std::ostream& os,
+ const BlobFileAddition& blob_file_addition);
+JSONWriter& operator<<(JSONWriter& jw,
+ const BlobFileAddition& blob_file_addition);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition_test.cc b/src/rocksdb/db/blob/blob_file_addition_test.cc
new file mode 100644
index 000000000..64cb0a9d6
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition_test.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_addition.h"
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileAdditionTest : public testing::Test {
+ public:
+ static void TestEncodeDecode(const BlobFileAddition& blob_file_addition) {
+ std::string encoded;
+ blob_file_addition.EncodeTo(&encoded);
+
+ BlobFileAddition decoded;
+ Slice input(encoded);
+ ASSERT_OK(decoded.DecodeFrom(&input));
+
+ ASSERT_EQ(blob_file_addition, decoded);
+ }
+};
+
+TEST_F(BlobFileAdditionTest, Empty) {
+ BlobFileAddition blob_file_addition;
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), kInvalidBlobFileNumber);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 0);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), 0);
+ ASSERT_TRUE(blob_file_addition.GetChecksumMethod().empty());
+ ASSERT_TRUE(blob_file_addition.GetChecksumValue().empty());
+
+ TestEncodeDecode(blob_file_addition);
+}
+
+TEST_F(BlobFileAdditionTest, NonEmpty) {
+ constexpr uint64_t blob_file_number = 123;
+ constexpr uint64_t total_blob_count = 2;
+ constexpr uint64_t total_blob_bytes = 123456;
+ const std::string checksum_method("SHA1");
+ const std::string checksum_value(
+ "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+ "\x5c\xbd");
+
+ BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+ total_blob_bytes, checksum_method,
+ checksum_value);
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), total_blob_count);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), total_blob_bytes);
+ ASSERT_EQ(blob_file_addition.GetChecksumMethod(), checksum_method);
+ ASSERT_EQ(blob_file_addition.GetChecksumValue(), checksum_value);
+
+ TestEncodeDecode(blob_file_addition);
+}
+
+TEST_F(BlobFileAdditionTest, DecodeErrors) {
+ std::string str;
+ Slice slice(str);
+
+ BlobFileAddition blob_file_addition;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "blob file number"));
+ }
+
+ constexpr uint64_t blob_file_number = 123;
+ PutVarint64(&str, blob_file_number);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "total blob count"));
+ }
+
+ constexpr uint64_t total_blob_count = 4567;
+ PutVarint64(&str, total_blob_count);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "total blob bytes"));
+ }
+
+ constexpr uint64_t total_blob_bytes = 12345678;
+ PutVarint64(&str, total_blob_bytes);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "checksum method"));
+ }
+
+ constexpr char checksum_method[] = "SHA1";
+ PutLengthPrefixedSlice(&str, checksum_method);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "checksum value"));
+ }
+
+ constexpr char checksum_value[] =
+ "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+ "\x5c\xbd";
+ PutLengthPrefixedSlice(&str, checksum_value);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "custom field tag"));
+ }
+
+ constexpr uint32_t custom_tag = 2;
+ PutVarint32(&str, custom_tag);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "custom field value"));
+ }
+}
+
+TEST_F(BlobFileAdditionTest, ForwardCompatibleCustomField) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) {
+ std::string* output = static_cast<std::string*>(arg);
+
+ constexpr uint32_t forward_compatible_tag = 2;
+ PutVarint32(output, forward_compatible_tag);
+
+ PutLengthPrefixedSlice(output, "deadbeef");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr uint64_t blob_file_number = 678;
+ constexpr uint64_t total_blob_count = 9999;
+ constexpr uint64_t total_blob_bytes = 100000000;
+ const std::string checksum_method("CRC32");
+ const std::string checksum_value("\x3d\x87\xff\x57");
+
+ BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+ total_blob_bytes, checksum_method,
+ checksum_value);
+
+ TestEncodeDecode(blob_file_addition);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileAdditionTest, ForwardIncompatibleCustomField) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) {
+ std::string* output = static_cast<std::string*>(arg);
+
+ constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1;
+ PutVarint32(output, forward_incompatible_tag);
+
+ PutLengthPrefixedSlice(output, "foobar");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr uint64_t blob_file_number = 456;
+ constexpr uint64_t total_blob_count = 100;
+ constexpr uint64_t total_blob_bytes = 2000000;
+ const std::string checksum_method("CRC32B");
+ const std::string checksum_value("\x6d\xbd\xf2\x3a");
+
+ BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+ total_blob_bytes, checksum_method,
+ checksum_value);
+
+ std::string encoded;
+ blob_file_addition.EncodeTo(&encoded);
+
+ BlobFileAddition decoded_blob_file_addition;
+ Slice input(encoded);
+ const Status s = decoded_blob_file_addition.DecodeFrom(&input);
+
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible"));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_builder.cc b/src/rocksdb/db/blob/blob_file_builder.cc
new file mode 100644
index 000000000..5e0e7f6cb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder.cc
@@ -0,0 +1,446 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_builder.h"
+
+#include <cassert>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "db/event_helpers.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "options/cf_options.h"
+#include "options/options_helper.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobFileBuilder::BlobFileBuilder(
+ VersionSet* versions, FileSystem* fs,
+ const ImmutableOptions* immutable_options,
+ const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
+ std::string db_id, std::string db_session_id, int job_id,
+ uint32_t column_family_id, const std::string& column_family_name,
+ Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ BlobFileCompletionCallback* blob_callback,
+ BlobFileCreationReason creation_reason,
+ std::vector<std::string>* blob_file_paths,
+ std::vector<BlobFileAddition>* blob_file_additions)
+ : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs,
+ immutable_options, mutable_cf_options, file_options,
+ db_id, db_session_id, job_id, column_family_id,
+ column_family_name, io_priority, write_hint, io_tracer,
+ blob_callback, creation_reason, blob_file_paths,
+ blob_file_additions) {}
+
+BlobFileBuilder::BlobFileBuilder(
+ std::function<uint64_t()> file_number_generator, FileSystem* fs,
+ const ImmutableOptions* immutable_options,
+ const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
+ std::string db_id, std::string db_session_id, int job_id,
+ uint32_t column_family_id, const std::string& column_family_name,
+ Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ BlobFileCompletionCallback* blob_callback,
+ BlobFileCreationReason creation_reason,
+ std::vector<std::string>* blob_file_paths,
+ std::vector<BlobFileAddition>* blob_file_additions)
+ : file_number_generator_(std::move(file_number_generator)),
+ fs_(fs),
+ immutable_options_(immutable_options),
+ min_blob_size_(mutable_cf_options->min_blob_size),
+ blob_file_size_(mutable_cf_options->blob_file_size),
+ blob_compression_type_(mutable_cf_options->blob_compression_type),
+ prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache),
+ file_options_(file_options),
+ db_id_(std::move(db_id)),
+ db_session_id_(std::move(db_session_id)),
+ job_id_(job_id),
+ column_family_id_(column_family_id),
+ column_family_name_(column_family_name),
+ io_priority_(io_priority),
+ write_hint_(write_hint),
+ io_tracer_(io_tracer),
+ blob_callback_(blob_callback),
+ creation_reason_(creation_reason),
+ blob_file_paths_(blob_file_paths),
+ blob_file_additions_(blob_file_additions),
+ blob_count_(0),
+ blob_bytes_(0) {
+ assert(file_number_generator_);
+ assert(fs_);
+ assert(immutable_options_);
+ assert(file_options_);
+ assert(blob_file_paths_);
+ assert(blob_file_paths_->empty());
+ assert(blob_file_additions_);
+ assert(blob_file_additions_->empty());
+}
+
+BlobFileBuilder::~BlobFileBuilder() = default;
+
+Status BlobFileBuilder::Add(const Slice& key, const Slice& value,
+ std::string* blob_index) {
+ assert(blob_index);
+ assert(blob_index->empty());
+
+ if (value.size() < min_blob_size_) {
+ return Status::OK();
+ }
+
+ {
+ const Status s = OpenBlobFileIfNeeded();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ Slice blob = value;
+ std::string compressed_blob;
+
+ {
+ const Status s = CompressBlobIfNeeded(&blob, &compressed_blob);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ uint64_t blob_file_number = 0;
+ uint64_t blob_offset = 0;
+
+ {
+ const Status s =
+ WriteBlobToFile(key, blob, &blob_file_number, &blob_offset);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ {
+ const Status s = CloseBlobFileIfNeeded();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ {
+ const Status s =
+ PutBlobIntoCacheIfNeeded(value, blob_file_number, blob_offset);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(immutable_options_->info_log,
+ "Failed to pre-populate the blob into blob cache: %s",
+ s.ToString().c_str());
+ }
+ }
+
+ BlobIndex::EncodeBlob(blob_index, blob_file_number, blob_offset, blob.size(),
+ blob_compression_type_);
+
+ return Status::OK();
+}
+
+Status BlobFileBuilder::Finish() {
+ if (!IsBlobFileOpen()) {
+ return Status::OK();
+ }
+
+ return CloseBlobFile();
+}
+
+bool BlobFileBuilder::IsBlobFileOpen() const { return !!writer_; }
+
+Status BlobFileBuilder::OpenBlobFileIfNeeded() {
+ if (IsBlobFileOpen()) {
+ return Status::OK();
+ }
+
+ assert(!blob_count_);
+ assert(!blob_bytes_);
+
+ assert(file_number_generator_);
+ const uint64_t blob_file_number = file_number_generator_();
+
+ assert(immutable_options_);
+ assert(!immutable_options_->cf_paths.empty());
+ std::string blob_file_path =
+ BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number);
+
+ if (blob_callback_) {
+ blob_callback_->OnBlobFileCreationStarted(
+ blob_file_path, column_family_name_, job_id_, creation_reason_);
+ }
+
+ std::unique_ptr<FSWritableFile> file;
+
+ {
+ assert(file_options_);
+ Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_);
+
+ TEST_SYNC_POINT_CALLBACK(
+ "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s);
+
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Note: files get added to blob_file_paths_ right after the open, so they
+ // can be cleaned up upon failure. Contrast this with blob_file_additions_,
+ // which only contains successfully written files.
+ assert(blob_file_paths_);
+ blob_file_paths_->emplace_back(std::move(blob_file_path));
+
+ assert(file);
+ file->SetIOPriority(io_priority_);
+ file->SetWriteLifeTimeHint(write_hint_);
+ FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types;
+ Statistics* const statistics = immutable_options_->stats;
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(file), blob_file_paths_->back(), *file_options_,
+ immutable_options_->clock, io_tracer_, statistics,
+ immutable_options_->listeners,
+ immutable_options_->file_checksum_gen_factory.get(),
+ tmp_set.Contains(FileType::kBlobFile), false));
+
+ constexpr bool do_flush = false;
+
+ std::unique_ptr<BlobLogWriter> blob_log_writer(new BlobLogWriter(
+ std::move(file_writer), immutable_options_->clock, statistics,
+ blob_file_number, immutable_options_->use_fsync, do_flush));
+
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+
+ BlobLogHeader header(column_family_id_, blob_compression_type_, has_ttl,
+ expiration_range);
+
+ {
+ Status s = blob_log_writer->WriteHeader(header);
+
+ TEST_SYNC_POINT_CALLBACK(
+ "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s);
+
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ writer_ = std::move(blob_log_writer);
+
+ assert(IsBlobFileOpen());
+
+ return Status::OK();
+}
+
+Status BlobFileBuilder::CompressBlobIfNeeded(
+ Slice* blob, std::string* compressed_blob) const {
+ assert(blob);
+ assert(compressed_blob);
+ assert(compressed_blob->empty());
+ assert(immutable_options_);
+
+ if (blob_compression_type_ == kNoCompression) {
+ return Status::OK();
+ }
+
+ CompressionOptions opts;
+ CompressionContext context(blob_compression_type_);
+ constexpr uint64_t sample_for_compression = 0;
+
+ CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+ blob_compression_type_, sample_for_compression);
+
+ constexpr uint32_t compression_format_version = 2;
+
+ bool success = false;
+
+ {
+ StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats,
+ BLOB_DB_COMPRESSION_MICROS);
+ success =
+ CompressData(*blob, info, compression_format_version, compressed_blob);
+ }
+
+ if (!success) {
+ return Status::Corruption("Error compressing blob");
+ }
+
+ *blob = Slice(*compressed_blob);
+
+ return Status::OK();
+}
+
+Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob,
+ uint64_t* blob_file_number,
+ uint64_t* blob_offset) {
+ assert(IsBlobFileOpen());
+ assert(blob_file_number);
+ assert(blob_offset);
+
+ uint64_t key_offset = 0;
+
+ Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset);
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ *blob_file_number = writer_->get_log_number();
+
+ ++blob_count_;
+ blob_bytes_ += BlobLogRecord::kHeaderSize + key.size() + blob.size();
+
+ return Status::OK();
+}
+
+Status BlobFileBuilder::CloseBlobFile() {
+ assert(IsBlobFileOpen());
+
+ BlobLogFooter footer;
+ footer.blob_count = blob_count_;
+
+ std::string checksum_method;
+ std::string checksum_value;
+
+ Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value);
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ const uint64_t blob_file_number = writer_->get_log_number();
+
+ if (blob_callback_) {
+ s = blob_callback_->OnBlobFileCompleted(
+ blob_file_paths_->back(), column_family_name_, job_id_,
+ blob_file_number, creation_reason_, s, checksum_value, checksum_method,
+ blob_count_, blob_bytes_);
+ }
+
+ assert(blob_file_additions_);
+ blob_file_additions_->emplace_back(blob_file_number, blob_count_, blob_bytes_,
+ std::move(checksum_method),
+ std::move(checksum_value));
+
+ assert(immutable_options_);
+ ROCKS_LOG_INFO(immutable_options_->logger,
+ "[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64
+ " total blobs, %" PRIu64 " total bytes",
+ column_family_name_.c_str(), job_id_, blob_file_number,
+ blob_count_, blob_bytes_);
+
+ writer_.reset();
+ blob_count_ = 0;
+ blob_bytes_ = 0;
+
+ return s;
+}
+
+Status BlobFileBuilder::CloseBlobFileIfNeeded() {
+ assert(IsBlobFileOpen());
+
+ const WritableFileWriter* const file_writer = writer_->file();
+ assert(file_writer);
+
+ if (file_writer->GetFileSize() < blob_file_size_) {
+ return Status::OK();
+ }
+
+ return CloseBlobFile();
+}
+
+void BlobFileBuilder::Abandon(const Status& s) {
+ if (!IsBlobFileOpen()) {
+ return;
+ }
+ if (blob_callback_) {
+ // BlobFileBuilder::Abandon() is called because of error while writing to
+ // Blob files. So we can ignore the below error.
+ blob_callback_
+ ->OnBlobFileCompleted(blob_file_paths_->back(), column_family_name_,
+ job_id_, writer_->get_log_number(),
+ creation_reason_, s, "", "", blob_count_,
+ blob_bytes_)
+ .PermitUncheckedError();
+ }
+
+ writer_.reset();
+ blob_count_ = 0;
+ blob_bytes_ = 0;
+}
+
+Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,
+ uint64_t blob_file_number,
+ uint64_t blob_offset) const {
+ Status s = Status::OK();
+
+ auto blob_cache = immutable_options_->blob_cache;
+ auto statistics = immutable_options_->statistics.get();
+ bool warm_cache =
+ prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly &&
+ creation_reason_ == BlobFileCreationReason::kFlush;
+
+ if (blob_cache && warm_cache) {
+ const OffsetableCacheKey base_cache_key(db_id_, db_session_id_,
+ blob_file_number);
+ const CacheKey cache_key = base_cache_key.WithOffset(blob_offset);
+ const Slice key = cache_key.AsSlice();
+
+ const Cache::Priority priority = Cache::Priority::BOTTOM;
+
+ // Objects to be put into the cache have to be heap-allocated and
+ // self-contained, i.e. own their contents. The Cache has to be able to
+ // take unique ownership of them.
+ CacheAllocationPtr allocation =
+ AllocateBlock(blob.size(), blob_cache->memory_allocator());
+ memcpy(allocation.get(), blob.data(), blob.size());
+ std::unique_ptr<BlobContents> buf =
+ BlobContents::Create(std::move(allocation), blob.size());
+
+ Cache::CacheItemHelper* const cache_item_helper =
+ BlobContents::GetCacheItemHelper();
+ assert(cache_item_helper);
+
+ if (immutable_options_->lowest_used_cache_tier ==
+ CacheTier::kNonVolatileBlockTier) {
+ s = blob_cache->Insert(key, buf.get(), cache_item_helper,
+ buf->ApproximateMemoryUsage(),
+ nullptr /* cache_handle */, priority);
+ } else {
+ s = blob_cache->Insert(key, buf.get(), buf->ApproximateMemoryUsage(),
+ cache_item_helper->del_cb,
+ nullptr /* cache_handle */, priority);
+ }
+
+ if (s.ok()) {
+ RecordTick(statistics, BLOB_DB_CACHE_ADD);
+ RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, buf->size());
+ buf.release();
+ } else {
+ RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES);
+ }
+ }
+
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_builder.h b/src/rocksdb/db/blob/blob_file_builder.h
new file mode 100644
index 000000000..8e7aab502
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <cinttypes>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionSet;
+class FileSystem;
+class SystemClock;
+struct ImmutableOptions;
+struct MutableCFOptions;
+struct FileOptions;
+class BlobFileAddition;
+class Status;
+class Slice;
+class BlobLogWriter;
+class IOTracer;
+class BlobFileCompletionCallback;
+
+class BlobFileBuilder {
+ public:
+ BlobFileBuilder(VersionSet* versions, FileSystem* fs,
+ const ImmutableOptions* immutable_options,
+ const MutableCFOptions* mutable_cf_options,
+ const FileOptions* file_options, std::string db_id,
+ std::string db_session_id, int job_id,
+ uint32_t column_family_id,
+ const std::string& column_family_name,
+ Env::IOPriority io_priority,
+ Env::WriteLifeTimeHint write_hint,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ BlobFileCompletionCallback* blob_callback,
+ BlobFileCreationReason creation_reason,
+ std::vector<std::string>* blob_file_paths,
+ std::vector<BlobFileAddition>* blob_file_additions);
+
+ BlobFileBuilder(std::function<uint64_t()> file_number_generator,
+ FileSystem* fs, const ImmutableOptions* immutable_options,
+ const MutableCFOptions* mutable_cf_options,
+ const FileOptions* file_options, std::string db_id,
+ std::string db_session_id, int job_id,
+ uint32_t column_family_id,
+ const std::string& column_family_name,
+ Env::IOPriority io_priority,
+ Env::WriteLifeTimeHint write_hint,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ BlobFileCompletionCallback* blob_callback,
+ BlobFileCreationReason creation_reason,
+ std::vector<std::string>* blob_file_paths,
+ std::vector<BlobFileAddition>* blob_file_additions);
+
+ BlobFileBuilder(const BlobFileBuilder&) = delete;
+ BlobFileBuilder& operator=(const BlobFileBuilder&) = delete;
+
+ ~BlobFileBuilder();
+
+ Status Add(const Slice& key, const Slice& value, std::string* blob_index);
+ Status Finish();
+ void Abandon(const Status& s);
+
+ private:
+ bool IsBlobFileOpen() const;
+ Status OpenBlobFileIfNeeded();
+ Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const;
+ Status WriteBlobToFile(const Slice& key, const Slice& blob,
+ uint64_t* blob_file_number, uint64_t* blob_offset);
+ Status CloseBlobFile();
+ Status CloseBlobFileIfNeeded();
+
+ Status PutBlobIntoCacheIfNeeded(const Slice& blob, uint64_t blob_file_number,
+ uint64_t blob_offset) const;
+
+ std::function<uint64_t()> file_number_generator_;
+ FileSystem* fs_;
+ const ImmutableOptions* immutable_options_;
+ uint64_t min_blob_size_;
+ uint64_t blob_file_size_;
+ CompressionType blob_compression_type_;
+ PrepopulateBlobCache prepopulate_blob_cache_;
+ const FileOptions* file_options_;
+ const std::string db_id_;
+ const std::string db_session_id_;
+ int job_id_;
+ uint32_t column_family_id_;
+ std::string column_family_name_;
+ Env::IOPriority io_priority_;
+ Env::WriteLifeTimeHint write_hint_;
+ std::shared_ptr<IOTracer> io_tracer_;
+ BlobFileCompletionCallback* blob_callback_;
+ BlobFileCreationReason creation_reason_;
+ std::vector<std::string>* blob_file_paths_;
+ std::vector<BlobFileAddition>* blob_file_additions_;
+ std::unique_ptr<BlobLogWriter> writer_;
+ uint64_t blob_count_;
+ uint64_t blob_bytes_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_builder_test.cc b/src/rocksdb/db/blob/blob_file_builder_test.cc
new file mode 100644
index 000000000..3a0feee45
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder_test.cc
@@ -0,0 +1,680 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_builder.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_sequential_reader.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/compression.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFileNumberGenerator {
+ public:
+ uint64_t operator()() { return ++next_file_number_; }
+
+ private:
+ uint64_t next_file_number_ = 1;
+};
+
+class BlobFileBuilderTest : public testing::Test {
+ protected:
+ BlobFileBuilderTest() {
+ mock_env_.reset(MockEnv::Create(Env::Default()));
+ fs_ = mock_env_->GetFileSystem().get();
+ clock_ = mock_env_->GetSystemClock().get();
+ }
+
+ void VerifyBlobFile(uint64_t blob_file_number,
+ const std::string& blob_file_path,
+ uint32_t column_family_id,
+ CompressionType blob_compression_type,
+ const std::vector<std::pair<std::string, std::string>>&
+ expected_key_value_pairs,
+ const std::vector<std::string>& blob_indexes) {
+ assert(expected_key_value_pairs.size() == blob_indexes.size());
+
+ std::unique_ptr<FSRandomAccessFile> file;
+ constexpr IODebugContext* dbg = nullptr;
+ ASSERT_OK(
+ fs_->NewRandomAccessFile(blob_file_path, file_options_, &file, dbg));
+
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(std::move(file), blob_file_path, clock_));
+
+ constexpr Statistics* statistics = nullptr;
+ BlobLogSequentialReader blob_log_reader(std::move(file_reader), clock_,
+ statistics);
+
+ BlobLogHeader header;
+ ASSERT_OK(blob_log_reader.ReadHeader(&header));
+ ASSERT_EQ(header.version, kVersion1);
+ ASSERT_EQ(header.column_family_id, column_family_id);
+ ASSERT_EQ(header.compression, blob_compression_type);
+ ASSERT_FALSE(header.has_ttl);
+ ASSERT_EQ(header.expiration_range, ExpirationRange());
+
+ for (size_t i = 0; i < expected_key_value_pairs.size(); ++i) {
+ BlobLogRecord record;
+ uint64_t blob_offset = 0;
+
+ ASSERT_OK(blob_log_reader.ReadRecord(
+ &record, BlobLogSequentialReader::kReadHeaderKeyBlob, &blob_offset));
+
+ // Check the contents of the blob file
+ const auto& expected_key_value = expected_key_value_pairs[i];
+ const auto& key = expected_key_value.first;
+ const auto& value = expected_key_value.second;
+
+ ASSERT_EQ(record.key_size, key.size());
+ ASSERT_EQ(record.value_size, value.size());
+ ASSERT_EQ(record.expiration, 0);
+ ASSERT_EQ(record.key, key);
+ ASSERT_EQ(record.value, value);
+
+ // Make sure the blob reference returned by the builder points to the
+ // right place
+ BlobIndex blob_index;
+ ASSERT_OK(blob_index.DecodeFrom(blob_indexes[i]));
+ ASSERT_FALSE(blob_index.IsInlined());
+ ASSERT_FALSE(blob_index.HasTTL());
+ ASSERT_EQ(blob_index.file_number(), blob_file_number);
+ ASSERT_EQ(blob_index.offset(), blob_offset);
+ ASSERT_EQ(blob_index.size(), value.size());
+ }
+
+ BlobLogFooter footer;
+ ASSERT_OK(blob_log_reader.ReadFooter(&footer));
+ ASSERT_EQ(footer.blob_count, expected_key_value_pairs.size());
+ ASSERT_EQ(footer.expiration_range, ExpirationRange());
+ }
+
+ std::unique_ptr<Env> mock_env_;
+ FileSystem* fs_;
+ SystemClock* clock_;
+ FileOptions file_options_;
+};
+
+TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) {
+ // Build a single blob file
+ constexpr size_t number_of_blobs = 10;
+ constexpr size_t key_size = 1;
+ constexpr size_t value_size = 4;
+ constexpr size_t value_offset = 1234;
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileBuilderTest_BuildAndCheckOneFile"),
+ 0);
+ options.enable_blob_files = true;
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
+ number_of_blobs);
+ std::vector<std::string> blob_indexes(number_of_blobs);
+
+ for (size_t i = 0; i < number_of_blobs; ++i) {
+ auto& expected_key_value = expected_key_value_pairs[i];
+
+ auto& key = expected_key_value.first;
+ key = std::to_string(i);
+ assert(key.size() == key_size);
+
+ auto& value = expected_key_value.second;
+ value = std::to_string(i + value_offset);
+ assert(value.size() == value_size);
+
+ auto& blob_index = blob_indexes[i];
+
+ ASSERT_OK(builder.Add(key, value, &blob_index));
+ ASSERT_FALSE(blob_index.empty());
+ }
+
+ ASSERT_OK(builder.Finish());
+
+ // Check the metadata generated
+ constexpr uint64_t blob_file_number = 2;
+
+ ASSERT_EQ(blob_file_paths.size(), 1);
+
+ const std::string& blob_file_path = blob_file_paths[0];
+
+ ASSERT_EQ(
+ blob_file_path,
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+ ASSERT_EQ(blob_file_additions.size(), 1);
+
+ const auto& blob_file_addition = blob_file_additions[0];
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), number_of_blobs);
+ ASSERT_EQ(
+ blob_file_addition.GetTotalBlobBytes(),
+ number_of_blobs * (BlobLogRecord::kHeaderSize + key_size + value_size));
+
+ // Verify the contents of the new blob file as well as the blob references
+ VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+ kNoCompression, expected_key_value_pairs, blob_indexes);
+}
+
+TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) {
+ // Build multiple blob files: file size limit is set to the size of a single
+ // value, so each blob ends up in a file of its own
+ constexpr size_t number_of_blobs = 10;
+ constexpr size_t key_size = 1;
+ constexpr size_t value_size = 10;
+ constexpr size_t value_offset = 1234567890;
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileBuilderTest_BuildAndCheckMultipleFiles"),
+ 0);
+ options.enable_blob_files = true;
+ options.blob_file_size = value_size;
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
+ number_of_blobs);
+ std::vector<std::string> blob_indexes(number_of_blobs);
+
+ for (size_t i = 0; i < number_of_blobs; ++i) {
+ auto& expected_key_value = expected_key_value_pairs[i];
+
+ auto& key = expected_key_value.first;
+ key = std::to_string(i);
+ assert(key.size() == key_size);
+
+ auto& value = expected_key_value.second;
+ value = std::to_string(i + value_offset);
+ assert(value.size() == value_size);
+
+ auto& blob_index = blob_indexes[i];
+
+ ASSERT_OK(builder.Add(key, value, &blob_index));
+ ASSERT_FALSE(blob_index.empty());
+ }
+
+ ASSERT_OK(builder.Finish());
+
+ // Check the metadata generated
+ ASSERT_EQ(blob_file_paths.size(), number_of_blobs);
+ ASSERT_EQ(blob_file_additions.size(), number_of_blobs);
+
+ for (size_t i = 0; i < number_of_blobs; ++i) {
+ const uint64_t blob_file_number = i + 2;
+
+ ASSERT_EQ(blob_file_paths[i],
+ BlobFileName(immutable_options.cf_paths.front().path,
+ blob_file_number));
+
+ const auto& blob_file_addition = blob_file_additions[i];
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+ BlobLogRecord::kHeaderSize + key_size + value_size);
+ }
+
+ // Verify the contents of the new blob files as well as the blob references
+ for (size_t i = 0; i < number_of_blobs; ++i) {
+ std::vector<std::pair<std::string, std::string>> expected_key_value_pair{
+ expected_key_value_pairs[i]};
+ std::vector<std::string> blob_index{blob_indexes[i]};
+
+ VerifyBlobFile(i + 2, blob_file_paths[i], column_family_id, kNoCompression,
+ expected_key_value_pair, blob_index);
+ }
+}
+
+TEST_F(BlobFileBuilderTest, InlinedValues) {
+ // All values are below the min_blob_size threshold; no blob files get written
+ constexpr size_t number_of_blobs = 10;
+ constexpr size_t key_size = 1;
+ constexpr size_t value_size = 10;
+ constexpr size_t value_offset = 1234567890;
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileBuilderTest_InlinedValues"),
+ 0);
+ options.enable_blob_files = true;
+ options.min_blob_size = 1024;
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ for (size_t i = 0; i < number_of_blobs; ++i) {
+ const std::string key = std::to_string(i);
+ assert(key.size() == key_size);
+
+ const std::string value = std::to_string(i + value_offset);
+ assert(value.size() == value_size);
+
+ std::string blob_index;
+ ASSERT_OK(builder.Add(key, value, &blob_index));
+ ASSERT_TRUE(blob_index.empty());
+ }
+
+ ASSERT_OK(builder.Finish());
+
+ // Check the metadata generated
+ ASSERT_TRUE(blob_file_paths.empty());
+ ASSERT_TRUE(blob_file_additions.empty());
+}
+
+TEST_F(BlobFileBuilderTest, Compression) {
+ // Build a blob file with a compressed blob
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ constexpr size_t key_size = 1;
+ constexpr size_t value_size = 100;
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Compression"),
+ 0);
+ options.enable_blob_files = true;
+ options.blob_compression_type = kSnappyCompression;
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ const std::string key("1");
+ const std::string uncompressed_value(value_size, 'x');
+
+ std::string blob_index;
+
+ ASSERT_OK(builder.Add(key, uncompressed_value, &blob_index));
+ ASSERT_FALSE(blob_index.empty());
+
+ ASSERT_OK(builder.Finish());
+
+ // Check the metadata generated
+ constexpr uint64_t blob_file_number = 2;
+
+ ASSERT_EQ(blob_file_paths.size(), 1);
+
+ const std::string& blob_file_path = blob_file_paths[0];
+
+ ASSERT_EQ(
+ blob_file_path,
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+ ASSERT_EQ(blob_file_additions.size(), 1);
+
+ const auto& blob_file_addition = blob_file_additions[0];
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+
+ CompressionOptions opts;
+ CompressionContext context(kSnappyCompression);
+ constexpr uint64_t sample_for_compression = 0;
+
+ CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+ kSnappyCompression, sample_for_compression);
+
+ std::string compressed_value;
+ ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(),
+ uncompressed_value.size(), &compressed_value));
+
+ ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+ BlobLogRecord::kHeaderSize + key_size + compressed_value.size());
+
+ // Verify the contents of the new blob file as well as the blob reference
+ std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
+ {key, compressed_value}};
+ std::vector<std::string> blob_indexes{blob_index};
+
+ VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+ kSnappyCompression, expected_key_value_pairs, blob_indexes);
+}
+
+TEST_F(BlobFileBuilderTest, CompressionError) {
+ // Simulate an error during compression
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileBuilderTest_CompressionError"),
+ 0);
+ options.enable_blob_files = true;
+ options.blob_compression_type = kSnappyCompression;
+ options.env = mock_env_.get();
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue",
+ [](void* arg) {
+ bool* ret = static_cast<bool*>(arg);
+ *ret = false;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr char key[] = "1";
+ constexpr char value[] = "deadbeef";
+
+ std::string blob_index;
+
+ ASSERT_TRUE(builder.Add(key, value, &blob_index).IsCorruption());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ constexpr uint64_t blob_file_number = 2;
+
+ ASSERT_EQ(blob_file_paths.size(), 1);
+ ASSERT_EQ(
+ blob_file_paths[0],
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+ ASSERT_TRUE(blob_file_additions.empty());
+}
+
+TEST_F(BlobFileBuilderTest, Checksum) {
+ // Build a blob file with checksum
+
+ class DummyFileChecksumGenerator : public FileChecksumGenerator {
+ public:
+ void Update(const char* /* data */, size_t /* n */) override {}
+
+ void Finalize() override {}
+
+ std::string GetChecksum() const override { return std::string("dummy"); }
+
+ const char* Name() const override { return "DummyFileChecksum"; }
+ };
+
+ class DummyFileChecksumGenFactory : public FileChecksumGenFactory {
+ public:
+ std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+ const FileChecksumGenContext& /* context */) override {
+ return std::unique_ptr<FileChecksumGenerator>(
+ new DummyFileChecksumGenerator);
+ }
+
+ const char* Name() const override { return "DummyFileChecksumGenFactory"; }
+ };
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Checksum"),
+ 0);
+ options.enable_blob_files = true;
+ options.file_checksum_gen_factory =
+ std::make_shared<DummyFileChecksumGenFactory>();
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ const std::string key("1");
+ const std::string value("deadbeef");
+
+ std::string blob_index;
+
+ ASSERT_OK(builder.Add(key, value, &blob_index));
+ ASSERT_FALSE(blob_index.empty());
+
+ ASSERT_OK(builder.Finish());
+
+ // Check the metadata generated
+ constexpr uint64_t blob_file_number = 2;
+
+ ASSERT_EQ(blob_file_paths.size(), 1);
+
+ const std::string& blob_file_path = blob_file_paths[0];
+
+ ASSERT_EQ(
+ blob_file_path,
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+ ASSERT_EQ(blob_file_additions.size(), 1);
+
+ const auto& blob_file_addition = blob_file_additions[0];
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+ BlobLogRecord::kHeaderSize + key.size() + value.size());
+ ASSERT_EQ(blob_file_addition.GetChecksumMethod(), "DummyFileChecksum");
+ ASSERT_EQ(blob_file_addition.GetChecksumValue(), "dummy");
+
+ // Verify the contents of the new blob file as well as the blob reference
+ std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
+ {key, value}};
+ std::vector<std::string> blob_indexes{blob_index};
+
+ VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+ kNoCompression, expected_key_value_pairs, blob_indexes);
+}
+
+class BlobFileBuilderIOErrorTest
+ : public testing::Test,
+ public testing::WithParamInterface<std::string> {
+ protected:
+ BlobFileBuilderIOErrorTest() : sync_point_(GetParam()) {
+ mock_env_.reset(MockEnv::Create(Env::Default()));
+ fs_ = mock_env_->GetFileSystem().get();
+ }
+
+ std::unique_ptr<Env> mock_env_;
+ FileSystem* fs_;
+ FileOptions file_options_;
+ std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ BlobFileBuilderTest, BlobFileBuilderIOErrorTest,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile",
+ "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader",
+ "BlobFileBuilder::WriteBlobToFile:AddRecord",
+ "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(BlobFileBuilderIOErrorTest, IOError) {
+ // Simulate an I/O error during the specified step of Add()
+ // Note: blob_file_size will be set to value_size in order for the first blob
+ // to trigger close
+ constexpr size_t value_size = 8;
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileBuilderIOErrorTest_IOError"),
+ 0);
+ options.enable_blob_files = true;
+ options.blob_file_size = value_size;
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+ Status* const s = static_cast<Status*>(arg);
+ assert(s);
+
+ (*s) = Status::IOError(sync_point_);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr char key[] = "1";
+ constexpr char value[] = "deadbeef";
+
+ std::string blob_index;
+
+ ASSERT_TRUE(builder.Add(key, value, &blob_index).IsIOError());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ if (sync_point_ == "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile") {
+ ASSERT_TRUE(blob_file_paths.empty());
+ } else {
+ constexpr uint64_t blob_file_number = 2;
+
+ ASSERT_EQ(blob_file_paths.size(), 1);
+ ASSERT_EQ(blob_file_paths[0],
+ BlobFileName(immutable_options.cf_paths.front().path,
+ blob_file_number));
+ }
+
+ ASSERT_TRUE(blob_file_additions.empty());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_cache.cc b/src/rocksdb/db/blob/blob_file_cache.cc
new file mode 100644
index 000000000..1a6cdf688
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <memory>
+
+#include "db/blob/blob_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobFileCache::BlobFileCache(Cache* cache,
+ const ImmutableOptions* immutable_options,
+ const FileOptions* file_options,
+ uint32_t column_family_id,
+ HistogramImpl* blob_file_read_hist,
+ const std::shared_ptr<IOTracer>& io_tracer)
+ : cache_(cache),
+ mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr),
+ immutable_options_(immutable_options),
+ file_options_(file_options),
+ column_family_id_(column_family_id),
+ blob_file_read_hist_(blob_file_read_hist),
+ io_tracer_(io_tracer) {
+ assert(cache_);
+ assert(immutable_options_);
+ assert(file_options_);
+}
+
+Status BlobFileCache::GetBlobFileReader(
+ uint64_t blob_file_number,
+ CacheHandleGuard<BlobFileReader>* blob_file_reader) {
+ assert(blob_file_reader);
+ assert(blob_file_reader->IsEmpty());
+
+ const Slice key = GetSlice(&blob_file_number);
+
+ assert(cache_);
+
+ Cache::Handle* handle = cache_->Lookup(key);
+ if (handle) {
+ *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+ return Status::OK();
+ }
+
+ TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck");
+
+ // Check again while holding mutex
+ MutexLock lock(mutex_.get(key));
+
+ handle = cache_->Lookup(key);
+ if (handle) {
+ *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+ return Status::OK();
+ }
+
+ assert(immutable_options_);
+ Statistics* const statistics = immutable_options_->stats;
+
+ RecordTick(statistics, NO_FILE_OPENS);
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ {
+ assert(file_options_);
+ const Status s = BlobFileReader::Create(
+ *immutable_options_, *file_options_, column_family_id_,
+ blob_file_read_hist_, blob_file_number, io_tracer_, &reader);
+ if (!s.ok()) {
+ RecordTick(statistics, NO_FILE_ERRORS);
+ return s;
+ }
+ }
+
+ {
+ constexpr size_t charge = 1;
+
+ const Status s = cache_->Insert(key, reader.get(), charge,
+ &DeleteCacheEntry<BlobFileReader>, &handle);
+ if (!s.ok()) {
+ RecordTick(statistics, NO_FILE_ERRORS);
+ return s;
+ }
+ }
+
+ reader.release();
+
+ *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_cache.h b/src/rocksdb/db/blob/blob_file_cache.h
new file mode 100644
index 000000000..8eec05f18
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+
+#include "cache/cache_helpers.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+struct ImmutableOptions;
+struct FileOptions;
+class HistogramImpl;
+class Status;
+class BlobFileReader;
+class Slice;
+class IOTracer;
+
+class BlobFileCache {
+ public:
+ BlobFileCache(Cache* cache, const ImmutableOptions* immutable_options,
+ const FileOptions* file_options, uint32_t column_family_id,
+ HistogramImpl* blob_file_read_hist,
+ const std::shared_ptr<IOTracer>& io_tracer);
+
+ BlobFileCache(const BlobFileCache&) = delete;
+ BlobFileCache& operator=(const BlobFileCache&) = delete;
+
+ Status GetBlobFileReader(uint64_t blob_file_number,
+ CacheHandleGuard<BlobFileReader>* blob_file_reader);
+
+ private:
+ Cache* cache_;
+ // Note: mutex_ below is used to guard against multiple threads racing to open
+ // the same file.
+ Striped<port::Mutex, Slice> mutex_;
+ const ImmutableOptions* immutable_options_;
+ const FileOptions* file_options_;
+ uint32_t column_family_id_;
+ HistogramImpl* blob_file_read_hist_;
+ std::shared_ptr<IOTracer> io_tracer_;
+
+ static constexpr size_t kNumberOfMutexStripes = 1 << 7;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_cache_test.cc b/src/rocksdb/db/blob/blob_file_cache_test.cc
new file mode 100644
index 000000000..d3a61b3c5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache_test.cc
@@ -0,0 +1,269 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with a single blob in it.
+void WriteBlobFile(uint32_t column_family_id,
+ const ImmutableOptions& immutable_options,
+ uint64_t blob_file_number) {
+ assert(!immutable_options.cf_paths.empty());
+
+ const std::string blob_file_path =
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+
+ std::unique_ptr<FSWritableFile> file;
+ ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+ FileOptions()));
+
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+ constexpr Statistics* statistics = nullptr;
+ constexpr bool use_fsync = false;
+ constexpr bool do_flush = false;
+
+ BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+ statistics, blob_file_number, use_fsync,
+ do_flush);
+
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+
+ BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
+ expiration_range);
+
+ ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ std::string compressed_blob;
+
+ uint64_t key_offset = 0;
+ uint64_t blob_offset = 0;
+
+ ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset));
+
+ BlobLogFooter footer;
+ footer.blob_count = 1;
+ footer.expiration_range = expiration_range;
+
+ std::string checksum_method;
+ std::string checksum_value;
+
+ ASSERT_OK(
+ blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+} // anonymous namespace
+
+class BlobFileCacheTest : public testing::Test {
+ protected:
+ BlobFileCacheTest() { mock_env_.reset(MockEnv::Create(Env::Default())); }
+
+ std::unique_ptr<Env> mock_env_;
+};
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader) {
+ Options options;
+ options.env = mock_env_.get();
+ options.statistics = CreateDBStatistics();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileCacheTest_GetBlobFileReader"),
+ 0);
+ options.enable_blob_files = true;
+
+ constexpr uint32_t column_family_id = 1;
+ ImmutableOptions immutable_options(options);
+ constexpr uint64_t blob_file_number = 123;
+
+ WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+ &file_options, column_family_id,
+ blob_file_read_hist, nullptr /*IOTracer*/);
+
+ // First try: reader should be opened and put in cache
+ CacheHandleGuard<BlobFileReader> first;
+
+ ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+ ASSERT_NE(first.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+ // Second try: reader should be served from cache
+ CacheHandleGuard<BlobFileReader> second;
+
+ ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+ ASSERT_NE(second.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+ ASSERT_EQ(first.GetValue(), second.GetValue());
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) {
+ Options options;
+ options.env = mock_env_.get();
+ options.statistics = CreateDBStatistics();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileCacheTest_GetBlobFileReader_Race"),
+ 0);
+ options.enable_blob_files = true;
+
+ constexpr uint32_t column_family_id = 1;
+ ImmutableOptions immutable_options(options);
+ constexpr uint64_t blob_file_number = 123;
+
+ WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+ &file_options, column_family_id,
+ blob_file_read_hist, nullptr /*IOTracer*/);
+
+ CacheHandleGuard<BlobFileReader> first;
+ CacheHandleGuard<BlobFileReader> second;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) {
+ // Disabling sync points to prevent infinite recursion
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+ ASSERT_NE(second.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+ ASSERT_NE(first.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+ ASSERT_EQ(first.GetValue(), second.GetValue());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) {
+ Options options;
+ options.env = mock_env_.get();
+ options.statistics = CreateDBStatistics();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileCacheTest_GetBlobFileReader_IOError"),
+ 0);
+ options.enable_blob_files = true;
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ ImmutableOptions immutable_options(options);
+ FileOptions file_options;
+ constexpr uint32_t column_family_id = 1;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+ &file_options, column_family_id,
+ blob_file_read_hist, nullptr /*IOTracer*/);
+
+ // Note: there is no blob file with the below number
+ constexpr uint64_t blob_file_number = 123;
+
+ CacheHandleGuard<BlobFileReader> reader;
+
+ ASSERT_TRUE(
+ blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError());
+ ASSERT_EQ(reader.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) {
+ Options options;
+ options.env = mock_env_.get();
+ options.statistics = CreateDBStatistics();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileCacheTest_GetBlobFileReader_CacheFull"),
+ 0);
+ options.enable_blob_files = true;
+
+ constexpr uint32_t column_family_id = 1;
+ ImmutableOptions immutable_options(options);
+ constexpr uint64_t blob_file_number = 123;
+
+ WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+ constexpr size_t capacity = 0;
+ constexpr int num_shard_bits = -1; // determined automatically
+ constexpr bool strict_capacity_limit = true;
+ std::shared_ptr<Cache> backing_cache =
+ NewLRUCache(capacity, num_shard_bits, strict_capacity_limit);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+ &file_options, column_family_id,
+ blob_file_read_hist, nullptr /*IOTracer*/);
+
+ // Insert into cache should fail since it has zero capacity and
+ // strict_capacity_limit is set
+ CacheHandleGuard<BlobFileReader> reader;
+
+ ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader)
+ .IsMemoryLimit());
+ ASSERT_EQ(reader.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_completion_callback.h b/src/rocksdb/db/blob/blob_file_completion_callback.h
new file mode 100644
index 000000000..ffe65a0ff
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_completion_callback.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileCompletionCallback {
+ public:
+ BlobFileCompletionCallback(
+ SstFileManager* sst_file_manager, InstrumentedMutex* mutex,
+ ErrorHandler* error_handler, EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& dbname)
+ : event_logger_(event_logger), listeners_(listeners), dbname_(dbname) {
+#ifndef ROCKSDB_LITE
+ sst_file_manager_ = sst_file_manager;
+ mutex_ = mutex;
+ error_handler_ = error_handler;
+#else
+ (void)sst_file_manager;
+ (void)mutex;
+ (void)error_handler;
+#endif // ROCKSDB_LITE
+ }
+
+ void OnBlobFileCreationStarted(const std::string& file_name,
+ const std::string& column_family_name,
+ int job_id,
+ BlobFileCreationReason creation_reason) {
+#ifndef ROCKSDB_LITE
+ // Notify the listeners.
+ EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_,
+ column_family_name, file_name,
+ job_id, creation_reason);
+#else
+ (void)file_name;
+ (void)column_family_name;
+ (void)job_id;
+ (void)creation_reason;
+#endif
+ }
+
+ Status OnBlobFileCompleted(const std::string& file_name,
+ const std::string& column_family_name, int job_id,
+ uint64_t file_number,
+ BlobFileCreationReason creation_reason,
+ const Status& report_status,
+ const std::string& checksum_value,
+ const std::string& checksum_method,
+ uint64_t blob_count, uint64_t blob_bytes) {
+ Status s;
+
+#ifndef ROCKSDB_LITE
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager_);
+ if (sfm) {
+ // Report new blob files to SstFileManagerImpl
+ s = sfm->OnAddFile(file_name);
+ if (sfm->IsMaxAllowedSpaceReached()) {
+ s = Status::SpaceLimit("Max allowed space was reached");
+ TEST_SYNC_POINT(
+ "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached");
+ InstrumentedMutexLock l(mutex_);
+ error_handler_->SetBGError(s, BackgroundErrorReason::kFlush);
+ }
+ }
+#endif // !ROCKSDB_LITE
+
+ // Notify the listeners.
+ EventHelpers::LogAndNotifyBlobFileCreationFinished(
+ event_logger_, listeners_, dbname_, column_family_name, file_name,
+ job_id, file_number, creation_reason,
+ (!report_status.ok() ? report_status : s),
+ (checksum_value.empty() ? kUnknownFileChecksum : checksum_value),
+ (checksum_method.empty() ? kUnknownFileChecksumFuncName
+ : checksum_method),
+ blob_count, blob_bytes);
+ return s;
+ }
+
+ private:
+#ifndef ROCKSDB_LITE
+ SstFileManager* sst_file_manager_;
+ InstrumentedMutex* mutex_;
+ ErrorHandler* error_handler_;
+#endif // ROCKSDB_LITE
+ EventLogger* event_logger_;
+ std::vector<std::shared_ptr<EventListener>> listeners_;
+ std::string dbname_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage.cc b/src/rocksdb/db/blob/blob_file_garbage.cc
new file mode 100644
index 000000000..52c336f49
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_garbage.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tags for custom fields. Note that these get persisted in the manifest,
+// so existing tags should not be modified.
+enum BlobFileGarbage::CustomFieldTags : uint32_t {
+ kEndMarker,
+
+ // Add forward compatible fields here
+
+ /////////////////////////////////////////////////////////////////////
+
+ kForwardIncompatibleMask = 1 << 6,
+
+ // Add forward incompatible fields here
+};
+
+void BlobFileGarbage::EncodeTo(std::string* output) const {
+ PutVarint64(output, blob_file_number_);
+ PutVarint64(output, garbage_blob_count_);
+ PutVarint64(output, garbage_blob_bytes_);
+
+ // Encode any custom fields here. The format to use is a Varint32 tag (see
+ // CustomFieldTags above) followed by a length prefixed slice. Unknown custom
+ // fields will be ignored during decoding unless they're in the forward
+ // incompatible range.
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileGarbage::EncodeTo::CustomFields", output);
+
+ PutVarint32(output, kEndMarker);
+}
+
+Status BlobFileGarbage::DecodeFrom(Slice* input) {
+ constexpr char class_name[] = "BlobFileGarbage";
+
+ if (!GetVarint64(input, &blob_file_number_)) {
+ return Status::Corruption(class_name, "Error decoding blob file number");
+ }
+
+ if (!GetVarint64(input, &garbage_blob_count_)) {
+ return Status::Corruption(class_name, "Error decoding garbage blob count");
+ }
+
+ if (!GetVarint64(input, &garbage_blob_bytes_)) {
+ return Status::Corruption(class_name, "Error decoding garbage blob bytes");
+ }
+
+ while (true) {
+ uint32_t custom_field_tag = 0;
+ if (!GetVarint32(input, &custom_field_tag)) {
+ return Status::Corruption(class_name, "Error decoding custom field tag");
+ }
+
+ if (custom_field_tag == kEndMarker) {
+ break;
+ }
+
+ if (custom_field_tag & kForwardIncompatibleMask) {
+ return Status::Corruption(
+ class_name, "Forward incompatible custom field encountered");
+ }
+
+ Slice custom_field_value;
+ if (!GetLengthPrefixedSlice(input, &custom_field_value)) {
+ return Status::Corruption(class_name,
+ "Error decoding custom field value");
+ }
+ }
+
+ return Status::OK();
+}
+
+std::string BlobFileGarbage::DebugString() const {
+ std::ostringstream oss;
+
+ oss << *this;
+
+ return oss.str();
+}
+
+std::string BlobFileGarbage::DebugJSON() const {
+ JSONWriter jw;
+
+ jw << *this;
+
+ jw.EndObject();
+
+ return jw.Get();
+}
+
+bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) {
+ return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() &&
+ lhs.GetGarbageBlobCount() == rhs.GetGarbageBlobCount() &&
+ lhs.GetGarbageBlobBytes() == rhs.GetGarbageBlobBytes();
+}
+
+bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) {
+ return !(lhs == rhs);
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const BlobFileGarbage& blob_file_garbage) {
+ os << "blob_file_number: " << blob_file_garbage.GetBlobFileNumber()
+ << " garbage_blob_count: " << blob_file_garbage.GetGarbageBlobCount()
+ << " garbage_blob_bytes: " << blob_file_garbage.GetGarbageBlobBytes();
+
+ return os;
+}
+
+JSONWriter& operator<<(JSONWriter& jw,
+ const BlobFileGarbage& blob_file_garbage) {
+ jw << "BlobFileNumber" << blob_file_garbage.GetBlobFileNumber()
+ << "GarbageBlobCount" << blob_file_garbage.GetGarbageBlobCount()
+ << "GarbageBlobBytes" << blob_file_garbage.GetGarbageBlobBytes();
+
+ return jw;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage.h b/src/rocksdb/db/blob/blob_file_garbage.h
new file mode 100644
index 000000000..6dc14ddca
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+class BlobFileGarbage {
+ public:
+ BlobFileGarbage() = default;
+
+ BlobFileGarbage(uint64_t blob_file_number, uint64_t garbage_blob_count,
+ uint64_t garbage_blob_bytes)
+ : blob_file_number_(blob_file_number),
+ garbage_blob_count_(garbage_blob_count),
+ garbage_blob_bytes_(garbage_blob_bytes) {}
+
+ uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+ uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+ uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+ void EncodeTo(std::string* output) const;
+ Status DecodeFrom(Slice* input);
+
+ std::string DebugString() const;
+ std::string DebugJSON() const;
+
+ private:
+ enum CustomFieldTags : uint32_t;
+
+ uint64_t blob_file_number_ = kInvalidBlobFileNumber;
+ uint64_t garbage_blob_count_ = 0;
+ uint64_t garbage_blob_bytes_ = 0;
+};
+
+bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs);
+bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs);
+
+std::ostream& operator<<(std::ostream& os,
+ const BlobFileGarbage& blob_file_garbage);
+JSONWriter& operator<<(JSONWriter& jw,
+ const BlobFileGarbage& blob_file_garbage);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage_test.cc b/src/rocksdb/db/blob/blob_file_garbage_test.cc
new file mode 100644
index 000000000..292a8b38a
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage_test.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_garbage.h"
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileGarbageTest : public testing::Test {
+ public:
+ static void TestEncodeDecode(const BlobFileGarbage& blob_file_garbage) {
+ std::string encoded;
+ blob_file_garbage.EncodeTo(&encoded);
+
+ BlobFileGarbage decoded;
+ Slice input(encoded);
+ ASSERT_OK(decoded.DecodeFrom(&input));
+
+ ASSERT_EQ(blob_file_garbage, decoded);
+ }
+};
+
+TEST_F(BlobFileGarbageTest, Empty) {
+ BlobFileGarbage blob_file_garbage;
+
+ ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), kInvalidBlobFileNumber);
+ ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), 0);
+ ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), 0);
+
+ TestEncodeDecode(blob_file_garbage);
+}
+
+TEST_F(BlobFileGarbageTest, NonEmpty) {
+ constexpr uint64_t blob_file_number = 123;
+ constexpr uint64_t garbage_blob_count = 1;
+ constexpr uint64_t garbage_blob_bytes = 9876;
+
+ BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+
+ ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), garbage_blob_count);
+ ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), garbage_blob_bytes);
+
+ TestEncodeDecode(blob_file_garbage);
+}
+
+TEST_F(BlobFileGarbageTest, DecodeErrors) {
+ std::string str;
+ Slice slice(str);
+
+ BlobFileGarbage blob_file_garbage;
+
+ {
+ const Status s = blob_file_garbage.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "blob file number"));
+ }
+
+ constexpr uint64_t blob_file_number = 123;
+ PutVarint64(&str, blob_file_number);
+ slice = str;
+
+ {
+ const Status s = blob_file_garbage.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "garbage blob count"));
+ }
+
+ constexpr uint64_t garbage_blob_count = 4567;
+ PutVarint64(&str, garbage_blob_count);
+ slice = str;
+
+ {
+ const Status s = blob_file_garbage.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "garbage blob bytes"));
+ }
+
+ constexpr uint64_t garbage_blob_bytes = 12345678;
+ PutVarint64(&str, garbage_blob_bytes);
+ slice = str;
+
+ {
+ const Status s = blob_file_garbage.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "custom field tag"));
+ }
+
+ constexpr uint32_t custom_tag = 2;
+ PutVarint32(&str, custom_tag);
+ slice = str;
+
+ {
+ const Status s = blob_file_garbage.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "custom field value"));
+ }
+}
+
+TEST_F(BlobFileGarbageTest, ForwardCompatibleCustomField) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) {
+ std::string* output = static_cast<std::string*>(arg);
+
+ constexpr uint32_t forward_compatible_tag = 2;
+ PutVarint32(output, forward_compatible_tag);
+
+ PutLengthPrefixedSlice(output, "deadbeef");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr uint64_t blob_file_number = 678;
+ constexpr uint64_t garbage_blob_count = 9999;
+ constexpr uint64_t garbage_blob_bytes = 100000000;
+
+ BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+
+ TestEncodeDecode(blob_file_garbage);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileGarbageTest, ForwardIncompatibleCustomField) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) {
+ std::string* output = static_cast<std::string*>(arg);
+
+ constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1;
+ PutVarint32(output, forward_incompatible_tag);
+
+ PutLengthPrefixedSlice(output, "foobar");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr uint64_t blob_file_number = 456;
+ constexpr uint64_t garbage_blob_count = 100;
+ constexpr uint64_t garbage_blob_bytes = 2000000;
+
+ BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+
+ std::string encoded;
+ blob_file_garbage.EncodeTo(&encoded);
+
+ BlobFileGarbage decoded_blob_file_addition;
+ Slice input(encoded);
+ const Status s = decoded_blob_file_addition.DecodeFrom(&input);
+
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible"));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_meta.cc b/src/rocksdb/db/blob/blob_file_meta.cc
new file mode 100644
index 000000000..4913137e5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_meta.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_meta.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+uint64_t SharedBlobFileMetaData::GetBlobFileSize() const {
+ return BlobLogHeader::kSize + total_blob_bytes_ + BlobLogFooter::kSize;
+}
+
+std::string SharedBlobFileMetaData::DebugString() const {
+ std::ostringstream oss;
+ oss << (*this);
+
+ return oss.str();
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const SharedBlobFileMetaData& shared_meta) {
+ os << "blob_file_number: " << shared_meta.GetBlobFileNumber()
+ << " total_blob_count: " << shared_meta.GetTotalBlobCount()
+ << " total_blob_bytes: " << shared_meta.GetTotalBlobBytes()
+ << " checksum_method: " << shared_meta.GetChecksumMethod()
+ << " checksum_value: "
+ << Slice(shared_meta.GetChecksumValue()).ToString(/* hex */ true);
+
+ return os;
+}
+
+std::string BlobFileMetaData::DebugString() const {
+ std::ostringstream oss;
+ oss << (*this);
+
+ return oss.str();
+}
+
+std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta) {
+ const auto& shared_meta = meta.GetSharedMeta();
+ assert(shared_meta);
+ os << (*shared_meta);
+
+ os << " linked_ssts: {";
+ for (uint64_t file_number : meta.GetLinkedSsts()) {
+ os << ' ' << file_number;
+ }
+ os << " }";
+
+ os << " garbage_blob_count: " << meta.GetGarbageBlobCount()
+ << " garbage_blob_bytes: " << meta.GetGarbageBlobBytes();
+
+ return os;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_meta.h b/src/rocksdb/db/blob/blob_file_meta.h
new file mode 100644
index 000000000..d7c8a1243
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_meta.h
@@ -0,0 +1,170 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SharedBlobFileMetaData represents the immutable part of blob files' metadata,
+// like the blob file number, total number and size of blobs, or checksum
+// method and value. There is supposed to be one object of this class per blob
+// file (shared across all versions that include the blob file in question);
+// hence, the type is neither copyable nor movable. A blob file can be marked
+// obsolete when the corresponding SharedBlobFileMetaData object is destroyed.
+
+class SharedBlobFileMetaData {
+ public:
+ static std::shared_ptr<SharedBlobFileMetaData> Create(
+ uint64_t blob_file_number, uint64_t total_blob_count,
+ uint64_t total_blob_bytes, std::string checksum_method,
+ std::string checksum_value) {
+ return std::shared_ptr<SharedBlobFileMetaData>(new SharedBlobFileMetaData(
+ blob_file_number, total_blob_count, total_blob_bytes,
+ std::move(checksum_method), std::move(checksum_value)));
+ }
+
+ template <typename Deleter>
+ static std::shared_ptr<SharedBlobFileMetaData> Create(
+ uint64_t blob_file_number, uint64_t total_blob_count,
+ uint64_t total_blob_bytes, std::string checksum_method,
+ std::string checksum_value, Deleter deleter) {
+ return std::shared_ptr<SharedBlobFileMetaData>(
+ new SharedBlobFileMetaData(blob_file_number, total_blob_count,
+ total_blob_bytes, std::move(checksum_method),
+ std::move(checksum_value)),
+ deleter);
+ }
+
+ SharedBlobFileMetaData(const SharedBlobFileMetaData&) = delete;
+ SharedBlobFileMetaData& operator=(const SharedBlobFileMetaData&) = delete;
+
+ SharedBlobFileMetaData(SharedBlobFileMetaData&&) = delete;
+ SharedBlobFileMetaData& operator=(SharedBlobFileMetaData&&) = delete;
+
+ uint64_t GetBlobFileSize() const;
+ uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+ uint64_t GetTotalBlobCount() const { return total_blob_count_; }
+ uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; }
+ const std::string& GetChecksumMethod() const { return checksum_method_; }
+ const std::string& GetChecksumValue() const { return checksum_value_; }
+
+ std::string DebugString() const;
+
+ private:
+ SharedBlobFileMetaData(uint64_t blob_file_number, uint64_t total_blob_count,
+ uint64_t total_blob_bytes, std::string checksum_method,
+ std::string checksum_value)
+ : blob_file_number_(blob_file_number),
+ total_blob_count_(total_blob_count),
+ total_blob_bytes_(total_blob_bytes),
+ checksum_method_(std::move(checksum_method)),
+ checksum_value_(std::move(checksum_value)) {
+ assert(checksum_method_.empty() == checksum_value_.empty());
+ }
+
+ uint64_t blob_file_number_;
+ uint64_t total_blob_count_;
+ uint64_t total_blob_bytes_;
+ std::string checksum_method_;
+ std::string checksum_value_;
+};
+
+std::ostream& operator<<(std::ostream& os,
+ const SharedBlobFileMetaData& shared_meta);
+
+// BlobFileMetaData contains the part of the metadata for blob files that can
+// vary across versions, like the amount of garbage in the blob file. In
+// addition, BlobFileMetaData objects point to and share the ownership of the
+// SharedBlobFileMetaData object for the corresponding blob file. Similarly to
+// SharedBlobFileMetaData, BlobFileMetaData are not copyable or movable. They
+// are meant to be jointly owned by the versions in which the blob file has the
+// same (immutable *and* mutable) state.
+
+class BlobFileMetaData {
+ public:
+ using LinkedSsts = std::unordered_set<uint64_t>;
+
+ static std::shared_ptr<BlobFileMetaData> Create(
+ std::shared_ptr<SharedBlobFileMetaData> shared_meta,
+ LinkedSsts linked_ssts, uint64_t garbage_blob_count,
+ uint64_t garbage_blob_bytes) {
+ return std::shared_ptr<BlobFileMetaData>(
+ new BlobFileMetaData(std::move(shared_meta), std::move(linked_ssts),
+ garbage_blob_count, garbage_blob_bytes));
+ }
+
+ BlobFileMetaData(const BlobFileMetaData&) = delete;
+ BlobFileMetaData& operator=(const BlobFileMetaData&) = delete;
+
+ BlobFileMetaData(BlobFileMetaData&&) = delete;
+ BlobFileMetaData& operator=(BlobFileMetaData&&) = delete;
+
+ const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const {
+ return shared_meta_;
+ }
+
+ uint64_t GetBlobFileSize() const {
+ assert(shared_meta_);
+ return shared_meta_->GetBlobFileSize();
+ }
+
+ uint64_t GetBlobFileNumber() const {
+ assert(shared_meta_);
+ return shared_meta_->GetBlobFileNumber();
+ }
+ uint64_t GetTotalBlobCount() const {
+ assert(shared_meta_);
+ return shared_meta_->GetTotalBlobCount();
+ }
+ uint64_t GetTotalBlobBytes() const {
+ assert(shared_meta_);
+ return shared_meta_->GetTotalBlobBytes();
+ }
+ const std::string& GetChecksumMethod() const {
+ assert(shared_meta_);
+ return shared_meta_->GetChecksumMethod();
+ }
+ const std::string& GetChecksumValue() const {
+ assert(shared_meta_);
+ return shared_meta_->GetChecksumValue();
+ }
+
+ const LinkedSsts& GetLinkedSsts() const { return linked_ssts_; }
+
+ uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+ uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+ std::string DebugString() const;
+
+ private:
+ BlobFileMetaData(std::shared_ptr<SharedBlobFileMetaData> shared_meta,
+ LinkedSsts linked_ssts, uint64_t garbage_blob_count,
+ uint64_t garbage_blob_bytes)
+ : shared_meta_(std::move(shared_meta)),
+ linked_ssts_(std::move(linked_ssts)),
+ garbage_blob_count_(garbage_blob_count),
+ garbage_blob_bytes_(garbage_blob_bytes) {
+ assert(shared_meta_);
+ assert(garbage_blob_count_ <= shared_meta_->GetTotalBlobCount());
+ assert(garbage_blob_bytes_ <= shared_meta_->GetTotalBlobBytes());
+ }
+
+ std::shared_ptr<SharedBlobFileMetaData> shared_meta_;
+ LinkedSsts linked_ssts_;
+ uint64_t garbage_blob_count_;
+ uint64_t garbage_blob_bytes_;
+};
+
+std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader.cc b/src/rocksdb/db/blob/blob_file_reader.cc
new file mode 100644
index 000000000..a4eabb605
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader.cc
@@ -0,0 +1,610 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_reader.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_log_format.h"
+#include "file/file_prefetch_buffer.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "options/cf_options.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "table/multiget_context.h"
+#include "test_util/sync_point.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobFileReader::Create(
+ const ImmutableOptions& immutable_options, const FileOptions& file_options,
+ uint32_t column_family_id, HistogramImpl* blob_file_read_hist,
+ uint64_t blob_file_number, const std::shared_ptr<IOTracer>& io_tracer,
+ std::unique_ptr<BlobFileReader>* blob_file_reader) {
+ assert(blob_file_reader);
+ assert(!*blob_file_reader);
+
+ uint64_t file_size = 0;
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+
+ {
+ const Status s =
+ OpenFile(immutable_options, file_options, blob_file_read_hist,
+ blob_file_number, io_tracer, &file_size, &file_reader);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ assert(file_reader);
+
+ Statistics* const statistics = immutable_options.stats;
+
+ CompressionType compression_type = kNoCompression;
+
+ {
+ const Status s = ReadHeader(file_reader.get(), column_family_id, statistics,
+ &compression_type);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ {
+ const Status s = ReadFooter(file_reader.get(), file_size, statistics);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ blob_file_reader->reset(
+ new BlobFileReader(std::move(file_reader), file_size, compression_type,
+ immutable_options.clock, statistics));
+
+ return Status::OK();
+}
+
+Status BlobFileReader::OpenFile(
+ const ImmutableOptions& immutable_options, const FileOptions& file_opts,
+ HistogramImpl* blob_file_read_hist, uint64_t blob_file_number,
+ const std::shared_ptr<IOTracer>& io_tracer, uint64_t* file_size,
+ std::unique_ptr<RandomAccessFileReader>* file_reader) {
+ assert(file_size);
+ assert(file_reader);
+
+ const auto& cf_paths = immutable_options.cf_paths;
+ assert(!cf_paths.empty());
+
+ const std::string blob_file_path =
+ BlobFileName(cf_paths.front().path, blob_file_number);
+
+ FileSystem* const fs = immutable_options.fs.get();
+ assert(fs);
+
+ constexpr IODebugContext* dbg = nullptr;
+
+ {
+ TEST_SYNC_POINT("BlobFileReader::OpenFile:GetFileSize");
+
+ const Status s =
+ fs->GetFileSize(blob_file_path, IOOptions(), file_size, dbg);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (*file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) {
+ return Status::Corruption("Malformed blob file");
+ }
+
+ std::unique_ptr<FSRandomAccessFile> file;
+
+ {
+ TEST_SYNC_POINT("BlobFileReader::OpenFile:NewRandomAccessFile");
+
+ const Status s =
+ fs->NewRandomAccessFile(blob_file_path, file_opts, &file, dbg);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ assert(file);
+
+ if (immutable_options.advise_random_on_open) {
+ file->Hint(FSRandomAccessFile::kRandom);
+ }
+
+ file_reader->reset(new RandomAccessFileReader(
+ std::move(file), blob_file_path, immutable_options.clock, io_tracer,
+ immutable_options.stats, BLOB_DB_BLOB_FILE_READ_MICROS,
+ blob_file_read_hist, immutable_options.rate_limiter.get(),
+ immutable_options.listeners));
+
+ return Status::OK();
+}
+
+Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
+ uint32_t column_family_id,
+ Statistics* statistics,
+ CompressionType* compression_type) {
+ assert(file_reader);
+ assert(compression_type);
+
+ Slice header_slice;
+ Buffer buf;
+ AlignedBuf aligned_buf;
+
+ {
+ TEST_SYNC_POINT("BlobFileReader::ReadHeader:ReadFromFile");
+
+ constexpr uint64_t read_offset = 0;
+ constexpr size_t read_size = BlobLogHeader::kSize;
+
+ // TODO: rate limit reading headers from blob files.
+ const Status s = ReadFromFile(file_reader, read_offset, read_size,
+ statistics, &header_slice, &buf, &aligned_buf,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!s.ok()) {
+ return s;
+ }
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadHeader:TamperWithResult",
+ &header_slice);
+ }
+
+ BlobLogHeader header;
+
+ {
+ const Status s = header.DecodeFrom(header_slice);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ constexpr ExpirationRange no_expiration_range;
+
+ if (header.has_ttl || header.expiration_range != no_expiration_range) {
+ return Status::Corruption("Unexpected TTL blob file");
+ }
+
+ if (header.column_family_id != column_family_id) {
+ return Status::Corruption("Column family ID mismatch");
+ }
+
+ *compression_type = header.compression;
+
+ return Status::OK();
+}
+
+Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
+ uint64_t file_size, Statistics* statistics) {
+ assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize);
+ assert(file_reader);
+
+ Slice footer_slice;
+ Buffer buf;
+ AlignedBuf aligned_buf;
+
+ {
+ TEST_SYNC_POINT("BlobFileReader::ReadFooter:ReadFromFile");
+
+ const uint64_t read_offset = file_size - BlobLogFooter::kSize;
+ constexpr size_t read_size = BlobLogFooter::kSize;
+
+ // TODO: rate limit reading footers from blob files.
+ const Status s = ReadFromFile(file_reader, read_offset, read_size,
+ statistics, &footer_slice, &buf, &aligned_buf,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!s.ok()) {
+ return s;
+ }
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadFooter:TamperWithResult",
+ &footer_slice);
+ }
+
+ BlobLogFooter footer;
+
+ {
+ const Status s = footer.DecodeFrom(footer_slice);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ constexpr ExpirationRange no_expiration_range;
+
+ if (footer.expiration_range != no_expiration_range) {
+ return Status::Corruption("Unexpected TTL blob file");
+ }
+
+ return Status::OK();
+}
+
+Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
+ uint64_t read_offset, size_t read_size,
+ Statistics* statistics, Slice* slice,
+ Buffer* buf, AlignedBuf* aligned_buf,
+ Env::IOPriority rate_limiter_priority) {
+ assert(slice);
+ assert(buf);
+ assert(aligned_buf);
+
+ assert(file_reader);
+
+ RecordTick(statistics, BLOB_DB_BLOB_FILE_BYTES_READ, read_size);
+
+ Status s;
+
+ if (file_reader->use_direct_io()) {
+ constexpr char* scratch = nullptr;
+
+ s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch,
+ aligned_buf, rate_limiter_priority);
+ } else {
+ buf->reset(new char[read_size]);
+ constexpr AlignedBuf* aligned_scratch = nullptr;
+
+ s = file_reader->Read(IOOptions(), read_offset, read_size, slice,
+ buf->get(), aligned_scratch, rate_limiter_priority);
+ }
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (slice->size() != read_size) {
+ return Status::Corruption("Failed to read data from blob file");
+ }
+
+ return Status::OK();
+}
+
+BlobFileReader::BlobFileReader(
+ std::unique_ptr<RandomAccessFileReader>&& file_reader, uint64_t file_size,
+ CompressionType compression_type, SystemClock* clock,
+ Statistics* statistics)
+ : file_reader_(std::move(file_reader)),
+ file_size_(file_size),
+ compression_type_(compression_type),
+ clock_(clock),
+ statistics_(statistics) {
+ assert(file_reader_);
+}
+
+BlobFileReader::~BlobFileReader() = default;
+
+Status BlobFileReader::GetBlob(
+ const ReadOptions& read_options, const Slice& user_key, uint64_t offset,
+ uint64_t value_size, CompressionType compression_type,
+ FilePrefetchBuffer* prefetch_buffer, MemoryAllocator* allocator,
+ std::unique_ptr<BlobContents>* result, uint64_t* bytes_read) const {
+ assert(result);
+
+ const uint64_t key_size = user_key.size();
+
+ if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) {
+ return Status::Corruption("Invalid blob offset");
+ }
+
+ if (compression_type != compression_type_) {
+ return Status::Corruption("Compression type mismatch when reading blob");
+ }
+
+ // Note: if verify_checksum is set, we read the entire blob record to be able
+ // to perform the verification; otherwise, we just read the blob itself. Since
+ // the offset in BlobIndex actually points to the blob value, we need to make
+ // an adjustment in the former case.
+ const uint64_t adjustment =
+ read_options.verify_checksums
+ ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size)
+ : 0;
+ assert(offset >= adjustment);
+
+ const uint64_t record_offset = offset - adjustment;
+ const uint64_t record_size = value_size + adjustment;
+
+ Slice record_slice;
+ Buffer buf;
+ AlignedBuf aligned_buf;
+
+ bool prefetched = false;
+
+ if (prefetch_buffer) {
+ Status s;
+ constexpr bool for_compaction = true;
+
+ prefetched = prefetch_buffer->TryReadFromCache(
+ IOOptions(), file_reader_.get(), record_offset,
+ static_cast<size_t>(record_size), &record_slice, &s,
+ read_options.rate_limiter_priority, for_compaction);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (!prefetched) {
+ TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile");
+ PERF_COUNTER_ADD(blob_read_count, 1);
+ PERF_COUNTER_ADD(blob_read_byte, record_size);
+ PERF_TIMER_GUARD(blob_read_time);
+ const Status s = ReadFromFile(file_reader_.get(), record_offset,
+ static_cast<size_t>(record_size), statistics_,
+ &record_slice, &buf, &aligned_buf,
+ read_options.rate_limiter_priority);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileReader::GetBlob:TamperWithResult",
+ &record_slice);
+
+ if (read_options.verify_checksums) {
+ const Status s = VerifyBlob(record_slice, user_key, value_size);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ const Slice value_slice(record_slice.data() + adjustment, value_size);
+
+ {
+ const Status s = UncompressBlobIfNeeded(
+ value_slice, compression_type, allocator, clock_, statistics_, result);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (bytes_read) {
+ *bytes_read = record_size;
+ }
+
+ return Status::OK();
+}
+
+void BlobFileReader::MultiGetBlob(
+ const ReadOptions& read_options, MemoryAllocator* allocator,
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>&
+ blob_reqs,
+ uint64_t* bytes_read) const {
+ const size_t num_blobs = blob_reqs.size();
+ assert(num_blobs > 0);
+ assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE);
+
+#ifndef NDEBUG
+ for (size_t i = 0; i < num_blobs - 1; ++i) {
+ assert(blob_reqs[i].first->offset <= blob_reqs[i + 1].first->offset);
+ }
+#endif // !NDEBUG
+
+ std::vector<FSReadRequest> read_reqs;
+ autovector<uint64_t> adjustments;
+ uint64_t total_len = 0;
+ read_reqs.reserve(num_blobs);
+ for (size_t i = 0; i < num_blobs; ++i) {
+ BlobReadRequest* const req = blob_reqs[i].first;
+ assert(req);
+ assert(req->user_key);
+ assert(req->status);
+
+ const size_t key_size = req->user_key->size();
+ const uint64_t offset = req->offset;
+ const uint64_t value_size = req->len;
+
+ if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) {
+ *req->status = Status::Corruption("Invalid blob offset");
+ continue;
+ }
+ if (req->compression != compression_type_) {
+ *req->status =
+ Status::Corruption("Compression type mismatch when reading a blob");
+ continue;
+ }
+
+ const uint64_t adjustment =
+ read_options.verify_checksums
+ ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size)
+ : 0;
+ assert(req->offset >= adjustment);
+ adjustments.push_back(adjustment);
+
+ FSReadRequest read_req = {};
+ read_req.offset = req->offset - adjustment;
+ read_req.len = req->len + adjustment;
+ read_reqs.emplace_back(read_req);
+ total_len += read_req.len;
+ }
+
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len);
+
+ Buffer buf;
+ AlignedBuf aligned_buf;
+
+ Status s;
+ bool direct_io = file_reader_->use_direct_io();
+ if (direct_io) {
+ for (size_t i = 0; i < read_reqs.size(); ++i) {
+ read_reqs[i].scratch = nullptr;
+ }
+ } else {
+ buf.reset(new char[total_len]);
+ std::ptrdiff_t pos = 0;
+ for (size_t i = 0; i < read_reqs.size(); ++i) {
+ read_reqs[i].scratch = buf.get() + pos;
+ pos += read_reqs[i].len;
+ }
+ }
+ TEST_SYNC_POINT("BlobFileReader::MultiGetBlob:ReadFromFile");
+ PERF_COUNTER_ADD(blob_read_count, num_blobs);
+ PERF_COUNTER_ADD(blob_read_byte, total_len);
+ s = file_reader_->MultiRead(IOOptions(), read_reqs.data(), read_reqs.size(),
+ direct_io ? &aligned_buf : nullptr,
+ read_options.rate_limiter_priority);
+ if (!s.ok()) {
+ for (auto& req : read_reqs) {
+ req.status.PermitUncheckedError();
+ }
+ for (auto& blob_req : blob_reqs) {
+ BlobReadRequest* const req = blob_req.first;
+ assert(req);
+ assert(req->status);
+
+ if (!req->status->IsCorruption()) {
+ // Avoid overwriting corruption status.
+ *req->status = s;
+ }
+ }
+ return;
+ }
+
+ assert(s.ok());
+
+ uint64_t total_bytes = 0;
+ for (size_t i = 0, j = 0; i < num_blobs; ++i) {
+ BlobReadRequest* const req = blob_reqs[i].first;
+ assert(req);
+ assert(req->user_key);
+ assert(req->status);
+
+ if (!req->status->ok()) {
+ continue;
+ }
+
+ assert(j < read_reqs.size());
+ auto& read_req = read_reqs[j++];
+ const auto& record_slice = read_req.result;
+ if (read_req.status.ok() && record_slice.size() != read_req.len) {
+ read_req.status =
+ IOStatus::Corruption("Failed to read data from blob file");
+ }
+
+ *req->status = read_req.status;
+ if (!req->status->ok()) {
+ continue;
+ }
+
+ // Verify checksums if enabled
+ if (read_options.verify_checksums) {
+ *req->status = VerifyBlob(record_slice, *req->user_key, req->len);
+ if (!req->status->ok()) {
+ continue;
+ }
+ }
+
+ // Uncompress blob if needed
+ Slice value_slice(record_slice.data() + adjustments[i], req->len);
+ *req->status =
+ UncompressBlobIfNeeded(value_slice, compression_type_, allocator,
+ clock_, statistics_, &blob_reqs[i].second);
+ if (req->status->ok()) {
+ total_bytes += record_slice.size();
+ }
+ }
+
+ if (bytes_read) {
+ *bytes_read = total_bytes;
+ }
+}
+
+Status BlobFileReader::VerifyBlob(const Slice& record_slice,
+ const Slice& user_key, uint64_t value_size) {
+ PERF_TIMER_GUARD(blob_checksum_time);
+
+ BlobLogRecord record;
+
+ const Slice header_slice(record_slice.data(), BlobLogRecord::kHeaderSize);
+
+ {
+ const Status s = record.DecodeHeaderFrom(header_slice);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (record.key_size != user_key.size()) {
+ return Status::Corruption("Key size mismatch when reading blob");
+ }
+
+ if (record.value_size != value_size) {
+ return Status::Corruption("Value size mismatch when reading blob");
+ }
+
+ record.key =
+ Slice(record_slice.data() + BlobLogRecord::kHeaderSize, record.key_size);
+ if (record.key != user_key) {
+ return Status::Corruption("Key mismatch when reading blob");
+ }
+
+ record.value = Slice(record.key.data() + record.key_size, value_size);
+
+ {
+ TEST_SYNC_POINT_CALLBACK("BlobFileReader::VerifyBlob:CheckBlobCRC",
+ &record);
+
+ const Status s = record.CheckBlobCRC();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ return Status::OK();
+}
+
+Status BlobFileReader::UncompressBlobIfNeeded(
+ const Slice& value_slice, CompressionType compression_type,
+ MemoryAllocator* allocator, SystemClock* clock, Statistics* statistics,
+ std::unique_ptr<BlobContents>* result) {
+ assert(result);
+
+ if (compression_type == kNoCompression) {
+ CacheAllocationPtr allocation =
+ AllocateBlock(value_slice.size(), allocator);
+ memcpy(allocation.get(), value_slice.data(), value_slice.size());
+
+ *result = BlobContents::Create(std::move(allocation), value_slice.size());
+
+ return Status::OK();
+ }
+
+ UncompressionContext context(compression_type);
+ UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+ compression_type);
+
+ size_t uncompressed_size = 0;
+ constexpr uint32_t compression_format_version = 2;
+
+ CacheAllocationPtr output;
+
+ {
+ PERF_TIMER_GUARD(blob_decompress_time);
+ StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS);
+ output = UncompressData(info, value_slice.data(), value_slice.size(),
+ &uncompressed_size, compression_format_version,
+ allocator);
+ }
+
+ TEST_SYNC_POINT_CALLBACK(
+ "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output);
+
+ if (!output) {
+ return Status::Corruption("Unable to uncompress blob");
+ }
+
+ *result = BlobContents::Create(std::move(output), uncompressed_size);
+
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader.h b/src/rocksdb/db/blob/blob_file_reader.h
new file mode 100644
index 000000000..75b756da1
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+#include <memory>
+
+#include "db/blob/blob_read_request.h"
+#include "file/random_access_file_reader.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Status;
+struct ImmutableOptions;
+struct FileOptions;
+class HistogramImpl;
+struct ReadOptions;
+class Slice;
+class FilePrefetchBuffer;
+class BlobContents;
+class Statistics;
+
+class BlobFileReader {
+ public:
+ static Status Create(const ImmutableOptions& immutable_options,
+ const FileOptions& file_options,
+ uint32_t column_family_id,
+ HistogramImpl* blob_file_read_hist,
+ uint64_t blob_file_number,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ std::unique_ptr<BlobFileReader>* reader);
+
+ BlobFileReader(const BlobFileReader&) = delete;
+ BlobFileReader& operator=(const BlobFileReader&) = delete;
+
+ ~BlobFileReader();
+
+ Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+ uint64_t offset, uint64_t value_size,
+ CompressionType compression_type,
+ FilePrefetchBuffer* prefetch_buffer,
+ MemoryAllocator* allocator,
+ std::unique_ptr<BlobContents>* result,
+ uint64_t* bytes_read) const;
+
+ // offsets must be sorted in ascending order by caller.
+ void MultiGetBlob(
+ const ReadOptions& read_options, MemoryAllocator* allocator,
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>&
+ blob_reqs,
+ uint64_t* bytes_read) const;
+
+ CompressionType GetCompressionType() const { return compression_type_; }
+
+ uint64_t GetFileSize() const { return file_size_; }
+
+ private:
+ BlobFileReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
+ uint64_t file_size, CompressionType compression_type,
+ SystemClock* clock, Statistics* statistics);
+
+ static Status OpenFile(const ImmutableOptions& immutable_options,
+ const FileOptions& file_opts,
+ HistogramImpl* blob_file_read_hist,
+ uint64_t blob_file_number,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ uint64_t* file_size,
+ std::unique_ptr<RandomAccessFileReader>* file_reader);
+
+ static Status ReadHeader(const RandomAccessFileReader* file_reader,
+ uint32_t column_family_id, Statistics* statistics,
+ CompressionType* compression_type);
+
+ static Status ReadFooter(const RandomAccessFileReader* file_reader,
+ uint64_t file_size, Statistics* statistics);
+
+ using Buffer = std::unique_ptr<char[]>;
+
+ static Status ReadFromFile(const RandomAccessFileReader* file_reader,
+ uint64_t read_offset, size_t read_size,
+ Statistics* statistics, Slice* slice, Buffer* buf,
+ AlignedBuf* aligned_buf,
+ Env::IOPriority rate_limiter_priority);
+
+ static Status VerifyBlob(const Slice& record_slice, const Slice& user_key,
+ uint64_t value_size);
+
+ static Status UncompressBlobIfNeeded(const Slice& value_slice,
+ CompressionType compression_type,
+ MemoryAllocator* allocator,
+ SystemClock* clock,
+ Statistics* statistics,
+ std::unique_ptr<BlobContents>* result);
+
+ std::unique_ptr<RandomAccessFileReader> file_reader_;
+ uint64_t file_size_;
+ CompressionType compression_type_;
+ SystemClock* clock_;
+ Statistics* statistics_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader_test.cc b/src/rocksdb/db/blob/blob_file_reader_test.cc
new file mode 100644
index 000000000..03458e2b5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader_test.cc
@@ -0,0 +1,1024 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_reader.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/compression.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with `num` blobs in it.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+ uint32_t column_family_id, bool has_ttl,
+ const ExpirationRange& expiration_range_header,
+ const ExpirationRange& expiration_range_footer,
+ uint64_t blob_file_number, const std::vector<Slice>& keys,
+ const std::vector<Slice>& blobs, CompressionType compression,
+ std::vector<uint64_t>& blob_offsets,
+ std::vector<uint64_t>& blob_sizes) {
+ assert(!immutable_options.cf_paths.empty());
+ size_t num = keys.size();
+ assert(num == blobs.size());
+ assert(num == blob_offsets.size());
+ assert(num == blob_sizes.size());
+
+ const std::string blob_file_path =
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+ std::unique_ptr<FSWritableFile> file;
+ ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+ FileOptions()));
+
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+ constexpr Statistics* statistics = nullptr;
+ constexpr bool use_fsync = false;
+ constexpr bool do_flush = false;
+
+ BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+ statistics, blob_file_number, use_fsync,
+ do_flush);
+
+ BlobLogHeader header(column_family_id, compression, has_ttl,
+ expiration_range_header);
+
+ ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+ std::vector<std::string> compressed_blobs(num);
+ std::vector<Slice> blobs_to_write(num);
+ if (kNoCompression == compression) {
+ for (size_t i = 0; i < num; ++i) {
+ blobs_to_write[i] = blobs[i];
+ blob_sizes[i] = blobs[i].size();
+ }
+ } else {
+ CompressionOptions opts;
+ CompressionContext context(compression);
+ constexpr uint64_t sample_for_compression = 0;
+ CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+ compression, sample_for_compression);
+
+ constexpr uint32_t compression_format_version = 2;
+
+ for (size_t i = 0; i < num; ++i) {
+ ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
+ &compressed_blobs[i]));
+ blobs_to_write[i] = compressed_blobs[i];
+ blob_sizes[i] = compressed_blobs[i].size();
+ }
+ }
+
+ for (size_t i = 0; i < num; ++i) {
+ uint64_t key_offset = 0;
+ ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset,
+ &blob_offsets[i]));
+ }
+
+ BlobLogFooter footer;
+ footer.blob_count = num;
+ footer.expiration_range = expiration_range_footer;
+
+ std::string checksum_method;
+ std::string checksum_value;
+ ASSERT_OK(
+ blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+// Creates a test blob file with a single blob in it. Note: this method
+// makes it possible to test various corner cases by allowing the caller
+// to specify the contents of various blob file header/footer fields.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+ uint32_t column_family_id, bool has_ttl,
+ const ExpirationRange& expiration_range_header,
+ const ExpirationRange& expiration_range_footer,
+ uint64_t blob_file_number, const Slice& key,
+ const Slice& blob, CompressionType compression,
+ uint64_t* blob_offset, uint64_t* blob_size) {
+ std::vector<Slice> keys{key};
+ std::vector<Slice> blobs{blob};
+ std::vector<uint64_t> blob_offsets{0};
+ std::vector<uint64_t> blob_sizes{0};
+ WriteBlobFile(immutable_options, column_family_id, has_ttl,
+ expiration_range_header, expiration_range_footer,
+ blob_file_number, keys, blobs, compression, blob_offsets,
+ blob_sizes);
+ if (blob_offset) {
+ *blob_offset = blob_offsets[0];
+ }
+ if (blob_size) {
+ *blob_size = blob_sizes[0];
+ }
+}
+
+} // anonymous namespace
+
+class BlobFileReaderTest : public testing::Test {
+ protected:
+ BlobFileReaderTest() { mock_env_.reset(MockEnv::Create(Env::Default())); }
+ std::unique_ptr<Env> mock_env_;
+};
+
+TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderTest_CreateReaderAndGetBlob"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr size_t num_blobs = 3;
+ const std::vector<std::string> key_strs = {"key1", "key2", "key3"};
+ const std::vector<std::string> blob_strs = {"blob1", "blob2", "blob3"};
+
+ const std::vector<Slice> keys = {key_strs[0], key_strs[1], key_strs[2]};
+ const std::vector<Slice> blobs = {blob_strs[0], blob_strs[1], blob_strs[2]};
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, keys, blobs, kNoCompression,
+ blob_offsets, blob_sizes);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_OK(BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader));
+
+ // Make sure the blob can be retrieved with and without checksum verification
+ ReadOptions read_options;
+ read_options.verify_checksums = false;
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_OK(reader->GetBlob(read_options, keys[0], blob_offsets[0],
+ blob_sizes[0], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read));
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blobs[0]);
+ ASSERT_EQ(bytes_read, blob_sizes[0]);
+
+ // MultiGetBlob
+ bytes_read = 0;
+ size_t total_size = 0;
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<BlobReadRequest, num_blobs> requests_buf;
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+ blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ requests_buf[i] =
+ BlobReadRequest(keys[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, nullptr, &statuses_buf[i]);
+ blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+ }
+
+ reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ const auto& result = blob_reqs[i].second;
+
+ ASSERT_OK(statuses_buf[i]);
+ ASSERT_NE(result, nullptr);
+ ASSERT_EQ(result->data(), blobs[i]);
+ total_size += blob_sizes[i];
+ }
+ ASSERT_EQ(bytes_read, total_size);
+ }
+
+ read_options.verify_checksums = true;
+
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_OK(reader->GetBlob(read_options, keys[1], blob_offsets[1],
+ blob_sizes[1], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read));
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blobs[1]);
+
+ const uint64_t key_size = keys[1].size();
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+ blob_sizes[1]);
+ }
+
+ // Invalid offset (too close to start of file)
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, keys[0], blob_offsets[0] - 1,
+ blob_sizes[0], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+ }
+
+ // Invalid offset (too close to end of file)
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, keys[2], blob_offsets[2] + 1,
+ blob_sizes[2], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+ }
+
+ // Incorrect compression type
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, keys[0], blob_offsets[0],
+ blob_sizes[0], kZSTD, prefetch_buffer, allocator,
+ &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+ }
+
+ // Incorrect key size
+ {
+ constexpr char shorter_key[] = "k";
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, shorter_key,
+ blob_offsets[0] -
+ (keys[0].size() - sizeof(shorter_key) + 1),
+ blob_sizes[0], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+
+ // MultiGetBlob
+ autovector<std::reference_wrapper<const Slice>> key_refs;
+ for (const auto& key_ref : keys) {
+ key_refs.emplace_back(std::cref(key_ref));
+ }
+ Slice shorter_key_slice(shorter_key, sizeof(shorter_key) - 1);
+ key_refs[1] = std::cref(shorter_key_slice);
+
+ autovector<uint64_t> offsets{
+ blob_offsets[0],
+ blob_offsets[1] - (keys[1].size() - key_refs[1].get().size()),
+ blob_offsets[2]};
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<BlobReadRequest, num_blobs> requests_buf;
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+ blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ requests_buf[i] =
+ BlobReadRequest(key_refs[i], offsets[i], blob_sizes[i],
+ kNoCompression, nullptr, &statuses_buf[i]);
+ blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+ }
+
+ reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (i == 1) {
+ ASSERT_TRUE(statuses_buf[i].IsCorruption());
+ } else {
+ ASSERT_OK(statuses_buf[i]);
+ }
+ }
+ }
+
+ // Incorrect key
+ {
+ constexpr char incorrect_key[] = "foo1";
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, incorrect_key, blob_offsets[0],
+ blob_sizes[0], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+
+ // MultiGetBlob
+ autovector<std::reference_wrapper<const Slice>> key_refs;
+ for (const auto& key_ref : keys) {
+ key_refs.emplace_back(std::cref(key_ref));
+ }
+ Slice wrong_key_slice(incorrect_key, sizeof(incorrect_key) - 1);
+ key_refs[2] = std::cref(wrong_key_slice);
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<BlobReadRequest, num_blobs> requests_buf;
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+ blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ requests_buf[i] =
+ BlobReadRequest(key_refs[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, nullptr, &statuses_buf[i]);
+ blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+ }
+
+ reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (i == num_blobs - 1) {
+ ASSERT_TRUE(statuses_buf[i].IsCorruption());
+ } else {
+ ASSERT_OK(statuses_buf[i]);
+ }
+ }
+ }
+
+ // Incorrect value size
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, keys[1], blob_offsets[1],
+ blob_sizes[1] + 1, kNoCompression,
+ prefetch_buffer, allocator, &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+
+ // MultiGetBlob
+ autovector<std::reference_wrapper<const Slice>> key_refs;
+ for (const auto& key_ref : keys) {
+ key_refs.emplace_back(std::cref(key_ref));
+ }
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<BlobReadRequest, num_blobs> requests_buf;
+
+ requests_buf[0] =
+ BlobReadRequest(key_refs[0], blob_offsets[0], blob_sizes[0],
+ kNoCompression, nullptr, &statuses_buf[0]);
+ requests_buf[1] =
+ BlobReadRequest(key_refs[1], blob_offsets[1], blob_sizes[1] + 1,
+ kNoCompression, nullptr, &statuses_buf[1]);
+ requests_buf[2] =
+ BlobReadRequest(key_refs[2], blob_offsets[2], blob_sizes[2],
+ kNoCompression, nullptr, &statuses_buf[2]);
+
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+ blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+ }
+
+ reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (i != 1) {
+ ASSERT_OK(statuses_buf[i]);
+ } else {
+ ASSERT_TRUE(statuses_buf[i].IsCorruption());
+ }
+ }
+ }
+}
+
+TEST_F(BlobFileReaderTest, Malformed) {
+ // Write a blob file consisting of nothing but a header, and make sure we
+ // detect the error when we open it for reading
+
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Malformed"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr uint64_t blob_file_number = 1;
+
+ {
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+
+ const std::string blob_file_path =
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+
+ std::unique_ptr<FSWritableFile> file;
+ ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+ FileOptions()));
+
+ std::unique_ptr<WritableFileWriter> file_writer(
+ new WritableFileWriter(std::move(file), blob_file_path, FileOptions(),
+ immutable_options.clock));
+
+ constexpr Statistics* statistics = nullptr;
+ constexpr bool use_fsync = false;
+ constexpr bool do_flush = false;
+
+ BlobLogWriter blob_log_writer(std::move(file_writer),
+ immutable_options.clock, statistics,
+ blob_file_number, use_fsync, do_flush);
+
+ BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
+ expiration_range);
+
+ ASSERT_OK(blob_log_writer.WriteHeader(header));
+ }
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+ column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/,
+ &reader)
+ .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, TTL) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_TTL"), 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = true;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob, kNoCompression,
+ &blob_offset, &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+ column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/,
+ &reader)
+ .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderTest_ExpirationRangeInHeader"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ const ExpirationRange expiration_range_header(
+ 1, 2); // can be made constexpr when we adopt C++14
+ constexpr ExpirationRange expiration_range_footer;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl,
+ expiration_range_header, expiration_range_footer,
+ blob_file_number, key, blob, kNoCompression, &blob_offset,
+ &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+ column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/,
+ &reader)
+ .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderTest_ExpirationRangeInFooter"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range_header;
+ const ExpirationRange expiration_range_footer(
+ 1, 2); // can be made constexpr when we adopt C++14
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl,
+ expiration_range_header, expiration_range_footer,
+ blob_file_number, key, blob, kNoCompression, &blob_offset,
+ &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+ column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/,
+ &reader)
+ .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, IncorrectColumnFamily) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderTest_IncorrectColumnFamily"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob, kNoCompression,
+ &blob_offset, &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ constexpr uint32_t incorrect_column_family_id = 2;
+
+ ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+ incorrect_column_family_id,
+ blob_file_read_hist, blob_file_number,
+ nullptr /*IOTracer*/, &reader)
+ .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, BlobCRCError) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_BlobCRCError"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob, kNoCompression,
+ &blob_offset, &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_OK(BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader));
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) {
+ BlobLogRecord* const record = static_cast<BlobLogRecord*>(arg);
+ assert(record);
+
+ record->blob_crc = 0xfaceb00c;
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+ kNoCompression, prefetch_buffer, allocator, &value,
+ &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileReaderTest, Compression) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Compression"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob,
+ kSnappyCompression, &blob_offset, &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_OK(BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader));
+
+ // Make sure the blob can be retrieved with and without checksum verification
+ ReadOptions read_options;
+ read_options.verify_checksums = false;
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
+ kSnappyCompression, prefetch_buffer, allocator,
+ &value, &bytes_read));
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blob);
+ ASSERT_EQ(bytes_read, blob_size);
+ }
+
+ read_options.verify_checksums = true;
+
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
+ kSnappyCompression, prefetch_buffer, allocator,
+ &value, &bytes_read));
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blob);
+
+ constexpr uint64_t key_size = sizeof(key) - 1;
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+ blob_size);
+ }
+}
+
+TEST_F(BlobFileReaderTest, UncompressionError) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderTest_UncompressionError"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob,
+ kSnappyCompression, &blob_offset, &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_OK(BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader));
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) {
+ CacheAllocationPtr* const output =
+ static_cast<CacheAllocationPtr*>(arg);
+ assert(output);
+
+ output->reset();
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+ kSnappyCompression, prefetch_buffer, allocator,
+ &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class BlobFileReaderIOErrorTest
+ : public testing::Test,
+ public testing::WithParamInterface<std::string> {
+ protected:
+ BlobFileReaderIOErrorTest() : sync_point_(GetParam()) {
+ mock_env_.reset(MockEnv::Create(Env::Default()));
+ fault_injection_env_.reset(new FaultInjectionTestEnv(mock_env_.get()));
+ }
+
+ std::unique_ptr<Env> mock_env_;
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+ std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderIOErrorTest,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileReader::OpenFile:GetFileSize",
+ "BlobFileReader::OpenFile:NewRandomAccessFile",
+ "BlobFileReader::ReadHeader:ReadFromFile",
+ "BlobFileReader::ReadFooter:ReadFromFile",
+ "BlobFileReader::GetBlob:ReadFromFile"}));
+
+TEST_P(BlobFileReaderIOErrorTest, IOError) {
+ // Simulates an I/O error during the specified step
+
+ Options options;
+ options.env = fault_injection_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(fault_injection_env_.get(),
+ "BlobFileReaderIOErrorTest_IOError"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob, kNoCompression,
+ &blob_offset, &blob_size);
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+ fault_injection_env_->SetFilesystemActive(false,
+ Status::IOError(sync_point_));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ const Status s = BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader);
+
+ const bool fail_during_create =
+ (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile");
+
+ if (fail_during_create) {
+ ASSERT_TRUE(s.IsIOError());
+ } else {
+ ASSERT_OK(s);
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+ kNoCompression, prefetch_buffer, allocator,
+ &value, &bytes_read)
+ .IsIOError());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class BlobFileReaderDecodingErrorTest
+ : public testing::Test,
+ public testing::WithParamInterface<std::string> {
+ protected:
+ BlobFileReaderDecodingErrorTest() : sync_point_(GetParam()) {
+ mock_env_.reset(MockEnv::Create(Env::Default()));
+ }
+
+ std::unique_ptr<Env> mock_env_;
+ std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderDecodingErrorTest,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileReader::ReadHeader:TamperWithResult",
+ "BlobFileReader::ReadFooter:TamperWithResult",
+ "BlobFileReader::GetBlob:TamperWithResult"}));
+
+TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderDecodingErrorTest_DecodingError"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob, kNoCompression,
+ &blob_offset, &blob_size);
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [](void* arg) {
+ Slice* const slice = static_cast<Slice*>(arg);
+ assert(slice);
+ assert(!slice->empty());
+
+ slice->remove_prefix(1);
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ const Status s = BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader);
+
+ const bool fail_during_create =
+ sync_point_ != "BlobFileReader::GetBlob:TamperWithResult";
+
+ if (fail_during_create) {
+ ASSERT_TRUE(s.IsCorruption());
+ } else {
+ ASSERT_OK(s);
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+ kNoCompression, prefetch_buffer, allocator,
+ &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_garbage_meter.cc b/src/rocksdb/db/blob/blob_garbage_meter.cc
new file mode 100644
index 000000000..d328d7ff4
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_garbage_meter.h"
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobGarbageMeter::ProcessInFlow(const Slice& key, const Slice& value) {
+ uint64_t blob_file_number = kInvalidBlobFileNumber;
+ uint64_t bytes = 0;
+
+ const Status s = Parse(key, value, &blob_file_number, &bytes);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (blob_file_number == kInvalidBlobFileNumber) {
+ return Status::OK();
+ }
+
+ flows_[blob_file_number].AddInFlow(bytes);
+
+ return Status::OK();
+}
+
+Status BlobGarbageMeter::ProcessOutFlow(const Slice& key, const Slice& value) {
+ uint64_t blob_file_number = kInvalidBlobFileNumber;
+ uint64_t bytes = 0;
+
+ const Status s = Parse(key, value, &blob_file_number, &bytes);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (blob_file_number == kInvalidBlobFileNumber) {
+ return Status::OK();
+ }
+
+ // Note: in order to measure the amount of additional garbage, we only need to
+ // track the outflow for preexisting files, i.e. those that also had inflow.
+ // (Newly written files would only have outflow.)
+ auto it = flows_.find(blob_file_number);
+ if (it == flows_.end()) {
+ return Status::OK();
+ }
+
+ it->second.AddOutFlow(bytes);
+
+ return Status::OK();
+}
+
+Status BlobGarbageMeter::Parse(const Slice& key, const Slice& value,
+ uint64_t* blob_file_number, uint64_t* bytes) {
+ assert(blob_file_number);
+ assert(*blob_file_number == kInvalidBlobFileNumber);
+ assert(bytes);
+ assert(*bytes == 0);
+
+ ParsedInternalKey ikey;
+
+ {
+ constexpr bool log_err_key = false;
+ const Status s = ParseInternalKey(key, &ikey, log_err_key);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (ikey.type != kTypeBlobIndex) {
+ return Status::OK();
+ }
+
+ BlobIndex blob_index;
+
+ {
+ const Status s = blob_index.DecodeFrom(value);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (blob_index.IsInlined() || blob_index.HasTTL()) {
+ return Status::Corruption("Unexpected TTL/inlined blob index");
+ }
+
+ *blob_file_number = blob_index.file_number();
+ *bytes =
+ blob_index.size() +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(ikey.user_key.size());
+
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_garbage_meter.h b/src/rocksdb/db/blob/blob_garbage_meter.h
new file mode 100644
index 000000000..a6c04b0b2
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <unordered_map>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// A class that can be used to compute the amount of additional garbage
+// generated by a compaction. It parses the keys and blob references in the
+// input and output of a compaction, and aggregates the "inflow" and "outflow"
+// on a per-blob file basis. The amount of additional garbage for any given blob
+// file can then be computed by subtracting the outflow from the inflow.
+class BlobGarbageMeter {
+ public:
+ // A class to store the number and total size of blobs on a per-blob file
+ // basis.
+ class BlobStats {
+ public:
+ void Add(uint64_t bytes) {
+ ++count_;
+ bytes_ += bytes;
+ }
+ void Add(uint64_t count, uint64_t bytes) {
+ count_ += count;
+ bytes_ += bytes;
+ }
+
+ uint64_t GetCount() const { return count_; }
+ uint64_t GetBytes() const { return bytes_; }
+
+ private:
+ uint64_t count_ = 0;
+ uint64_t bytes_ = 0;
+ };
+
+ // A class to keep track of the "inflow" and the "outflow" and to compute the
+ // amount of additional garbage for a given blob file.
+ class BlobInOutFlow {
+ public:
+ void AddInFlow(uint64_t bytes) {
+ in_flow_.Add(bytes);
+ assert(IsValid());
+ }
+ void AddOutFlow(uint64_t bytes) {
+ out_flow_.Add(bytes);
+ assert(IsValid());
+ }
+
+ const BlobStats& GetInFlow() const { return in_flow_; }
+ const BlobStats& GetOutFlow() const { return out_flow_; }
+
+ bool IsValid() const {
+ return in_flow_.GetCount() >= out_flow_.GetCount() &&
+ in_flow_.GetBytes() >= out_flow_.GetBytes();
+ }
+ bool HasGarbage() const {
+ assert(IsValid());
+ return in_flow_.GetCount() > out_flow_.GetCount();
+ }
+ uint64_t GetGarbageCount() const {
+ assert(IsValid());
+ assert(HasGarbage());
+ return in_flow_.GetCount() - out_flow_.GetCount();
+ }
+ uint64_t GetGarbageBytes() const {
+ assert(IsValid());
+ assert(HasGarbage());
+ return in_flow_.GetBytes() - out_flow_.GetBytes();
+ }
+
+ private:
+ BlobStats in_flow_;
+ BlobStats out_flow_;
+ };
+
+ Status ProcessInFlow(const Slice& key, const Slice& value);
+ Status ProcessOutFlow(const Slice& key, const Slice& value);
+
+ const std::unordered_map<uint64_t, BlobInOutFlow>& flows() const {
+ return flows_;
+ }
+
+ private:
+ static Status Parse(const Slice& key, const Slice& value,
+ uint64_t* blob_file_number, uint64_t* bytes);
+
+ std::unordered_map<uint64_t, BlobInOutFlow> flows_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_garbage_meter_test.cc b/src/rocksdb/db/blob/blob_garbage_meter_test.cc
new file mode 100644
index 000000000..ba53f06f1
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter_test.cc
@@ -0,0 +1,197 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_garbage_meter.h"
+
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(BlobGarbageMeterTest, MeasureGarbage) {
+ BlobGarbageMeter blob_garbage_meter;
+
+ struct BlobDescriptor {
+ std::string user_key;
+ uint64_t blob_file_number;
+ uint64_t offset;
+ uint64_t size;
+ CompressionType compression_type;
+ bool has_in_flow;
+ bool has_out_flow;
+
+ uint64_t GetExpectedBytes() const {
+ return size +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(user_key.size());
+ }
+ };
+
+ // Note: blob file 4 has the same inflow and outflow and hence no additional
+ // garbage. Blob file 5 has less outflow than inflow and thus it does have
+ // additional garbage. Blob file 6 is a newly written file (i.e. no inflow,
+ // only outflow) and is thus not tracked by the meter.
+ std::vector<BlobDescriptor> blobs{
+ {"key", 4, 1234, 555, kLZ4Compression, true, true},
+ {"other_key", 4, 6789, 101010, kLZ4Compression, true, true},
+ {"yet_another_key", 5, 22222, 3456, kLZ4Compression, true, true},
+ {"foo_key", 5, 77777, 8888, kLZ4Compression, true, true},
+ {"bar_key", 5, 999999, 1212, kLZ4Compression, true, false},
+ {"baz_key", 5, 1234567, 890, kLZ4Compression, true, false},
+ {"new_key", 6, 7777, 9999, kNoCompression, false, true}};
+
+ for (const auto& blob : blobs) {
+ constexpr SequenceNumber seq = 123;
+ const InternalKey key(blob.user_key, seq, kTypeBlobIndex);
+ const Slice key_slice = key.Encode();
+
+ std::string value;
+ BlobIndex::EncodeBlob(&value, blob.blob_file_number, blob.offset, blob.size,
+ blob.compression_type);
+ const Slice value_slice(value);
+
+ if (blob.has_in_flow) {
+ ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+ }
+ if (blob.has_out_flow) {
+ ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+ }
+ }
+
+ const auto& flows = blob_garbage_meter.flows();
+ ASSERT_EQ(flows.size(), 2);
+
+ {
+ const auto it = flows.find(4);
+ ASSERT_NE(it, flows.end());
+
+ const auto& flow = it->second;
+
+ constexpr uint64_t expected_count = 2;
+ const uint64_t expected_bytes =
+ blobs[0].GetExpectedBytes() + blobs[1].GetExpectedBytes();
+
+ const auto& in = flow.GetInFlow();
+ ASSERT_EQ(in.GetCount(), expected_count);
+ ASSERT_EQ(in.GetBytes(), expected_bytes);
+
+ const auto& out = flow.GetOutFlow();
+ ASSERT_EQ(out.GetCount(), expected_count);
+ ASSERT_EQ(out.GetBytes(), expected_bytes);
+
+ ASSERT_TRUE(flow.IsValid());
+ ASSERT_FALSE(flow.HasGarbage());
+ }
+
+ {
+ const auto it = flows.find(5);
+ ASSERT_NE(it, flows.end());
+
+ const auto& flow = it->second;
+
+ const auto& in = flow.GetInFlow();
+
+ constexpr uint64_t expected_in_count = 4;
+ const uint64_t expected_in_bytes =
+ blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes() +
+ blobs[4].GetExpectedBytes() + blobs[5].GetExpectedBytes();
+
+ ASSERT_EQ(in.GetCount(), expected_in_count);
+ ASSERT_EQ(in.GetBytes(), expected_in_bytes);
+
+ const auto& out = flow.GetOutFlow();
+
+ constexpr uint64_t expected_out_count = 2;
+ const uint64_t expected_out_bytes =
+ blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes();
+
+ ASSERT_EQ(out.GetCount(), expected_out_count);
+ ASSERT_EQ(out.GetBytes(), expected_out_bytes);
+
+ ASSERT_TRUE(flow.IsValid());
+ ASSERT_TRUE(flow.HasGarbage());
+ ASSERT_EQ(flow.GetGarbageCount(), expected_in_count - expected_out_count);
+ ASSERT_EQ(flow.GetGarbageBytes(), expected_in_bytes - expected_out_bytes);
+ }
+}
+
+TEST(BlobGarbageMeterTest, PlainValue) {
+ constexpr char user_key[] = "user_key";
+ constexpr SequenceNumber seq = 123;
+
+ const InternalKey key(user_key, seq, kTypeValue);
+ const Slice key_slice = key.Encode();
+
+ constexpr char value[] = "value";
+ const Slice value_slice(value);
+
+ BlobGarbageMeter blob_garbage_meter;
+
+ ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+ ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+ ASSERT_TRUE(blob_garbage_meter.flows().empty());
+}
+
+TEST(BlobGarbageMeterTest, CorruptInternalKey) {
+ constexpr char corrupt_key[] = "i_am_corrupt";
+ const Slice key_slice(corrupt_key);
+
+ constexpr char value[] = "value";
+ const Slice value_slice(value);
+
+ BlobGarbageMeter blob_garbage_meter;
+
+ ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+ ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+TEST(BlobGarbageMeterTest, CorruptBlobIndex) {
+ constexpr char user_key[] = "user_key";
+ constexpr SequenceNumber seq = 123;
+
+ const InternalKey key(user_key, seq, kTypeBlobIndex);
+ const Slice key_slice = key.Encode();
+
+ constexpr char value[] = "i_am_not_a_blob_index";
+ const Slice value_slice(value);
+
+ BlobGarbageMeter blob_garbage_meter;
+
+ ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+ ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+TEST(BlobGarbageMeterTest, InlinedTTLBlobIndex) {
+ constexpr char user_key[] = "user_key";
+ constexpr SequenceNumber seq = 123;
+
+ const InternalKey key(user_key, seq, kTypeBlobIndex);
+ const Slice key_slice = key.Encode();
+
+ constexpr uint64_t expiration = 1234567890;
+ constexpr char inlined_value[] = "inlined";
+
+ std::string value;
+ BlobIndex::EncodeInlinedTTL(&value, expiration, inlined_value);
+
+ const Slice value_slice(value);
+
+ BlobGarbageMeter blob_garbage_meter;
+
+ ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+ ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_index.h b/src/rocksdb/db/blob/blob_index.h
new file mode 100644
index 000000000..e9944d784
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_index.h
@@ -0,0 +1,187 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <sstream>
+#include <string>
+
+#include "rocksdb/compression_type.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// BlobIndex is a pointer to the blob and metadata of the blob. The index is
+// stored in base DB as ValueType::kTypeBlobIndex.
+// There are three types of blob index:
+//
+// kInlinedTTL:
+// +------+------------+---------------+
+// | type | expiration | value |
+// +------+------------+---------------+
+// | char | varint64 | variable size |
+// +------+------------+---------------+
+//
+// kBlob:
+// +------+-------------+----------+----------+-------------+
+// | type | file number | offset | size | compression |
+// +------+-------------+----------+----------+-------------+
+// | char | varint64 | varint64 | varint64 | char |
+// +------+-------------+----------+----------+-------------+
+//
+// kBlobTTL:
+// +------+------------+-------------+----------+----------+-------------+
+// | type | expiration | file number | offset | size | compression |
+// +------+------------+-------------+----------+----------+-------------+
+// | char | varint64 | varint64 | varint64 | varint64 | char |
+// +------+------------+-------------+----------+----------+-------------+
+//
+// There isn't a kInlined (without TTL) type since we can store it as a plain
+// value (i.e. ValueType::kTypeValue).
+class BlobIndex {
+ public:
+ enum class Type : unsigned char {
+ kInlinedTTL = 0,
+ kBlob = 1,
+ kBlobTTL = 2,
+ kUnknown = 3,
+ };
+
+ BlobIndex() : type_(Type::kUnknown) {}
+
+ BlobIndex(const BlobIndex&) = default;
+ BlobIndex& operator=(const BlobIndex&) = default;
+
+ bool IsInlined() const { return type_ == Type::kInlinedTTL; }
+
+ bool HasTTL() const {
+ return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL;
+ }
+
+ uint64_t expiration() const {
+ assert(HasTTL());
+ return expiration_;
+ }
+
+ const Slice& value() const {
+ assert(IsInlined());
+ return value_;
+ }
+
+ uint64_t file_number() const {
+ assert(!IsInlined());
+ return file_number_;
+ }
+
+ uint64_t offset() const {
+ assert(!IsInlined());
+ return offset_;
+ }
+
+ uint64_t size() const {
+ assert(!IsInlined());
+ return size_;
+ }
+
+ CompressionType compression() const {
+ assert(!IsInlined());
+ return compression_;
+ }
+
+ Status DecodeFrom(Slice slice) {
+ const char* kErrorMessage = "Error while decoding blob index";
+ assert(slice.size() > 0);
+ type_ = static_cast<Type>(*slice.data());
+ if (type_ >= Type::kUnknown) {
+ return Status::Corruption(kErrorMessage,
+ "Unknown blob index type: " +
+ std::to_string(static_cast<char>(type_)));
+ }
+ slice = Slice(slice.data() + 1, slice.size() - 1);
+ if (HasTTL()) {
+ if (!GetVarint64(&slice, &expiration_)) {
+ return Status::Corruption(kErrorMessage, "Corrupted expiration");
+ }
+ }
+ if (IsInlined()) {
+ value_ = slice;
+ } else {
+ if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) &&
+ GetVarint64(&slice, &size_) && slice.size() == 1) {
+ compression_ = static_cast<CompressionType>(*slice.data());
+ } else {
+ return Status::Corruption(kErrorMessage, "Corrupted blob offset");
+ }
+ }
+ return Status::OK();
+ }
+
+ std::string DebugString(bool output_hex) const {
+ std::ostringstream oss;
+
+ if (IsInlined()) {
+ oss << "[inlined blob] value:" << value_.ToString(output_hex);
+ } else {
+ oss << "[blob ref] file:" << file_number_ << " offset:" << offset_
+ << " size:" << size_
+ << " compression: " << CompressionTypeToString(compression_);
+ }
+
+ if (HasTTL()) {
+ oss << " exp:" << expiration_;
+ }
+
+ return oss.str();
+ }
+
+ static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
+ const Slice& value) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(1 + kMaxVarint64Length + value.size());
+ dst->push_back(static_cast<char>(Type::kInlinedTTL));
+ PutVarint64(dst, expiration);
+ dst->append(value.data(), value.size());
+ }
+
+ static void EncodeBlob(std::string* dst, uint64_t file_number,
+ uint64_t offset, uint64_t size,
+ CompressionType compression) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(kMaxVarint64Length * 3 + 2);
+ dst->push_back(static_cast<char>(Type::kBlob));
+ PutVarint64(dst, file_number);
+ PutVarint64(dst, offset);
+ PutVarint64(dst, size);
+ dst->push_back(static_cast<char>(compression));
+ }
+
+ static void EncodeBlobTTL(std::string* dst, uint64_t expiration,
+ uint64_t file_number, uint64_t offset,
+ uint64_t size, CompressionType compression) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(kMaxVarint64Length * 4 + 2);
+ dst->push_back(static_cast<char>(Type::kBlobTTL));
+ PutVarint64(dst, expiration);
+ PutVarint64(dst, file_number);
+ PutVarint64(dst, offset);
+ PutVarint64(dst, size);
+ dst->push_back(static_cast<char>(compression));
+ }
+
+ private:
+ Type type_ = Type::kUnknown;
+ uint64_t expiration_ = 0;
+ Slice value_;
+ uint64_t file_number_ = 0;
+ uint64_t offset_ = 0;
+ uint64_t size_ = 0;
+ CompressionType compression_ = kNoCompression;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_format.cc b/src/rocksdb/db/blob/blob_log_format.cc
new file mode 100644
index 000000000..8e26281e3
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_format.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "db/blob/blob_log_format.h"
+
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void BlobLogHeader::EncodeTo(std::string* dst) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(BlobLogHeader::kSize);
+ PutFixed32(dst, kMagicNumber);
+ PutFixed32(dst, version);
+ PutFixed32(dst, column_family_id);
+ unsigned char flags = (has_ttl ? 1 : 0);
+ dst->push_back(flags);
+ dst->push_back(compression);
+ PutFixed64(dst, expiration_range.first);
+ PutFixed64(dst, expiration_range.second);
+}
+
+Status BlobLogHeader::DecodeFrom(Slice src) {
+ const char* kErrorMessage = "Error while decoding blob log header";
+ if (src.size() != BlobLogHeader::kSize) {
+ return Status::Corruption(kErrorMessage,
+ "Unexpected blob file header size");
+ }
+ uint32_t magic_number;
+ unsigned char flags;
+ if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) ||
+ !GetFixed32(&src, &column_family_id)) {
+ return Status::Corruption(
+ kErrorMessage,
+ "Error decoding magic number, version and column family id");
+ }
+ if (magic_number != kMagicNumber) {
+ return Status::Corruption(kErrorMessage, "Magic number mismatch");
+ }
+ if (version != kVersion1) {
+ return Status::Corruption(kErrorMessage, "Unknown header version");
+ }
+ flags = src.data()[0];
+ compression = static_cast<CompressionType>(src.data()[1]);
+ has_ttl = (flags & 1) == 1;
+ src.remove_prefix(2);
+ if (!GetFixed64(&src, &expiration_range.first) ||
+ !GetFixed64(&src, &expiration_range.second)) {
+ return Status::Corruption(kErrorMessage, "Error decoding expiration range");
+ }
+ return Status::OK();
+}
+
+void BlobLogFooter::EncodeTo(std::string* dst) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(BlobLogFooter::kSize);
+ PutFixed32(dst, kMagicNumber);
+ PutFixed64(dst, blob_count);
+ PutFixed64(dst, expiration_range.first);
+ PutFixed64(dst, expiration_range.second);
+ crc = crc32c::Value(dst->c_str(), dst->size());
+ crc = crc32c::Mask(crc);
+ PutFixed32(dst, crc);
+}
+
+Status BlobLogFooter::DecodeFrom(Slice src) {
+ const char* kErrorMessage = "Error while decoding blob log footer";
+ if (src.size() != BlobLogFooter::kSize) {
+ return Status::Corruption(kErrorMessage,
+ "Unexpected blob file footer size");
+ }
+ uint32_t src_crc = 0;
+ src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - sizeof(uint32_t));
+ src_crc = crc32c::Mask(src_crc);
+ uint32_t magic_number = 0;
+ if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) ||
+ !GetFixed64(&src, &expiration_range.first) ||
+ !GetFixed64(&src, &expiration_range.second) || !GetFixed32(&src, &crc)) {
+ return Status::Corruption(kErrorMessage, "Error decoding content");
+ }
+ if (magic_number != kMagicNumber) {
+ return Status::Corruption(kErrorMessage, "Magic number mismatch");
+ }
+ if (src_crc != crc) {
+ return Status::Corruption(kErrorMessage, "CRC mismatch");
+ }
+ return Status::OK();
+}
+
+void BlobLogRecord::EncodeHeaderTo(std::string* dst) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size());
+ PutFixed64(dst, key.size());
+ PutFixed64(dst, value.size());
+ PutFixed64(dst, expiration);
+ header_crc = crc32c::Value(dst->c_str(), dst->size());
+ header_crc = crc32c::Mask(header_crc);
+ PutFixed32(dst, header_crc);
+ blob_crc = crc32c::Value(key.data(), key.size());
+ blob_crc = crc32c::Extend(blob_crc, value.data(), value.size());
+ blob_crc = crc32c::Mask(blob_crc);
+ PutFixed32(dst, blob_crc);
+}
+
+Status BlobLogRecord::DecodeHeaderFrom(Slice src) {
+ const char* kErrorMessage = "Error while decoding blob record";
+ if (src.size() != BlobLogRecord::kHeaderSize) {
+ return Status::Corruption(kErrorMessage,
+ "Unexpected blob record header size");
+ }
+ uint32_t src_crc = 0;
+ src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8);
+ src_crc = crc32c::Mask(src_crc);
+ if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) ||
+ !GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) ||
+ !GetFixed32(&src, &blob_crc)) {
+ return Status::Corruption(kErrorMessage, "Error decoding content");
+ }
+ if (src_crc != header_crc) {
+ return Status::Corruption(kErrorMessage, "Header CRC mismatch");
+ }
+ return Status::OK();
+}
+
+Status BlobLogRecord::CheckBlobCRC() const {
+ uint32_t expected_crc = 0;
+ expected_crc = crc32c::Value(key.data(), key.size());
+ expected_crc = crc32c::Extend(expected_crc, value.data(), value.size());
+ expected_crc = crc32c::Mask(expected_crc);
+ if (expected_crc != blob_crc) {
+ return Status::Corruption("Blob CRC mismatch");
+ }
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_format.h b/src/rocksdb/db/blob/blob_log_format.h
new file mode 100644
index 000000000..607db2367
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_format.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Log format information shared by reader and writer.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint32_t kMagicNumber = 2395959; // 0x00248f37
+constexpr uint32_t kVersion1 = 1;
+
+using ExpirationRange = std::pair<uint64_t, uint64_t>;
+
+// clang-format off
+
+// Format of blob log file header (30 bytes):
+//
+// +--------------+---------+---------+-------+-------------+-------------------+
+// | magic number | version | cf id | flags | compression | expiration range |
+// +--------------+---------+---------+-------+-------------+-------------------+
+// | Fixed32 | Fixed32 | Fixed32 | char | char | Fixed64 Fixed64 |
+// +--------------+---------+---------+-------+-------------+-------------------+
+//
+// List of flags:
+// has_ttl: Whether the file contain TTL data.
+//
+// Expiration range in the header is a rough range based on
+// blob_db_options.ttl_range_secs.
+
+// clang-format on
+
+struct BlobLogHeader {
+ static constexpr size_t kSize = 30;
+
+ BlobLogHeader() = default;
+ BlobLogHeader(uint32_t _column_family_id, CompressionType _compression,
+ bool _has_ttl, const ExpirationRange& _expiration_range)
+ : column_family_id(_column_family_id),
+ compression(_compression),
+ has_ttl(_has_ttl),
+ expiration_range(_expiration_range) {}
+
+ uint32_t version = kVersion1;
+ uint32_t column_family_id = 0;
+ CompressionType compression = kNoCompression;
+ bool has_ttl = false;
+ ExpirationRange expiration_range;
+
+ void EncodeTo(std::string* dst);
+
+ Status DecodeFrom(Slice slice);
+};
+
+// clang-format off
+
+// Format of blob log file footer (32 bytes):
+//
+// +--------------+------------+-------------------+------------+
+// | magic number | blob count | expiration range | footer CRC |
+// +--------------+------------+-------------------+------------+
+// | Fixed32 | Fixed64 | Fixed64 + Fixed64 | Fixed32 |
+// +--------------+------------+-------------------+------------+
+//
+// The footer will be presented only when the blob file is properly closed.
+//
+// Unlike the same field in file header, expiration range in the footer is the
+// range of smallest and largest expiration of the data in this file.
+
+// clang-format on
+
+struct BlobLogFooter {
+ static constexpr size_t kSize = 32;
+
+ uint64_t blob_count = 0;
+ ExpirationRange expiration_range = std::make_pair(0, 0);
+ uint32_t crc = 0;
+
+ void EncodeTo(std::string* dst);
+
+ Status DecodeFrom(Slice slice);
+};
+
+// clang-format off
+
+// Blob record format (32 bytes header + key + value):
+//
+// +------------+--------------+------------+------------+----------+---------+-----------+
+// | key length | value length | expiration | header CRC | blob CRC | key | value |
+// +------------+--------------+------------+------------+----------+---------+-----------+
+// | Fixed64 | Fixed64 | Fixed64 | Fixed32 | Fixed32 | key len | value len |
+// +------------+--------------+------------+------------+----------+---------+-----------+
+//
+// If file has has_ttl = false, expiration field is always 0, and the blob
+// doesn't has expiration.
+//
+// Also note that if compression is used, value is compressed value and value
+// length is compressed value length.
+//
+// Header CRC is the checksum of (key_len + val_len + expiration), while
+// blob CRC is the checksum of (key + value).
+//
+// We could use variable length encoding (Varint64) to save more space, but it
+// make reader more complicated.
+
+// clang-format on
+
+struct BlobLogRecord {
+ // header include fields up to blob CRC
+ static constexpr size_t kHeaderSize = 32;
+
+ // Note that the offset field of BlobIndex actually points to the blob value
+ // as opposed to the start of the blob record. The following method can
+ // be used to calculate the adjustment needed to read the blob record header.
+ static constexpr uint64_t CalculateAdjustmentForRecordHeader(
+ uint64_t key_size) {
+ return key_size + kHeaderSize;
+ }
+
+ uint64_t key_size = 0;
+ uint64_t value_size = 0;
+ uint64_t expiration = 0;
+ uint32_t header_crc = 0;
+ uint32_t blob_crc = 0;
+ Slice key;
+ Slice value;
+ std::unique_ptr<char[]> key_buf;
+ std::unique_ptr<char[]> value_buf;
+
+ uint64_t record_size() const { return kHeaderSize + key_size + value_size; }
+
+ void EncodeHeaderTo(std::string* dst);
+
+ Status DecodeHeaderFrom(Slice src);
+
+ Status CheckBlobCRC() const;
+};
+
+// Checks whether a blob offset is potentially valid or not.
+inline bool IsValidBlobOffset(uint64_t value_offset, uint64_t key_size,
+ uint64_t value_size, uint64_t file_size) {
+ if (value_offset <
+ BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key_size) {
+ return false;
+ }
+
+ if (value_offset + value_size + BlobLogFooter::kSize > file_size) {
+ return false;
+ }
+
+ return true;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.cc b/src/rocksdb/db/blob/blob_log_sequential_reader.cc
new file mode 100644
index 000000000..778725189
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_sequential_reader.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "db/blob/blob_log_sequential_reader.h"
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/statistics.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobLogSequentialReader::BlobLogSequentialReader(
+ std::unique_ptr<RandomAccessFileReader>&& file_reader, SystemClock* clock,
+ Statistics* statistics)
+ : file_(std::move(file_reader)),
+ clock_(clock),
+ statistics_(statistics),
+ next_byte_(0) {}
+
+BlobLogSequentialReader::~BlobLogSequentialReader() = default;
+
+Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice,
+ char* buf) {
+ assert(slice);
+ assert(file_);
+
+ StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+ // TODO: rate limit `BlobLogSequentialReader` reads (it appears unused?)
+ Status s =
+ file_->Read(IOOptions(), next_byte_, static_cast<size_t>(size), slice,
+ buf, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+ next_byte_ += size;
+ if (!s.ok()) {
+ return s;
+ }
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size());
+ if (slice->size() != size) {
+ return Status::Corruption("EOF reached while reading record");
+ }
+ return s;
+}
+
+Status BlobLogSequentialReader::ReadHeader(BlobLogHeader* header) {
+ assert(header);
+ assert(next_byte_ == 0);
+
+ static_assert(BlobLogHeader::kSize <= sizeof(header_buf_),
+ "Buffer is smaller than BlobLogHeader::kSize");
+
+ Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (buffer_.size() != BlobLogHeader::kSize) {
+ return Status::Corruption("EOF reached before file header");
+ }
+
+ return header->DecodeFrom(buffer_);
+}
+
+Status BlobLogSequentialReader::ReadRecord(BlobLogRecord* record,
+ ReadLevel level,
+ uint64_t* blob_offset) {
+ assert(record);
+ static_assert(BlobLogRecord::kHeaderSize <= sizeof(header_buf_),
+ "Buffer is smaller than BlobLogRecord::kHeaderSize");
+
+ Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_);
+ if (!s.ok()) {
+ return s;
+ }
+ if (buffer_.size() != BlobLogRecord::kHeaderSize) {
+ return Status::Corruption("EOF reached before record header");
+ }
+
+ s = record->DecodeHeaderFrom(buffer_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ uint64_t kb_size = record->key_size + record->value_size;
+ if (blob_offset != nullptr) {
+ *blob_offset = next_byte_ + record->key_size;
+ }
+
+ switch (level) {
+ case kReadHeader:
+ next_byte_ += kb_size;
+ break;
+
+ case kReadHeaderKey:
+ record->key_buf.reset(new char[record->key_size]);
+ s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
+ next_byte_ += record->value_size;
+ break;
+
+ case kReadHeaderKeyBlob:
+ record->key_buf.reset(new char[record->key_size]);
+ s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
+ if (s.ok()) {
+ record->value_buf.reset(new char[record->value_size]);
+ s = ReadSlice(record->value_size, &record->value,
+ record->value_buf.get());
+ }
+ if (s.ok()) {
+ s = record->CheckBlobCRC();
+ }
+ break;
+ }
+ return s;
+}
+
+Status BlobLogSequentialReader::ReadFooter(BlobLogFooter* footer) {
+ assert(footer);
+ static_assert(BlobLogFooter::kSize <= sizeof(header_buf_),
+ "Buffer is smaller than BlobLogFooter::kSize");
+
+ Status s = ReadSlice(BlobLogFooter::kSize, &buffer_, header_buf_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (buffer_.size() != BlobLogFooter::kSize) {
+ return Status::Corruption("EOF reached before file footer");
+ }
+
+ return footer->DecodeFrom(buffer_);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.h b/src/rocksdb/db/blob/blob_log_sequential_reader.h
new file mode 100644
index 000000000..98afa8518
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_sequential_reader.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <memory>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+
+#define MAX_HEADER_SIZE(a, b, c) (a > b ? (a > c ? a : c) : (b > c ? b : c))
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFileReader;
+class Env;
+class Statistics;
+class Status;
+class SystemClock;
+
+/**
+ * BlobLogSequentialReader is a general purpose log stream reader
+ * implementation. The actual job of reading from the device is implemented by
+ * the RandomAccessFileReader interface.
+ *
+ * Please see BlobLogWriter for details on the file and record layout.
+ */
+
+class BlobLogSequentialReader {
+ public:
+ enum ReadLevel {
+ kReadHeader,
+ kReadHeaderKey,
+ kReadHeaderKeyBlob,
+ };
+
+ // Create a reader that will return log records from "*file_reader".
+ BlobLogSequentialReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
+ SystemClock* clock, Statistics* statistics);
+
+ // No copying allowed
+ BlobLogSequentialReader(const BlobLogSequentialReader&) = delete;
+ BlobLogSequentialReader& operator=(const BlobLogSequentialReader&) = delete;
+
+ ~BlobLogSequentialReader();
+
+ Status ReadHeader(BlobLogHeader* header);
+
+ // Read the next record into *record. Returns true if read
+ // successfully, false if we hit end of the input. The contents filled in
+ // *record will only be valid until the next mutating operation on this
+ // reader.
+ // If blob_offset is non-null, return offset of the blob through it.
+ Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader,
+ uint64_t* blob_offset = nullptr);
+
+ Status ReadFooter(BlobLogFooter* footer);
+
+ void ResetNextByte() { next_byte_ = 0; }
+
+ uint64_t GetNextByte() const { return next_byte_; }
+
+ private:
+ Status ReadSlice(uint64_t size, Slice* slice, char* buf);
+
+ const std::unique_ptr<RandomAccessFileReader> file_;
+ SystemClock* clock_;
+
+ Statistics* statistics_;
+
+ Slice buffer_;
+ char header_buf_[MAX_HEADER_SIZE(BlobLogHeader::kSize, BlobLogFooter::kSize,
+ BlobLogRecord::kHeaderSize)];
+
+ // which byte to read next
+ uint64_t next_byte_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#undef MAX_HEADER_SIZE \ No newline at end of file
diff --git a/src/rocksdb/db/blob/blob_log_writer.cc b/src/rocksdb/db/blob/blob_log_writer.cc
new file mode 100644
index 000000000..9dbac7f25
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_writer.cc
@@ -0,0 +1,178 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_log_writer.h"
+
+#include <cstdint>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobLogWriter::BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest,
+ SystemClock* clock, Statistics* statistics,
+ uint64_t log_number, bool use_fs, bool do_flush,
+ uint64_t boffset)
+ : dest_(std::move(dest)),
+ clock_(clock),
+ statistics_(statistics),
+ log_number_(log_number),
+ block_offset_(boffset),
+ use_fsync_(use_fs),
+ do_flush_(do_flush),
+ last_elem_type_(kEtNone) {}
+
+BlobLogWriter::~BlobLogWriter() = default;
+
+Status BlobLogWriter::Sync() {
+ TEST_SYNC_POINT("BlobLogWriter::Sync");
+
+ StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
+ Status s = dest_->Sync(use_fsync_);
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED);
+ return s;
+}
+
+Status BlobLogWriter::WriteHeader(BlobLogHeader& header) {
+ assert(block_offset_ == 0);
+ assert(last_elem_type_ == kEtNone);
+ std::string str;
+ header.EncodeTo(&str);
+
+ Status s = dest_->Append(Slice(str));
+ if (s.ok()) {
+ block_offset_ += str.size();
+ if (do_flush_) {
+ s = dest_->Flush();
+ }
+ }
+ last_elem_type_ = kEtFileHdr;
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+ BlobLogHeader::kSize);
+ return s;
+}
+
+Status BlobLogWriter::AppendFooter(BlobLogFooter& footer,
+ std::string* checksum_method,
+ std::string* checksum_value) {
+ assert(block_offset_ != 0);
+ assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+ std::string str;
+ footer.EncodeTo(&str);
+
+ Status s;
+ if (dest_->seen_error()) {
+ s.PermitUncheckedError();
+ return Status::IOError("Seen Error. Skip closing.");
+ } else {
+ s = dest_->Append(Slice(str));
+ if (s.ok()) {
+ block_offset_ += str.size();
+
+ s = Sync();
+
+ if (s.ok()) {
+ s = dest_->Close();
+
+ if (s.ok()) {
+ assert(!!checksum_method == !!checksum_value);
+
+ if (checksum_method) {
+ assert(checksum_method->empty());
+
+ std::string method = dest_->GetFileChecksumFuncName();
+ if (method != kUnknownFileChecksumFuncName) {
+ *checksum_method = std::move(method);
+ }
+ }
+ if (checksum_value) {
+ assert(checksum_value->empty());
+
+ std::string value = dest_->GetFileChecksum();
+ if (value != kUnknownFileChecksum) {
+ *checksum_value = std::move(value);
+ }
+ }
+ }
+ }
+ }
+
+ dest_.reset();
+ }
+
+ last_elem_type_ = kEtFileFooter;
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+ BlobLogFooter::kSize);
+ return s;
+}
+
+Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val,
+ uint64_t expiration, uint64_t* key_offset,
+ uint64_t* blob_offset) {
+ assert(block_offset_ != 0);
+ assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+ std::string buf;
+ ConstructBlobHeader(&buf, key, val, expiration);
+
+ Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+ return s;
+}
+
+Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val,
+ uint64_t* key_offset, uint64_t* blob_offset) {
+ assert(block_offset_ != 0);
+ assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+ std::string buf;
+ ConstructBlobHeader(&buf, key, val, 0);
+
+ Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+ return s;
+}
+
+void BlobLogWriter::ConstructBlobHeader(std::string* buf, const Slice& key,
+ const Slice& val, uint64_t expiration) {
+ BlobLogRecord record;
+ record.key = key;
+ record.value = val;
+ record.expiration = expiration;
+ record.EncodeHeaderTo(buf);
+}
+
+Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf,
+ const Slice& key, const Slice& val,
+ uint64_t* key_offset,
+ uint64_t* blob_offset) {
+ StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
+ Status s = dest_->Append(Slice(headerbuf));
+ if (s.ok()) {
+ s = dest_->Append(key);
+ }
+ if (s.ok()) {
+ s = dest_->Append(val);
+ }
+ if (do_flush_ && s.ok()) {
+ s = dest_->Flush();
+ }
+
+ *key_offset = block_offset_ + BlobLogRecord::kHeaderSize;
+ *blob_offset = *key_offset + key.size();
+ block_offset_ = *blob_offset + val.size();
+ last_elem_type_ = kEtRecord;
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+ BlobLogRecord::kHeaderSize + key.size() + val.size());
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_writer.h b/src/rocksdb/db/blob/blob_log_writer.h
new file mode 100644
index 000000000..c1f9f31ad
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_writer.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriter;
+class SystemClock;
+/**
+ * BlobLogWriter is the blob log stream writer. It provides an append-only
+ * abstraction for writing blob data.
+ *
+ *
+ * Look at blob_db_format.h to see the details of the record formats.
+ */
+
+class BlobLogWriter {
+ public:
+ // Create a writer that will append data to "*dest".
+ // "*dest" must be initially empty.
+ // "*dest" must remain live while this BlobLogWriter is in use.
+ BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest, SystemClock* clock,
+ Statistics* statistics, uint64_t log_number, bool use_fsync,
+ bool do_flush, uint64_t boffset = 0);
+ // No copying allowed
+ BlobLogWriter(const BlobLogWriter&) = delete;
+ BlobLogWriter& operator=(const BlobLogWriter&) = delete;
+
+ ~BlobLogWriter();
+
+ static void ConstructBlobHeader(std::string* buf, const Slice& key,
+ const Slice& val, uint64_t expiration);
+
+ Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
+ uint64_t* blob_offset);
+
+ Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration,
+ uint64_t* key_offset, uint64_t* blob_offset);
+
+ Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key,
+ const Slice& val, uint64_t* key_offset,
+ uint64_t* blob_offset);
+
+ Status AppendFooter(BlobLogFooter& footer, std::string* checksum_method,
+ std::string* checksum_value);
+
+ Status WriteHeader(BlobLogHeader& header);
+
+ WritableFileWriter* file() { return dest_.get(); }
+
+ const WritableFileWriter* file() const { return dest_.get(); }
+
+ uint64_t get_log_number() const { return log_number_; }
+
+ Status Sync();
+
+ private:
+ std::unique_ptr<WritableFileWriter> dest_;
+ SystemClock* clock_;
+ Statistics* statistics_;
+ uint64_t log_number_;
+ uint64_t block_offset_; // Current offset in block
+ bool use_fsync_;
+ bool do_flush_;
+
+ public:
+ enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter };
+ ElemType last_elem_type_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_read_request.h b/src/rocksdb/db/blob/blob_read_request.h
new file mode 100644
index 000000000..f9668ca2e
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_read_request.h
@@ -0,0 +1,58 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A read Blob request structure for use in BlobSource::MultiGetBlob and
+// BlobFileReader::MultiGetBlob.
+struct BlobReadRequest {
+ // User key to lookup the paired blob
+ const Slice* user_key = nullptr;
+
+ // File offset in bytes
+ uint64_t offset = 0;
+
+ // Length to read in bytes
+ size_t len = 0;
+
+ // Blob compression type
+ CompressionType compression = kNoCompression;
+
+ // Output parameter set by MultiGetBlob() to point to the data buffer, and
+ // the number of valid bytes
+ PinnableSlice* result = nullptr;
+
+ // Status of read
+ Status* status = nullptr;
+
+ BlobReadRequest(const Slice& _user_key, uint64_t _offset, size_t _len,
+ CompressionType _compression, PinnableSlice* _result,
+ Status* _status)
+ : user_key(&_user_key),
+ offset(_offset),
+ len(_len),
+ compression(_compression),
+ result(_result),
+ status(_status) {}
+
+ BlobReadRequest() = default;
+ BlobReadRequest(const BlobReadRequest& other) = default;
+ BlobReadRequest& operator=(const BlobReadRequest& other) = default;
+};
+
+using BlobFileReadRequests =
+ std::tuple<uint64_t /* file_number */, uint64_t /* file_size */,
+ autovector<BlobReadRequest>>;
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source.cc b/src/rocksdb/db/blob/blob_source.cc
new file mode 100644
index 000000000..bfade2507
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source.cc
@@ -0,0 +1,488 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_source.h"
+
+#include <cassert>
+#include <string>
+
+#include "cache/cache_reservation_manager.h"
+#include "cache/charged_cache.h"
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_log_format.h"
+#include "monitoring/statistics.h"
+#include "options/cf_options.h"
+#include "table/get_context.h"
+#include "table/multiget_context.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobSource::BlobSource(const ImmutableOptions* immutable_options,
+ const std::string& db_id,
+ const std::string& db_session_id,
+ BlobFileCache* blob_file_cache)
+ : db_id_(db_id),
+ db_session_id_(db_session_id),
+ statistics_(immutable_options->statistics.get()),
+ blob_file_cache_(blob_file_cache),
+ blob_cache_(immutable_options->blob_cache),
+ lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) {
+#ifndef ROCKSDB_LITE
+ auto bbto =
+ immutable_options->table_factory->GetOptions<BlockBasedTableOptions>();
+ if (bbto &&
+ bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache)
+ .charged == CacheEntryRoleOptions::Decision::kEnabled) {
+ blob_cache_ = std::make_shared<ChargedCache>(immutable_options->blob_cache,
+ bbto->block_cache);
+ }
+#endif // ROCKSDB_LITE
+}
+
+BlobSource::~BlobSource() = default;
+
+Status BlobSource::GetBlobFromCache(
+ const Slice& cache_key, CacheHandleGuard<BlobContents>* cached_blob) const {
+ assert(blob_cache_);
+ assert(!cache_key.empty());
+ assert(cached_blob);
+ assert(cached_blob->IsEmpty());
+
+ Cache::Handle* cache_handle = nullptr;
+ cache_handle = GetEntryFromCache(cache_key);
+ if (cache_handle != nullptr) {
+ *cached_blob =
+ CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle);
+
+ assert(cached_blob->GetValue());
+
+ PERF_COUNTER_ADD(blob_cache_hit_count, 1);
+ RecordTick(statistics_, BLOB_DB_CACHE_HIT);
+ RecordTick(statistics_, BLOB_DB_CACHE_BYTES_READ,
+ cached_blob->GetValue()->size());
+
+ return Status::OK();
+ }
+
+ RecordTick(statistics_, BLOB_DB_CACHE_MISS);
+
+ return Status::NotFound("Blob not found in cache");
+}
+
+Status BlobSource::PutBlobIntoCache(
+ const Slice& cache_key, std::unique_ptr<BlobContents>* blob,
+ CacheHandleGuard<BlobContents>* cached_blob) const {
+ assert(blob_cache_);
+ assert(!cache_key.empty());
+ assert(blob);
+ assert(*blob);
+ assert(cached_blob);
+ assert(cached_blob->IsEmpty());
+
+ Cache::Handle* cache_handle = nullptr;
+ const Status s = InsertEntryIntoCache(cache_key, blob->get(),
+ (*blob)->ApproximateMemoryUsage(),
+ &cache_handle, Cache::Priority::BOTTOM);
+ if (s.ok()) {
+ blob->release();
+
+ assert(cache_handle != nullptr);
+ *cached_blob =
+ CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle);
+
+ assert(cached_blob->GetValue());
+
+ RecordTick(statistics_, BLOB_DB_CACHE_ADD);
+ RecordTick(statistics_, BLOB_DB_CACHE_BYTES_WRITE,
+ cached_blob->GetValue()->size());
+
+ } else {
+ RecordTick(statistics_, BLOB_DB_CACHE_ADD_FAILURES);
+ }
+
+ return s;
+}
+
+Cache::Handle* BlobSource::GetEntryFromCache(const Slice& key) const {
+ Cache::Handle* cache_handle = nullptr;
+
+ if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
+ Cache::CreateCallback create_cb =
+ [allocator = blob_cache_->memory_allocator()](
+ const void* buf, size_t size, void** out_obj,
+ size_t* charge) -> Status {
+ return BlobContents::CreateCallback(AllocateBlock(size, allocator), buf,
+ size, out_obj, charge);
+ };
+
+ cache_handle = blob_cache_->Lookup(key, BlobContents::GetCacheItemHelper(),
+ create_cb, Cache::Priority::BOTTOM,
+ true /* wait_for_cache */, statistics_);
+ } else {
+ cache_handle = blob_cache_->Lookup(key, statistics_);
+ }
+
+ return cache_handle;
+}
+
+void BlobSource::PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
+ PinnableSlice* value) {
+ assert(cached_blob);
+ assert(cached_blob->GetValue());
+ assert(value);
+
+ // To avoid copying the cached blob into the buffer provided by the
+ // application, we can simply transfer ownership of the cache handle to
+ // the target PinnableSlice. This has the potential to save a lot of
+ // CPU, especially with large blob values.
+
+ value->Reset();
+
+ constexpr Cleanable* cleanable = nullptr;
+ value->PinSlice(cached_blob->GetValue()->data(), cleanable);
+
+ cached_blob->TransferTo(value);
+}
+
+void BlobSource::PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
+ PinnableSlice* value) {
+ assert(owned_blob);
+ assert(*owned_blob);
+ assert(value);
+
+ BlobContents* const blob = owned_blob->release();
+ assert(blob);
+
+ value->Reset();
+ value->PinSlice(
+ blob->data(),
+ [](void* arg1, void* /* arg2 */) {
+ delete static_cast<BlobContents*>(arg1);
+ },
+ blob, nullptr);
+}
+
+Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value,
+ size_t charge,
+ Cache::Handle** cache_handle,
+ Cache::Priority priority) const {
+ Status s;
+
+ Cache::CacheItemHelper* const cache_item_helper =
+ BlobContents::GetCacheItemHelper();
+ assert(cache_item_helper);
+
+ if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
+ s = blob_cache_->Insert(key, value, cache_item_helper, charge, cache_handle,
+ priority);
+ } else {
+ s = blob_cache_->Insert(key, value, charge, cache_item_helper->del_cb,
+ cache_handle, priority);
+ }
+
+ return s;
+}
+
+Status BlobSource::GetBlob(const ReadOptions& read_options,
+ const Slice& user_key, uint64_t file_number,
+ uint64_t offset, uint64_t file_size,
+ uint64_t value_size,
+ CompressionType compression_type,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* value, uint64_t* bytes_read) {
+ assert(value);
+
+ Status s;
+
+ const CacheKey cache_key = GetCacheKey(file_number, file_size, offset);
+
+ CacheHandleGuard<BlobContents> blob_handle;
+
+ // First, try to get the blob from the cache
+ //
+ // If blob cache is enabled, we'll try to read from it.
+ if (blob_cache_) {
+ Slice key = cache_key.AsSlice();
+ s = GetBlobFromCache(key, &blob_handle);
+ if (s.ok()) {
+ PinCachedBlob(&blob_handle, value);
+
+ // For consistency, the size of on-disk (possibly compressed) blob record
+ // is assigned to bytes_read.
+ uint64_t adjustment =
+ read_options.verify_checksums
+ ? BlobLogRecord::CalculateAdjustmentForRecordHeader(
+ user_key.size())
+ : 0;
+ assert(offset >= adjustment);
+
+ uint64_t record_size = value_size + adjustment;
+ if (bytes_read) {
+ *bytes_read = record_size;
+ }
+ return s;
+ }
+ }
+
+ assert(blob_handle.IsEmpty());
+
+ const bool no_io = read_options.read_tier == kBlockCacheTier;
+ if (no_io) {
+ s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
+ return s;
+ }
+
+ // Can't find the blob from the cache. Since I/O is allowed, read from the
+ // file.
+ std::unique_ptr<BlobContents> blob_contents;
+
+ {
+ CacheHandleGuard<BlobFileReader> blob_file_reader;
+ s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
+ if (!s.ok()) {
+ return s;
+ }
+
+ assert(blob_file_reader.GetValue());
+
+ if (compression_type != blob_file_reader.GetValue()->GetCompressionType()) {
+ return Status::Corruption("Compression type mismatch when reading blob");
+ }
+
+ MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
+ ? blob_cache_->memory_allocator()
+ : nullptr;
+
+ uint64_t read_size = 0;
+ s = blob_file_reader.GetValue()->GetBlob(
+ read_options, user_key, offset, value_size, compression_type,
+ prefetch_buffer, allocator, &blob_contents, &read_size);
+ if (!s.ok()) {
+ return s;
+ }
+ if (bytes_read) {
+ *bytes_read = read_size;
+ }
+ }
+
+ if (blob_cache_ && read_options.fill_cache) {
+ // If filling cache is allowed and a cache is configured, try to put the
+ // blob to the cache.
+ Slice key = cache_key.AsSlice();
+ s = PutBlobIntoCache(key, &blob_contents, &blob_handle);
+ if (!s.ok()) {
+ return s;
+ }
+
+ PinCachedBlob(&blob_handle, value);
+ } else {
+ PinOwnedBlob(&blob_contents, value);
+ }
+
+ assert(s.ok());
+ return s;
+}
+
+void BlobSource::MultiGetBlob(const ReadOptions& read_options,
+ autovector<BlobFileReadRequests>& blob_reqs,
+ uint64_t* bytes_read) {
+ assert(blob_reqs.size() > 0);
+
+ uint64_t total_bytes_read = 0;
+ uint64_t bytes_read_in_file = 0;
+
+ for (auto& [file_number, file_size, blob_reqs_in_file] : blob_reqs) {
+ // sort blob_reqs_in_file by file offset.
+ std::sort(
+ blob_reqs_in_file.begin(), blob_reqs_in_file.end(),
+ [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool {
+ return lhs.offset < rhs.offset;
+ });
+
+ MultiGetBlobFromOneFile(read_options, file_number, file_size,
+ blob_reqs_in_file, &bytes_read_in_file);
+
+ total_bytes_read += bytes_read_in_file;
+ }
+
+ if (bytes_read) {
+ *bytes_read = total_bytes_read;
+ }
+}
+
+void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
+ uint64_t file_number,
+ uint64_t /*file_size*/,
+ autovector<BlobReadRequest>& blob_reqs,
+ uint64_t* bytes_read) {
+ const size_t num_blobs = blob_reqs.size();
+ assert(num_blobs > 0);
+ assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE);
+
+#ifndef NDEBUG
+ for (size_t i = 0; i < num_blobs - 1; ++i) {
+ assert(blob_reqs[i].offset <= blob_reqs[i + 1].offset);
+ }
+#endif // !NDEBUG
+
+ using Mask = uint64_t;
+ Mask cache_hit_mask = 0;
+
+ uint64_t total_bytes = 0;
+ const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+
+ if (blob_cache_) {
+ size_t cached_blob_count = 0;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ auto& req = blob_reqs[i];
+
+ CacheHandleGuard<BlobContents> blob_handle;
+ const CacheKey cache_key = base_cache_key.WithOffset(req.offset);
+ const Slice key = cache_key.AsSlice();
+
+ const Status s = GetBlobFromCache(key, &blob_handle);
+
+ if (s.ok()) {
+ assert(req.status);
+ *req.status = s;
+
+ PinCachedBlob(&blob_handle, req.result);
+
+ // Update the counter for the number of valid blobs read from the cache.
+ ++cached_blob_count;
+
+ // For consistency, the size of each on-disk (possibly compressed) blob
+ // record is accumulated to total_bytes.
+ uint64_t adjustment =
+ read_options.verify_checksums
+ ? BlobLogRecord::CalculateAdjustmentForRecordHeader(
+ req.user_key->size())
+ : 0;
+ assert(req.offset >= adjustment);
+ total_bytes += req.len + adjustment;
+ cache_hit_mask |= (Mask{1} << i); // cache hit
+ }
+ }
+
+ // All blobs were read from the cache.
+ if (cached_blob_count == num_blobs) {
+ if (bytes_read) {
+ *bytes_read = total_bytes;
+ }
+ return;
+ }
+ }
+
+ const bool no_io = read_options.read_tier == kBlockCacheTier;
+ if (no_io) {
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (!(cache_hit_mask & (Mask{1} << i))) {
+ BlobReadRequest& req = blob_reqs[i];
+ assert(req.status);
+
+ *req.status =
+ Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
+ }
+ }
+ return;
+ }
+
+ {
+ // Find the rest of blobs from the file since I/O is allowed.
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+ _blob_reqs;
+ uint64_t _bytes_read = 0;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (!(cache_hit_mask & (Mask{1} << i))) {
+ _blob_reqs.emplace_back(&blob_reqs[i], std::unique_ptr<BlobContents>());
+ }
+ }
+
+ CacheHandleGuard<BlobFileReader> blob_file_reader;
+ Status s =
+ blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
+ if (!s.ok()) {
+ for (size_t i = 0; i < _blob_reqs.size(); ++i) {
+ BlobReadRequest* const req = _blob_reqs[i].first;
+ assert(req);
+ assert(req->status);
+
+ *req->status = s;
+ }
+ return;
+ }
+
+ assert(blob_file_reader.GetValue());
+
+ MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
+ ? blob_cache_->memory_allocator()
+ : nullptr;
+
+ blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator,
+ _blob_reqs, &_bytes_read);
+
+ if (blob_cache_ && read_options.fill_cache) {
+ // If filling cache is allowed and a cache is configured, try to put
+ // the blob(s) to the cache.
+ for (auto& [req, blob_contents] : _blob_reqs) {
+ assert(req);
+
+ if (req->status->ok()) {
+ CacheHandleGuard<BlobContents> blob_handle;
+ const CacheKey cache_key = base_cache_key.WithOffset(req->offset);
+ const Slice key = cache_key.AsSlice();
+ s = PutBlobIntoCache(key, &blob_contents, &blob_handle);
+ if (!s.ok()) {
+ *req->status = s;
+ } else {
+ PinCachedBlob(&blob_handle, req->result);
+ }
+ }
+ }
+ } else {
+ for (auto& [req, blob_contents] : _blob_reqs) {
+ assert(req);
+
+ if (req->status->ok()) {
+ PinOwnedBlob(&blob_contents, req->result);
+ }
+ }
+ }
+
+ total_bytes += _bytes_read;
+ if (bytes_read) {
+ *bytes_read = total_bytes;
+ }
+ }
+}
+
+bool BlobSource::TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
+ uint64_t offset, size_t* charge) const {
+ const CacheKey cache_key = GetCacheKey(file_number, file_size, offset);
+ const Slice key = cache_key.AsSlice();
+
+ CacheHandleGuard<BlobContents> blob_handle;
+ const Status s = GetBlobFromCache(key, &blob_handle);
+
+ if (s.ok() && blob_handle.GetValue() != nullptr) {
+ if (charge) {
+ const Cache* const cache = blob_handle.GetCache();
+ assert(cache);
+
+ Cache::Handle* const handle = blob_handle.GetCacheHandle();
+ assert(handle);
+
+ *charge = cache->GetUsage(handle);
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source.h b/src/rocksdb/db/blob/blob_source.h
new file mode 100644
index 000000000..2ed296eeb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source.h
@@ -0,0 +1,153 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+#include <memory>
+
+#include "cache/cache_helpers.h"
+#include "cache/cache_key.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_read_request.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "table/block_based/cachable_entry.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ImmutableOptions;
+class Status;
+class FilePrefetchBuffer;
+class Slice;
+class BlobContents;
+
+// BlobSource is a class that provides universal access to blobs, regardless of
+// whether they are in the blob cache, secondary cache, or (remote) storage.
+// Depending on user settings, it always fetch blobs from multi-tier cache and
+// storage with minimal cost.
+class BlobSource {
+ public:
+ BlobSource(const ImmutableOptions* immutable_options,
+ const std::string& db_id, const std::string& db_session_id,
+ BlobFileCache* blob_file_cache);
+
+ BlobSource(const BlobSource&) = delete;
+ BlobSource& operator=(const BlobSource&) = delete;
+
+ ~BlobSource();
+
+ // Read a blob from the underlying cache or one blob file.
+ //
+ // If successful, returns ok and sets "*value" to the newly retrieved
+ // uncompressed blob. If there was an error while fetching the blob, sets
+ // "*value" to empty and returns a non-ok status.
+ //
+ // Note: For consistency, whether the blob is found in the cache or on disk,
+ // sets "*bytes_read" to the size of on-disk (possibly compressed) blob
+ // record.
+ Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+ uint64_t file_number, uint64_t offset, uint64_t file_size,
+ uint64_t value_size, CompressionType compression_type,
+ FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+ uint64_t* bytes_read);
+
+ // Read multiple blobs from the underlying cache or blob file(s).
+ //
+ // If successful, returns ok and sets "result" in the elements of "blob_reqs"
+ // to the newly retrieved uncompressed blobs. If there was an error while
+ // fetching one of blobs, sets its "result" to empty and sets its
+ // corresponding "status" to a non-ok status.
+ //
+ // Note:
+ // - The main difference between this function and MultiGetBlobFromOneFile is
+ // that this function can read multiple blobs from multiple blob files.
+ //
+ // - For consistency, whether the blob is found in the cache or on disk, sets
+ // "*bytes_read" to the total size of on-disk (possibly compressed) blob
+ // records.
+ void MultiGetBlob(const ReadOptions& read_options,
+ autovector<BlobFileReadRequests>& blob_reqs,
+ uint64_t* bytes_read);
+
+ // Read multiple blobs from the underlying cache or one blob file.
+ //
+ // If successful, returns ok and sets "result" in the elements of "blob_reqs"
+ // to the newly retrieved uncompressed blobs. If there was an error while
+ // fetching one of blobs, sets its "result" to empty and sets its
+ // corresponding "status" to a non-ok status.
+ //
+ // Note:
+ // - The main difference between this function and MultiGetBlob is that this
+ // function is only used for the case where the demanded blobs are stored in
+ // one blob file. MultiGetBlob will call this function multiple times if the
+ // demanded blobs are stored in multiple blob files.
+ //
+ // - For consistency, whether the blob is found in the cache or on disk, sets
+ // "*bytes_read" to the total size of on-disk (possibly compressed) blob
+ // records.
+ void MultiGetBlobFromOneFile(const ReadOptions& read_options,
+ uint64_t file_number, uint64_t file_size,
+ autovector<BlobReadRequest>& blob_reqs,
+ uint64_t* bytes_read);
+
+ inline Status GetBlobFileReader(
+ uint64_t blob_file_number,
+ CacheHandleGuard<BlobFileReader>* blob_file_reader) {
+ return blob_file_cache_->GetBlobFileReader(blob_file_number,
+ blob_file_reader);
+ }
+
+ inline Cache* GetBlobCache() const { return blob_cache_.get(); }
+
+ bool TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
+ uint64_t offset, size_t* charge = nullptr) const;
+
+ private:
+ Status GetBlobFromCache(const Slice& cache_key,
+ CacheHandleGuard<BlobContents>* cached_blob) const;
+
+ Status PutBlobIntoCache(const Slice& cache_key,
+ std::unique_ptr<BlobContents>* blob,
+ CacheHandleGuard<BlobContents>* cached_blob) const;
+
+ static void PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
+ PinnableSlice* value);
+
+ static void PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
+ PinnableSlice* value);
+
+ Cache::Handle* GetEntryFromCache(const Slice& key) const;
+
+ Status InsertEntryIntoCache(const Slice& key, BlobContents* value,
+ size_t charge, Cache::Handle** cache_handle,
+ Cache::Priority priority) const;
+
+ inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/,
+ uint64_t offset) const {
+ OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+ return base_cache_key.WithOffset(offset);
+ }
+
+ const std::string& db_id_;
+ const std::string& db_session_id_;
+
+ Statistics* statistics_;
+
+ // A cache to store blob file reader.
+ BlobFileCache* blob_file_cache_;
+
+ // A cache to store uncompressed blobs.
+ std::shared_ptr<Cache> blob_cache_;
+
+ // The control option of how the cache tiers will be used. Currently rocksdb
+ // support block/blob cache (volatile tier) and secondary cache (this tier
+ // isn't strictly speaking a non-volatile tier since the compressed cache in
+ // this tier is in volatile memory).
+ const CacheTier lowest_used_cache_tier_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source_test.cc b/src/rocksdb/db/blob/blob_source_test.cc
new file mode 100644
index 000000000..a85ed8646
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source_test.cc
@@ -0,0 +1,1624 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_source.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "cache/charged_cache.h"
+#include "cache/compressed_secondary_cache.h"
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "db/db_test_util.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+#include "util/compression.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with `num` blobs in it.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+ uint32_t column_family_id, bool has_ttl,
+ const ExpirationRange& expiration_range_header,
+ const ExpirationRange& expiration_range_footer,
+ uint64_t blob_file_number, const std::vector<Slice>& keys,
+ const std::vector<Slice>& blobs, CompressionType compression,
+ std::vector<uint64_t>& blob_offsets,
+ std::vector<uint64_t>& blob_sizes) {
+ assert(!immutable_options.cf_paths.empty());
+ size_t num = keys.size();
+ assert(num == blobs.size());
+ assert(num == blob_offsets.size());
+ assert(num == blob_sizes.size());
+
+ const std::string blob_file_path =
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+ std::unique_ptr<FSWritableFile> file;
+ ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+ FileOptions()));
+
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+ constexpr Statistics* statistics = nullptr;
+ constexpr bool use_fsync = false;
+ constexpr bool do_flush = false;
+
+ BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+ statistics, blob_file_number, use_fsync,
+ do_flush);
+
+ BlobLogHeader header(column_family_id, compression, has_ttl,
+ expiration_range_header);
+
+ ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+ std::vector<std::string> compressed_blobs(num);
+ std::vector<Slice> blobs_to_write(num);
+ if (kNoCompression == compression) {
+ for (size_t i = 0; i < num; ++i) {
+ blobs_to_write[i] = blobs[i];
+ blob_sizes[i] = blobs[i].size();
+ }
+ } else {
+ CompressionOptions opts;
+ CompressionContext context(compression);
+ constexpr uint64_t sample_for_compression = 0;
+ CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+ compression, sample_for_compression);
+
+ constexpr uint32_t compression_format_version = 2;
+
+ for (size_t i = 0; i < num; ++i) {
+ ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
+ &compressed_blobs[i]));
+ blobs_to_write[i] = compressed_blobs[i];
+ blob_sizes[i] = compressed_blobs[i].size();
+ }
+ }
+
+ for (size_t i = 0; i < num; ++i) {
+ uint64_t key_offset = 0;
+ ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset,
+ &blob_offsets[i]));
+ }
+
+ BlobLogFooter footer;
+ footer.blob_count = num;
+ footer.expiration_range = expiration_range_footer;
+
+ std::string checksum_method;
+ std::string checksum_value;
+ ASSERT_OK(
+ blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+} // anonymous namespace
+
+class BlobSourceTest : public DBTestBase {
+ protected:
+ public:
+ explicit BlobSourceTest()
+ : DBTestBase("blob_source_test", /*env_do_fsync=*/true) {
+ options_.env = env_;
+ options_.enable_blob_files = true;
+ options_.create_if_missing = true;
+
+ LRUCacheOptions co;
+ co.capacity = 8 << 20;
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ co.high_pri_pool_ratio = 0.2;
+ co.low_pri_pool_ratio = 0.2;
+ options_.blob_cache = NewLRUCache(co);
+ options_.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+ assert(db_->GetDbIdentity(db_id_).ok());
+ assert(db_->GetDbSessionId(db_session_id_).ok());
+ }
+
+ Options options_;
+ std::string db_id_;
+ std::string db_session_id_;
+};
+
+TEST_F(BlobSourceTest, GetBlobsFromCache) {
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(env_, "BlobSourceTest_GetBlobsFromCache"), 0);
+
+ options_.statistics = CreateDBStatistics();
+ Statistics* statistics = options_.statistics.get();
+ assert(statistics);
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr size_t num_blobs = 16;
+
+ std::vector<std::string> key_strs;
+ std::vector<std::string> blob_strs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ key_strs.push_back("key" + std::to_string(i));
+ blob_strs.push_back("blob" + std::to_string(i));
+ }
+
+ std::vector<Slice> keys;
+ std::vector<Slice> blobs;
+
+ uint64_t file_size = BlobLogHeader::kSize;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ keys.push_back({key_strs[i]});
+ blobs.push_back({blob_strs[i]});
+ file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+ }
+ file_size += BlobLogFooter::kSize;
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, keys, blobs, kNoCompression,
+ blob_offsets, blob_sizes);
+
+ constexpr size_t capacity = 1024;
+ std::shared_ptr<Cache> backing_cache =
+ NewLRUCache(capacity); // Blob file cache
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+
+ {
+ // GetBlob
+ std::vector<PinnableSlice> values(keys.size());
+ uint64_t bytes_read = 0;
+ uint64_t blob_bytes = 0;
+ uint64_t total_bytes = 0;
+
+ read_options.fill_cache = false;
+ get_perf_context()->Reset();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read));
+ ASSERT_EQ(values[i], blobs[i]);
+ ASSERT_TRUE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ total_bytes += bytes_read;
+ }
+
+ // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+ // GetBlob, and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+ ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+
+ read_options.fill_cache = true;
+ blob_bytes = 0;
+ total_bytes = 0;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read));
+ ASSERT_EQ(values[i], blobs[i]);
+ ASSERT_TRUE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ blob_bytes += blob_sizes[i];
+ total_bytes += bytes_read;
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i + 1);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+ }
+
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_blobs);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), blob_bytes);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+ blob_bytes);
+
+ read_options.fill_cache = true;
+ total_bytes = 0;
+ blob_bytes = 0;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read));
+ ASSERT_EQ(values[i], blobs[i]);
+ ASSERT_TRUE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ total_bytes += bytes_read; // on-disk blob record size
+ blob_bytes += blob_sizes[i]; // cached blob value size
+ }
+
+ // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+ // GetBlob, and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // without i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // without i/o
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ blob_bytes * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+
+ // Cache-only GetBlob
+ read_options.read_tier = ReadTier::kBlockCacheTier;
+ total_bytes = 0;
+ blob_bytes = 0;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read));
+ ASSERT_EQ(values[i], blobs[i]);
+ ASSERT_TRUE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ total_bytes += bytes_read;
+ blob_bytes += blob_sizes[i];
+ }
+
+ // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+ // GetBlob, and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // without i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // without i/o
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ blob_bytes * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+
+ options_.blob_cache->EraseUnRefEntries();
+
+ {
+ // Cache-only GetBlob
+ std::vector<PinnableSlice> values(keys.size());
+ uint64_t bytes_read = 0;
+
+ read_options.read_tier = ReadTier::kBlockCacheTier;
+ read_options.fill_cache = true;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_TRUE(blob_source
+ .GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read)
+ .IsIncomplete());
+ ASSERT_TRUE(values[i].empty());
+ ASSERT_FALSE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read, 0);
+
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+ // GetBlob, and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+
+ {
+ // GetBlob from non-existing file
+ std::vector<PinnableSlice> values(keys.size());
+ uint64_t bytes_read = 0;
+ uint64_t file_number = 100; // non-existing file
+
+ read_options.read_tier = ReadTier::kReadAllTier;
+ read_options.fill_cache = true;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_TRUE(blob_source
+ .GetBlob(read_options, keys[i], file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read)
+ .IsIOError());
+ ASSERT_TRUE(values[i].empty());
+ ASSERT_FALSE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read, 0);
+
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+ // GetBlob, and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+}
+
+TEST_F(BlobSourceTest, GetCompressedBlobs) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ const CompressionType compression = kSnappyCompression;
+
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(env_, "BlobSourceTest_GetCompressedBlobs"), 0);
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr size_t num_blobs = 256;
+
+ std::vector<std::string> key_strs;
+ std::vector<std::string> blob_strs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ key_strs.push_back("key" + std::to_string(i));
+ blob_strs.push_back("blob" + std::to_string(i));
+ }
+
+ std::vector<Slice> keys;
+ std::vector<Slice> blobs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ keys.push_back({key_strs[i]});
+ blobs.push_back({blob_strs[i]});
+ }
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ constexpr size_t capacity = 1024;
+ auto backing_cache = NewLRUCache(capacity); // Blob file cache
+
+ FileOptions file_options;
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ column_family_id, nullptr /*HistogramImpl*/, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ uint64_t bytes_read = 0;
+ std::vector<PinnableSlice> values(keys.size());
+
+ {
+ // Snappy Compression
+ const uint64_t file_number = 1;
+
+ read_options.read_tier = ReadTier::kReadAllTier;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl,
+ expiration_range, expiration_range, file_number, keys, blobs,
+ compression, blob_offsets, blob_sizes);
+
+ CacheHandleGuard<BlobFileReader> blob_file_reader;
+ ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader));
+ ASSERT_NE(blob_file_reader.GetValue(), nullptr);
+
+ const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize();
+ ASSERT_EQ(blob_file_reader.GetValue()->GetCompressionType(), compression);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_NE(blobs[i].size() /*uncompressed size*/,
+ blob_sizes[i] /*compressed size*/);
+ }
+
+ read_options.fill_cache = true;
+ read_options.read_tier = ReadTier::kReadAllTier;
+ get_perf_context()->Reset();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ compression, nullptr /*prefetch_buffer*/,
+ &values[i], &bytes_read));
+ ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/);
+ ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/);
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ ASSERT_GE((int)get_perf_context()->blob_decompress_time, 0);
+
+ read_options.read_tier = ReadTier::kBlockCacheTier;
+ get_perf_context()->Reset();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+
+ // Compressed blob size is passed in GetBlob
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ compression, nullptr /*prefetch_buffer*/,
+ &values[i], &bytes_read));
+ ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/);
+ ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/);
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+ }
+}
+
+TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) {
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromMultiFiles"),
+ 0);
+
+ options_.statistics = CreateDBStatistics();
+ Statistics* statistics = options_.statistics.get();
+ assert(statistics);
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_files = 2;
+ constexpr size_t num_blobs = 32;
+
+ std::vector<std::string> key_strs;
+ std::vector<std::string> blob_strs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ key_strs.push_back("key" + std::to_string(i));
+ blob_strs.push_back("blob" + std::to_string(i));
+ }
+
+ std::vector<Slice> keys;
+ std::vector<Slice> blobs;
+
+ uint64_t file_size = BlobLogHeader::kSize;
+ uint64_t blob_value_bytes = 0;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ keys.push_back({key_strs[i]});
+ blobs.push_back({blob_strs[i]});
+ blob_value_bytes += blobs[i].size();
+ file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+ }
+ file_size += BlobLogFooter::kSize;
+ const uint64_t blob_records_bytes =
+ file_size - BlobLogHeader::kSize - BlobLogFooter::kSize;
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ {
+ // Write key/blob pairs to multiple blob files.
+ for (size_t i = 0; i < blob_files; ++i) {
+ const uint64_t file_number = i + 1;
+ WriteBlobFile(immutable_options, column_family_id, has_ttl,
+ expiration_range, expiration_range, file_number, keys,
+ blobs, kNoCompression, blob_offsets, blob_sizes);
+ }
+ }
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache =
+ NewLRUCache(capacity); // Blob file cache
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ uint64_t bytes_read = 0;
+
+ {
+ // MultiGetBlob
+ read_options.fill_cache = true;
+ read_options.read_tier = ReadTier::kReadAllTier;
+
+ autovector<BlobFileReadRequests> blob_reqs;
+ std::array<autovector<BlobReadRequest>, blob_files> blob_reqs_in_file;
+ std::array<PinnableSlice, num_blobs * blob_files> value_buf;
+ std::array<Status, num_blobs * blob_files> statuses_buf;
+
+ for (size_t i = 0; i < blob_files; ++i) {
+ const uint64_t file_number = i + 1;
+ for (size_t j = 0; j < num_blobs; ++j) {
+ blob_reqs_in_file[i].emplace_back(
+ keys[j], blob_offsets[j], blob_sizes[j], kNoCompression,
+ &value_buf[i * num_blobs + j], &statuses_buf[i * num_blobs + j]);
+ }
+ blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file[i]);
+ }
+
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < blob_files; ++i) {
+ const uint64_t file_number = i + 1;
+ for (size_t j = 0; j < num_blobs; ++j) {
+ ASSERT_OK(statuses_buf[i * num_blobs + j]);
+ ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]);
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[j]));
+ }
+ }
+
+ // Retrieved all blobs from 2 blob files twice via MultiGetBlob and
+ // TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count,
+ num_blobs * blob_files);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count,
+ num_blobs * blob_files); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+ blob_records_bytes * blob_files); // blocking i/o
+ ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS),
+ num_blobs * blob_files); // MultiGetBlob
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT),
+ num_blobs * blob_files); // TEST_BlobInCache
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD),
+ num_blobs * blob_files); // MultiGetBlob
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ blob_value_bytes * blob_files); // TEST_BlobInCache
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+ blob_value_bytes * blob_files); // MultiGetBlob
+
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ autovector<BlobReadRequest> fake_blob_reqs_in_file;
+ std::array<PinnableSlice, num_blobs> fake_value_buf;
+ std::array<Status, num_blobs> fake_statuses_buf;
+
+ const uint64_t fake_file_number = 100;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ fake_blob_reqs_in_file.emplace_back(
+ keys[i], blob_offsets[i], blob_sizes[i], kNoCompression,
+ &fake_value_buf[i], &fake_statuses_buf[i]);
+ }
+
+ // Add a fake multi-get blob request.
+ blob_reqs.emplace_back(fake_file_number, file_size, fake_blob_reqs_in_file);
+
+ blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read);
+
+ // Check the real blob read requests.
+ for (size_t i = 0; i < blob_files; ++i) {
+ const uint64_t file_number = i + 1;
+ for (size_t j = 0; j < num_blobs; ++j) {
+ ASSERT_OK(statuses_buf[i * num_blobs + j]);
+ ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]);
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[j]));
+ }
+ }
+
+ // Check the fake blob request.
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(fake_statuses_buf[i].IsIOError());
+ ASSERT_TRUE(fake_value_buf[i].empty());
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(fake_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ // Retrieved all blobs from 3 blob files (including the fake one) twice
+ // via MultiGetBlob and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count,
+ num_blobs * blob_files * 2);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count,
+ 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+ 0); // blocking i/o
+ ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ // Fake blob requests: MultiGetBlob and TEST_BlobInCache
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+ // Real blob requests: MultiGetBlob and TEST_BlobInCache
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT),
+ num_blobs * blob_files * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ // Real blob requests: MultiGetBlob and TEST_BlobInCache
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ blob_value_bytes * blob_files * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+}
+
+TEST_F(BlobSourceTest, MultiGetBlobsFromCache) {
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromCache"), 0);
+
+ options_.statistics = CreateDBStatistics();
+ Statistics* statistics = options_.statistics.get();
+ assert(statistics);
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr size_t num_blobs = 16;
+
+ std::vector<std::string> key_strs;
+ std::vector<std::string> blob_strs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ key_strs.push_back("key" + std::to_string(i));
+ blob_strs.push_back("blob" + std::to_string(i));
+ }
+
+ std::vector<Slice> keys;
+ std::vector<Slice> blobs;
+
+ uint64_t file_size = BlobLogHeader::kSize;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ keys.push_back({key_strs[i]});
+ blobs.push_back({blob_strs[i]});
+ file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+ }
+ file_size += BlobLogFooter::kSize;
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, keys, blobs, kNoCompression,
+ blob_offsets, blob_sizes);
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache =
+ NewLRUCache(capacity); // Blob file cache
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+
+ {
+ // MultiGetBlobFromOneFile
+ uint64_t bytes_read = 0;
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<PinnableSlice, num_blobs> value_buf;
+ autovector<BlobReadRequest> blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; i += 2) { // even index
+ blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, &value_buf[i], &statuses_buf[i]);
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ read_options.fill_cache = true;
+ read_options.read_tier = ReadTier::kReadAllTier;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ // Get half of blobs
+ blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+ file_size, blob_reqs, &bytes_read);
+
+ uint64_t fs_read_bytes = 0;
+ uint64_t ca_read_bytes = 0;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (i % 2 == 0) {
+ ASSERT_OK(statuses_buf[i]);
+ ASSERT_EQ(value_buf[i], blobs[i]);
+ ASSERT_TRUE(value_buf[i].IsPinned());
+ fs_read_bytes +=
+ blob_sizes[i] + keys[i].size() + BlobLogRecord::kHeaderSize;
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ ca_read_bytes += blob_sizes[i];
+ } else {
+ statuses_buf[i].PermitUncheckedError();
+ ASSERT_TRUE(value_buf[i].empty());
+ ASSERT_FALSE(value_buf[i].IsPinned());
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+ }
+
+ constexpr int num_even_blobs = num_blobs / 2;
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_even_blobs);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count,
+ num_even_blobs); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+ fs_read_bytes); // blocking i/o
+ ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_even_blobs);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_even_blobs);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ ca_read_bytes);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+ ca_read_bytes);
+
+ // Get the rest of blobs
+ for (size_t i = 1; i < num_blobs; i += 2) { // odd index
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer,
+ &value_buf[i], &bytes_read));
+ ASSERT_EQ(value_buf[i], blobs[i]);
+ ASSERT_TRUE(value_buf[i].IsPinned());
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ // Cache-only MultiGetBlobFromOneFile
+ read_options.read_tier = ReadTier::kBlockCacheTier;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ blob_reqs.clear();
+ for (size_t i = 0; i < num_blobs; ++i) {
+ blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, &value_buf[i], &statuses_buf[i]);
+ }
+
+ blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+ file_size, blob_reqs, &bytes_read);
+
+ uint64_t blob_bytes = 0;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_OK(statuses_buf[i]);
+ ASSERT_EQ(value_buf[i], blobs[i]);
+ ASSERT_TRUE(value_buf[i].IsPinned());
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ blob_bytes += blob_sizes[i];
+ }
+
+ // Retrieved the blob cache num_blobs * 2 times via GetBlob and
+ // TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 2);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o
+ ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ blob_bytes * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+
+ options_.blob_cache->EraseUnRefEntries();
+
+ {
+ // Cache-only MultiGetBlobFromOneFile
+ uint64_t bytes_read = 0;
+ read_options.read_tier = ReadTier::kBlockCacheTier;
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<PinnableSlice, num_blobs> value_buf;
+ autovector<BlobReadRequest> blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; i++) {
+ blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, &value_buf[i], &statuses_buf[i]);
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+ file_size, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(statuses_buf[i].IsIncomplete());
+ ASSERT_TRUE(value_buf[i].empty());
+ ASSERT_FALSE(value_buf[i].IsPinned());
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+
+ {
+ // MultiGetBlobFromOneFile from non-existing file
+ uint64_t bytes_read = 0;
+ uint64_t non_existing_file_number = 100;
+ read_options.read_tier = ReadTier::kReadAllTier;
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<PinnableSlice, num_blobs> value_buf;
+ autovector<BlobReadRequest> blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; i++) {
+ blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, &value_buf[i], &statuses_buf[i]);
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number,
+ file_size, blob_offsets[i]));
+ }
+
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ blob_source.MultiGetBlobFromOneFile(read_options, non_existing_file_number,
+ file_size, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(statuses_buf[i].IsIOError());
+ ASSERT_TRUE(value_buf[i].empty());
+ ASSERT_FALSE(value_buf[i].IsPinned());
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number,
+ file_size, blob_offsets[i]));
+ }
+
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+}
+
+class BlobSecondaryCacheTest : public DBTestBase {
+ protected:
+ public:
+ explicit BlobSecondaryCacheTest()
+ : DBTestBase("blob_secondary_cache_test", /*env_do_fsync=*/true) {
+ options_.env = env_;
+ options_.enable_blob_files = true;
+ options_.create_if_missing = true;
+
+ // Set a small cache capacity to evict entries from the cache, and to test
+ // that secondary cache is used properly.
+ lru_cache_opts_.capacity = 1024;
+ lru_cache_opts_.num_shard_bits = 0;
+ lru_cache_opts_.strict_capacity_limit = true;
+ lru_cache_opts_.metadata_charge_policy = kDontChargeCacheMetadata;
+ lru_cache_opts_.high_pri_pool_ratio = 0.2;
+ lru_cache_opts_.low_pri_pool_ratio = 0.2;
+
+ secondary_cache_opts_.capacity = 8 << 20; // 8 MB
+ secondary_cache_opts_.num_shard_bits = 0;
+ secondary_cache_opts_.metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy;
+
+ // Read blobs from the secondary cache if they are not in the primary cache
+ options_.lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+
+ assert(db_->GetDbIdentity(db_id_).ok());
+ assert(db_->GetDbSessionId(db_session_id_).ok());
+ }
+
+ Options options_;
+
+ LRUCacheOptions lru_cache_opts_;
+ CompressedSecondaryCacheOptions secondary_cache_opts_;
+
+ std::string db_id_;
+ std::string db_session_id_;
+};
+
+TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ secondary_cache_opts_.compression_type = kSnappyCompression;
+ lru_cache_opts_.secondary_cache =
+ NewCompressedSecondaryCache(secondary_cache_opts_);
+ options_.blob_cache = NewLRUCache(lru_cache_opts_);
+
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(
+ env_, "BlobSecondaryCacheTest_GetBlobsFromSecondaryCache"),
+ 0);
+
+ options_.statistics = CreateDBStatistics();
+ Statistics* statistics = options_.statistics.get();
+ assert(statistics);
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t file_number = 1;
+
+ Random rnd(301);
+
+ std::vector<std::string> key_strs{"key0", "key1"};
+ std::vector<std::string> blob_strs{rnd.RandomString(512),
+ rnd.RandomString(768)};
+
+ std::vector<Slice> keys{key_strs[0], key_strs[1]};
+ std::vector<Slice> blobs{blob_strs[0], blob_strs[1]};
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, file_number, keys, blobs, kNoCompression,
+ blob_offsets, blob_sizes);
+
+ constexpr size_t capacity = 1024;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache(new BlobFileCache(
+ backing_cache.get(), &immutable_options, &file_options, column_family_id,
+ blob_file_read_hist, nullptr /*IOTracer*/));
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ CacheHandleGuard<BlobFileReader> file_reader;
+ ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader));
+ ASSERT_NE(file_reader.GetValue(), nullptr);
+ const uint64_t file_size = file_reader.GetValue()->GetFileSize();
+ ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression);
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ auto blob_cache = options_.blob_cache;
+ auto secondary_cache = lru_cache_opts_.secondary_cache;
+
+ Cache::CreateCallback create_cb = [](const void* buf, size_t size,
+ void** out_obj,
+ size_t* charge) -> Status {
+ CacheAllocationPtr allocation(new char[size]);
+
+ return BlobContents::CreateCallback(std::move(allocation), buf, size,
+ out_obj, charge);
+ };
+
+ {
+ // GetBlob
+ std::vector<PinnableSlice> values(keys.size());
+
+ read_options.fill_cache = true;
+ get_perf_context()->Reset();
+
+ // key0 should be filled to the primary cache from the blob file.
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number,
+ blob_offsets[0], file_size, blob_sizes[0],
+ kNoCompression, nullptr /* prefetch_buffer */,
+ &values[0], nullptr /* bytes_read */));
+ // Release cache handle
+ values[0].Reset();
+
+ // key0 should be evicted and key0's dummy item is inserted into secondary
+ // cache. key1 should be filled to the primary cache from the blob file.
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number,
+ blob_offsets[1], file_size, blob_sizes[1],
+ kNoCompression, nullptr /* prefetch_buffer */,
+ &values[1], nullptr /* bytes_read */));
+
+ // Release cache handle
+ values[1].Reset();
+
+ // key0 should be filled to the primary cache from the blob file. key1
+ // should be evicted and key1's dummy item is inserted into secondary cache.
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number,
+ blob_offsets[0], file_size, blob_sizes[0],
+ kNoCompression, nullptr /* prefetch_buffer */,
+ &values[0], nullptr /* bytes_read */));
+ ASSERT_EQ(values[0], blobs[0]);
+ ASSERT_TRUE(
+ blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[0]));
+
+ // Release cache handle
+ values[0].Reset();
+
+ // key0 should be evicted and is inserted into secondary cache.
+ // key1 should be filled to the primary cache from the blob file.
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number,
+ blob_offsets[1], file_size, blob_sizes[1],
+ kNoCompression, nullptr /* prefetch_buffer */,
+ &values[1], nullptr /* bytes_read */));
+ ASSERT_EQ(values[1], blobs[1]);
+ ASSERT_TRUE(
+ blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[1]));
+
+ // Release cache handle
+ values[1].Reset();
+
+ OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+
+ // blob_cache here only looks at the primary cache since we didn't provide
+ // the cache item helper for the secondary cache. However, since key0 is
+ // demoted to the secondary cache, we shouldn't be able to find it in the
+ // primary cache.
+ {
+ CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[0]);
+ const Slice key0 = cache_key.AsSlice();
+ auto handle0 = blob_cache->Lookup(key0, statistics);
+ ASSERT_EQ(handle0, nullptr);
+
+ // key0's item should be in the secondary cache.
+ bool is_in_sec_cache = false;
+ auto sec_handle0 =
+ secondary_cache->Lookup(key0, create_cb, true,
+ /*advise_erase=*/true, is_in_sec_cache);
+ ASSERT_FALSE(is_in_sec_cache);
+ ASSERT_NE(sec_handle0, nullptr);
+ ASSERT_TRUE(sec_handle0->IsReady());
+ auto value = static_cast<BlobContents*>(sec_handle0->Value());
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blobs[0]);
+ delete value;
+
+ // key0 doesn't exist in the blob cache although key0's dummy
+ // item exist in the secondary cache.
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[0]));
+ }
+
+ // key1 should exists in the primary cache. key1's dummy item exists
+ // in the secondary cache.
+ {
+ CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[1]);
+ const Slice key1 = cache_key.AsSlice();
+ auto handle1 = blob_cache->Lookup(key1, statistics);
+ ASSERT_NE(handle1, nullptr);
+ blob_cache->Release(handle1);
+
+ bool is_in_sec_cache = false;
+ auto sec_handle1 =
+ secondary_cache->Lookup(key1, create_cb, true,
+ /*advise_erase=*/true, is_in_sec_cache);
+ ASSERT_FALSE(is_in_sec_cache);
+ ASSERT_EQ(sec_handle1, nullptr);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[1]));
+ }
+
+ {
+ // fetch key0 from the blob file to the primary cache.
+ // key1 is evicted and inserted into the secondary cache.
+ ASSERT_OK(blob_source.GetBlob(
+ read_options, keys[0], file_number, blob_offsets[0], file_size,
+ blob_sizes[0], kNoCompression, nullptr /* prefetch_buffer */,
+ &values[0], nullptr /* bytes_read */));
+ ASSERT_EQ(values[0], blobs[0]);
+
+ // Release cache handle
+ values[0].Reset();
+
+ // key0 should be in the primary cache.
+ CacheKey cache_key0 = base_cache_key.WithOffset(blob_offsets[0]);
+ const Slice key0 = cache_key0.AsSlice();
+ auto handle0 = blob_cache->Lookup(key0, statistics);
+ ASSERT_NE(handle0, nullptr);
+ auto value = static_cast<BlobContents*>(blob_cache->Value(handle0));
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blobs[0]);
+ blob_cache->Release(handle0);
+
+ // key1 is not in the primary cache and is in the secondary cache.
+ CacheKey cache_key1 = base_cache_key.WithOffset(blob_offsets[1]);
+ const Slice key1 = cache_key1.AsSlice();
+ auto handle1 = blob_cache->Lookup(key1, statistics);
+ ASSERT_EQ(handle1, nullptr);
+
+ // erase key0 from the primary cache.
+ blob_cache->Erase(key0);
+ handle0 = blob_cache->Lookup(key0, statistics);
+ ASSERT_EQ(handle0, nullptr);
+
+ // key1 promotion should succeed due to the primary cache being empty. we
+ // did't call secondary cache's Lookup() here, because it will remove the
+ // key but it won't be able to promote the key to the primary cache.
+ // Instead we use the end-to-end blob source API to read key1.
+ // In function TEST_BlobInCache, key1's dummy item is inserted into the
+ // primary cache and a standalone handle is checked by GetValue().
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[1]));
+
+ // key1's dummy handle is in the primary cache and key1's item is still
+ // in the secondary cache. So, the primary cache's Lookup() without
+ // secondary cache support cannot see it. (NOTE: The dummy handle used
+ // to be a leaky abstraction but not anymore.)
+ handle1 = blob_cache->Lookup(key1, statistics);
+ ASSERT_EQ(handle1, nullptr);
+
+ // But after another access, it is promoted to primary cache
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[1]));
+
+ // And Lookup() can find it (without secondary cache support)
+ handle1 = blob_cache->Lookup(key1, statistics);
+ ASSERT_NE(handle1, nullptr);
+ ASSERT_NE(blob_cache->Value(handle1), nullptr);
+ blob_cache->Release(handle1);
+ }
+ }
+}
+
+class BlobSourceCacheReservationTest : public DBTestBase {
+ public:
+ explicit BlobSourceCacheReservationTest()
+ : DBTestBase("blob_source_cache_reservation_test",
+ /*env_do_fsync=*/true) {
+ options_.env = env_;
+ options_.enable_blob_files = true;
+ options_.create_if_missing = true;
+
+ LRUCacheOptions co;
+ co.capacity = kCacheCapacity;
+ co.num_shard_bits = kNumShardBits;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+
+ co.high_pri_pool_ratio = 0.0;
+ co.low_pri_pool_ratio = 0.0;
+ std::shared_ptr<Cache> blob_cache = NewLRUCache(co);
+
+ co.high_pri_pool_ratio = 0.5;
+ co.low_pri_pool_ratio = 0.5;
+ std::shared_ptr<Cache> block_cache = NewLRUCache(co);
+
+ options_.blob_cache = blob_cache;
+ options_.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = block_cache;
+ block_based_options.cache_usage_options.options_overrides.insert(
+ {CacheEntryRole::kBlobCache,
+ {/* charged = */ CacheEntryRoleOptions::Decision::kEnabled}});
+ options_.table_factory.reset(
+ NewBlockBasedTableFactory(block_based_options));
+
+ assert(db_->GetDbIdentity(db_id_).ok());
+ assert(db_->GetDbSessionId(db_session_id_).ok());
+ }
+
+ void GenerateKeysAndBlobs() {
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ key_strs_.push_back("key" + std::to_string(i));
+ blob_strs_.push_back("blob" + std::to_string(i));
+ }
+
+ blob_file_size_ = BlobLogHeader::kSize;
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ keys_.push_back({key_strs_[i]});
+ blobs_.push_back({blob_strs_[i]});
+ blob_file_size_ +=
+ BlobLogRecord::kHeaderSize + keys_[i].size() + blobs_[i].size();
+ }
+ blob_file_size_ += BlobLogFooter::kSize;
+ }
+
+ static constexpr std::size_t kSizeDummyEntry = CacheReservationManagerImpl<
+ CacheEntryRole::kBlobCache>::GetDummyEntrySize();
+ static constexpr std::size_t kCacheCapacity = 1 * kSizeDummyEntry;
+ static constexpr int kNumShardBits = 0; // 2^0 shard
+
+ static constexpr uint32_t kColumnFamilyId = 1;
+ static constexpr bool kHasTTL = false;
+ static constexpr uint64_t kBlobFileNumber = 1;
+ static constexpr size_t kNumBlobs = 16;
+
+ std::vector<Slice> keys_;
+ std::vector<Slice> blobs_;
+ std::vector<std::string> key_strs_;
+ std::vector<std::string> blob_strs_;
+ uint64_t blob_file_size_;
+
+ Options options_;
+ std::string db_id_;
+ std::string db_session_id_;
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(
+ env_, "BlobSourceCacheReservationTest_SimpleCacheReservation"),
+ 0);
+
+ GenerateKeysAndBlobs();
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr ExpirationRange expiration_range;
+
+ std::vector<uint64_t> blob_offsets(keys_.size());
+ std::vector<uint64_t> blob_sizes(keys_.size());
+
+ WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range,
+ expiration_range, kBlobFileNumber, keys_, blobs_,
+ kNoCompression, blob_offsets, blob_sizes);
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ConcurrentCacheReservationManager* cache_res_mgr =
+ static_cast<ChargedCache*>(blob_source.GetBlobCache())
+ ->TEST_GetCacheReservationManager();
+ ASSERT_NE(cache_res_mgr, nullptr);
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ {
+ read_options.fill_cache = false;
+
+ std::vector<PinnableSlice> values(keys_.size());
+
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ ASSERT_OK(blob_source.GetBlob(
+ read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+ blob_file_size_, blob_sizes[i], kNoCompression,
+ nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0);
+ }
+ }
+
+ {
+ read_options.fill_cache = true;
+
+ std::vector<PinnableSlice> values(keys_.size());
+
+ // num_blobs is 16, so the total blob cache usage is less than a single
+ // dummy entry. Therefore, cache reservation manager only reserves one dummy
+ // entry here.
+ uint64_t blob_bytes = 0;
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ ASSERT_OK(blob_source.GetBlob(
+ read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+ blob_file_size_, blob_sizes[i], kNoCompression,
+ nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+
+ size_t charge = 0;
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_,
+ blob_offsets[i], &charge));
+
+ blob_bytes += charge;
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+ options_.blob_cache->GetUsage());
+ }
+ }
+
+ {
+ OffsetableCacheKey base_cache_key(db_id_, db_session_id_, kBlobFileNumber);
+ size_t blob_bytes = options_.blob_cache->GetUsage();
+
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ size_t charge = 0;
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_,
+ blob_offsets[i], &charge));
+
+ CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[i]);
+ // We didn't call options_.blob_cache->Erase() here, this is because
+ // the cache wrapper's Erase() method must be called to update the
+ // cache usage after erasing the cache entry.
+ blob_source.GetBlobCache()->Erase(cache_key.AsSlice());
+ if (i == kNumBlobs - 1) {
+ // All the blobs got removed from the cache. cache_res_mgr should not
+ // reserve any space for them.
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+ } else {
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+ }
+ blob_bytes -= charge;
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+ options_.blob_cache->GetUsage());
+ }
+ }
+}
+
+TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(
+ env_,
+ "BlobSourceCacheReservationTest_IncreaseCacheReservationOnFullCache"),
+ 0);
+
+ GenerateKeysAndBlobs();
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+ constexpr size_t blob_size = kSizeDummyEntry / (kNumBlobs / 2);
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ blob_file_size_ -= blobs_[i].size(); // old blob size
+ blob_strs_[i].resize(blob_size, '@');
+ blobs_[i] = Slice(blob_strs_[i]);
+ blob_file_size_ += blobs_[i].size(); // new blob size
+ }
+
+ std::vector<uint64_t> blob_offsets(keys_.size());
+ std::vector<uint64_t> blob_sizes(keys_.size());
+
+ constexpr ExpirationRange expiration_range;
+ WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range,
+ expiration_range, kBlobFileNumber, keys_, blobs_,
+ kNoCompression, blob_offsets, blob_sizes);
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ConcurrentCacheReservationManager* cache_res_mgr =
+ static_cast<ChargedCache*>(blob_source.GetBlobCache())
+ ->TEST_GetCacheReservationManager();
+ ASSERT_NE(cache_res_mgr, nullptr);
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ {
+ read_options.fill_cache = false;
+
+ std::vector<PinnableSlice> values(keys_.size());
+
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ ASSERT_OK(blob_source.GetBlob(
+ read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+ blob_file_size_, blob_sizes[i], kNoCompression,
+ nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0);
+ }
+ }
+
+ {
+ read_options.fill_cache = true;
+
+ std::vector<PinnableSlice> values(keys_.size());
+
+ // Since we resized each blob to be kSizeDummyEntry / (num_blobs / 2), we
+ // can't fit all the blobs in the cache at the same time, which means we
+ // should observe cache evictions once we reach the cache's capacity.
+ // Due to the overhead of the cache and the BlobContents objects, as well as
+ // jemalloc bin sizes, this happens after inserting seven blobs.
+ uint64_t blob_bytes = 0;
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ ASSERT_OK(blob_source.GetBlob(
+ read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+ blob_file_size_, blob_sizes[i], kNoCompression,
+ nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+
+ // Release cache handle
+ values[i].Reset();
+
+ if (i < kNumBlobs / 2 - 1) {
+ size_t charge = 0;
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(
+ kBlobFileNumber, blob_file_size_, blob_offsets[i], &charge));
+
+ blob_bytes += charge;
+ }
+
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+ options_.blob_cache->GetUsage());
+ }
+ }
+}
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_basic_test.cc b/src/rocksdb/db/blob/db_blob_basic_test.cc
new file mode 100644
index 000000000..e6832a2ae
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_basic_test.cc
@@ -0,0 +1,1789 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <array>
+#include <sstream>
+#include <string>
+
+#include "cache/compressed_secondary_cache.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobBasicTest : public DBTestBase {
+ protected:
+ DBBlobBasicTest()
+ : DBTestBase("db_blob_basic_test", /* env_do_fsync */ false) {}
+};
+
+TEST_F(DBBlobBasicTest, GetBlob) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob_value[] = "blob_value";
+
+ ASSERT_OK(Put(key, blob_value));
+
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(Get(key), blob_value);
+
+ // Try again with no I/O allowed. The table and the necessary blocks should
+ // already be in their respective caches; however, the blob itself can only be
+ // read from the blob file, so the read should return Incomplete.
+ ReadOptions read_options;
+ read_options.read_tier = kBlockCacheTier;
+
+ PinnableSlice result;
+ ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)
+ .IsIncomplete());
+}
+
+TEST_F(DBBlobBasicTest, GetBlobFromCache) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 2 << 20; // 2MB
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ options.enable_blob_files = true;
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob_value[] = "blob_value";
+
+ ASSERT_OK(Put(key, blob_value));
+
+ ASSERT_OK(Flush());
+
+ ReadOptions read_options;
+
+ read_options.fill_cache = false;
+
+ {
+ PinnableSlice result;
+
+ read_options.read_tier = kReadAllTier;
+ ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+ ASSERT_EQ(result, blob_value);
+
+ result.Reset();
+ read_options.read_tier = kBlockCacheTier;
+
+ // Try again with no I/O allowed. Since we didn't re-fill the cache, the
+ // blob itself can only be read from the blob file, so the read should
+ // return Incomplete.
+ ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)
+ .IsIncomplete());
+ ASSERT_TRUE(result.empty());
+ }
+
+ read_options.fill_cache = true;
+
+ {
+ PinnableSlice result;
+
+ read_options.read_tier = kReadAllTier;
+ ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+ ASSERT_EQ(result, blob_value);
+
+ result.Reset();
+ read_options.read_tier = kBlockCacheTier;
+
+ // Try again with no I/O allowed. The table and the necessary blocks/blobs
+ // should already be in their respective caches.
+ ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+ ASSERT_EQ(result, blob_value);
+ }
+}
+
+TEST_F(DBBlobBasicTest, IterateBlobsFromCache) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 2 << 20; // 2MB
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ options.enable_blob_files = true;
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ options.statistics = CreateDBStatistics();
+
+ Reopen(options);
+
+ int num_blobs = 5;
+ std::vector<std::string> keys;
+ std::vector<std::string> blobs;
+
+ for (int i = 0; i < num_blobs; ++i) {
+ keys.push_back("key" + std::to_string(i));
+ blobs.push_back("blob" + std::to_string(i));
+ ASSERT_OK(Put(keys[i], blobs[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ReadOptions read_options;
+
+ {
+ read_options.fill_cache = false;
+ read_options.read_tier = kReadAllTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+
+ int i = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key().ToString(), keys[i]);
+ ASSERT_EQ(iter->value().ToString(), blobs[i]);
+ ++i;
+ }
+ ASSERT_EQ(i, num_blobs);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+ }
+
+ {
+ read_options.fill_cache = false;
+ read_options.read_tier = kBlockCacheTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+
+ // Try again with no I/O allowed. Since we didn't re-fill the cache,
+ // the blob itself can only be read from the blob file, so iter->Valid()
+ // should be false.
+ iter->SeekToFirst();
+ ASSERT_NOK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+ }
+
+ {
+ read_options.fill_cache = true;
+ read_options.read_tier = kReadAllTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+
+ // Read blobs from the file and refill the cache.
+ int i = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key().ToString(), keys[i]);
+ ASSERT_EQ(iter->value().ToString(), blobs[i]);
+ ++i;
+ }
+ ASSERT_EQ(i, num_blobs);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD),
+ num_blobs);
+ }
+
+ {
+ read_options.fill_cache = false;
+ read_options.read_tier = kBlockCacheTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+
+ // Try again with no I/O allowed. The table and the necessary blocks/blobs
+ // should already be in their respective caches.
+ int i = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key().ToString(), keys[i]);
+ ASSERT_EQ(iter->value().ToString(), blobs[i]);
+ ++i;
+ }
+ ASSERT_EQ(i, num_blobs);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+ }
+}
+
+TEST_F(DBBlobBasicTest, IterateBlobsFromCachePinning) {
+ constexpr size_t min_blob_size = 6;
+
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions cache_options;
+ cache_options.capacity = 2048;
+ cache_options.num_shard_bits = 0;
+ cache_options.metadata_charge_policy = kDontChargeCacheMetadata;
+
+ options.blob_cache = NewLRUCache(cache_options);
+ options.enable_blob_files = true;
+ options.min_blob_size = min_blob_size;
+
+ Reopen(options);
+
+ // Put then iterate over three key-values. The second value is below the size
+ // limit and is thus stored inline; the other two are stored separately as
+ // blobs. We expect to have something pinned in the cache iff we are
+ // positioned on a blob.
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "long_value";
+ static_assert(sizeof(first_value) - 1 >= min_blob_size,
+ "first_value too short to be stored as blob");
+
+ ASSERT_OK(Put(first_key, first_value));
+
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "short";
+ static_assert(sizeof(second_value) - 1 < min_blob_size,
+ "second_value too long to be inlined");
+
+ ASSERT_OK(Put(second_key, second_value));
+
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "other_long_value";
+ static_assert(sizeof(third_value) - 1 >= min_blob_size,
+ "third_value too short to be stored as blob");
+
+ ASSERT_OK(Put(third_key, third_value));
+
+ ASSERT_OK(Flush());
+
+ {
+ ReadOptions read_options;
+ read_options.fill_cache = true;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), first_value);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_EQ(iter->value(), second_value);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), third_key);
+ ASSERT_EQ(iter->value(), third_value);
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ }
+
+ {
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+ read_options.read_tier = kBlockCacheTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), first_value);
+ ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_EQ(iter->value(), second_value);
+ ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), third_key);
+ ASSERT_EQ(iter->value(), third_value);
+ ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+ }
+
+ {
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+ read_options.read_tier = kBlockCacheTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), third_key);
+ ASSERT_EQ(iter->value(), third_value);
+ ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_EQ(iter->value(), second_value);
+ ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), first_value);
+ ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+ }
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlobs) {
+ constexpr size_t min_blob_size = 6;
+
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = min_blob_size;
+
+ Reopen(options);
+
+ // Put then retrieve three key-values. The first value is below the size limit
+ // and is thus stored inline; the other two are stored separately as blobs.
+ constexpr size_t num_keys = 3;
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "short";
+ static_assert(sizeof(first_value) - 1 < min_blob_size,
+ "first_value too long to be inlined");
+
+ ASSERT_OK(Put(first_key, first_value));
+
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "long_value";
+ static_assert(sizeof(second_value) - 1 >= min_blob_size,
+ "second_value too short to be stored as blob");
+
+ ASSERT_OK(Put(second_key, second_value));
+
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "other_long_value";
+ static_assert(sizeof(third_value) - 1 >= min_blob_size,
+ "third_value too short to be stored as blob");
+
+ ASSERT_OK(Put(third_key, third_value));
+
+ ASSERT_OK(Flush());
+
+ ReadOptions read_options;
+
+ std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], second_value);
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], third_value);
+ }
+
+ // Try again with no I/O allowed. The table and the necessary blocks should
+ // already be in their respective caches. The first (inlined) value should be
+ // successfully read; however, the two blob values could only be read from the
+ // blob file, so for those the read should return Incomplete.
+ read_options.read_tier = kBlockCacheTier;
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_TRUE(statuses[1].IsIncomplete());
+
+ ASSERT_TRUE(statuses[2].IsIncomplete());
+ }
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 2 << 20; // 2MB
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ constexpr size_t min_blob_size = 6;
+ options.min_blob_size = min_blob_size;
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ DestroyAndReopen(options);
+
+ // Put then retrieve three key-values. The first value is below the size limit
+ // and is thus stored inline; the other two are stored separately as blobs.
+ constexpr size_t num_keys = 3;
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "short";
+ static_assert(sizeof(first_value) - 1 < min_blob_size,
+ "first_value too long to be inlined");
+
+ ASSERT_OK(Put(first_key, first_value));
+
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "long_value";
+ static_assert(sizeof(second_value) - 1 >= min_blob_size,
+ "second_value too short to be stored as blob");
+
+ ASSERT_OK(Put(second_key, second_value));
+
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "other_long_value";
+ static_assert(sizeof(third_value) - 1 >= min_blob_size,
+ "third_value too short to be stored as blob");
+
+ ASSERT_OK(Put(third_key, third_value));
+
+ ASSERT_OK(Flush());
+
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+
+ std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], second_value);
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], third_value);
+ }
+
+ // Try again with no I/O allowed. The first (inlined) value should be
+ // successfully read; however, the two blob values could only be read from the
+ // blob file, so for those the read should return Incomplete.
+ read_options.read_tier = kBlockCacheTier;
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_TRUE(statuses[1].IsIncomplete());
+
+ ASSERT_TRUE(statuses[2].IsIncomplete());
+ }
+
+ // Fill the cache when reading blobs from the blob file.
+ read_options.read_tier = kReadAllTier;
+ read_options.fill_cache = true;
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], second_value);
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], third_value);
+ }
+
+ // Try again with no I/O allowed. All blobs should be successfully read from
+ // the cache.
+ read_options.read_tier = kBlockCacheTier;
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], second_value);
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], third_value);
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
+ Options options = GetDefaultOptions();
+
+ // First, create an external SST file ["b"].
+ const std::string file_path = dbname_ + "/test.sst";
+ {
+ SstFileWriter sst_file_writer(EnvOptions(), GetDefaultOptions());
+ Status s = sst_file_writer.Open(file_path);
+ ASSERT_OK(s);
+ ASSERT_OK(sst_file_writer.Put("b", "b_value"));
+ ASSERT_OK(sst_file_writer.Finish());
+ }
+
+ options.enable_blob_files = true;
+ options.min_blob_size = 1000;
+ options.use_direct_reads = true;
+ options.allow_ingest_behind = true;
+
+ // Open DB with fixed-prefix sst-partitioner so that compaction will cut
+ // new table file when encountering a new key whose 1-byte prefix changes.
+ constexpr size_t key_len = 1;
+ options.sst_partitioner_factory =
+ NewSstPartitionerFixedPrefixFactory(key_len);
+
+ Status s = TryReopen(options);
+ if (s.IsInvalidArgument()) {
+ ROCKSDB_GTEST_SKIP("This test requires direct IO support");
+ return;
+ }
+ ASSERT_OK(s);
+
+ constexpr size_t num_keys = 3;
+ constexpr size_t blob_size = 3000;
+
+ constexpr char first_key[] = "a";
+ const std::string first_blob(blob_size, 'a');
+ ASSERT_OK(Put(first_key, first_blob));
+
+ constexpr char second_key[] = "b";
+ const std::string second_blob(2 * blob_size, 'b');
+ ASSERT_OK(Put(second_key, second_blob));
+
+ constexpr char third_key[] = "d";
+ const std::string third_blob(blob_size, 'd');
+ ASSERT_OK(Put(third_key, third_blob));
+
+ // first_blob, second_blob and third_blob in the same blob file.
+ // SST Blob file
+ // L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'|
+ // | | | ^ ^ ^
+ // | | | | | |
+ // | | +---------|-------|--------+
+ // | +-----------------|-------+
+ // +-------------------------+
+ ASSERT_OK(Flush());
+
+ constexpr char fourth_key[] = "c";
+ const std::string fourth_blob(blob_size, 'c');
+ ASSERT_OK(Put(fourth_key, fourth_blob));
+ // fourth_blob in another blob file.
+ // SST Blob file SST Blob file
+ // L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'| ["c"] |'cccc'|
+ // | | | ^ ^ ^ | ^
+ // | | | | | | | |
+ // | | +---------|-------|--------+ +-------+
+ // | +-----------------|-------+
+ // +-------------------------+
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+
+ // Due to the above sst partitioner, we get 4 L1 files. The blob files are
+ // unchanged.
+ // |'aaaa', 'bbbb', 'dddd'| |'cccc'|
+ // ^ ^ ^ ^
+ // | | | |
+ // L0 | | | |
+ // L1 ["a"] ["b"] ["c"] | | ["d"] |
+ // | | | | | |
+ // | | +---------|-------|---------------+
+ // | +-----------------|-------+
+ // +-------------------------+
+ ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+
+ {
+ // Ingest the external SST file into bottommost level.
+ std::vector<std::string> ext_files{file_path};
+ IngestExternalFileOptions opts;
+ opts.ingest_behind = true;
+ ASSERT_OK(
+ db_->IngestExternalFile(db_->DefaultColumnFamily(), ext_files, opts));
+ }
+
+ // Now the database becomes as follows.
+ // |'aaaa', 'bbbb', 'dddd'| |'cccc'|
+ // ^ ^ ^ ^
+ // | | | |
+ // L0 | | | |
+ // L1 ["a"] ["b"] ["c"] | | ["d"] |
+ // | | | | | |
+ // | | +---------|-------|---------------+
+ // | +-----------------|-------+
+ // +-------------------------+
+ //
+ // L6 ["b"]
+
+ {
+ // Compact ["b"] to bottommost level.
+ Slice begin = Slice(second_key);
+ Slice end = Slice(second_key);
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, &begin, &end));
+ }
+
+ // |'aaaa', 'bbbb', 'dddd'| |'cccc'|
+ // ^ ^ ^ ^
+ // | | | |
+ // L0 | | | |
+ // L1 ["a"] ["c"] | | ["d"] |
+ // | | | | |
+ // | +---------|-------|---------------+
+ // | +-----------------|-------+
+ // +-------|-----------------+
+ // |
+ // L6 ["b"]
+ ASSERT_EQ(3, NumTableFilesAtLevel(/*level=*/1));
+ ASSERT_EQ(1, NumTableFilesAtLevel(/*level=*/6));
+
+ bool called = false;
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* arg) {
+ auto* aligned_reqs = static_cast<std::vector<FSReadRequest>*>(arg);
+ assert(aligned_reqs);
+ ASSERT_EQ(1, aligned_reqs->size());
+ called = true;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::array<Slice, num_keys> keys{{first_key, third_key, second_key}};
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ // The MultiGet(), when constructing the KeyContexts, will process the keys
+ // in such order: a, d, b. The reason is that ["a"] and ["d"] are in L1,
+ // while ["b"] resides in L6.
+ // Consequently, the original FSReadRequest list prepared by
+ // Version::MultiGetblob() will be for "a", "d" and "b". It is unsorted as
+ // follows:
+ //
+ // ["a", offset=30, len=3033],
+ // ["d", offset=9096, len=3033],
+ // ["b", offset=3063, len=6033]
+ //
+ // If we do not sort them before calling MultiRead() in DirectIO, then the
+ // underlying IO merging logic will yield two requests.
+ //
+ // [offset=0, len=4096] (for "a")
+ // [offset=0, len=12288] (result of merging the request for "d" and "b")
+ //
+ // We need to sort them in Version::MultiGetBlob() so that the underlying
+ // IO merging logic in DirectIO mode works as expected. The correct
+ // behavior will be one aligned request:
+ //
+ // [offset=0, len=12288]
+
+ db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_TRUE(called);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_blob);
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], third_blob);
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], second_blob);
+ }
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 2 << 20; // 2MB
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ Reopen(options);
+
+ constexpr size_t kNumBlobFiles = 3;
+ constexpr size_t kNumBlobsPerFile = 3;
+ constexpr size_t kNumKeys = kNumBlobsPerFile * kNumBlobFiles;
+
+ std::vector<std::string> key_strs;
+ std::vector<std::string> value_strs;
+ for (size_t i = 0; i < kNumBlobFiles; ++i) {
+ for (size_t j = 0; j < kNumBlobsPerFile; ++j) {
+ std::string key = "key" + std::to_string(i) + "_" + std::to_string(j);
+ std::string value =
+ "value_as_blob" + std::to_string(i) + "_" + std::to_string(j);
+ ASSERT_OK(Put(key, value));
+ key_strs.push_back(key);
+ value_strs.push_back(value);
+ }
+ ASSERT_OK(Flush());
+ }
+ assert(key_strs.size() == kNumKeys);
+ std::array<Slice, kNumKeys> keys;
+ for (size_t i = 0; i < keys.size(); ++i) {
+ keys[i] = key_strs[i];
+ }
+
+ ReadOptions read_options;
+ read_options.read_tier = kReadAllTier;
+ read_options.fill_cache = false;
+
+ {
+ std::array<PinnableSlice, kNumKeys> values;
+ std::array<Status, kNumKeys> statuses;
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+ &values[0], &statuses[0]);
+
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(value_strs[i], values[i]);
+ }
+ }
+
+ read_options.read_tier = kBlockCacheTier;
+
+ {
+ std::array<PinnableSlice, kNumKeys> values;
+ std::array<Status, kNumKeys> statuses;
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+ &values[0], &statuses[0]);
+
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ ASSERT_TRUE(statuses[i].IsIncomplete());
+ ASSERT_TRUE(values[i].empty());
+ }
+ }
+
+ read_options.read_tier = kReadAllTier;
+ read_options.fill_cache = true;
+
+ {
+ std::array<PinnableSlice, kNumKeys> values;
+ std::array<Status, kNumKeys> statuses;
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+ &values[0], &statuses[0]);
+
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(value_strs[i], values[i]);
+ }
+ }
+
+ read_options.read_tier = kBlockCacheTier;
+
+ {
+ std::array<PinnableSlice, kNumKeys> values;
+ std::array<Status, kNumKeys> statuses;
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+ &values[0], &statuses[0]);
+
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(value_strs[i], values[i]);
+ }
+ }
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_CorruptIndex) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ ASSERT_OK(Put(key, blob));
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "Version::Get::TamperWithBlobIndex", [](void* arg) {
+ Slice* const blob_index = static_cast<Slice*>(arg);
+ assert(blob_index);
+ assert(!blob_index->empty());
+ blob_index->remove_prefix(1);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ PinnableSlice result;
+ ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+ .IsCorruption());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlob_CorruptIndex) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+
+ DestroyAndReopen(options);
+
+ constexpr size_t kNumOfKeys = 3;
+ std::array<std::string, kNumOfKeys> key_strs;
+ std::array<std::string, kNumOfKeys> value_strs;
+ std::array<Slice, kNumOfKeys + 1> keys;
+ for (size_t i = 0; i < kNumOfKeys; ++i) {
+ key_strs[i] = "foo" + std::to_string(i);
+ value_strs[i] = "blob_value" + std::to_string(i);
+ ASSERT_OK(Put(key_strs[i], value_strs[i]));
+ keys[i] = key_strs[i];
+ }
+
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+ ASSERT_OK(Put(key, blob));
+ keys[kNumOfKeys] = key;
+
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "Version::MultiGet::TamperWithBlobIndex", [&key](void* arg) {
+ KeyContext* const key_context = static_cast<KeyContext*>(arg);
+ assert(key_context);
+ assert(key_context->key);
+
+ if (*(key_context->key) == key) {
+ Slice* const blob_index = key_context->value;
+ assert(blob_index);
+ assert(!blob_index->empty());
+ blob_index->remove_prefix(1);
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::array<PinnableSlice, kNumOfKeys + 1> values;
+ std::array<Status, kNumOfKeys + 1> statuses;
+ db_->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), kNumOfKeys + 1,
+ keys.data(), values.data(), statuses.data(),
+ /*sorted_input=*/false);
+ for (size_t i = 0; i < kNumOfKeys + 1; ++i) {
+ if (i != kNumOfKeys) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ("blob_value" + std::to_string(i), values[i]);
+ } else {
+ ASSERT_TRUE(statuses[i].IsCorruption());
+ }
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlob_ExceedSoftLimit) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr size_t kNumOfKeys = 3;
+ std::array<std::string, kNumOfKeys> key_bufs;
+ std::array<std::string, kNumOfKeys> value_bufs;
+ std::array<Slice, kNumOfKeys> keys;
+ for (size_t i = 0; i < kNumOfKeys; ++i) {
+ key_bufs[i] = "foo" + std::to_string(i);
+ value_bufs[i] = "blob_value" + std::to_string(i);
+ ASSERT_OK(Put(key_bufs[i], value_bufs[i]));
+ keys[i] = key_bufs[i];
+ }
+ ASSERT_OK(Flush());
+
+ std::array<PinnableSlice, kNumOfKeys> values;
+ std::array<Status, kNumOfKeys> statuses;
+ ReadOptions read_opts;
+ read_opts.value_size_soft_limit = 1;
+ db_->MultiGet(read_opts, dbfull()->DefaultColumnFamily(), kNumOfKeys,
+ keys.data(), values.data(), statuses.data(),
+ /*sorted_input=*/true);
+ for (const auto& s : statuses) {
+ ASSERT_TRUE(s.IsAborted());
+ }
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_InlinedTTLIndex) {
+ constexpr uint64_t min_blob_size = 10;
+
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = min_blob_size;
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob[] = "short";
+ static_assert(sizeof(short) - 1 < min_blob_size,
+ "Blob too long to be inlined");
+
+ // Fake an inlined TTL blob index.
+ std::string blob_index;
+
+ constexpr uint64_t expiration = 1234567890;
+
+ BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+ ASSERT_OK(Flush());
+
+ PinnableSlice result;
+ ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+ .IsCorruption());
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+
+ // Fake a blob index referencing a non-existent blob file.
+ std::string blob_index;
+
+ constexpr uint64_t blob_file_number = 1000;
+ constexpr uint64_t offset = 1234;
+ constexpr uint64_t size = 5678;
+
+ BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+ kNoCompression);
+
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+ ASSERT_OK(Flush());
+
+ PinnableSlice result;
+ ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+ .IsCorruption());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, GenerateIOTracing) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ std::string trace_file = dbname_ + "/io_trace_file";
+
+ Reopen(options);
+ {
+ // Create IO trace file
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(
+ NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer));
+ ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer)));
+
+ constexpr char key[] = "key";
+ constexpr char blob_value[] = "blob_value";
+
+ ASSERT_OK(Put(key, blob_value));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(Get(key), blob_value);
+
+ ASSERT_OK(db_->EndIOTrace());
+ ASSERT_OK(env_->FileExists(trace_file));
+ }
+ {
+ // Parse trace file to check file operations related to blob files are
+ // recorded.
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(
+ NewFileTraceReader(env_, EnvOptions(), trace_file, &trace_reader));
+ IOTraceReader reader(std::move(trace_reader));
+
+ IOTraceHeader header;
+ ASSERT_OK(reader.ReadHeader(&header));
+ ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+ ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+
+ // Read records.
+ int blob_files_op_count = 0;
+ Status status;
+ while (true) {
+ IOTraceRecord record;
+ status = reader.ReadIOOp(&record);
+ if (!status.ok()) {
+ break;
+ }
+ if (record.file_name.find("blob") != std::string::npos) {
+ blob_files_op_count++;
+ }
+ }
+ // Assuming blob files will have Append, Close and then Read operations.
+ ASSERT_GT(blob_files_op_count, 2);
+ }
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ Reopen(options);
+
+ ASSERT_OK(dbfull()->DisableFileDeletions());
+ constexpr int kNumTableFiles = 2;
+ for (int i = 0; i < kNumTableFiles; ++i) {
+ for (char ch = 'a'; ch != 'c'; ++ch) {
+ std::string key(1, ch);
+ ASSERT_OK(Put(key, "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ Close();
+
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(dbname_, &files));
+ std::string blob_file_path;
+ uint64_t max_blob_file_num = kInvalidBlobFileNumber;
+ for (const auto& fname : files) {
+ uint64_t file_num = 0;
+ FileType type;
+ if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) &&
+ type == kBlobFile) {
+ if (file_num > max_blob_file_num) {
+ max_blob_file_num = file_num;
+ blob_file_path = dbname_ + "/" + fname;
+ }
+ }
+ }
+ ASSERT_OK(env_->DeleteFile(blob_file_path));
+
+ options.best_efforts_recovery = true;
+ Reopen(options);
+ std::string value;
+ ASSERT_OK(db_->Get(ReadOptions(), "a", &value));
+ ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value);
+}
+
+TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) {
+ Options options = GetDefaultOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("Key1", "v1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("Key1", "v2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("Key1", "v3"));
+ ASSERT_OK(Flush());
+
+ std::string value;
+ ASSERT_OK(db_->Get(ReadOptions(), "Key1", &value));
+ ASSERT_EQ(Get("Key1"), "v1,v2,v3");
+}
+
+TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) {
+ constexpr size_t num_keys = 3;
+
+ Options options = GetDefaultOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("Key0", "v0_0"));
+ ASSERT_OK(Put("Key1", "v1_0"));
+ ASSERT_OK(Put("Key2", "v2_0"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("Key0", "v0_1"));
+ ASSERT_OK(Merge("Key1", "v1_1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("Key0", "v0_2"));
+ ASSERT_OK(Flush());
+
+ std::array<Slice, num_keys> keys{{"Key0", "Key1", "Key2"}};
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], "v0_0,v0_1,v0_2");
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], "v1_0,v1_1");
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], "v2_0");
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, Properties) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key1[] = "key1";
+ constexpr size_t key1_size = sizeof(key1) - 1;
+
+ constexpr char key2[] = "key2";
+ constexpr size_t key2_size = sizeof(key2) - 1;
+
+ constexpr char key3[] = "key3";
+ constexpr size_t key3_size = sizeof(key3) - 1;
+
+ constexpr char blob[] = "00000000000000";
+ constexpr size_t blob_size = sizeof(blob) - 1;
+
+ constexpr char longer_blob[] = "00000000000000000000";
+ constexpr size_t longer_blob_size = sizeof(longer_blob) - 1;
+
+ ASSERT_OK(Put(key1, blob));
+ ASSERT_OK(Put(key2, longer_blob));
+ ASSERT_OK(Flush());
+
+ constexpr size_t first_blob_file_expected_size =
+ BlobLogHeader::kSize +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key1_size) + blob_size +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) +
+ longer_blob_size + BlobLogFooter::kSize;
+
+ ASSERT_OK(Put(key3, blob));
+ ASSERT_OK(Flush());
+
+ constexpr size_t second_blob_file_expected_size =
+ BlobLogHeader::kSize +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key3_size) + blob_size +
+ BlobLogFooter::kSize;
+
+ constexpr size_t total_expected_size =
+ first_blob_file_expected_size + second_blob_file_expected_size;
+
+ // Number of blob files
+ uint64_t num_blob_files = 0;
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kNumBlobFiles, &num_blob_files));
+ ASSERT_EQ(num_blob_files, 2);
+
+ // Total size of live blob files
+ uint64_t live_blob_file_size = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileSize,
+ &live_blob_file_size));
+ ASSERT_EQ(live_blob_file_size, total_expected_size);
+
+ // Total amount of garbage in live blob files
+ {
+ uint64_t live_blob_file_garbage_size = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize,
+ &live_blob_file_garbage_size));
+ ASSERT_EQ(live_blob_file_garbage_size, 0);
+ }
+
+ // Total size of all blob files across all versions
+ // Note: this should be the same as above since we only have one
+ // version at this point.
+ uint64_t total_blob_file_size = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
+ &total_blob_file_size));
+ ASSERT_EQ(total_blob_file_size, total_expected_size);
+
+ // Delete key2 to create some garbage
+ ASSERT_OK(Delete(key2));
+ ASSERT_OK(Flush());
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ constexpr size_t expected_garbage_size =
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) +
+ longer_blob_size;
+
+ constexpr double expected_space_amp =
+ static_cast<double>(total_expected_size) /
+ (total_expected_size - expected_garbage_size);
+
+ // Blob file stats
+ std::string blob_stats;
+ ASSERT_TRUE(db_->GetProperty(DB::Properties::kBlobStats, &blob_stats));
+
+ std::ostringstream oss;
+ oss << "Number of blob files: 2\nTotal size of blob files: "
+ << total_expected_size
+ << "\nTotal size of garbage in blob files: " << expected_garbage_size
+ << "\nBlob file space amplification: " << expected_space_amp << '\n';
+
+ ASSERT_EQ(blob_stats, oss.str());
+
+ // Total amount of garbage in live blob files
+ {
+ uint64_t live_blob_file_garbage_size = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize,
+ &live_blob_file_garbage_size));
+ ASSERT_EQ(live_blob_file_garbage_size, expected_garbage_size);
+ }
+}
+
+TEST_F(DBBlobBasicTest, PropertiesMultiVersion) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key1[] = "key1";
+ constexpr char key2[] = "key2";
+ constexpr char key3[] = "key3";
+
+ constexpr size_t key_size = sizeof(key1) - 1;
+ static_assert(sizeof(key2) - 1 == key_size, "unexpected size: key2");
+ static_assert(sizeof(key3) - 1 == key_size, "unexpected size: key3");
+
+ constexpr char blob[] = "0000000000";
+ constexpr size_t blob_size = sizeof(blob) - 1;
+
+ ASSERT_OK(Put(key1, blob));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(key2, blob));
+ ASSERT_OK(Flush());
+
+ // Create an iterator to keep the current version alive
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ ASSERT_OK(iter->status());
+
+ // Note: the Delete and subsequent compaction results in the first blob file
+ // not making it to the final version. (It is still part of the previous
+ // version kept alive by the iterator though.) On the other hand, the Put
+ // results in a third blob file.
+ ASSERT_OK(Delete(key1));
+ ASSERT_OK(Put(key3, blob));
+ ASSERT_OK(Flush());
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ // Total size of all blob files across all versions: between the two versions,
+ // we should have three blob files of the same size with one blob each.
+ // The version kept alive by the iterator contains the first and the second
+ // blob file, while the final version contains the second and the third blob
+ // file. (The second blob file is thus shared by the two versions but should
+ // be counted only once.)
+ uint64_t total_blob_file_size = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
+ &total_blob_file_size));
+ ASSERT_EQ(total_blob_file_size,
+ 3 * (BlobLogHeader::kSize +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+ blob_size + BlobLogFooter::kSize));
+}
+#endif // !ROCKSDB_LITE
+
+class DBBlobBasicIOErrorTest : public DBBlobBasicTest,
+ public testing::WithParamInterface<std::string> {
+ protected:
+ DBBlobBasicIOErrorTest() : sync_point_(GetParam()) {
+ fault_injection_env_.reset(new FaultInjectionTestEnv(env_));
+ }
+ ~DBBlobBasicIOErrorTest() { Close(); }
+
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+ std::string sync_point_;
+};
+
+class DBBlobBasicIOErrorMultiGetTest : public DBBlobBasicIOErrorTest {
+ public:
+ DBBlobBasicIOErrorMultiGetTest() : DBBlobBasicIOErrorTest() {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorTest,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileReader::OpenFile:NewRandomAccessFile",
+ "BlobFileReader::GetBlob:ReadFromFile"}));
+
+INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorMultiGetTest,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileReader::OpenFile:NewRandomAccessFile",
+ "BlobFileReader::MultiGetBlob:ReadFromFile"}));
+
+TEST_P(DBBlobBasicIOErrorTest, GetBlob_IOError) {
+ Options options;
+ options.env = fault_injection_env_.get();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob_value[] = "blob_value";
+
+ ASSERT_OK(Put(key, blob_value));
+
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+ fault_injection_env_->SetFilesystemActive(false,
+ Status::IOError(sync_point_));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ PinnableSlice result;
+ ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+ .IsIOError());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBBlobBasicIOErrorMultiGetTest, MultiGetBlobs_IOError) {
+ Options options = GetDefaultOptions();
+ options.env = fault_injection_env_.get();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr size_t num_keys = 2;
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "first_value";
+
+ ASSERT_OK(Put(first_key, first_value));
+
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "second_value";
+
+ ASSERT_OK(Put(second_key, second_value));
+
+ ASSERT_OK(Flush());
+
+ std::array<Slice, num_keys> keys{{first_key, second_key}};
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+ fault_injection_env_->SetFilesystemActive(false,
+ Status::IOError(sync_point_));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_TRUE(statuses[0].IsIOError());
+ ASSERT_TRUE(statuses[1].IsIOError());
+}
+
+TEST_P(DBBlobBasicIOErrorMultiGetTest, MultipleBlobFiles) {
+ Options options = GetDefaultOptions();
+ options.env = fault_injection_env_.get();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr size_t num_keys = 2;
+
+ constexpr char key1[] = "key1";
+ constexpr char value1[] = "blob1";
+
+ ASSERT_OK(Put(key1, value1));
+ ASSERT_OK(Flush());
+
+ constexpr char key2[] = "key2";
+ constexpr char value2[] = "blob2";
+
+ ASSERT_OK(Put(key2, value2));
+ ASSERT_OK(Flush());
+
+ std::array<Slice, num_keys> keys{{key1, key2}};
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ bool first_blob_file = true;
+ SyncPoint::GetInstance()->SetCallBack(
+ sync_point_, [&first_blob_file, this](void* /* arg */) {
+ if (first_blob_file) {
+ first_blob_file = false;
+ return;
+ }
+ fault_injection_env_->SetFilesystemActive(false,
+ Status::IOError(sync_point_));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+ keys.data(), values.data(), statuses.data());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(value1, values[0]);
+ ASSERT_TRUE(statuses[1].IsIOError());
+}
+
+namespace {
+
+class ReadBlobCompactionFilter : public CompactionFilter {
+ public:
+ ReadBlobCompactionFilter() = default;
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.read.blob";
+ }
+ CompactionFilter::Decision FilterV2(
+ int /*level*/, const Slice& /*key*/, ValueType value_type,
+ const Slice& existing_value, std::string* new_value,
+ std::string* /*skip_until*/) const override {
+ if (value_type != CompactionFilter::ValueType::kValue) {
+ return CompactionFilter::Decision::kKeep;
+ }
+ assert(new_value);
+ new_value->assign(existing_value.data(), existing_value.size());
+ return CompactionFilter::Decision::kChangeValue;
+ }
+};
+
+} // anonymous namespace
+
+TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) {
+ Options options = GetDefaultOptions();
+ options.env = fault_injection_env_.get();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ReadBlobCompactionFilter);
+ options.compaction_filter = compaction_filter_guard.get();
+
+ DestroyAndReopen(options);
+ constexpr char key[] = "foo";
+ constexpr char blob_value[] = "foo_blob_value";
+ ASSERT_OK(Put(key, blob_value));
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+ fault_injection_env_->SetFilesystemActive(false,
+ Status::IOError(sync_point_));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr)
+ .IsIOError());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 1 << 25;
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ options.enable_blob_files = true;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 1.0;
+ options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ DestroyAndReopen(options);
+
+ constexpr size_t kNumBlobs = 10;
+ constexpr size_t kValueSize = 100;
+
+ std::string value(kValueSize, 'a');
+
+ for (size_t i = 1; i <= kNumBlobs; i++) {
+ ASSERT_OK(Put(std::to_string(i), value));
+ ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap
+ ASSERT_OK(Flush());
+ ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+ ASSERT_EQ(value, Get(std::to_string(i)));
+ ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+ ASSERT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_MISS));
+ ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_HIT));
+ }
+
+ // Verify compaction not counted
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ EXPECT_EQ(kNumBlobs * 2,
+ options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 1 << 25;
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ options.enable_blob_files = true;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 1.0;
+ options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ DestroyAndReopen(options);
+
+ constexpr size_t kNumBlobs = 10;
+ constexpr size_t kValueSize = 100;
+
+ std::string value(kValueSize, 'a');
+
+ for (size_t i = 1; i <= 5; i++) {
+ ASSERT_OK(Put(std::to_string(i), value));
+ ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap
+ ASSERT_OK(Flush());
+ ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+
+ ASSERT_EQ(value, Get(std::to_string(i)));
+ ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+ ASSERT_EQ(0,
+ options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS));
+ ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT));
+ }
+
+ ASSERT_OK(dbfull()->SetOptions({{"prepopulate_blob_cache", "kDisable"}}));
+
+ for (size_t i = 6; i <= kNumBlobs; i++) {
+ ASSERT_OK(Put(std::to_string(i), value));
+ ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap
+ ASSERT_OK(Flush());
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+
+ ASSERT_EQ(value, Get(std::to_string(i)));
+ ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+ ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+ ASSERT_EQ(2,
+ options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT));
+ }
+
+ // Verify compaction not counted
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ EXPECT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) {
+ CompressedSecondaryCacheOptions secondary_cache_opts;
+ secondary_cache_opts.capacity = 1 << 20;
+ secondary_cache_opts.num_shard_bits = 0;
+ secondary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+ secondary_cache_opts.compression_type = kNoCompression;
+
+ LRUCacheOptions primary_cache_opts;
+ primary_cache_opts.capacity = 1024;
+ primary_cache_opts.num_shard_bits = 0;
+ primary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+ primary_cache_opts.secondary_cache =
+ NewCompressedSecondaryCache(secondary_cache_opts);
+
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.statistics = CreateDBStatistics();
+ options.enable_blob_files = true;
+ options.blob_cache = NewLRUCache(primary_cache_opts);
+ options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+
+ DestroyAndReopen(options);
+
+ // Note: only one of the two blobs fit in the primary cache at any given time.
+ constexpr char first_key[] = "foo";
+ constexpr size_t first_blob_size = 512;
+ const std::string first_blob(first_blob_size, 'a');
+
+ constexpr char second_key[] = "bar";
+ constexpr size_t second_blob_size = 768;
+ const std::string second_blob(second_blob_size, 'b');
+
+ // First blob is inserted into primary cache during flush.
+ ASSERT_OK(Put(first_key, first_blob));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1);
+
+ // Second blob is inserted into primary cache during flush,
+ // First blob is evicted but only a dummy handle is inserted into secondary
+ // cache.
+ ASSERT_OK(Put(second_key, second_blob));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1);
+
+ // First blob is inserted into primary cache.
+ // Second blob is evicted but only a dummy handle is inserted into secondary
+ // cache.
+ ASSERT_EQ(Get(first_key), first_blob);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+ 0);
+ // Second blob is inserted into primary cache,
+ // First blob is evicted and is inserted into secondary cache.
+ ASSERT_EQ(Get(second_key), second_blob);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+ 0);
+
+ // First blob's dummy item is inserted into primary cache b/c of lookup.
+ // Second blob is still in primary cache.
+ ASSERT_EQ(Get(first_key), first_blob);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+ 1);
+
+ // First blob's item is inserted into primary cache b/c of lookup.
+ // Second blob is evicted and inserted into secondary cache.
+ ASSERT_EQ(Get(first_key), first_blob);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+ 1);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_compaction_test.cc b/src/rocksdb/db/blob/db_blob_compaction_test.cc
new file mode 100644
index 000000000..f3fe3c03b
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_compaction_test.cc
@@ -0,0 +1,913 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCompactionTest : public DBTestBase {
+ public:
+ explicit DBBlobCompactionTest()
+ : DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {}
+
+#ifndef ROCKSDB_LITE
+ const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ assert(versions->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ assert(internal_stats);
+
+ return internal_stats->TEST_GetCompactionStats();
+ }
+#endif // ROCKSDB_LITE
+};
+
+namespace {
+
+class FilterByKeyLength : public CompactionFilter {
+ public:
+ explicit FilterByKeyLength(size_t len) : length_threshold_(len) {}
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.by.key.length";
+ }
+ CompactionFilter::Decision FilterBlobByKey(
+ int /*level*/, const Slice& key, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ if (key.size() < length_threshold_) {
+ return CompactionFilter::Decision::kRemove;
+ }
+ return CompactionFilter::Decision::kKeep;
+ }
+
+ private:
+ size_t length_threshold_;
+};
+
+class FilterByValueLength : public CompactionFilter {
+ public:
+ explicit FilterByValueLength(size_t len) : length_threshold_(len) {}
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.by.value.length";
+ }
+ CompactionFilter::Decision FilterV2(
+ int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+ const Slice& existing_value, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ if (existing_value.size() < length_threshold_) {
+ return CompactionFilter::Decision::kRemove;
+ }
+ return CompactionFilter::Decision::kKeep;
+ }
+
+ private:
+ size_t length_threshold_;
+};
+
+class BadBlobCompactionFilter : public CompactionFilter {
+ public:
+ explicit BadBlobCompactionFilter(std::string prefix,
+ CompactionFilter::Decision filter_by_key,
+ CompactionFilter::Decision filter_v2)
+ : prefix_(std::move(prefix)),
+ filter_blob_by_key_(filter_by_key),
+ filter_v2_(filter_v2) {}
+ const char* Name() const override { return "rocksdb.compaction.filter.bad"; }
+ CompactionFilter::Decision FilterBlobByKey(
+ int /*level*/, const Slice& key, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ if (key.size() >= prefix_.size() &&
+ 0 == strncmp(prefix_.data(), key.data(), prefix_.size())) {
+ return CompactionFilter::Decision::kUndetermined;
+ }
+ return filter_blob_by_key_;
+ }
+ CompactionFilter::Decision FilterV2(
+ int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ return filter_v2_;
+ }
+
+ private:
+ const std::string prefix_;
+ const CompactionFilter::Decision filter_blob_by_key_;
+ const CompactionFilter::Decision filter_v2_;
+};
+
+class ValueBlindWriteFilter : public CompactionFilter {
+ public:
+ explicit ValueBlindWriteFilter(std::string new_val)
+ : new_value_(std::move(new_val)) {}
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.blind.write";
+ }
+ CompactionFilter::Decision FilterBlobByKey(
+ int level, const Slice& key, std::string* new_value,
+ std::string* skip_until) const override;
+
+ private:
+ const std::string new_value_;
+};
+
+CompactionFilter::Decision ValueBlindWriteFilter::FilterBlobByKey(
+ int /*level*/, const Slice& /*key*/, std::string* new_value,
+ std::string* /*skip_until*/) const {
+ assert(new_value);
+ new_value->assign(new_value_);
+ return CompactionFilter::Decision::kChangeValue;
+}
+
+class ValueMutationFilter : public CompactionFilter {
+ public:
+ explicit ValueMutationFilter(std::string padding)
+ : padding_(std::move(padding)) {}
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.value.mutation";
+ }
+ CompactionFilter::Decision FilterV2(int level, const Slice& key,
+ ValueType value_type,
+ const Slice& existing_value,
+ std::string* new_value,
+ std::string* skip_until) const override;
+
+ private:
+ const std::string padding_;
+};
+
+CompactionFilter::Decision ValueMutationFilter::FilterV2(
+ int /*level*/, const Slice& /*key*/, ValueType value_type,
+ const Slice& existing_value, std::string* new_value,
+ std::string* /*skip_until*/) const {
+ assert(CompactionFilter::ValueType::kBlobIndex != value_type);
+ if (CompactionFilter::ValueType::kValue != value_type) {
+ return CompactionFilter::Decision::kKeep;
+ }
+ assert(new_value);
+ new_value->assign(existing_value.data(), existing_value.size());
+ new_value->append(padding_);
+ return CompactionFilter::Decision::kChangeValue;
+}
+
+class AlwaysKeepFilter : public CompactionFilter {
+ public:
+ explicit AlwaysKeepFilter() = default;
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.always.keep";
+ }
+ CompactionFilter::Decision FilterV2(
+ int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ return CompactionFilter::Decision::kKeep;
+ }
+};
+
+class SkipUntilFilter : public CompactionFilter {
+ public:
+ explicit SkipUntilFilter(std::string skip_until)
+ : skip_until_(std::move(skip_until)) {}
+
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.skip.until";
+ }
+
+ CompactionFilter::Decision FilterV2(int /* level */, const Slice& /* key */,
+ ValueType /* value_type */,
+ const Slice& /* existing_value */,
+ std::string* /* new_value */,
+ std::string* skip_until) const override {
+ assert(skip_until);
+ *skip_until = skip_until_;
+
+ return CompactionFilter::Decision::kRemoveAndSkipUntil;
+ }
+
+ private:
+ std::string skip_until_;
+};
+
+} // anonymous namespace
+
+class DBBlobBadCompactionFilterTest
+ : public DBBlobCompactionTest,
+ public testing::WithParamInterface<
+ std::tuple<std::string, CompactionFilter::Decision,
+ CompactionFilter::Decision>> {
+ public:
+ explicit DBBlobBadCompactionFilterTest()
+ : compaction_filter_guard_(new BadBlobCompactionFilter(
+ std::get<0>(GetParam()), std::get<1>(GetParam()),
+ std::get<2>(GetParam()))) {}
+
+ protected:
+ std::unique_ptr<CompactionFilter> compaction_filter_guard_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ BadCompactionFilter, DBBlobBadCompactionFilterTest,
+ testing::Combine(
+ testing::Values("a"),
+ testing::Values(CompactionFilter::Decision::kChangeBlobIndex,
+ CompactionFilter::Decision::kIOError),
+ testing::Values(CompactionFilter::Decision::kUndetermined,
+ CompactionFilter::Decision::kChangeBlobIndex,
+ CompactionFilter::Decision::kIOError)));
+
+TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ constexpr size_t kKeyLength = 2;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new FilterByKeyLength(kKeyLength));
+ options.compaction_filter = compaction_filter_guard.get();
+
+ constexpr char short_key[] = "a";
+ constexpr char long_key[] = "abc";
+ constexpr char blob_value[] = "value";
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Put(short_key, blob_value));
+ ASSERT_OK(Put(long_key, blob_value));
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), short_key, &value).IsNotFound());
+ value.clear();
+ ASSERT_OK(db_->Get(ReadOptions(), long_key, &value));
+ ASSERT_EQ("value", value);
+
+#ifndef ROCKSDB_LITE
+ const auto& compaction_stats = GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ // Filter decides between kKeep and kRemove solely based on key;
+ // this involves neither reading nor writing blobs
+ ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif // ROCKSDB_LITE
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, FilterByValueLength) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 5;
+ options.create_if_missing = true;
+ constexpr size_t kValueLength = 5;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new FilterByValueLength(kValueLength));
+ options.compaction_filter = compaction_filter_guard.get();
+
+ const std::vector<std::string> short_value_keys = {"a", "e", "j"};
+ constexpr char short_value[] = "val";
+ const std::vector<std::string> long_value_keys = {"b", "f", "k"};
+ constexpr char long_value[] = "valuevalue";
+
+ DestroyAndReopen(options);
+ for (size_t i = 0; i < short_value_keys.size(); ++i) {
+ ASSERT_OK(Put(short_value_keys[i], short_value));
+ }
+ for (size_t i = 0; i < short_value_keys.size(); ++i) {
+ ASSERT_OK(Put(long_value_keys[i], long_value));
+ }
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+ std::string value;
+ for (size_t i = 0; i < short_value_keys.size(); ++i) {
+ ASSERT_TRUE(
+ db_->Get(ReadOptions(), short_value_keys[i], &value).IsNotFound());
+ value.clear();
+ }
+ for (size_t i = 0; i < long_value_keys.size(); ++i) {
+ ASSERT_OK(db_->Get(ReadOptions(), long_value_keys[i], &value));
+ ASSERT_EQ(long_value, value);
+ }
+
+#ifndef ROCKSDB_LITE
+ const auto& compaction_stats = GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ // Filter decides between kKeep and kRemove based on value;
+ // this involves reading but not writing blobs
+ ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif // ROCKSDB_LITE
+
+ Close();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) {
+ Options options = GetDefaultOptions();
+
+ options.enable_blob_files = true;
+ options.min_blob_size = 1000;
+ options.blob_file_starting_level = 5;
+ options.create_if_missing = true;
+
+ // Open DB with fixed-prefix sst-partitioner so that compaction will cut
+ // new table file when encountering a new key whose 1-byte prefix changes.
+ constexpr size_t key_len = 1;
+ options.sst_partitioner_factory =
+ NewSstPartitionerFixedPrefixFactory(key_len);
+
+ ASSERT_OK(TryReopen(options));
+
+ constexpr size_t blob_size = 3000;
+
+ constexpr char first_key[] = "a";
+ const std::string first_blob(blob_size, 'a');
+ ASSERT_OK(Put(first_key, first_blob));
+
+ constexpr char second_key[] = "b";
+ const std::string second_blob(2 * blob_size, 'b');
+ ASSERT_OK(Put(second_key, second_blob));
+
+ constexpr char third_key[] = "d";
+ const std::string third_blob(blob_size, 'd');
+ ASSERT_OK(Put(third_key, third_blob));
+
+ ASSERT_OK(Flush());
+
+ constexpr char fourth_key[] = "c";
+ const std::string fourth_blob(blob_size, 'c');
+ ASSERT_OK(Put(fourth_key, fourth_blob));
+
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(0, GetBlobFileNumbers().size());
+ ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1));
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+
+ // No blob file should be created since blob_file_starting_level is 5.
+ ASSERT_EQ(0, GetBlobFileNumbers().size());
+ ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0));
+ ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+
+ {
+ options.blob_file_starting_level = 1;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(first_key, first_blob));
+ ASSERT_OK(Put(second_key, second_blob));
+ ASSERT_OK(Put(third_key, third_blob));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(fourth_key, fourth_blob));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(0, GetBlobFileNumbers().size());
+ ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1));
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ // The compaction's output level equals to blob_file_starting_level.
+ ASSERT_EQ(1, GetBlobFileNumbers().size());
+ ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0));
+ ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+ }
+
+ Close();
+}
+#endif
+
+TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ constexpr char new_blob_value[] = "new_blob_value";
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ValueBlindWriteFilter(new_blob_value));
+ options.compaction_filter = compaction_filter_guard.get();
+ DestroyAndReopen(options);
+ const std::vector<std::string> keys = {"a", "b", "c"};
+ const std::vector<std::string> values = {"a_value", "b_value", "c_value"};
+ assert(keys.size() == values.size());
+ for (size_t i = 0; i < keys.size(); ++i) {
+ ASSERT_OK(Put(keys[i], values[i]));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ for (const auto& key : keys) {
+ ASSERT_EQ(new_blob_value, Get(key));
+ }
+
+#ifndef ROCKSDB_LITE
+ const auto& compaction_stats = GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ // Filter unconditionally changes value in FilterBlobByKey;
+ // this involves writing but not reading blobs
+ ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif // ROCKSDB_LITE
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, SkipUntilFilter) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new SkipUntilFilter("z"));
+ options.compaction_filter = compaction_filter_guard.get();
+
+ Reopen(options);
+
+ const std::vector<std::string> keys{"a", "b", "c"};
+ const std::vector<std::string> values{"a_value", "b_value", "c_value"};
+ assert(keys.size() == values.size());
+
+ for (size_t i = 0; i < keys.size(); ++i) {
+ ASSERT_OK(Put(keys[i], values[i]));
+ }
+
+ ASSERT_OK(Flush());
+
+ int process_in_flow_called = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow",
+ [&process_in_flow_called](void* /* arg */) { ++process_in_flow_called; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr,
+ /* end */ nullptr));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ for (const auto& key : keys) {
+ ASSERT_EQ(Get(key), "NOT_FOUND");
+ }
+
+ // Make sure SkipUntil was performed using iteration rather than Seek
+ ASSERT_EQ(process_in_flow_called, keys.size());
+
+ Close();
+}
+
+TEST_P(DBBlobBadCompactionFilterTest, BadDecisionFromCompactionFilter) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ options.compaction_filter = compaction_filter_guard_.get();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("b", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr)
+ .IsNotSupported());
+ Close();
+
+ DestroyAndReopen(options);
+ std::string key(std::get<0>(GetParam()));
+ ASSERT_OK(Put(key, "value"));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr)
+ .IsNotSupported());
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter_InlinedTTLIndex) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ValueMutationFilter(""));
+ options.compaction_filter = compaction_filter_guard.get();
+ DestroyAndReopen(options);
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+ // Fake an inlined TTL blob index.
+ std::string blob_index;
+ constexpr uint64_t expiration = 1234567890;
+ BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr)
+ .IsCorruption());
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ constexpr char padding[] = "_delta";
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ValueMutationFilter(padding));
+ options.compaction_filter = compaction_filter_guard.get();
+ DestroyAndReopen(options);
+ const std::vector<std::pair<std::string, std::string>> kvs = {
+ {"a", "a_value"}, {"b", "b_value"}, {"c", "c_value"}};
+ for (const auto& kv : kvs) {
+ ASSERT_OK(Put(kv.first, kv.second));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ for (const auto& kv : kvs) {
+ ASSERT_EQ(kv.second + std::string(padding), Get(kv.first));
+ }
+
+#ifndef ROCKSDB_LITE
+ const auto& compaction_stats = GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ // Filter changes the value using the previous value in FilterV2;
+ // this involves reading and writing blobs
+ ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif // ROCKSDB_LITE
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CorruptedBlobIndex) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ValueMutationFilter(""));
+ options.compaction_filter = compaction_filter_guard.get();
+ DestroyAndReopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ ASSERT_OK(Put(key, blob));
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex",
+ [](void* arg) {
+ Slice* const blob_index = static_cast<Slice*>(arg);
+ assert(blob_index);
+ assert(!blob_index->empty());
+ blob_index->remove_prefix(1);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr)
+ .IsCorruption());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new AlwaysKeepFilter());
+ options.compaction_filter = compaction_filter_guard.get();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "foo_value"));
+ ASSERT_OK(Flush());
+ std::vector<uint64_t> blob_files = GetBlobFileNumbers();
+ ASSERT_EQ(1, blob_files.size());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ ASSERT_EQ(blob_files, GetBlobFileNumbers());
+
+#ifndef ROCKSDB_LITE
+ const auto& compaction_stats = GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ // Filter decides to keep the existing value in FilterV2;
+ // this involves reading but not writing blobs
+ ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif // ROCKSDB_LITE
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, TrackGarbage) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+
+ Reopen(options);
+
+ // First table+blob file pair: 4 blobs with different keys
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "first_value";
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "second_value";
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "third_value";
+ constexpr char fourth_key[] = "fourth_key";
+ constexpr char fourth_value[] = "fourth_value";
+
+ ASSERT_OK(Put(first_key, first_value));
+ ASSERT_OK(Put(second_key, second_value));
+ ASSERT_OK(Put(third_key, third_value));
+ ASSERT_OK(Put(fourth_key, fourth_value));
+ ASSERT_OK(Flush());
+
+ // Second table+blob file pair: overwrite 2 existing keys
+ constexpr char new_first_value[] = "new_first_value";
+ constexpr char new_second_value[] = "new_second_value";
+
+ ASSERT_OK(Put(first_key, new_first_value));
+ ASSERT_OK(Put(second_key, new_second_value));
+ ASSERT_OK(Flush());
+
+ // Compact them together. The first blob file should have 2 garbage blobs
+ // corresponding to the 2 overwritten keys.
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ assert(versions->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ Version* const current = cfd->current();
+ assert(current);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+ ASSERT_EQ(blob_files.size(), 2);
+
+ {
+ const auto& meta = blob_files.front();
+ assert(meta);
+
+ constexpr uint64_t first_expected_bytes =
+ sizeof(first_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) -
+ 1);
+ constexpr uint64_t second_expected_bytes =
+ sizeof(second_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) -
+ 1);
+ constexpr uint64_t third_expected_bytes =
+ sizeof(third_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(third_key) -
+ 1);
+ constexpr uint64_t fourth_expected_bytes =
+ sizeof(fourth_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(fourth_key) -
+ 1);
+
+ ASSERT_EQ(meta->GetTotalBlobCount(), 4);
+ ASSERT_EQ(meta->GetTotalBlobBytes(),
+ first_expected_bytes + second_expected_bytes +
+ third_expected_bytes + fourth_expected_bytes);
+ ASSERT_EQ(meta->GetGarbageBlobCount(), 2);
+ ASSERT_EQ(meta->GetGarbageBlobBytes(),
+ first_expected_bytes + second_expected_bytes);
+ }
+
+ {
+ const auto& meta = blob_files.back();
+ assert(meta);
+
+ constexpr uint64_t new_first_expected_bytes =
+ sizeof(new_first_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) -
+ 1);
+ constexpr uint64_t new_second_expected_bytes =
+ sizeof(new_second_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) -
+ 1);
+
+ ASSERT_EQ(meta->GetTotalBlobCount(), 2);
+ ASSERT_EQ(meta->GetTotalBlobBytes(),
+ new_first_expected_bytes + new_second_expected_bytes);
+ ASSERT_EQ(meta->GetGarbageBlobCount(), 0);
+ ASSERT_EQ(meta->GetGarbageBlobBytes(), 0);
+ }
+}
+
+TEST_F(DBBlobCompactionTest, MergeBlobWithBase) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.disable_auto_compactions = true;
+
+ Reopen(options);
+ ASSERT_OK(Put("Key1", "v1_1"));
+ ASSERT_OK(Put("Key2", "v2_1"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Merge("Key1", "v1_2"));
+ ASSERT_OK(Merge("Key2", "v2_2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Merge("Key1", "v1_3"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ ASSERT_EQ(Get("Key1"), "v1_1,v1_2,v1_3");
+ ASSERT_EQ(Get("Key2"), "v2_1,v2_2");
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadGarbageCollection) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 1.0;
+ options.blob_compaction_readahead_size = 1 << 10;
+ options.disable_auto_compactions = true;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("key", "lime"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("key", "pie"));
+ ASSERT_OK(Put("foo", "baz"));
+ ASSERT_OK(Flush());
+
+ size_t num_non_prefetch_reads = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileReader::GetBlob:ReadFromFile",
+ [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_EQ(Get("key"), "pie");
+ ASSERT_EQ(Get("foo"), "baz");
+ ASSERT_EQ(num_non_prefetch_reads, 0);
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadFilter) {
+ Options options = GetDefaultOptions();
+
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ValueMutationFilter("pie"));
+
+ options.compaction_filter = compaction_filter_guard.get();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.blob_compaction_readahead_size = 1 << 10;
+ options.disable_auto_compactions = true;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("key", "lime"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ size_t num_non_prefetch_reads = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileReader::GetBlob:ReadFromFile",
+ [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_EQ(Get("key"), "limepie");
+ ASSERT_EQ(Get("foo"), "barpie");
+ ASSERT_EQ(num_non_prefetch_reads, 0);
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadMerge) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.blob_compaction_readahead_size = 1 << 10;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.disable_auto_compactions = true;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("key", "lime"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Merge("key", "pie"));
+ ASSERT_OK(Merge("foo", "baz"));
+ ASSERT_OK(Flush());
+
+ size_t num_non_prefetch_reads = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileReader::GetBlob:ReadFromFile",
+ [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_EQ(Get("key"), "lime,pie");
+ ASSERT_EQ(Get("foo"), "bar,baz");
+ ASSERT_EQ(num_non_prefetch_reads, 0);
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionDoNotFillCache) {
+ Options options = GetDefaultOptions();
+
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 1.0;
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+
+ LRUCacheOptions cache_options;
+ cache_options.capacity = 1 << 20;
+ cache_options.metadata_charge_policy = kDontChargeCacheMetadata;
+
+ options.blob_cache = NewLRUCache(cache_options);
+
+ Reopen(options);
+
+ ASSERT_OK(Put("key", "lime"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("key", "pie"));
+ ASSERT_OK(Put("foo", "baz"));
+ ASSERT_OK(Flush());
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ ASSERT_EQ(options.statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+
+ Close();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_corruption_test.cc b/src/rocksdb/db/blob/db_blob_corruption_test.cc
new file mode 100644
index 000000000..7ac7ce3fc
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_corruption_test.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCorruptionTest : public DBTestBase {
+ protected:
+ DBBlobCorruptionTest()
+ : DBTestBase("db_blob_corruption_test", /* env_do_fsync */ false) {}
+
+ void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+ // Pick file to corrupt
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ uint64_t number;
+ FileType type;
+ std::string fname;
+ uint64_t picked_number = kInvalidBlobFileNumber;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) && type == filetype &&
+ number > picked_number) { // Pick latest file
+ fname = dbname_ + "/" + filenames[i];
+ picked_number = number;
+ }
+ }
+ ASSERT_TRUE(!fname.empty()) << filetype;
+ ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt));
+ }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ options.file_checksum_gen_factory =
+ ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
+ Reopen(options);
+
+ ASSERT_OK(Put(Slice("key_1"), Slice("blob_value_1")));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Slice("key_2"), Slice("blob_value_2")));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+ Close();
+
+ Corrupt(kBlobFile, 0, 2);
+
+ ASSERT_OK(TryReopen(options));
+
+ int count{0};
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
+ const Status* s = static_cast<Status*>(arg);
+ ASSERT_NE(s, nullptr);
+ ++count;
+ ASSERT_NOK(*s);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
+ ASSERT_EQ(1, count);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // !ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_index_test.cc b/src/rocksdb/db/blob/db_blob_index_test.cc
new file mode 100644
index 000000000..64c550894
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_index_test.cc
@@ -0,0 +1,602 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/blob/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb
+// should accept the value type on write, and report not supported value
+// for reads, unless caller request for it explicitly. The base rocksdb
+// doesn't understand format of actual blob index (the value).
+class DBBlobIndexTest : public DBTestBase {
+ public:
+ enum Tier {
+ kMemtable = 0,
+ kImmutableMemtables = 1,
+ kL0SstFile = 2,
+ kLnSstFile = 3,
+ };
+ const std::vector<Tier> kAllTiers = {Tier::kMemtable,
+ Tier::kImmutableMemtables,
+ Tier::kL0SstFile, Tier::kLnSstFile};
+
+ DBBlobIndexTest() : DBTestBase("db_blob_index_test", /*env_do_fsync=*/true) {}
+
+ ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); }
+
+ ColumnFamilyData* cfd() {
+ return static_cast_with_check<ColumnFamilyHandleImpl>(cfh())->cfd();
+ }
+
+ Status PutBlobIndex(WriteBatch* batch, const Slice& key,
+ const Slice& blob_index) {
+ return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key,
+ blob_index);
+ }
+
+ Status Write(WriteBatch* batch) {
+ return dbfull()->Write(WriteOptions(), batch);
+ }
+
+ std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr,
+ const Snapshot* snapshot = nullptr) {
+ ReadOptions read_options;
+ read_options.snapshot = snapshot;
+ PinnableSlice value;
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = cfh();
+ get_impl_options.value = &value;
+ get_impl_options.is_blob_index = is_blob_index;
+ auto s = dbfull()->GetImpl(read_options, key, get_impl_options);
+ if (s.IsNotFound()) {
+ return "NOT_FOUND";
+ }
+ if (s.IsCorruption()) {
+ return "CORRUPTION";
+ }
+ if (s.IsNotSupported()) {
+ return "NOT_SUPPORTED";
+ }
+ if (!s.ok()) {
+ return s.ToString();
+ }
+ return value.ToString();
+ }
+
+ std::string GetBlobIndex(const Slice& key,
+ const Snapshot* snapshot = nullptr) {
+ bool is_blob_index = false;
+ std::string value = GetImpl(key, &is_blob_index, snapshot);
+ if (!is_blob_index) {
+ return "NOT_BLOB";
+ }
+ return value;
+ }
+
+ ArenaWrappedDBIter* GetBlobIterator() {
+ return dbfull()->NewIteratorImpl(
+ ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(),
+ nullptr /*read_callback*/, true /*expose_blob_index*/);
+ }
+
+ Options GetTestOptions() {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.num_levels = 2;
+ options.disable_auto_compactions = true;
+ // Disable auto flushes.
+ options.max_write_buffer_number = 10;
+ options.min_write_buffer_number_to_merge = 10;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ return options;
+ }
+
+ void MoveDataTo(Tier tier) {
+ switch (tier) {
+ case Tier::kMemtable:
+ break;
+ case Tier::kImmutableMemtables:
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+ break;
+ case Tier::kL0SstFile:
+ ASSERT_OK(Flush());
+ break;
+ case Tier::kLnSstFile:
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("a", "dummy"));
+ ASSERT_OK(Put("z", "dummy"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,1", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ break;
+ }
+ }
+};
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. We should be able to write kTypeBlobIndex to memtables and
+// SST files.
+TEST_F(DBBlobIndexTest, Write) {
+ for (auto tier : kAllTiers) {
+ DestroyAndReopen(GetTestOptions());
+
+ std::vector<std::pair<std::string, std::string>> key_values;
+
+ constexpr size_t num_key_values = 5;
+
+ key_values.reserve(num_key_values);
+
+ for (size_t i = 1; i <= num_key_values; ++i) {
+ std::string key = "key" + std::to_string(i);
+
+ std::string blob_index;
+ BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210,
+ "blob" + std::to_string(i));
+
+ key_values.emplace_back(std::move(key), std::move(blob_index));
+ }
+
+ for (const auto& key_value : key_values) {
+ WriteBatch batch;
+ ASSERT_OK(PutBlobIndex(&batch, key_value.first, key_value.second));
+ ASSERT_OK(Write(&batch));
+ }
+
+ MoveDataTo(tier);
+
+ for (const auto& key_value : key_values) {
+ ASSERT_EQ(GetBlobIndex(key_value.first), key_value.second);
+ }
+ }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should be able to return blob index if is_blob_index is
+// provided, otherwise it should return Status::NotSupported (when reading from
+// memtable) or Status::Corruption (when reading from SST). Reading from SST
+// returns Corruption because we can't differentiate between the application
+// accidentally opening the base DB of a stacked BlobDB and actual corruption
+// when using the integrated BlobDB.
+TEST_F(DBBlobIndexTest, Get) {
+ std::string blob_index;
+ BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob");
+
+ for (auto tier : kAllTiers) {
+ DestroyAndReopen(GetTestOptions());
+
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("key", "value"));
+ ASSERT_OK(PutBlobIndex(&batch, "blob_key", blob_index));
+ ASSERT_OK(Write(&batch));
+
+ MoveDataTo(tier);
+
+ // Verify normal value
+ bool is_blob_index = false;
+ PinnableSlice value;
+ ASSERT_EQ("value", Get("key"));
+ ASSERT_EQ("value", GetImpl("key"));
+ ASSERT_EQ("value", GetImpl("key", &is_blob_index));
+ ASSERT_FALSE(is_blob_index);
+
+ // Verify blob index
+ if (tier <= kImmutableMemtables) {
+ ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
+ ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
+ } else {
+ ASSERT_TRUE(Get("blob_key", &value).IsCorruption());
+ ASSERT_EQ("CORRUPTION", GetImpl("blob_key"));
+ }
+ ASSERT_EQ(blob_index, GetImpl("blob_key", &is_blob_index));
+ ASSERT_TRUE(is_blob_index);
+ }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should NOT return Status::NotSupported/Status::Corruption
+// if blob index is updated with a normal value. See the test case above for
+// more details.
+TEST_F(DBBlobIndexTest, Updated) {
+ std::string blob_index;
+ BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob");
+
+ for (auto tier : kAllTiers) {
+ DestroyAndReopen(GetTestOptions());
+ WriteBatch batch;
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(PutBlobIndex(&batch, "key" + std::to_string(i), blob_index));
+ }
+ ASSERT_OK(Write(&batch));
+ // Avoid blob values from being purged.
+ const Snapshot* snapshot = dbfull()->GetSnapshot();
+ ASSERT_OK(Put("key1", "new_value"));
+ ASSERT_OK(Merge("key2", "a"));
+ ASSERT_OK(Merge("key2", "b"));
+ ASSERT_OK(Merge("key2", "c"));
+ ASSERT_OK(Delete("key3"));
+ ASSERT_OK(SingleDelete("key4"));
+ ASSERT_OK(Delete("key5"));
+ ASSERT_OK(Merge("key5", "a"));
+ ASSERT_OK(Merge("key5", "b"));
+ ASSERT_OK(Merge("key5", "c"));
+ ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9"));
+ MoveDataTo(tier);
+ for (int i = 0; i < 10; i++) {
+ ASSERT_EQ(blob_index, GetBlobIndex("key" + std::to_string(i), snapshot));
+ }
+ ASSERT_EQ("new_value", Get("key1"));
+ if (tier <= kImmutableMemtables) {
+ ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
+ } else {
+ ASSERT_EQ("CORRUPTION", GetImpl("key2"));
+ }
+ ASSERT_EQ("NOT_FOUND", Get("key3"));
+ ASSERT_EQ("NOT_FOUND", Get("key4"));
+ ASSERT_EQ("a,b,c", GetImpl("key5"));
+ for (int i = 6; i < 9; i++) {
+ ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i)));
+ }
+ ASSERT_EQ(blob_index, GetBlobIndex("key9"));
+ dbfull()->ReleaseSnapshot(snapshot);
+ }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. When a blob iterator is used, it should set the
+// expose_blob_index flag for the underlying DBIter, and retrieve/return the
+// corresponding blob value. If a regular DBIter is created (i.e.
+// expose_blob_index is not set), it should return Status::Corruption.
+TEST_F(DBBlobIndexTest, Iterate) {
+ const std::vector<std::vector<ValueType>> data = {
+ /*00*/ {kTypeValue},
+ /*01*/ {kTypeBlobIndex},
+ /*02*/ {kTypeValue},
+ /*03*/ {kTypeBlobIndex, kTypeValue},
+ /*04*/ {kTypeValue},
+ /*05*/ {kTypeValue, kTypeBlobIndex},
+ /*06*/ {kTypeValue},
+ /*07*/ {kTypeDeletion, kTypeBlobIndex},
+ /*08*/ {kTypeValue},
+ /*09*/ {kTypeSingleDeletion, kTypeBlobIndex},
+ /*10*/ {kTypeValue},
+ /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex},
+ /*12*/ {kTypeValue},
+ /*13*/
+ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex},
+ /*14*/ {kTypeValue},
+ /*15*/ {kTypeBlobIndex},
+ /*16*/ {kTypeValue},
+ };
+
+ auto get_key = [](int index) {
+ char buf[20];
+ snprintf(buf, sizeof(buf), "%02d", index);
+ return "key" + std::string(buf);
+ };
+
+ auto get_value = [&](int index, int version) {
+ return get_key(index) + "_value" + std::to_string(version);
+ };
+
+ auto check_iterator = [&](Iterator* iterator, Status::Code expected_status,
+ const Slice& expected_value) {
+ ASSERT_EQ(expected_status, iterator->status().code());
+ if (expected_status == Status::kOk) {
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ(expected_value, iterator->value());
+ } else {
+ ASSERT_FALSE(iterator->Valid());
+ }
+ };
+
+ auto create_normal_iterator = [&]() -> Iterator* {
+ return dbfull()->NewIterator(ReadOptions());
+ };
+
+ auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); };
+
+ auto check_is_blob = [&](bool is_blob) {
+ return [is_blob](Iterator* iterator) {
+ ASSERT_EQ(is_blob,
+ reinterpret_cast<ArenaWrappedDBIter*>(iterator)->IsBlob());
+ };
+ };
+
+ auto verify = [&](int index, Status::Code expected_status,
+ const Slice& forward_value, const Slice& backward_value,
+ std::function<Iterator*()> create_iterator,
+ std::function<void(Iterator*)> extra_check = nullptr) {
+ // Seek
+ auto* iterator = create_iterator();
+ ASSERT_OK(iterator->status());
+ ASSERT_OK(iterator->Refresh());
+ iterator->Seek(get_key(index));
+ check_iterator(iterator, expected_status, forward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+
+ // Next
+ iterator = create_iterator();
+ ASSERT_OK(iterator->Refresh());
+ iterator->Seek(get_key(index - 1));
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_OK(iterator->status());
+ iterator->Next();
+ check_iterator(iterator, expected_status, forward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+
+ // SeekForPrev
+ iterator = create_iterator();
+ ASSERT_OK(iterator->status());
+ ASSERT_OK(iterator->Refresh());
+ iterator->SeekForPrev(get_key(index));
+ check_iterator(iterator, expected_status, backward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+
+ // Prev
+ iterator = create_iterator();
+ iterator->Seek(get_key(index + 1));
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_OK(iterator->status());
+ iterator->Prev();
+ check_iterator(iterator, expected_status, backward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+ };
+
+ for (auto tier : {Tier::kMemtable} /*kAllTiers*/) {
+ // Avoid values from being purged.
+ std::vector<const Snapshot*> snapshots;
+ DestroyAndReopen(GetTestOptions());
+
+ // fill data
+ for (int i = 0; i < static_cast<int>(data.size()); i++) {
+ for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) {
+ std::string key = get_key(i);
+ std::string value = get_value(i, j);
+ WriteBatch batch;
+ switch (data[i][j]) {
+ case kTypeValue:
+ ASSERT_OK(Put(key, value));
+ break;
+ case kTypeDeletion:
+ ASSERT_OK(Delete(key));
+ break;
+ case kTypeSingleDeletion:
+ ASSERT_OK(SingleDelete(key));
+ break;
+ case kTypeMerge:
+ ASSERT_OK(Merge(key, value));
+ break;
+ case kTypeBlobIndex:
+ ASSERT_OK(PutBlobIndex(&batch, key, value));
+ ASSERT_OK(Write(&batch));
+ break;
+ default:
+ FAIL();
+ };
+ }
+ snapshots.push_back(dbfull()->GetSnapshot());
+ }
+ ASSERT_OK(
+ dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16)));
+ snapshots.push_back(dbfull()->GetSnapshot());
+ MoveDataTo(tier);
+
+ // Normal iterator
+ verify(1, Status::kCorruption, "", "", create_normal_iterator);
+ verify(3, Status::kCorruption, "", "", create_normal_iterator);
+ verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+ create_normal_iterator);
+ verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+ create_normal_iterator);
+ verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+ create_normal_iterator);
+ verify(11, Status::kCorruption, "", "", create_normal_iterator);
+ verify(13, Status::kOk,
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ create_normal_iterator);
+ verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+ create_normal_iterator);
+
+ // Iterator with blob support
+ verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+ create_blob_iterator, check_is_blob(false));
+ if (tier <= kImmutableMemtables) {
+ verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+ } else {
+ verify(11, Status::kCorruption, "", "", create_blob_iterator);
+ }
+ verify(13, Status::kOk,
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+ create_blob_iterator, check_is_blob(false));
+
+#ifndef ROCKSDB_LITE
+ // Iterator with blob support and using seek.
+ ASSERT_OK(dbfull()->SetOptions(
+ cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
+ verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+ create_blob_iterator, check_is_blob(false));
+ if (tier <= kImmutableMemtables) {
+ verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+ } else {
+ verify(11, Status::kCorruption, "", "", create_blob_iterator);
+ }
+ verify(13, Status::kOk,
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+ create_blob_iterator, check_is_blob(false));
+#endif // !ROCKSDB_LITE
+
+ for (auto* snapshot : snapshots) {
+ dbfull()->ReleaseSnapshot(snapshot);
+ }
+ }
+}
+
+TEST_F(DBBlobIndexTest, IntegratedBlobIterate) {
+ const std::vector<std::vector<std::string>> data = {
+ /*00*/ {"Put"},
+ /*01*/ {"Put", "Merge", "Merge", "Merge"},
+ /*02*/ {"Put"}};
+
+ auto get_key = [](size_t index) { return ("key" + std::to_string(index)); };
+
+ auto get_value = [&](size_t index, size_t version) {
+ return get_key(index) + "_value" + std::to_string(version);
+ };
+
+ auto check_iterator = [&](Iterator* iterator, Status expected_status,
+ const Slice& expected_value) {
+ ASSERT_EQ(expected_status, iterator->status());
+ if (expected_status.ok()) {
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ(expected_value, iterator->value());
+ } else {
+ ASSERT_FALSE(iterator->Valid());
+ }
+ };
+
+ auto verify = [&](size_t index, Status expected_status,
+ const Slice& expected_value) {
+ // Seek
+ {
+ Iterator* iterator = db_->NewIterator(ReadOptions());
+ std::unique_ptr<Iterator> iterator_guard(iterator);
+ ASSERT_OK(iterator->status());
+ ASSERT_OK(iterator->Refresh());
+ iterator->Seek(get_key(index));
+ check_iterator(iterator, expected_status, expected_value);
+ }
+ // Next
+ {
+ Iterator* iterator = db_->NewIterator(ReadOptions());
+ std::unique_ptr<Iterator> iterator_guard(iterator);
+ ASSERT_OK(iterator->Refresh());
+ iterator->Seek(get_key(index - 1));
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_OK(iterator->status());
+ iterator->Next();
+ check_iterator(iterator, expected_status, expected_value);
+ }
+ // SeekForPrev
+ {
+ Iterator* iterator = db_->NewIterator(ReadOptions());
+ std::unique_ptr<Iterator> iterator_guard(iterator);
+ ASSERT_OK(iterator->status());
+ ASSERT_OK(iterator->Refresh());
+ iterator->SeekForPrev(get_key(index));
+ check_iterator(iterator, expected_status, expected_value);
+ }
+ // Prev
+ {
+ Iterator* iterator = db_->NewIterator(ReadOptions());
+ std::unique_ptr<Iterator> iterator_guard(iterator);
+ iterator->Seek(get_key(index + 1));
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_OK(iterator->status());
+ iterator->Prev();
+ check_iterator(iterator, expected_status, expected_value);
+ }
+ };
+
+ Options options = GetTestOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ DestroyAndReopen(options);
+
+ // fill data
+ for (size_t i = 0; i < data.size(); i++) {
+ for (size_t j = 0; j < data[i].size(); j++) {
+ std::string key = get_key(i);
+ std::string value = get_value(i, j);
+ if (data[i][j] == "Put") {
+ ASSERT_OK(Put(key, value));
+ ASSERT_OK(Flush());
+ } else if (data[i][j] == "Merge") {
+ ASSERT_OK(Merge(key, value));
+ ASSERT_OK(Flush());
+ }
+ }
+ }
+
+ std::string expected_value = get_value(1, 0) + "," + get_value(1, 1) + "," +
+ get_value(1, 2) + "," + get_value(1, 3);
+ Status expected_status;
+ verify(1, expected_status, expected_value);
+
+#ifndef ROCKSDB_LITE
+ // Test DBIter::FindValueForCurrentKeyUsingSeek flow.
+ ASSERT_OK(dbfull()->SetOptions(cfh(),
+ {{"max_sequential_skip_in_iterations", "0"}}));
+ verify(1, expected_status, expected_value);
+#endif // !ROCKSDB_LITE
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.cc b/src/rocksdb/db/blob/prefetch_buffer_collection.cc
new file mode 100644
index 000000000..079576f51
--- /dev/null
+++ b/src/rocksdb/db/blob/prefetch_buffer_collection.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/prefetch_buffer_collection.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FilePrefetchBuffer* PrefetchBufferCollection::GetOrCreatePrefetchBuffer(
+ uint64_t file_number) {
+ auto& prefetch_buffer = prefetch_buffers_[file_number];
+ if (!prefetch_buffer) {
+ prefetch_buffer.reset(
+ new FilePrefetchBuffer(readahead_size_, readahead_size_));
+ }
+
+ return prefetch_buffer.get();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.h b/src/rocksdb/db/blob/prefetch_buffer_collection.h
new file mode 100644
index 000000000..b973eddc0
--- /dev/null
+++ b/src/rocksdb/db/blob/prefetch_buffer_collection.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+
+#include "file/file_prefetch_buffer.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A class that owns a collection of FilePrefetchBuffers using the file number
+// as key. Used for implementing compaction readahead for blob files. Designed
+// to be accessed by a single thread only: every (sub)compaction needs its own
+// buffers since they are guaranteed to read different blobs from different
+// positions even when reading the same file.
+class PrefetchBufferCollection {
+ public:
+ explicit PrefetchBufferCollection(uint64_t readahead_size)
+ : readahead_size_(readahead_size) {
+ assert(readahead_size_ > 0);
+ }
+
+ FilePrefetchBuffer* GetOrCreatePrefetchBuffer(uint64_t file_number);
+
+ private:
+ uint64_t readahead_size_;
+ std::unordered_map<uint64_t, std::unique_ptr<FilePrefetchBuffer>>
+ prefetch_buffers_; // maps file number to prefetch buffer
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/builder.cc b/src/rocksdb/db/builder.cc
new file mode 100644
index 000000000..9283ffd64
--- /dev/null
+++ b/src/rocksdb/db/builder.cc
@@ -0,0 +1,434 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include <algorithm>
+#include <deque>
+#include <vector>
+
+#include "db/blob/blob_file_builder.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/event_helpers.h"
+#include "db/internal_stats.h"
+#include "db/merge_helper.h"
+#include "db/output_validator.h"
+#include "db/range_del_aggregator.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TableFactory;
+
+TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
+ WritableFileWriter* file) {
+ assert((tboptions.column_family_id ==
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
+ tboptions.column_family_name.empty());
+ return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file);
+}
+
+Status BuildTable(
+ const std::string& dbname, VersionSet* versions,
+ const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
+ const FileOptions& file_options, TableCache* table_cache,
+ InternalIterator* iter,
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters,
+ FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
+ std::vector<SequenceNumber> snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker,
+ bool paranoid_file_checks, InternalStats* internal_stats,
+ IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer,
+ BlobFileCreationReason blob_creation_reason,
+ const SeqnoToTimeMapping& seqno_to_time_mapping, EventLogger* event_logger,
+ int job_id, const Env::IOPriority io_priority,
+ TableProperties* table_properties, Env::WriteLifeTimeHint write_hint,
+ const std::string* full_history_ts_low,
+ BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries,
+ uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) {
+ assert((tboptions.column_family_id ==
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
+ tboptions.column_family_name.empty());
+ auto& mutable_cf_options = tboptions.moptions;
+ auto& ioptions = tboptions.ioptions;
+ // Reports the IOStats for flush for every following bytes.
+ const size_t kReportFlushIOStatsEvery = 1048576;
+ OutputValidator output_validator(
+ tboptions.internal_comparator,
+ /*enable_order_check=*/
+ mutable_cf_options.check_flush_compaction_key_order,
+ /*enable_hash=*/paranoid_file_checks);
+ Status s;
+ meta->fd.file_size = 0;
+ iter->SeekToFirst();
+ std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+ new CompactionRangeDelAggregator(&tboptions.internal_comparator,
+ snapshots, full_history_ts_low));
+ uint64_t num_unfragmented_tombstones = 0;
+ uint64_t total_tombstone_payload_bytes = 0;
+ for (auto& range_del_iter : range_del_iters) {
+ num_unfragmented_tombstones +=
+ range_del_iter->num_unfragmented_tombstones();
+ total_tombstone_payload_bytes +=
+ range_del_iter->total_tombstone_payload_bytes();
+ range_del_agg->AddTombstones(std::move(range_del_iter));
+ }
+
+ std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
+ meta->fd.GetPathId());
+ std::vector<std::string> blob_file_paths;
+ std::string file_checksum = kUnknownFileChecksum;
+ std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+#ifndef ROCKSDB_LITE
+ EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname,
+ tboptions.column_family_name,
+ fname, job_id, tboptions.reason);
+#endif // !ROCKSDB_LITE
+ Env* env = db_options.env;
+ assert(env);
+ FileSystem* fs = db_options.fs.get();
+ assert(fs);
+
+ TableProperties tp;
+ bool table_file_created = false;
+ if (iter->Valid() || !range_del_agg->IsEmpty()) {
+ std::unique_ptr<CompactionFilter> compaction_filter;
+ if (ioptions.compaction_filter_factory != nullptr &&
+ ioptions.compaction_filter_factory->ShouldFilterTableFileCreation(
+ tboptions.reason)) {
+ CompactionFilter::Context context;
+ context.is_full_compaction = false;
+ context.is_manual_compaction = false;
+ context.column_family_id = tboptions.column_family_id;
+ context.reason = tboptions.reason;
+ compaction_filter =
+ ioptions.compaction_filter_factory->CreateCompactionFilter(context);
+ if (compaction_filter != nullptr &&
+ !compaction_filter->IgnoreSnapshots()) {
+ s.PermitUncheckedError();
+ return Status::NotSupported(
+ "CompactionFilter::IgnoreSnapshots() = false is not supported "
+ "anymore.");
+ }
+ }
+
+ TableBuilder* builder;
+ std::unique_ptr<WritableFileWriter> file_writer;
+ {
+ std::unique_ptr<FSWritableFile> file;
+#ifndef NDEBUG
+ bool use_direct_writes = file_options.use_direct_writes;
+ TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
+#endif // !NDEBUG
+ IOStatus io_s = NewWritableFile(fs, fname, &file, file_options);
+ assert(s.ok());
+ s = io_s;
+ if (io_status->ok()) {
+ *io_status = io_s;
+ }
+ if (!s.ok()) {
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger, ioptions.listeners, dbname,
+ tboptions.column_family_name, fname, job_id, meta->fd,
+ kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum,
+ file_checksum_func_name);
+ return s;
+ }
+
+ table_file_created = true;
+ FileTypeSet tmp_set = ioptions.checksum_handoff_file_types;
+ file->SetIOPriority(io_priority);
+ file->SetWriteLifeTimeHint(write_hint);
+ file_writer.reset(new WritableFileWriter(
+ std::move(file), fname, file_options, ioptions.clock, io_tracer,
+ ioptions.stats, ioptions.listeners,
+ ioptions.file_checksum_gen_factory.get(),
+ tmp_set.Contains(FileType::kTableFile), false));
+
+ builder = NewTableBuilder(tboptions, file_writer.get());
+ }
+
+ MergeHelper merge(
+ env, tboptions.internal_comparator.user_comparator(),
+ ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger,
+ true /* internal key corruption is not ok */,
+ snapshots.empty() ? 0 : snapshots.back(), snapshot_checker);
+
+ std::unique_ptr<BlobFileBuilder> blob_file_builder(
+ (mutable_cf_options.enable_blob_files &&
+ tboptions.level_at_creation >=
+ mutable_cf_options.blob_file_starting_level &&
+ blob_file_additions)
+ ? new BlobFileBuilder(
+ versions, fs, &ioptions, &mutable_cf_options, &file_options,
+ tboptions.db_id, tboptions.db_session_id, job_id,
+ tboptions.column_family_id, tboptions.column_family_name,
+ io_priority, write_hint, io_tracer, blob_callback,
+ blob_creation_reason, &blob_file_paths, blob_file_additions)
+ : nullptr);
+
+ const std::atomic<bool> kManualCompactionCanceledFalse{false};
+ CompactionIterator c_iter(
+ iter, tboptions.internal_comparator.user_comparator(), &merge,
+ kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot,
+ job_snapshot, snapshot_checker, env,
+ ShouldReportDetailedTime(env, ioptions.stats),
+ true /* internal key corruption is not ok */, range_del_agg.get(),
+ blob_file_builder.get(), ioptions.allow_data_in_errors,
+ ioptions.enforce_single_del_contracts,
+ /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
+ /*compaction=*/nullptr, compaction_filter.get(),
+ /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low);
+
+ c_iter.SeekToFirst();
+ for (; c_iter.Valid(); c_iter.Next()) {
+ const Slice& key = c_iter.key();
+ const Slice& value = c_iter.value();
+ const ParsedInternalKey& ikey = c_iter.ikey();
+ // Generate a rolling 64-bit hash of the key and values
+ // Note :
+ // Here "key" integrates 'sequence_number'+'kType'+'user key'.
+ s = output_validator.Add(key, value);
+ if (!s.ok()) {
+ break;
+ }
+ builder->Add(key, value);
+
+ s = meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
+ if (!s.ok()) {
+ break;
+ }
+
+ // TODO(noetzli): Update stats after flush, too.
+ if (io_priority == Env::IO_HIGH &&
+ IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
+ }
+ }
+ if (!s.ok()) {
+ c_iter.status().PermitUncheckedError();
+ } else if (!c_iter.status().ok()) {
+ s = c_iter.status();
+ }
+
+ if (s.ok()) {
+ auto range_del_it = range_del_agg->NewIterator();
+ for (range_del_it->SeekToFirst(); range_del_it->Valid();
+ range_del_it->Next()) {
+ auto tombstone = range_del_it->Tombstone();
+ auto kv = tombstone.Serialize();
+ builder->Add(kv.first.Encode(), kv.second);
+ meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
+ tombstone.seq_,
+ tboptions.internal_comparator);
+ }
+ }
+
+ TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
+ const bool empty = builder->IsEmpty();
+ if (num_input_entries != nullptr) {
+ *num_input_entries =
+ c_iter.num_input_entry_scanned() + num_unfragmented_tombstones;
+ }
+ if (!s.ok() || empty) {
+ builder->Abandon();
+ } else {
+ std::string seqno_time_mapping_str;
+ seqno_to_time_mapping.Encode(
+ seqno_time_mapping_str, meta->fd.smallest_seqno,
+ meta->fd.largest_seqno, meta->file_creation_time);
+ builder->SetSeqnoTimeTableProperties(
+ seqno_time_mapping_str,
+ ioptions.compaction_style == CompactionStyle::kCompactionStyleFIFO
+ ? meta->file_creation_time
+ : meta->oldest_ancester_time);
+ s = builder->Finish();
+ }
+ if (io_status->ok()) {
+ *io_status = builder->io_status();
+ }
+
+ if (s.ok() && !empty) {
+ uint64_t file_size = builder->FileSize();
+ meta->fd.file_size = file_size;
+ meta->marked_for_compaction = builder->NeedCompact();
+ assert(meta->fd.GetFileSize() > 0);
+ tp = builder
+ ->GetTableProperties(); // refresh now that builder is finished
+ if (memtable_payload_bytes != nullptr &&
+ memtable_garbage_bytes != nullptr) {
+ const CompactionIterationStats& ci_stats = c_iter.iter_stats();
+ uint64_t total_payload_bytes = ci_stats.total_input_raw_key_bytes +
+ ci_stats.total_input_raw_value_bytes +
+ total_tombstone_payload_bytes;
+ uint64_t total_payload_bytes_written =
+ (tp.raw_key_size + tp.raw_value_size);
+ // Prevent underflow, which may still happen at this point
+ // since we only support inserts, deletes, and deleteRanges.
+ if (total_payload_bytes_written <= total_payload_bytes) {
+ *memtable_payload_bytes = total_payload_bytes;
+ *memtable_garbage_bytes =
+ total_payload_bytes - total_payload_bytes_written;
+ } else {
+ *memtable_payload_bytes = 0;
+ *memtable_garbage_bytes = 0;
+ }
+ }
+ if (table_properties) {
+ *table_properties = tp;
+ }
+ }
+ delete builder;
+
+ // Finish and check for file errors
+ TEST_SYNC_POINT("BuildTable:BeforeSyncTable");
+ if (s.ok() && !empty) {
+ StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS);
+ *io_status = file_writer->Sync(ioptions.use_fsync);
+ }
+ TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile");
+ if (s.ok() && io_status->ok() && !empty) {
+ *io_status = file_writer->Close();
+ }
+ if (s.ok() && io_status->ok() && !empty) {
+ // Add the checksum information to file metadata.
+ meta->file_checksum = file_writer->GetFileChecksum();
+ meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName();
+ file_checksum = meta->file_checksum;
+ file_checksum_func_name = meta->file_checksum_func_name;
+ // Set unique_id only if db_id and db_session_id exist
+ if (!tboptions.db_id.empty() && !tboptions.db_session_id.empty()) {
+ if (!GetSstInternalUniqueId(tboptions.db_id, tboptions.db_session_id,
+ meta->fd.GetNumber(), &(meta->unique_id))
+ .ok()) {
+ // if failed to get unique id, just set it Null
+ meta->unique_id = kNullUniqueId64x2;
+ }
+ }
+ }
+
+ if (s.ok()) {
+ s = *io_status;
+ }
+
+ if (blob_file_builder) {
+ if (s.ok()) {
+ s = blob_file_builder->Finish();
+ } else {
+ blob_file_builder->Abandon(s);
+ }
+ blob_file_builder.reset();
+ }
+
+ // TODO Also check the IO status when create the Iterator.
+
+ TEST_SYNC_POINT("BuildTable:BeforeOutputValidation");
+ if (s.ok() && !empty) {
+ // Verify that the table is usable
+ // We set for_compaction to false and don't OptimizeForCompactionTableRead
+ // here because this is a special case after we finish the table building.
+ // No matter whether use_direct_io_for_flush_and_compaction is true,
+ // the goal is to cache it here for further user reads.
+ ReadOptions read_options;
+ std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
+ read_options, file_options, tboptions.internal_comparator, *meta,
+ nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor,
+ nullptr,
+ (internal_stats == nullptr) ? nullptr
+ : internal_stats->GetFileReadHist(0),
+ TableReaderCaller::kFlush, /*arena=*/nullptr,
+ /*skip_filter=*/false, tboptions.level_at_creation,
+ MaxFileSizeForL0MetaPin(mutable_cf_options),
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key*/ nullptr,
+ /*allow_unprepared_value*/ false));
+ s = it->status();
+ if (s.ok() && paranoid_file_checks) {
+ OutputValidator file_validator(tboptions.internal_comparator,
+ /*enable_order_check=*/true,
+ /*enable_hash=*/true);
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ // Generate a rolling 64-bit hash of the key and values
+ file_validator.Add(it->key(), it->value()).PermitUncheckedError();
+ }
+ s = it->status();
+ if (s.ok() && !output_validator.CompareValidator(file_validator)) {
+ s = Status::Corruption("Paranoid checksums do not match");
+ }
+ }
+ }
+ }
+
+ // Check for input iterator errors
+ if (!iter->status().ok()) {
+ s = iter->status();
+ }
+
+ if (!s.ok() || meta->fd.GetFileSize() == 0) {
+ TEST_SYNC_POINT("BuildTable:BeforeDeleteFile");
+
+ constexpr IODebugContext* dbg = nullptr;
+
+ if (table_file_created) {
+ Status ignored = fs->DeleteFile(fname, IOOptions(), dbg);
+ ignored.PermitUncheckedError();
+ }
+
+ assert(blob_file_additions || blob_file_paths.empty());
+
+ if (blob_file_additions) {
+ for (const std::string& blob_file_path : blob_file_paths) {
+ Status ignored = DeleteDBFile(&db_options, blob_file_path, dbname,
+ /*force_bg=*/false, /*force_fg=*/false);
+ ignored.PermitUncheckedError();
+ TEST_SYNC_POINT("BuildTable::AfterDeleteFile");
+ }
+ }
+ }
+
+ Status status_for_listener = s;
+ if (meta->fd.GetFileSize() == 0) {
+ fname = "(nil)";
+ if (s.ok()) {
+ status_for_listener = Status::Aborted("Empty SST file not kept");
+ }
+ }
+ // Output to event logger and fire events.
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger, ioptions.listeners, dbname, tboptions.column_family_name,
+ fname, job_id, meta->fd, meta->oldest_blob_file_number, tp,
+ tboptions.reason, status_for_listener, file_checksum,
+ file_checksum_func_name);
+
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/builder.h b/src/rocksdb/db/builder.h
new file mode 100644
index 000000000..a028fd2ba
--- /dev/null
+++ b/src/rocksdb/db/builder.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/range_tombstone_fragmenter.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/table_properties_collector.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+#include "table/scoped_arena_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FileMetaData;
+
+class VersionSet;
+class BlobFileAddition;
+class SnapshotChecker;
+class TableCache;
+class TableBuilder;
+class WritableFileWriter;
+class InternalStats;
+class BlobFileCompletionCallback;
+
+// Convenience function for NewTableBuilder on the embedded table_factory.
+TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
+ WritableFileWriter* file);
+
+// Build a Table file from the contents of *iter. The generated file
+// will be named according to number specified in meta. On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+//
+// @param column_family_name Name of the column family that is also identified
+// by column_family_id, or empty string if unknown.
+extern Status BuildTable(
+ const std::string& dbname, VersionSet* versions,
+ const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
+ const FileOptions& file_options, TableCache* table_cache,
+ InternalIterator* iter,
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters,
+ FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
+ std::vector<SequenceNumber> snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker,
+ bool paranoid_file_checks, InternalStats* internal_stats,
+ IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer,
+ BlobFileCreationReason blob_creation_reason,
+ const SeqnoToTimeMapping& seqno_to_time_mapping,
+ EventLogger* event_logger = nullptr, int job_id = 0,
+ const Env::IOPriority io_priority = Env::IO_HIGH,
+ TableProperties* table_properties = nullptr,
+ Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
+ const std::string* full_history_ts_low = nullptr,
+ BlobFileCompletionCallback* blob_callback = nullptr,
+ uint64_t* num_input_entries = nullptr,
+ uint64_t* memtable_payload_bytes = nullptr,
+ uint64_t* memtable_garbage_bytes = nullptr);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/c.cc b/src/rocksdb/db/c.cc
new file mode 100644
index 000000000..a7e4360c6
--- /dev/null
+++ b/src/rocksdb/db/c.cc
@@ -0,0 +1,6390 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/c.h"
+
+#include <cstdlib>
+#include <map>
+#include <unordered_set>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/universal_compaction.h"
+#include "rocksdb/utilities/backup_engine.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/memory_util.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/write_batch.h"
+#include "utilities/merge_operators.h"
+
+using ROCKSDB_NAMESPACE::BackupEngine;
+using ROCKSDB_NAMESPACE::BackupEngineOptions;
+using ROCKSDB_NAMESPACE::BackupID;
+using ROCKSDB_NAMESPACE::BackupInfo;
+using ROCKSDB_NAMESPACE::BatchResult;
+using ROCKSDB_NAMESPACE::BlockBasedTableOptions;
+using ROCKSDB_NAMESPACE::BottommostLevelCompaction;
+using ROCKSDB_NAMESPACE::BytewiseComparator;
+using ROCKSDB_NAMESPACE::Cache;
+using ROCKSDB_NAMESPACE::Checkpoint;
+using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
+using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
+using ROCKSDB_NAMESPACE::ColumnFamilyMetaData;
+using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
+using ROCKSDB_NAMESPACE::CompactionFilter;
+using ROCKSDB_NAMESPACE::CompactionFilterFactory;
+using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
+using ROCKSDB_NAMESPACE::CompactRangeOptions;
+using ROCKSDB_NAMESPACE::Comparator;
+using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::CuckooTableOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DBOptions;
+using ROCKSDB_NAMESPACE::DbPath;
+using ROCKSDB_NAMESPACE::Env;
+using ROCKSDB_NAMESPACE::EnvOptions;
+using ROCKSDB_NAMESPACE::FileLock;
+using ROCKSDB_NAMESPACE::FilterPolicy;
+using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::InfoLogLevel;
+using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
+using ROCKSDB_NAMESPACE::Iterator;
+using ROCKSDB_NAMESPACE::LevelMetaData;
+using ROCKSDB_NAMESPACE::LiveFileMetaData;
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::LRUCacheOptions;
+using ROCKSDB_NAMESPACE::MemoryAllocator;
+using ROCKSDB_NAMESPACE::MemoryUtil;
+using ROCKSDB_NAMESPACE::MergeOperator;
+using ROCKSDB_NAMESPACE::NewBloomFilterPolicy;
+using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory;
+using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
+using ROCKSDB_NAMESPACE::NewLRUCache;
+using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy;
+using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
+using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::PerfContext;
+using ROCKSDB_NAMESPACE::PerfLevel;
+using ROCKSDB_NAMESPACE::PinnableSlice;
+using ROCKSDB_NAMESPACE::PrepopulateBlobCache;
+using ROCKSDB_NAMESPACE::RandomAccessFile;
+using ROCKSDB_NAMESPACE::Range;
+using ROCKSDB_NAMESPACE::RateLimiter;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::RestoreOptions;
+using ROCKSDB_NAMESPACE::SequentialFile;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::SliceParts;
+using ROCKSDB_NAMESPACE::SliceTransform;
+using ROCKSDB_NAMESPACE::Snapshot;
+using ROCKSDB_NAMESPACE::SstFileMetaData;
+using ROCKSDB_NAMESPACE::SstFileWriter;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory;
+using ROCKSDB_NAMESPACE::Transaction;
+using ROCKSDB_NAMESPACE::TransactionDB;
+using ROCKSDB_NAMESPACE::TransactionDBOptions;
+using ROCKSDB_NAMESPACE::TransactionLogIterator;
+using ROCKSDB_NAMESPACE::TransactionOptions;
+using ROCKSDB_NAMESPACE::WALRecoveryMode;
+using ROCKSDB_NAMESPACE::WritableFile;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteBatchWithIndex;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+using std::unordered_set;
+using std::vector;
+
+extern "C" {
+
+struct rocksdb_t {
+ DB* rep;
+};
+struct rocksdb_backup_engine_t {
+ BackupEngine* rep;
+};
+struct rocksdb_backup_engine_info_t {
+ std::vector<BackupInfo> rep;
+};
+struct rocksdb_restore_options_t {
+ RestoreOptions rep;
+};
+struct rocksdb_iterator_t {
+ Iterator* rep;
+};
+struct rocksdb_writebatch_t {
+ WriteBatch rep;
+};
+struct rocksdb_writebatch_wi_t {
+ WriteBatchWithIndex* rep;
+};
+struct rocksdb_snapshot_t {
+ const Snapshot* rep;
+};
+struct rocksdb_flushoptions_t {
+ FlushOptions rep;
+};
+struct rocksdb_fifo_compaction_options_t {
+ CompactionOptionsFIFO rep;
+};
+struct rocksdb_readoptions_t {
+ ReadOptions rep;
+ // stack variables to set pointers to in ReadOptions
+ Slice upper_bound;
+ Slice lower_bound;
+ Slice timestamp;
+ Slice iter_start_ts;
+};
+struct rocksdb_writeoptions_t {
+ WriteOptions rep;
+};
+struct rocksdb_options_t {
+ Options rep;
+};
+struct rocksdb_compactoptions_t {
+ CompactRangeOptions rep;
+ Slice full_history_ts_low;
+};
+struct rocksdb_block_based_table_options_t {
+ BlockBasedTableOptions rep;
+};
+struct rocksdb_cuckoo_table_options_t {
+ CuckooTableOptions rep;
+};
+struct rocksdb_seqfile_t {
+ SequentialFile* rep;
+};
+struct rocksdb_randomfile_t {
+ RandomAccessFile* rep;
+};
+struct rocksdb_writablefile_t {
+ WritableFile* rep;
+};
+struct rocksdb_wal_iterator_t {
+ TransactionLogIterator* rep;
+};
+struct rocksdb_wal_readoptions_t {
+ TransactionLogIterator::ReadOptions rep;
+};
+struct rocksdb_filelock_t {
+ FileLock* rep;
+};
+struct rocksdb_logger_t {
+ std::shared_ptr<Logger> rep;
+};
+struct rocksdb_lru_cache_options_t {
+ LRUCacheOptions rep;
+};
+struct rocksdb_memory_allocator_t {
+ std::shared_ptr<MemoryAllocator> rep;
+};
+struct rocksdb_cache_t {
+ std::shared_ptr<Cache> rep;
+};
+struct rocksdb_livefiles_t {
+ std::vector<LiveFileMetaData> rep;
+};
+struct rocksdb_column_family_handle_t {
+ ColumnFamilyHandle* rep;
+};
+struct rocksdb_column_family_metadata_t {
+ ColumnFamilyMetaData rep;
+};
+struct rocksdb_level_metadata_t {
+ const LevelMetaData* rep;
+};
+struct rocksdb_sst_file_metadata_t {
+ const SstFileMetaData* rep;
+};
+struct rocksdb_envoptions_t {
+ EnvOptions rep;
+};
+struct rocksdb_ingestexternalfileoptions_t {
+ IngestExternalFileOptions rep;
+};
+struct rocksdb_sstfilewriter_t {
+ SstFileWriter* rep;
+};
+struct rocksdb_ratelimiter_t {
+ std::shared_ptr<RateLimiter> rep;
+};
+struct rocksdb_perfcontext_t {
+ PerfContext* rep;
+};
+struct rocksdb_pinnableslice_t {
+ PinnableSlice rep;
+};
+struct rocksdb_transactiondb_options_t {
+ TransactionDBOptions rep;
+};
+struct rocksdb_transactiondb_t {
+ TransactionDB* rep;
+};
+struct rocksdb_transaction_options_t {
+ TransactionOptions rep;
+};
+struct rocksdb_transaction_t {
+ Transaction* rep;
+};
+struct rocksdb_backup_engine_options_t {
+ BackupEngineOptions rep;
+};
+struct rocksdb_checkpoint_t {
+ Checkpoint* rep;
+};
+struct rocksdb_optimistictransactiondb_t {
+ OptimisticTransactionDB* rep;
+};
+struct rocksdb_optimistictransaction_options_t {
+ OptimisticTransactionOptions rep;
+};
+
+struct rocksdb_compactionfiltercontext_t {
+ CompactionFilter::Context rep;
+};
+
+struct rocksdb_compactionfilter_t : public CompactionFilter {
+ void* state_;
+ void (*destructor_)(void*);
+ unsigned char (*filter_)(void*, int level, const char* key, size_t key_length,
+ const char* existing_value, size_t value_length,
+ char** new_value, size_t* new_value_length,
+ unsigned char* value_changed);
+ const char* (*name_)(void*);
+ unsigned char ignore_snapshots_;
+
+ ~rocksdb_compactionfilter_t() override { (*destructor_)(state_); }
+
+ bool Filter(int level, const Slice& key, const Slice& existing_value,
+ std::string* new_value, bool* value_changed) const override {
+ char* c_new_value = nullptr;
+ size_t new_value_length = 0;
+ unsigned char c_value_changed = 0;
+ unsigned char result =
+ (*filter_)(state_, level, key.data(), key.size(), existing_value.data(),
+ existing_value.size(), &c_new_value, &new_value_length,
+ &c_value_changed);
+ if (c_value_changed) {
+ new_value->assign(c_new_value, new_value_length);
+ *value_changed = true;
+ }
+ return result;
+ }
+
+ const char* Name() const override { return (*name_)(state_); }
+
+ bool IgnoreSnapshots() const override { return ignore_snapshots_; }
+};
+
+struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory {
+ void* state_;
+ void (*destructor_)(void*);
+ rocksdb_compactionfilter_t* (*create_compaction_filter_)(
+ void*, rocksdb_compactionfiltercontext_t* context);
+ const char* (*name_)(void*);
+
+ ~rocksdb_compactionfilterfactory_t() override { (*destructor_)(state_); }
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ rocksdb_compactionfiltercontext_t ccontext;
+ ccontext.rep = context;
+ CompactionFilter* cf = (*create_compaction_filter_)(state_, &ccontext);
+ return std::unique_ptr<CompactionFilter>(cf);
+ }
+
+ const char* Name() const override { return (*name_)(state_); }
+};
+
+struct rocksdb_comparator_t : public Comparator {
+ void* state_;
+ void (*destructor_)(void*);
+ int (*compare_)(void*, const char* a, size_t alen, const char* b,
+ size_t blen);
+ const char* (*name_)(void*);
+ int (*compare_ts_)(void*, const char* a_ts, size_t a_tslen, const char* b_ts,
+ size_t b_tslen);
+ int (*compare_without_ts_)(void*, const char* a, size_t alen,
+ unsigned char a_has_ts, const char* b, size_t blen,
+ unsigned char b_has_ts);
+
+ rocksdb_comparator_t() : Comparator() {}
+
+ rocksdb_comparator_t(size_t ts_size) : Comparator(ts_size) {}
+
+ ~rocksdb_comparator_t() override { (*destructor_)(state_); }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
+ }
+
+ int CompareTimestamp(const Slice& a_ts, const Slice& b_ts) const override {
+ if (compare_ts_ == nullptr) {
+ return 0;
+ }
+ return (*compare_ts_)(state_, a_ts.data(), a_ts.size(), b_ts.data(),
+ b_ts.size());
+ }
+
+ int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+ bool b_has_ts) const override {
+ if (compare_without_ts_ == nullptr) {
+ return Compare(a, b);
+ }
+ return (*compare_without_ts_)(state_, a.data(), a.size(), a_has_ts,
+ b.data(), b.size(), b_has_ts);
+ }
+
+ const char* Name() const override { return (*name_)(state_); }
+
+ // No-ops since the C binding does not support key shortening methods.
+ void FindShortestSeparator(std::string*, const Slice&) const override {}
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+struct rocksdb_filterpolicy_t : public FilterPolicy {
+ void* state_;
+ void (*destructor_)(void*);
+ const char* (*name_)(void*);
+
+ ~rocksdb_filterpolicy_t() override { (*destructor_)(state_); }
+
+ const char* Name() const override { return (*name_)(state_); }
+};
+
+struct rocksdb_mergeoperator_t : public MergeOperator {
+ void* state_;
+ void (*destructor_)(void*);
+ const char* (*name_)(void*);
+ char* (*full_merge_)(void*, const char* key, size_t key_length,
+ const char* existing_value, size_t existing_value_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length);
+ char* (*partial_merge_)(void*, const char* key, size_t key_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length);
+ void (*delete_value_)(void*, const char* value, size_t value_length);
+
+ ~rocksdb_mergeoperator_t() override { (*destructor_)(state_); }
+
+ const char* Name() const override { return (*name_)(state_); }
+
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ size_t n = merge_in.operand_list.size();
+ std::vector<const char*> operand_pointers(n);
+ std::vector<size_t> operand_sizes(n);
+ for (size_t i = 0; i < n; i++) {
+ Slice operand(merge_in.operand_list[i]);
+ operand_pointers[i] = operand.data();
+ operand_sizes[i] = operand.size();
+ }
+
+ const char* existing_value_data = nullptr;
+ size_t existing_value_len = 0;
+ if (merge_in.existing_value != nullptr) {
+ existing_value_data = merge_in.existing_value->data();
+ existing_value_len = merge_in.existing_value->size();
+ }
+
+ unsigned char success;
+ size_t new_value_len;
+ char* tmp_new_value = (*full_merge_)(
+ state_, merge_in.key.data(), merge_in.key.size(), existing_value_data,
+ existing_value_len, &operand_pointers[0], &operand_sizes[0],
+ static_cast<int>(n), &success, &new_value_len);
+ merge_out->new_value.assign(tmp_new_value, new_value_len);
+
+ if (delete_value_ != nullptr) {
+ (*delete_value_)(state_, tmp_new_value, new_value_len);
+ } else {
+ free(tmp_new_value);
+ }
+
+ return success;
+ }
+
+ bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* /*logger*/) const override {
+ size_t operand_count = operand_list.size();
+ std::vector<const char*> operand_pointers(operand_count);
+ std::vector<size_t> operand_sizes(operand_count);
+ for (size_t i = 0; i < operand_count; ++i) {
+ Slice operand(operand_list[i]);
+ operand_pointers[i] = operand.data();
+ operand_sizes[i] = operand.size();
+ }
+
+ unsigned char success;
+ size_t new_value_len;
+ char* tmp_new_value = (*partial_merge_)(
+ state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0],
+ static_cast<int>(operand_count), &success, &new_value_len);
+ new_value->assign(tmp_new_value, new_value_len);
+
+ if (delete_value_ != nullptr) {
+ (*delete_value_)(state_, tmp_new_value, new_value_len);
+ } else {
+ free(tmp_new_value);
+ }
+
+ return success;
+ }
+};
+
+struct rocksdb_dbpath_t {
+ DbPath rep;
+};
+
+struct rocksdb_env_t {
+ Env* rep;
+ bool is_default;
+};
+
+struct rocksdb_slicetransform_t : public SliceTransform {
+ void* state_;
+ void (*destructor_)(void*);
+ const char* (*name_)(void*);
+ char* (*transform_)(void*, const char* key, size_t length,
+ size_t* dst_length);
+ unsigned char (*in_domain_)(void*, const char* key, size_t length);
+ unsigned char (*in_range_)(void*, const char* key, size_t length);
+
+ ~rocksdb_slicetransform_t() override { (*destructor_)(state_); }
+
+ const char* Name() const override { return (*name_)(state_); }
+
+ Slice Transform(const Slice& src) const override {
+ size_t len;
+ char* dst = (*transform_)(state_, src.data(), src.size(), &len);
+ return Slice(dst, len);
+ }
+
+ bool InDomain(const Slice& src) const override {
+ return (*in_domain_)(state_, src.data(), src.size());
+ }
+
+ bool InRange(const Slice& src) const override {
+ return (*in_range_)(state_, src.data(), src.size());
+ }
+};
+
+struct rocksdb_universal_compaction_options_t {
+ ROCKSDB_NAMESPACE::CompactionOptionsUniversal* rep;
+};
+
+static bool SaveError(char** errptr, const Status& s) {
+ assert(errptr != nullptr);
+ if (s.ok()) {
+ return false;
+ } else if (*errptr == nullptr) {
+ *errptr = strdup(s.ToString().c_str());
+ } else {
+ // TODO(sanjay): Merge with existing error?
+ // This is a bug if *errptr is not created by malloc()
+ free(*errptr);
+ *errptr = strdup(s.ToString().c_str());
+ }
+ return true;
+}
+
+static char* CopyString(const std::string& str) {
+ char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
+ memcpy(result, str.data(), sizeof(char) * str.size());
+ return result;
+}
+
+rocksdb_t* rocksdb_open(const rocksdb_options_t* options, const char* name,
+ char** errptr) {
+ DB* db;
+ if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+ return nullptr;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_with_ttl(const rocksdb_options_t* options,
+ const char* name, int ttl, char** errptr) {
+ ROCKSDB_NAMESPACE::DBWithTTL* db;
+ if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open(
+ options->rep, std::string(name), &db, ttl))) {
+ return nullptr;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only(const rocksdb_options_t* options,
+ const char* name,
+ unsigned char error_if_wal_file_exists,
+ char** errptr) {
+ DB* db;
+ if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name),
+ &db, error_if_wal_file_exists))) {
+ return nullptr;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options,
+ const char* name,
+ const char* secondary_path,
+ char** errptr) {
+ DB* db;
+ if (SaveError(errptr,
+ DB::OpenAsSecondary(options->rep, std::string(name),
+ std::string(secondary_path), &db))) {
+ return nullptr;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+ const rocksdb_options_t* options, const char* path, char** errptr) {
+ BackupEngine* be;
+ if (SaveError(errptr, BackupEngine::Open(
+ options->rep.env,
+ BackupEngineOptions(path, nullptr, true,
+ options->rep.info_log.get()),
+ &be))) {
+ return nullptr;
+ }
+ rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+ result->rep = be;
+ return result;
+}
+
+rocksdb_backup_engine_t* rocksdb_backup_engine_open_opts(
+ const rocksdb_backup_engine_options_t* options, rocksdb_env_t* env,
+ char** errptr) {
+ BackupEngine* be;
+ if (SaveError(errptr, BackupEngine::Open(options->rep, env->rep, &be))) {
+ return nullptr;
+ }
+ rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+ result->rep = be;
+ return result;
+}
+
+void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
+ rocksdb_t* db, char** errptr) {
+ SaveError(errptr, be->rep->CreateNewBackup(db->rep));
+}
+
+void rocksdb_backup_engine_create_new_backup_flush(
+ rocksdb_backup_engine_t* be, rocksdb_t* db,
+ unsigned char flush_before_backup, char** errptr) {
+ SaveError(errptr, be->rep->CreateNewBackup(db->rep, flush_before_backup));
+}
+
+void rocksdb_backup_engine_purge_old_backups(rocksdb_backup_engine_t* be,
+ uint32_t num_backups_to_keep,
+ char** errptr) {
+ SaveError(errptr, be->rep->PurgeOldBackups(num_backups_to_keep));
+}
+
+rocksdb_restore_options_t* rocksdb_restore_options_create() {
+ return new rocksdb_restore_options_t;
+}
+
+void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) {
+ delete opt;
+}
+
+void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt,
+ int v) {
+ opt->rep.keep_log_files = v;
+}
+
+void rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be,
+ uint32_t backup_id, char** errptr) {
+ SaveError(errptr, be->rep->VerifyBackup(static_cast<BackupID>(backup_id)));
+}
+
+void rocksdb_backup_engine_restore_db_from_latest_backup(
+ rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+ const rocksdb_restore_options_t* restore_options, char** errptr) {
+ SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir),
+ std::string(wal_dir),
+ restore_options->rep));
+}
+
+void rocksdb_backup_engine_restore_db_from_backup(
+ rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+ const rocksdb_restore_options_t* restore_options, const uint32_t backup_id,
+ char** errptr) {
+ SaveError(errptr, be->rep->RestoreDBFromBackup(backup_id, std::string(db_dir),
+ std::string(wal_dir),
+ restore_options->rep));
+}
+
+const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
+ rocksdb_backup_engine_t* be) {
+ rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t;
+ be->rep->GetBackupInfo(&result->rep);
+ return result;
+}
+
+int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) {
+ return static_cast<int>(info->rep.size());
+}
+
+int64_t rocksdb_backup_engine_info_timestamp(
+ const rocksdb_backup_engine_info_t* info, int index) {
+ return info->rep[index].timestamp;
+}
+
+uint32_t rocksdb_backup_engine_info_backup_id(
+ const rocksdb_backup_engine_info_t* info, int index) {
+ return info->rep[index].backup_id;
+}
+
+uint64_t rocksdb_backup_engine_info_size(
+ const rocksdb_backup_engine_info_t* info, int index) {
+ return info->rep[index].size;
+}
+
+uint32_t rocksdb_backup_engine_info_number_files(
+ const rocksdb_backup_engine_info_t* info, int index) {
+ return info->rep[index].number_files;
+}
+
+void rocksdb_backup_engine_info_destroy(
+ const rocksdb_backup_engine_info_t* info) {
+ delete info;
+}
+
+void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) {
+ delete be->rep;
+ delete be;
+}
+
+rocksdb_backup_engine_options_t* rocksdb_backup_engine_options_create(
+ const char* backup_dir) {
+ return new rocksdb_backup_engine_options_t{
+ BackupEngineOptions(std::string(backup_dir))};
+}
+
+void rocksdb_backup_engine_options_set_backup_dir(
+ rocksdb_backup_engine_options_t* options, const char* backup_dir) {
+ options->rep.backup_dir = std::string(backup_dir);
+}
+
+void rocksdb_backup_engine_options_set_env(
+ rocksdb_backup_engine_options_t* options, rocksdb_env_t* env) {
+ options->rep.backup_env = (env ? env->rep : nullptr);
+}
+
+void rocksdb_backup_engine_options_set_share_table_files(
+ rocksdb_backup_engine_options_t* options, unsigned char val) {
+ options->rep.share_table_files = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_share_table_files(
+ rocksdb_backup_engine_options_t* options) {
+ return options->rep.share_table_files;
+}
+
+void rocksdb_backup_engine_options_set_sync(
+ rocksdb_backup_engine_options_t* options, unsigned char val) {
+ options->rep.sync = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_sync(
+ rocksdb_backup_engine_options_t* options) {
+ return options->rep.sync;
+}
+
+void rocksdb_backup_engine_options_set_destroy_old_data(
+ rocksdb_backup_engine_options_t* options, unsigned char val) {
+ options->rep.destroy_old_data = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_destroy_old_data(
+ rocksdb_backup_engine_options_t* options) {
+ return options->rep.destroy_old_data;
+}
+
+void rocksdb_backup_engine_options_set_backup_log_files(
+ rocksdb_backup_engine_options_t* options, unsigned char val) {
+ options->rep.backup_log_files = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_backup_log_files(
+ rocksdb_backup_engine_options_t* options) {
+ return options->rep.backup_log_files;
+}
+
+void rocksdb_backup_engine_options_set_backup_rate_limit(
+ rocksdb_backup_engine_options_t* options, uint64_t limit) {
+ options->rep.backup_rate_limit = limit;
+}
+
+uint64_t rocksdb_backup_engine_options_get_backup_rate_limit(
+ rocksdb_backup_engine_options_t* options) {
+ return options->rep.backup_rate_limit;
+}
+
+void rocksdb_backup_engine_options_set_restore_rate_limit(
+ rocksdb_backup_engine_options_t* options, uint64_t limit) {
+ options->rep.restore_rate_limit = limit;
+}
+
+uint64_t rocksdb_backup_engine_options_get_restore_rate_limit(
+ rocksdb_backup_engine_options_t* options) {
+ return options->rep.restore_rate_limit;
+}
+
+void rocksdb_backup_engine_options_set_max_background_operations(
+ rocksdb_backup_engine_options_t* options, int val) {
+ options->rep.max_background_operations = val;
+}
+
+int rocksdb_backup_engine_options_get_max_background_operations(
+ rocksdb_backup_engine_options_t* options) {
+ return options->rep.max_background_operations;
+}
+
+void rocksdb_backup_engine_options_set_callback_trigger_interval_size(
+ rocksdb_backup_engine_options_t* options, uint64_t size) {
+ options->rep.callback_trigger_interval_size = size;
+}
+
+uint64_t rocksdb_backup_engine_options_get_callback_trigger_interval_size(
+ rocksdb_backup_engine_options_t* options) {
+ return options->rep.callback_trigger_interval_size;
+}
+
+void rocksdb_backup_engine_options_set_max_valid_backups_to_open(
+ rocksdb_backup_engine_options_t* options, int val) {
+ options->rep.max_valid_backups_to_open = val;
+}
+
+int rocksdb_backup_engine_options_get_max_valid_backups_to_open(
+ rocksdb_backup_engine_options_t* options) {
+ return options->rep.max_valid_backups_to_open;
+}
+
+void rocksdb_backup_engine_options_set_share_files_with_checksum_naming(
+ rocksdb_backup_engine_options_t* options, int val) {
+ options->rep.share_files_with_checksum_naming =
+ static_cast<BackupEngineOptions::ShareFilesNaming>(val);
+}
+
+int rocksdb_backup_engine_options_get_share_files_with_checksum_naming(
+ rocksdb_backup_engine_options_t* options) {
+ return static_cast<int>(options->rep.share_files_with_checksum_naming);
+}
+
+void rocksdb_backup_engine_options_destroy(
+ rocksdb_backup_engine_options_t* options) {
+ delete options;
+}
+
+rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db,
+ char** errptr) {
+ Checkpoint* checkpoint;
+ if (SaveError(errptr, Checkpoint::Create(db->rep, &checkpoint))) {
+ return nullptr;
+ }
+ rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+ result->rep = checkpoint;
+ return result;
+}
+
+void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint,
+ const char* checkpoint_dir,
+ uint64_t log_size_for_flush, char** errptr) {
+ SaveError(errptr, checkpoint->rep->CreateCheckpoint(
+ std::string(checkpoint_dir), log_size_for_flush));
+}
+
+void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) {
+ delete checkpoint->rep;
+ delete checkpoint;
+}
+
+void rocksdb_close(rocksdb_t* db) {
+ delete db->rep;
+ delete db;
+}
+
+void rocksdb_options_set_uint64add_merge_operator(rocksdb_options_t* opt) {
+ opt->rep.merge_operator =
+ ROCKSDB_NAMESPACE::MergeOperators::CreateUInt64AddOperator();
+}
+
+rocksdb_t* rocksdb_open_and_trim_history(
+ const rocksdb_options_t* db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char* trim_ts,
+ size_t trim_tslen, char** errptr) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i < num_column_families; i++) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep)));
+ }
+
+ std::string trim_ts_(trim_ts, trim_tslen);
+
+ DB* db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr, DB::OpenAndTrimHistory(
+ DBOptions(db_options->rep), std::string(name),
+ column_families, &handles, &db, trim_ts_))) {
+ return nullptr;
+ }
+
+ for (size_t i = 0; i < handles.size(); i++) {
+ rocksdb_column_family_handle_t* c_handle =
+ new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_column_families(
+ const rocksdb_options_t* db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i < num_column_families; i++) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep)));
+ }
+
+ DB* db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr, DB::Open(DBOptions(db_options->rep), std::string(name),
+ column_families, &handles, &db))) {
+ return nullptr;
+ }
+
+ for (size_t i = 0; i < handles.size(); i++) {
+ rocksdb_column_family_handle_t* c_handle =
+ new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_column_families_with_ttl(
+ const rocksdb_options_t* db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, const int* ttls,
+ char** errptr) {
+ std::vector<int32_t> ttls_vec;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i < num_column_families; i++) {
+ ttls_vec.push_back(ttls[i]);
+
+ column_families.push_back(ColumnFamilyDescriptor(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep)));
+ }
+
+ ROCKSDB_NAMESPACE::DBWithTTL* db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open(
+ DBOptions(db_options->rep), std::string(name),
+ column_families, &handles, &db, ttls_vec))) {
+ return nullptr;
+ }
+
+ for (size_t i = 0; i < handles.size(); i++) {
+ rocksdb_column_family_handle_t* c_handle =
+ new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only_column_families(
+ const rocksdb_options_t* db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles,
+ unsigned char error_if_wal_file_exists, char** errptr) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i < num_column_families; i++) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep)));
+ }
+
+ DB* db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr,
+ DB::OpenForReadOnly(DBOptions(db_options->rep),
+ std::string(name), column_families,
+ &handles, &db, error_if_wal_file_exists))) {
+ return nullptr;
+ }
+
+ for (size_t i = 0; i < handles.size(); i++) {
+ rocksdb_column_family_handle_t* c_handle =
+ new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+rocksdb_t* rocksdb_open_as_secondary_column_families(
+ const rocksdb_options_t* db_options, const char* name,
+ const char* secondary_path, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i != num_column_families; ++i) {
+ column_families.emplace_back(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep));
+ }
+ DB* db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep),
+ std::string(name),
+ std::string(secondary_path),
+ column_families, &handles, &db))) {
+ return nullptr;
+ }
+ for (size_t i = 0; i != handles.size(); ++i) {
+ rocksdb_column_family_handle_t* c_handle =
+ new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = db;
+ return result;
+}
+
+char** rocksdb_list_column_families(const rocksdb_options_t* options,
+ const char* name, size_t* lencfs,
+ char** errptr) {
+ std::vector<std::string> fams;
+ SaveError(errptr, DB::ListColumnFamilies(DBOptions(options->rep),
+ std::string(name), &fams));
+
+ *lencfs = fams.size();
+ char** column_families =
+ static_cast<char**>(malloc(sizeof(char*) * fams.size()));
+ for (size_t i = 0; i < fams.size(); i++) {
+ column_families[i] = strdup(fams[i].c_str());
+ }
+ return column_families;
+}
+
+void rocksdb_list_column_families_destroy(char** list, size_t len) {
+ for (size_t i = 0; i < len; ++i) {
+ free(list[i]);
+ }
+ free(list);
+}
+
+rocksdb_column_family_handle_t* rocksdb_create_column_family(
+ rocksdb_t* db, const rocksdb_options_t* column_family_options,
+ const char* column_family_name, char** errptr) {
+ rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+ SaveError(errptr, db->rep->CreateColumnFamily(
+ ColumnFamilyOptions(column_family_options->rep),
+ std::string(column_family_name), &(handle->rep)));
+ return handle;
+}
+
+rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl(
+ rocksdb_t* db, const rocksdb_options_t* column_family_options,
+ const char* column_family_name, int ttl, char** errptr) {
+ ROCKSDB_NAMESPACE::DBWithTTL* db_with_ttl =
+ static_cast<ROCKSDB_NAMESPACE::DBWithTTL*>(db->rep);
+ rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+ SaveError(errptr, db_with_ttl->CreateColumnFamilyWithTtl(
+ ColumnFamilyOptions(column_family_options->rep),
+ std::string(column_family_name), &(handle->rep), ttl));
+ return handle;
+}
+
+void rocksdb_drop_column_family(rocksdb_t* db,
+ rocksdb_column_family_handle_t* handle,
+ char** errptr) {
+ SaveError(errptr, db->rep->DropColumnFamily(handle->rep));
+}
+
+uint32_t rocksdb_column_family_handle_get_id(
+ rocksdb_column_family_handle_t* handle) {
+ return handle->rep->GetID();
+}
+
+char* rocksdb_column_family_handle_get_name(
+ rocksdb_column_family_handle_t* handle, size_t* name_len) {
+ auto name = handle->rep->GetName();
+ *name_len = name.size();
+ return CopyString(name);
+}
+
+void rocksdb_column_family_handle_destroy(
+ rocksdb_column_family_handle_t* handle) {
+ delete handle->rep;
+ delete handle;
+}
+
+void rocksdb_put(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ const char* key, size_t keylen, const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr,
+ db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_put_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen, const char* val,
+ size_t vallen, char** errptr) {
+ SaveError(errptr, db->rep->Put(options->rep, column_family->rep,
+ Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_put_with_ts(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ const char* key, size_t keylen, const char* ts,
+ size_t tslen, const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr, db->rep->Put(options->rep, Slice(key, keylen),
+ Slice(ts, tslen), Slice(val, vallen)));
+}
+
+void rocksdb_put_cf_with_ts(rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen, const char* ts,
+ size_t tslen, const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr,
+ db->rep->Put(options->rep, column_family->rep, Slice(key, keylen),
+ Slice(ts, tslen), Slice(val, vallen)));
+}
+
+void rocksdb_delete(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ const char* key, size_t keylen, char** errptr) {
+ SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
+}
+
+void rocksdb_delete_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen, char** errptr) {
+ SaveError(errptr, db->rep->Delete(options->rep, column_family->rep,
+ Slice(key, keylen)));
+}
+
+void rocksdb_delete_with_ts(rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ const char* key, size_t keylen, const char* ts,
+ size_t tslen, char** errptr) {
+ SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen),
+ Slice(ts, tslen)));
+}
+
+void rocksdb_delete_cf_with_ts(rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen, const char* ts,
+ size_t tslen, char** errptr) {
+ SaveError(errptr, db->rep->Delete(options->rep, column_family->rep,
+ Slice(key, keylen), Slice(ts, tslen)));
+}
+
+void rocksdb_singledelete(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ const char* key, size_t keylen, char** errptr) {
+ SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen)));
+}
+
+void rocksdb_singledelete_cf(rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen, char** errptr) {
+ SaveError(errptr, db->rep->SingleDelete(options->rep, column_family->rep,
+ Slice(key, keylen)));
+}
+
+void rocksdb_singledelete_with_ts(rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ const char* key, size_t keylen,
+ const char* ts, size_t tslen, char** errptr) {
+ SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen),
+ Slice(ts, tslen)));
+}
+
+void rocksdb_singledelete_cf_with_ts(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, const char* ts, size_t tslen, char** errptr) {
+ SaveError(errptr,
+ db->rep->SingleDelete(options->rep, column_family->rep,
+ Slice(key, keylen), Slice(ts, tslen)));
+}
+
+void rocksdb_increase_full_history_ts_low(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* ts_low, size_t ts_lowlen, char** errptr) {
+ std::string ts(ts_low, ts_lowlen);
+ SaveError(errptr, db->rep->IncreaseFullHistoryTsLow(column_family->rep, ts));
+}
+
+char* rocksdb_get_full_history_ts_low(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ size_t* ts_len, char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = db->rep->GetFullHistoryTsLow(column_family->rep, &tmp);
+ if (s.ok()) {
+ *ts_len = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *ts_len = 0;
+ SaveError(errptr, s);
+ }
+ return result;
+}
+
+void rocksdb_delete_range_cf(rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len,
+ const char* end_key, size_t end_key_len,
+ char** errptr) {
+ SaveError(errptr, db->rep->DeleteRange(options->rep, column_family->rep,
+ Slice(start_key, start_key_len),
+ Slice(end_key, end_key_len)));
+}
+
+void rocksdb_merge(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ const char* key, size_t keylen, const char* val,
+ size_t vallen, char** errptr) {
+ SaveError(errptr, db->rep->Merge(options->rep, Slice(key, keylen),
+ Slice(val, vallen)));
+}
+
+void rocksdb_merge_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen, const char* val,
+ size_t vallen, char** errptr) {
+ SaveError(errptr, db->rep->Merge(options->rep, column_family->rep,
+ Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_write(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_t* batch, char** errptr) {
+ SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+char* rocksdb_get(rocksdb_t* db, const rocksdb_readoptions_t* options,
+ const char* key, size_t keylen, size_t* vallen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_get_cf(rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen, size_t* vallen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s =
+ db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_get_with_ts(rocksdb_t* db, const rocksdb_readoptions_t* options,
+ const char* key, size_t keylen, size_t* vallen,
+ char** ts, size_t* tslen, char** errptr) {
+ char* result = nullptr;
+ std::string tmp_val;
+ std::string tmp_ts;
+ Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp_val, &tmp_ts);
+ if (s.ok()) {
+ *vallen = tmp_val.size();
+ result = CopyString(tmp_val);
+ *tslen = tmp_ts.size();
+ *ts = CopyString(tmp_ts);
+ } else {
+ *vallen = 0;
+ *tslen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_get_cf_with_ts(rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen, size_t* vallen,
+ char** ts, size_t* tslen, char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ std::string tmp_ts;
+ Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+ &tmp, &tmp_ts);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ *tslen = tmp_ts.size();
+ *ts = CopyString(tmp_ts);
+ } else {
+ *vallen = 0;
+ *tslen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+void rocksdb_multi_get(rocksdb_t* db, const rocksdb_readoptions_t* options,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** errs) {
+ std::vector<Slice> keys(num_keys);
+ for (size_t i = 0; i < num_keys; i++) {
+ keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<std::string> values(num_keys);
+ std::vector<Status> statuses = db->rep->MultiGet(options->rep, keys, &values);
+ for (size_t i = 0; i < num_keys; i++) {
+ if (statuses[i].ok()) {
+ values_list[i] = CopyString(values[i]);
+ values_list_sizes[i] = values[i].size();
+ errs[i] = nullptr;
+ } else {
+ values_list[i] = nullptr;
+ values_list_sizes[i] = 0;
+ if (!statuses[i].IsNotFound()) {
+ errs[i] = strdup(statuses[i].ToString().c_str());
+ } else {
+ errs[i] = nullptr;
+ }
+ }
+ }
+}
+
+void rocksdb_multi_get_with_ts(rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ char** values_list, size_t* values_list_sizes,
+ char** timestamp_list,
+ size_t* timestamp_list_sizes, char** errs) {
+ std::vector<Slice> keys(num_keys);
+ for (size_t i = 0; i < num_keys; i++) {
+ keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<std::string> values(num_keys);
+ std::vector<std::string> timestamps(num_keys);
+ std::vector<Status> statuses =
+ db->rep->MultiGet(options->rep, keys, &values, &timestamps);
+ for (size_t i = 0; i < num_keys; i++) {
+ if (statuses[i].ok()) {
+ values_list[i] = CopyString(values[i]);
+ values_list_sizes[i] = values[i].size();
+ timestamp_list[i] = CopyString(timestamps[i]);
+ timestamp_list_sizes[i] = timestamps[i].size();
+ errs[i] = nullptr;
+ } else {
+ values_list[i] = nullptr;
+ values_list_sizes[i] = 0;
+ timestamp_list[i] = nullptr;
+ timestamp_list_sizes[i] = 0;
+ if (!statuses[i].IsNotFound()) {
+ errs[i] = strdup(statuses[i].ToString().c_str());
+ } else {
+ errs[i] = nullptr;
+ }
+ }
+ }
+}
+
+void rocksdb_multi_get_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ const rocksdb_column_family_handle_t* const* column_families,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** errs) {
+ std::vector<Slice> keys(num_keys);
+ std::vector<ColumnFamilyHandle*> cfs(num_keys);
+ for (size_t i = 0; i < num_keys; i++) {
+ keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ cfs[i] = column_families[i]->rep;
+ }
+ std::vector<std::string> values(num_keys);
+ std::vector<Status> statuses =
+ db->rep->MultiGet(options->rep, cfs, keys, &values);
+ for (size_t i = 0; i < num_keys; i++) {
+ if (statuses[i].ok()) {
+ values_list[i] = CopyString(values[i]);
+ values_list_sizes[i] = values[i].size();
+ errs[i] = nullptr;
+ } else {
+ values_list[i] = nullptr;
+ values_list_sizes[i] = 0;
+ if (!statuses[i].IsNotFound()) {
+ errs[i] = strdup(statuses[i].ToString().c_str());
+ } else {
+ errs[i] = nullptr;
+ }
+ }
+ }
+}
+
+void rocksdb_multi_get_cf_with_ts(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ const rocksdb_column_family_handle_t* const* column_families,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** timestamps_list,
+ size_t* timestamps_list_sizes, char** errs) {
+ std::vector<Slice> keys(num_keys);
+ std::vector<ColumnFamilyHandle*> cfs(num_keys);
+ for (size_t i = 0; i < num_keys; i++) {
+ keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ cfs[i] = column_families[i]->rep;
+ }
+ std::vector<std::string> values(num_keys);
+ std::vector<std::string> timestamps(num_keys);
+ std::vector<Status> statuses =
+ db->rep->MultiGet(options->rep, cfs, keys, &values, &timestamps);
+ for (size_t i = 0; i < num_keys; i++) {
+ if (statuses[i].ok()) {
+ values_list[i] = CopyString(values[i]);
+ values_list_sizes[i] = values[i].size();
+ timestamps_list[i] = CopyString(timestamps[i]);
+ timestamps_list_sizes[i] = timestamps[i].size();
+ errs[i] = nullptr;
+ } else {
+ values_list[i] = nullptr;
+ values_list_sizes[i] = 0;
+ timestamps_list[i] = nullptr;
+ timestamps_list_sizes[i] = 0;
+ if (!statuses[i].IsNotFound()) {
+ errs[i] = strdup(statuses[i].ToString().c_str());
+ } else {
+ errs[i] = nullptr;
+ }
+ }
+ }
+}
+
+void rocksdb_batched_multi_get_cf(rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ rocksdb_pinnableslice_t** values, char** errs,
+ const bool sorted_input) {
+ Slice* key_slices = new Slice[num_keys];
+ PinnableSlice* value_slices = new PinnableSlice[num_keys];
+ Status* statuses = new Status[num_keys];
+ for (size_t i = 0; i < num_keys; ++i) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+
+ db->rep->MultiGet(options->rep, column_family->rep, num_keys, key_slices,
+ value_slices, statuses, sorted_input);
+
+ for (size_t i = 0; i < num_keys; ++i) {
+ if (statuses[i].ok()) {
+ values[i] = new (rocksdb_pinnableslice_t);
+ values[i]->rep = std::move(value_slices[i]);
+ errs[i] = nullptr;
+ } else {
+ values[i] = nullptr;
+ if (!statuses[i].IsNotFound()) {
+ errs[i] = strdup(statuses[i].ToString().c_str());
+ } else {
+ errs[i] = nullptr;
+ }
+ }
+ }
+
+ delete[] key_slices;
+ delete[] value_slices;
+ delete[] statuses;
+}
+
+unsigned char rocksdb_key_may_exist(rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t key_len,
+ char** value, size_t* val_len,
+ const char* timestamp, size_t timestamp_len,
+ unsigned char* value_found) {
+ std::string tmp;
+ std::string time;
+ if (timestamp) {
+ time.assign(timestamp, timestamp_len);
+ }
+ bool found = false;
+ const bool result = db->rep->KeyMayExist(options->rep, Slice(key, key_len),
+ &tmp, timestamp ? &time : nullptr,
+ value_found ? &found : nullptr);
+ if (value_found) {
+ *value_found = found;
+ if (found) {
+ *val_len = tmp.size();
+ *value = CopyString(tmp);
+ }
+ }
+ return result;
+}
+
+unsigned char rocksdb_key_may_exist_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t key_len, char** value, size_t* val_len, const char* timestamp,
+ size_t timestamp_len, unsigned char* value_found) {
+ std::string tmp;
+ std::string time;
+ if (timestamp) {
+ time.assign(timestamp, timestamp_len);
+ }
+ bool found = false;
+ const bool result = db->rep->KeyMayExist(
+ options->rep, column_family->rep, Slice(key, key_len), &tmp,
+ timestamp ? &time : nullptr, value_found ? &found : nullptr);
+ if (value_found) {
+ *value_found = found;
+ if (found) {
+ *val_len = tmp.size();
+ *value = CopyString(tmp);
+ }
+ }
+ return result;
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator(
+ rocksdb_t* db, const rocksdb_readoptions_t* options) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = db->rep->NewIterator(options->rep);
+ return result;
+}
+
+rocksdb_wal_iterator_t* rocksdb_get_updates_since(
+ rocksdb_t* db, uint64_t seq_number,
+ const rocksdb_wal_readoptions_t* options, char** errptr) {
+ std::unique_ptr<TransactionLogIterator> iter;
+ TransactionLogIterator::ReadOptions ro;
+ if (options != nullptr) {
+ ro = options->rep;
+ }
+ if (SaveError(errptr, db->rep->GetUpdatesSince(seq_number, &iter, ro))) {
+ return nullptr;
+ }
+ rocksdb_wal_iterator_t* result = new rocksdb_wal_iterator_t;
+ result->rep = iter.release();
+ return result;
+}
+
+void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) { iter->rep->Next(); }
+
+unsigned char rocksdb_wal_iter_valid(const rocksdb_wal_iterator_t* iter) {
+ return iter->rep->Valid();
+}
+
+void rocksdb_wal_iter_status(const rocksdb_wal_iterator_t* iter,
+ char** errptr) {
+ SaveError(errptr, iter->rep->status());
+}
+
+void rocksdb_wal_iter_destroy(const rocksdb_wal_iterator_t* iter) {
+ delete iter->rep;
+ delete iter;
+}
+
+rocksdb_writebatch_t* rocksdb_wal_iter_get_batch(
+ const rocksdb_wal_iterator_t* iter, uint64_t* seq) {
+ rocksdb_writebatch_t* result = rocksdb_writebatch_create();
+ BatchResult wal_batch = iter->rep->GetBatch();
+ result->rep = std::move(*wal_batch.writeBatchPtr);
+ if (seq != nullptr) {
+ *seq = wal_batch.sequence;
+ }
+ return result;
+}
+
+uint64_t rocksdb_get_latest_sequence_number(rocksdb_t* db) {
+ return db->rep->GetLatestSequenceNumber();
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = db->rep->NewIterator(options->rep, column_family->rep);
+ return result;
+}
+
+void rocksdb_create_iterators(rocksdb_t* db, rocksdb_readoptions_t* opts,
+ rocksdb_column_family_handle_t** column_families,
+ rocksdb_iterator_t** iterators, size_t size,
+ char** errptr) {
+ std::vector<ColumnFamilyHandle*> column_families_vec;
+ for (size_t i = 0; i < size; i++) {
+ column_families_vec.push_back(column_families[i]->rep);
+ }
+
+ std::vector<Iterator*> res;
+ Status status = db->rep->NewIterators(opts->rep, column_families_vec, &res);
+ assert(res.size() == size);
+ if (SaveError(errptr, status)) {
+ return;
+ }
+
+ for (size_t i = 0; i < size; i++) {
+ iterators[i] = new rocksdb_iterator_t;
+ iterators[i]->rep = res[i];
+ }
+}
+
+const rocksdb_snapshot_t* rocksdb_create_snapshot(rocksdb_t* db) {
+ rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+ result->rep = db->rep->GetSnapshot();
+ return result;
+}
+
+void rocksdb_release_snapshot(rocksdb_t* db,
+ const rocksdb_snapshot_t* snapshot) {
+ db->rep->ReleaseSnapshot(snapshot->rep);
+ delete snapshot;
+}
+
+char* rocksdb_property_value(rocksdb_t* db, const char* propname) {
+ std::string tmp;
+ if (db->rep->GetProperty(Slice(propname), &tmp)) {
+ // We use strdup() since we expect human readable output.
+ return strdup(tmp.c_str());
+ } else {
+ return nullptr;
+ }
+}
+
+int rocksdb_property_int(rocksdb_t* db, const char* propname,
+ uint64_t* out_val) {
+ if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int rocksdb_property_int_cf(rocksdb_t* db,
+ rocksdb_column_family_handle_t* column_family,
+ const char* propname, uint64_t* out_val) {
+ if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) {
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+char* rocksdb_property_value_cf(rocksdb_t* db,
+ rocksdb_column_family_handle_t* column_family,
+ const char* propname) {
+ std::string tmp;
+ if (db->rep->GetProperty(column_family->rep, Slice(propname), &tmp)) {
+ // We use strdup() since we expect human readable output.
+ return strdup(tmp.c_str());
+ } else {
+ return nullptr;
+ }
+}
+
+void rocksdb_approximate_sizes(rocksdb_t* db, int num_ranges,
+ const char* const* range_start_key,
+ const size_t* range_start_key_len,
+ const char* const* range_limit_key,
+ const size_t* range_limit_key_len,
+ uint64_t* sizes, char** errptr) {
+ Range* ranges = new Range[num_ranges];
+ for (int i = 0; i < num_ranges; i++) {
+ ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+ ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+ }
+ Status s = db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+ if (!s.ok()) {
+ SaveError(errptr, s);
+ }
+ delete[] ranges;
+}
+
+void rocksdb_approximate_sizes_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ int num_ranges, const char* const* range_start_key,
+ const size_t* range_start_key_len, const char* const* range_limit_key,
+ const size_t* range_limit_key_len, uint64_t* sizes, char** errptr) {
+ Range* ranges = new Range[num_ranges];
+ for (int i = 0; i < num_ranges; i++) {
+ ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+ ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+ }
+ Status s = db->rep->GetApproximateSizes(column_family->rep, ranges,
+ num_ranges, sizes);
+ if (!s.ok()) {
+ SaveError(errptr, s);
+ }
+ delete[] ranges;
+}
+
+void rocksdb_delete_file(rocksdb_t* db, const char* name) {
+ db->rep->DeleteFile(name);
+}
+
+const rocksdb_livefiles_t* rocksdb_livefiles(rocksdb_t* db) {
+ rocksdb_livefiles_t* result = new rocksdb_livefiles_t;
+ db->rep->GetLiveFilesMetaData(&result->rep);
+ return result;
+}
+
+void rocksdb_compact_range(rocksdb_t* db, const char* start_key,
+ size_t start_key_len, const char* limit_key,
+ size_t limit_key_len) {
+ Slice a, b;
+ db->rep->CompactRange(
+ CompactRangeOptions(),
+ // Pass nullptr Slice if corresponding "const char*" is nullptr
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_cf(rocksdb_t* db,
+ rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len) {
+ Slice a, b;
+ db->rep->CompactRange(
+ CompactRangeOptions(), column_family->rep,
+ // Pass nullptr Slice if corresponding "const char*" is nullptr
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_suggest_compact_range(rocksdb_t* db, const char* start_key,
+ size_t start_key_len, const char* limit_key,
+ size_t limit_key_len, char** errptr) {
+ Slice a, b;
+ Status s = ROCKSDB_NAMESPACE::experimental::SuggestCompactRange(
+ db->rep,
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+ SaveError(errptr, s);
+}
+
+void rocksdb_suggest_compact_range_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* limit_key,
+ size_t limit_key_len, char** errptr) {
+ Slice a, b;
+ Status s = db->rep->SuggestCompactRange(
+ column_family->rep,
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+ SaveError(errptr, s);
+}
+
+void rocksdb_compact_range_opt(rocksdb_t* db, rocksdb_compactoptions_t* opt,
+ const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len) {
+ Slice a, b;
+ db->rep->CompactRange(
+ opt->rep,
+ // Pass nullptr Slice if corresponding "const char*" is nullptr
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_cf_opt(rocksdb_t* db,
+ rocksdb_column_family_handle_t* column_family,
+ rocksdb_compactoptions_t* opt,
+ const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len) {
+ Slice a, b;
+ db->rep->CompactRange(
+ opt->rep, column_family->rep,
+ // Pass nullptr Slice if corresponding "const char*" is nullptr
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_flush(rocksdb_t* db, const rocksdb_flushoptions_t* options,
+ char** errptr) {
+ SaveError(errptr, db->rep->Flush(options->rep));
+}
+
+void rocksdb_flush_cf(rocksdb_t* db, const rocksdb_flushoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ char** errptr) {
+ SaveError(errptr, db->rep->Flush(options->rep, column_family->rep));
+}
+
+void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) {
+ SaveError(errptr, db->rep->FlushWAL(sync));
+}
+
+void rocksdb_disable_file_deletions(rocksdb_t* db, char** errptr) {
+ SaveError(errptr, db->rep->DisableFileDeletions());
+}
+
+void rocksdb_enable_file_deletions(rocksdb_t* db, unsigned char force,
+ char** errptr) {
+ SaveError(errptr, db->rep->EnableFileDeletions(force));
+}
+
+void rocksdb_destroy_db(const rocksdb_options_t* options, const char* name,
+ char** errptr) {
+ SaveError(errptr, DestroyDB(name, options->rep));
+}
+
+void rocksdb_repair_db(const rocksdb_options_t* options, const char* name,
+ char** errptr) {
+ SaveError(errptr, RepairDB(name, options->rep));
+}
+
+void rocksdb_iter_destroy(rocksdb_iterator_t* iter) {
+ delete iter->rep;
+ delete iter;
+}
+
+unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) {
+ return iter->rep->Valid();
+}
+
+void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) {
+ iter->rep->SeekToFirst();
+}
+
+void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) {
+ iter->rep->SeekToLast();
+}
+
+void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) {
+ iter->rep->Seek(Slice(k, klen));
+}
+
+void rocksdb_iter_seek_for_prev(rocksdb_iterator_t* iter, const char* k,
+ size_t klen) {
+ iter->rep->SeekForPrev(Slice(k, klen));
+}
+
+void rocksdb_iter_next(rocksdb_iterator_t* iter) { iter->rep->Next(); }
+
+void rocksdb_iter_prev(rocksdb_iterator_t* iter) { iter->rep->Prev(); }
+
+const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) {
+ Slice s = iter->rep->key();
+ *klen = s.size();
+ return s.data();
+}
+
+const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) {
+ Slice s = iter->rep->value();
+ *vlen = s.size();
+ return s.data();
+}
+
+const char* rocksdb_iter_timestamp(const rocksdb_iterator_t* iter,
+ size_t* tslen) {
+ Slice s = iter->rep->timestamp();
+ *tslen = s.size();
+ return s.data();
+}
+
+void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) {
+ SaveError(errptr, iter->rep->status());
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create() {
+ return new rocksdb_writebatch_t;
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep,
+ size_t size) {
+ rocksdb_writebatch_t* b = new rocksdb_writebatch_t;
+ b->rep = WriteBatch(std::string(rep, size));
+ return b;
+}
+
+void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { delete b; }
+
+void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { b->rep.Clear(); }
+
+int rocksdb_writebatch_count(rocksdb_writebatch_t* b) { return b->rep.Count(); }
+
+void rocksdb_writebatch_put(rocksdb_writebatch_t* b, const char* key,
+ size_t klen, const char* val, size_t vlen) {
+ b->rep.Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_put_cf(rocksdb_writebatch_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val,
+ size_t vlen) {
+ b->rep.Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_put_cf_with_ts(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* ts, size_t tslen, const char* val,
+ size_t vlen) {
+ b->rep.Put(column_family->rep, Slice(key, klen), Slice(ts, tslen),
+ Slice(val, vlen));
+}
+
+void rocksdb_writebatch_putv(rocksdb_writebatch_t* b, int num_keys,
+ const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep.Put(SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_merge(rocksdb_writebatch_t* b, const char* key,
+ size_t klen, const char* val, size_t vlen) {
+ b->rep.Merge(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_merge_cf(rocksdb_writebatch_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val,
+ size_t vlen) {
+ b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_mergev(rocksdb_writebatch_t* b, int num_keys,
+ const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep.Merge(SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_delete(rocksdb_writebatch_t* b, const char* key,
+ size_t klen) {
+ b->rep.Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_singledelete(rocksdb_writebatch_t* b, const char* key,
+ size_t klen) {
+ b->rep.SingleDelete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_delete_cf(rocksdb_writebatch_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen) {
+ b->rep.Delete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_delete_cf_with_ts(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* ts, size_t tslen) {
+ b->rep.Delete(column_family->rep, Slice(key, klen), Slice(ts, tslen));
+}
+
+void rocksdb_writebatch_singledelete_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen) {
+ b->rep.SingleDelete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_singledelete_cf_with_ts(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* ts, size_t tslen) {
+ b->rep.SingleDelete(column_family->rep, Slice(key, klen), Slice(ts, tslen));
+}
+
+void rocksdb_writebatch_deletev(rocksdb_writebatch_t* b, int num_keys,
+ const char* const* keys_list,
+ const size_t* keys_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ b->rep.Delete(SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_deletev_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b,
+ const char* start_key,
+ size_t start_key_len, const char* end_key,
+ size_t end_key_len) {
+ b->rep.DeleteRange(Slice(start_key, start_key_len),
+ Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_delete_range_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* end_key,
+ size_t end_key_len) {
+ b->rep.DeleteRange(column_family->rep, Slice(start_key, start_key_len),
+ Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys,
+ const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes,
+ const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes) {
+ std::vector<Slice> start_key_slices(num_keys);
+ std::vector<Slice> end_key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+ end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+ }
+ b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys),
+ SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_delete_rangev_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes) {
+ std::vector<Slice> start_key_slices(num_keys);
+ std::vector<Slice> end_key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+ end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+ }
+ b->rep.DeleteRange(column_family->rep,
+ SliceParts(start_key_slices.data(), num_keys),
+ SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_put_log_data(rocksdb_writebatch_t* b, const char* blob,
+ size_t len) {
+ b->rep.PutLogData(Slice(blob, len));
+}
+
+class H : public WriteBatch::Handler {
+ public:
+ void* state_;
+ void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+ void (*deleted_)(void*, const char* k, size_t klen);
+ void Put(const Slice& key, const Slice& value) override {
+ (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+ }
+ void Delete(const Slice& key) override {
+ (*deleted_)(state_, key.data(), key.size());
+ }
+};
+
+void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state,
+ void (*put)(void*, const char* k, size_t klen,
+ const char* v, size_t vlen),
+ void (*deleted)(void*, const char* k,
+ size_t klen)) {
+ H handler;
+ handler.state_ = state;
+ handler.put_ = put;
+ handler.deleted_ = deleted;
+ b->rep.Iterate(&handler);
+}
+
+const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) {
+ *size = b->rep.GetDataSize();
+ return b->rep.Data().c_str();
+}
+
+void rocksdb_writebatch_set_save_point(rocksdb_writebatch_t* b) {
+ b->rep.SetSavePoint();
+}
+
+void rocksdb_writebatch_rollback_to_save_point(rocksdb_writebatch_t* b,
+ char** errptr) {
+ SaveError(errptr, b->rep.RollbackToSavePoint());
+}
+
+void rocksdb_writebatch_pop_save_point(rocksdb_writebatch_t* b, char** errptr) {
+ SaveError(errptr, b->rep.PopSavePoint());
+}
+
+rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create(
+ size_t reserved_bytes, unsigned char overwrite_key) {
+ rocksdb_writebatch_wi_t* b = new rocksdb_writebatch_wi_t;
+ b->rep = new WriteBatchWithIndex(BytewiseComparator(), reserved_bytes,
+ overwrite_key);
+ return b;
+}
+
+void rocksdb_writebatch_wi_destroy(rocksdb_writebatch_wi_t* b) {
+ if (b->rep) {
+ delete b->rep;
+ }
+ delete b;
+}
+
+void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t* b) {
+ b->rep->Clear();
+}
+
+int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b) {
+ return b->rep->GetWriteBatch()->Count();
+}
+
+void rocksdb_writebatch_wi_put(rocksdb_writebatch_wi_t* b, const char* key,
+ size_t klen, const char* val, size_t vlen) {
+ b->rep->Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_put_cf(rocksdb_writebatch_wi_t* b,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val,
+ size_t vlen) {
+ b->rep->Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_putv(rocksdb_writebatch_wi_t* b, int num_keys,
+ const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep->Put(SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_putv_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep->Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_merge(rocksdb_writebatch_wi_t* b, const char* key,
+ size_t klen, const char* val, size_t vlen) {
+ b->rep->Merge(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_merge_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen) {
+ b->rep->Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_mergev(rocksdb_writebatch_wi_t* b, int num_keys,
+ const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep->Merge(SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_mergev_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<Slice> value_slices(num_values);
+ for (int i = 0; i < num_values; i++) {
+ value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+ }
+ b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
+ SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t* b, const char* key,
+ size_t klen) {
+ b->rep->Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_singledelete(rocksdb_writebatch_wi_t* b,
+ const char* key, size_t klen) {
+ b->rep->SingleDelete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_delete_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen) {
+ b->rep->Delete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_singledelete_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen) {
+ b->rep->SingleDelete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_deletev(rocksdb_writebatch_wi_t* b, int num_keys,
+ const char* const* keys_list,
+ const size_t* keys_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ b->rep->Delete(SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_deletev_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) {
+ std::vector<Slice> key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b,
+ const char* start_key,
+ size_t start_key_len,
+ const char* end_key,
+ size_t end_key_len) {
+ b->rep->DeleteRange(Slice(start_key, start_key_len),
+ Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_wi_delete_range_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* end_key,
+ size_t end_key_len) {
+ b->rep->DeleteRange(column_family->rep, Slice(start_key, start_key_len),
+ Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b,
+ int num_keys,
+ const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes,
+ const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes) {
+ std::vector<Slice> start_key_slices(num_keys);
+ std::vector<Slice> end_key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+ end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+ }
+ b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys),
+ SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_delete_rangev_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes) {
+ std::vector<Slice> start_key_slices(num_keys);
+ std::vector<Slice> end_key_slices(num_keys);
+ for (int i = 0; i < num_keys; i++) {
+ start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+ end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+ }
+ b->rep->DeleteRange(column_family->rep,
+ SliceParts(start_key_slices.data(), num_keys),
+ SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_put_log_data(rocksdb_writebatch_wi_t* b,
+ const char* blob, size_t len) {
+ b->rep->PutLogData(Slice(blob, len));
+}
+
+void rocksdb_writebatch_wi_iterate(
+ rocksdb_writebatch_wi_t* b, void* state,
+ void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+ void (*deleted)(void*, const char* k, size_t klen)) {
+ H handler;
+ handler.state_ = state;
+ handler.put_ = put;
+ handler.deleted_ = deleted;
+ b->rep->GetWriteBatch()->Iterate(&handler);
+}
+
+const char* rocksdb_writebatch_wi_data(rocksdb_writebatch_wi_t* b,
+ size_t* size) {
+ WriteBatch* wb = b->rep->GetWriteBatch();
+ *size = wb->GetDataSize();
+ return wb->Data().c_str();
+}
+
+void rocksdb_writebatch_wi_set_save_point(rocksdb_writebatch_wi_t* b) {
+ b->rep->SetSavePoint();
+}
+
+void rocksdb_writebatch_wi_rollback_to_save_point(rocksdb_writebatch_wi_t* b,
+ char** errptr) {
+ SaveError(errptr, b->rep->RollbackToSavePoint());
+}
+
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base(
+ rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = wbwi->rep->NewIteratorWithBase(base_iterator->rep);
+ delete base_iterator;
+ return result;
+}
+
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
+ rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+ rocksdb_column_family_handle_t* column_family) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep =
+ wbwi->rep->NewIteratorWithBase(column_family->rep, base_iterator->rep);
+ delete base_iterator;
+ return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch(rocksdb_writebatch_wi_t* wbwi,
+ const rocksdb_options_t* options,
+ const char* key, size_t keylen,
+ size_t* vallen, char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = wbwi->rep->GetFromBatch(options->rep, Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_cf(
+ rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, size_t* vallen, char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = wbwi->rep->GetFromBatch(column_family->rep, options->rep,
+ Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_and_db(
+ rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+ const rocksdb_readoptions_t* options, const char* key, size_t keylen,
+ size_t* vallen, char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep,
+ Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
+ rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, size_t* vallen, char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = wbwi->rep->GetFromBatchAndDB(
+ db->rep, options->rep, column_family->rep, Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+void rocksdb_write_writebatch_wi(rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_wi_t* wbwi, char** errptr) {
+ WriteBatch* wb = wbwi->rep->GetWriteBatch();
+ SaveError(errptr, db->rep->Write(options->rep, wb));
+}
+
+void rocksdb_load_latest_options(
+ const char* db_path, rocksdb_env_t* env, bool ignore_unknown_options,
+ rocksdb_cache_t* cache, rocksdb_options_t** db_options,
+ size_t* num_column_families, char*** list_column_family_names,
+ rocksdb_options_t*** list_column_family_options, char** errptr) {
+ DBOptions db_opt;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ Status s = LoadLatestOptions(std::string(db_path), env->rep, &db_opt,
+ &cf_descs, ignore_unknown_options, &cache->rep);
+ if (s.ok()) {
+ char** cf_names = (char**)malloc(cf_descs.size() * sizeof(char*));
+ rocksdb_options_t** cf_options = (rocksdb_options_t**)malloc(
+ cf_descs.size() * sizeof(rocksdb_options_t*));
+ for (size_t i = 0; i < cf_descs.size(); ++i) {
+ cf_names[i] = strdup(cf_descs[i].name.c_str());
+ cf_options[i] = new rocksdb_options_t{
+ Options(DBOptions(), std::move(cf_descs[i].options))};
+ }
+ *num_column_families = cf_descs.size();
+ *db_options = new rocksdb_options_t{
+ Options(std::move(db_opt), ColumnFamilyOptions())};
+ *list_column_family_names = cf_names;
+ *list_column_family_options = cf_options;
+ } else {
+ *num_column_families = 0;
+ *db_options = nullptr;
+ *list_column_family_names = nullptr;
+ *list_column_family_options = nullptr;
+ SaveError(errptr, s);
+ }
+}
+
+void rocksdb_load_latest_options_destroy(
+ rocksdb_options_t* db_options, char** list_column_family_names,
+ rocksdb_options_t** list_column_family_options, size_t len) {
+ rocksdb_options_destroy(db_options);
+ if (list_column_family_names) {
+ for (size_t i = 0; i < len; ++i) {
+ free(list_column_family_names[i]);
+ }
+ free(list_column_family_names);
+ }
+ if (list_column_family_options) {
+ for (size_t i = 0; i < len; ++i) {
+ rocksdb_options_destroy(list_column_family_options[i]);
+ }
+ free(list_column_family_options);
+ }
+}
+
+rocksdb_block_based_table_options_t* rocksdb_block_based_options_create() {
+ return new rocksdb_block_based_table_options_t;
+}
+
+void rocksdb_block_based_options_destroy(
+ rocksdb_block_based_table_options_t* options) {
+ delete options;
+}
+
+void rocksdb_block_based_options_set_checksum(
+ rocksdb_block_based_table_options_t* opt, char v) {
+ opt->rep.checksum = static_cast<ROCKSDB_NAMESPACE::ChecksumType>(v);
+}
+
+void rocksdb_block_based_options_set_block_size(
+ rocksdb_block_based_table_options_t* options, size_t block_size) {
+ options->rep.block_size = block_size;
+}
+
+void rocksdb_block_based_options_set_block_size_deviation(
+ rocksdb_block_based_table_options_t* options, int block_size_deviation) {
+ options->rep.block_size_deviation = block_size_deviation;
+}
+
+void rocksdb_block_based_options_set_block_restart_interval(
+ rocksdb_block_based_table_options_t* options, int block_restart_interval) {
+ options->rep.block_restart_interval = block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_index_block_restart_interval(
+ rocksdb_block_based_table_options_t* options,
+ int index_block_restart_interval) {
+ options->rep.index_block_restart_interval = index_block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_metadata_block_size(
+ rocksdb_block_based_table_options_t* options,
+ uint64_t metadata_block_size) {
+ options->rep.metadata_block_size = metadata_block_size;
+}
+
+void rocksdb_block_based_options_set_partition_filters(
+ rocksdb_block_based_table_options_t* options,
+ unsigned char partition_filters) {
+ options->rep.partition_filters = partition_filters;
+}
+
+void rocksdb_block_based_options_set_use_delta_encoding(
+ rocksdb_block_based_table_options_t* options,
+ unsigned char use_delta_encoding) {
+ options->rep.use_delta_encoding = use_delta_encoding;
+}
+
+void rocksdb_block_based_options_set_filter_policy(
+ rocksdb_block_based_table_options_t* options,
+ rocksdb_filterpolicy_t* filter_policy) {
+ options->rep.filter_policy.reset(filter_policy);
+}
+
+void rocksdb_block_based_options_set_no_block_cache(
+ rocksdb_block_based_table_options_t* options,
+ unsigned char no_block_cache) {
+ options->rep.no_block_cache = no_block_cache;
+}
+
+void rocksdb_block_based_options_set_block_cache(
+ rocksdb_block_based_table_options_t* options,
+ rocksdb_cache_t* block_cache) {
+ if (block_cache) {
+ options->rep.block_cache = block_cache->rep;
+ }
+}
+
+void rocksdb_block_based_options_set_block_cache_compressed(
+ rocksdb_block_based_table_options_t* options,
+ rocksdb_cache_t* block_cache_compressed) {
+ if (block_cache_compressed) {
+ options->rep.block_cache_compressed = block_cache_compressed->rep;
+ }
+}
+
+void rocksdb_block_based_options_set_whole_key_filtering(
+ rocksdb_block_based_table_options_t* options, unsigned char v) {
+ options->rep.whole_key_filtering = v;
+}
+
+void rocksdb_block_based_options_set_format_version(
+ rocksdb_block_based_table_options_t* options, int v) {
+ options->rep.format_version = v;
+}
+
+void rocksdb_block_based_options_set_index_type(
+ rocksdb_block_based_table_options_t* options, int v) {
+ options->rep.index_type = static_cast<BlockBasedTableOptions::IndexType>(v);
+}
+
+void rocksdb_block_based_options_set_data_block_index_type(
+ rocksdb_block_based_table_options_t* options, int v) {
+ options->rep.data_block_index_type =
+ static_cast<BlockBasedTableOptions::DataBlockIndexType>(v);
+}
+
+void rocksdb_block_based_options_set_data_block_hash_ratio(
+ rocksdb_block_based_table_options_t* options, double v) {
+ options->rep.data_block_hash_table_util_ratio = v;
+}
+
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks(
+ rocksdb_block_based_table_options_t* options, unsigned char v) {
+ options->rep.cache_index_and_filter_blocks = v;
+}
+
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
+ rocksdb_block_based_table_options_t* options, unsigned char v) {
+ options->rep.cache_index_and_filter_blocks_with_high_priority = v;
+}
+
+void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
+ rocksdb_block_based_table_options_t* options, unsigned char v) {
+ options->rep.pin_l0_filter_and_index_blocks_in_cache = v;
+}
+
+void rocksdb_block_based_options_set_pin_top_level_index_and_filter(
+ rocksdb_block_based_table_options_t* options, unsigned char v) {
+ options->rep.pin_top_level_index_and_filter = v;
+}
+
+void rocksdb_options_set_block_based_table_factory(
+ rocksdb_options_t* opt,
+ rocksdb_block_based_table_options_t* table_options) {
+ if (table_options) {
+ opt->rep.table_factory.reset(
+ ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options->rep));
+ }
+}
+
+rocksdb_cuckoo_table_options_t* rocksdb_cuckoo_options_create() {
+ return new rocksdb_cuckoo_table_options_t;
+}
+
+void rocksdb_cuckoo_options_destroy(rocksdb_cuckoo_table_options_t* options) {
+ delete options;
+}
+
+void rocksdb_cuckoo_options_set_hash_ratio(
+ rocksdb_cuckoo_table_options_t* options, double v) {
+ options->rep.hash_table_ratio = v;
+}
+
+void rocksdb_cuckoo_options_set_max_search_depth(
+ rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+ options->rep.max_search_depth = v;
+}
+
+void rocksdb_cuckoo_options_set_cuckoo_block_size(
+ rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+ options->rep.cuckoo_block_size = v;
+}
+
+void rocksdb_cuckoo_options_set_identity_as_first_hash(
+ rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+ options->rep.identity_as_first_hash = v;
+}
+
+void rocksdb_cuckoo_options_set_use_module_hash(
+ rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+ options->rep.use_module_hash = v;
+}
+
+void rocksdb_options_set_cuckoo_table_factory(
+ rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options) {
+ if (table_options) {
+ opt->rep.table_factory.reset(
+ ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options->rep));
+ }
+}
+
+void rocksdb_set_options(rocksdb_t* db, int count, const char* const keys[],
+ const char* const values[], char** errptr) {
+ std::unordered_map<std::string, std::string> options_map;
+ for (int i = 0; i < count; i++) options_map[keys[i]] = values[i];
+ SaveError(errptr, db->rep->SetOptions(options_map));
+}
+
+void rocksdb_set_options_cf(rocksdb_t* db,
+ rocksdb_column_family_handle_t* handle, int count,
+ const char* const keys[],
+ const char* const values[], char** errptr) {
+ std::unordered_map<std::string, std::string> options_map;
+ for (int i = 0; i < count; i++) options_map[keys[i]] = values[i];
+ SaveError(errptr, db->rep->SetOptions(handle->rep, options_map));
+}
+
+rocksdb_options_t* rocksdb_options_create() { return new rocksdb_options_t; }
+
+void rocksdb_options_destroy(rocksdb_options_t* options) { delete options; }
+
+rocksdb_options_t* rocksdb_options_create_copy(rocksdb_options_t* options) {
+ return new rocksdb_options_t(*options);
+}
+
+void rocksdb_options_increase_parallelism(rocksdb_options_t* opt,
+ int total_threads) {
+ opt->rep.IncreaseParallelism(total_threads);
+}
+
+void rocksdb_options_optimize_for_point_lookup(rocksdb_options_t* opt,
+ uint64_t block_cache_size_mb) {
+ opt->rep.OptimizeForPointLookup(block_cache_size_mb);
+}
+
+void rocksdb_options_optimize_level_style_compaction(
+ rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+ opt->rep.OptimizeLevelStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_optimize_universal_style_compaction(
+ rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+ opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_set_allow_ingest_behind(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.allow_ingest_behind = v;
+}
+
+unsigned char rocksdb_options_get_allow_ingest_behind(rocksdb_options_t* opt) {
+ return opt->rep.allow_ingest_behind;
+}
+
+void rocksdb_options_set_compaction_filter(rocksdb_options_t* opt,
+ rocksdb_compactionfilter_t* filter) {
+ opt->rep.compaction_filter = filter;
+}
+
+void rocksdb_options_set_compaction_filter_factory(
+ rocksdb_options_t* opt, rocksdb_compactionfilterfactory_t* factory) {
+ opt->rep.compaction_filter_factory =
+ std::shared_ptr<CompactionFilterFactory>(factory);
+}
+
+void rocksdb_options_compaction_readahead_size(rocksdb_options_t* opt,
+ size_t s) {
+ opt->rep.compaction_readahead_size = s;
+}
+
+size_t rocksdb_options_get_compaction_readahead_size(rocksdb_options_t* opt) {
+ return opt->rep.compaction_readahead_size;
+}
+
+void rocksdb_options_set_comparator(rocksdb_options_t* opt,
+ rocksdb_comparator_t* cmp) {
+ opt->rep.comparator = cmp;
+}
+
+void rocksdb_options_set_merge_operator(
+ rocksdb_options_t* opt, rocksdb_mergeoperator_t* merge_operator) {
+ opt->rep.merge_operator = std::shared_ptr<MergeOperator>(merge_operator);
+}
+
+void rocksdb_options_set_create_if_missing(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.create_if_missing = v;
+}
+
+unsigned char rocksdb_options_get_create_if_missing(rocksdb_options_t* opt) {
+ return opt->rep.create_if_missing;
+}
+
+void rocksdb_options_set_create_missing_column_families(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.create_missing_column_families = v;
+}
+
+unsigned char rocksdb_options_get_create_missing_column_families(
+ rocksdb_options_t* opt) {
+ return opt->rep.create_missing_column_families;
+}
+
+void rocksdb_options_set_error_if_exists(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.error_if_exists = v;
+}
+
+unsigned char rocksdb_options_get_error_if_exists(rocksdb_options_t* opt) {
+ return opt->rep.error_if_exists;
+}
+
+void rocksdb_options_set_paranoid_checks(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.paranoid_checks = v;
+}
+
+unsigned char rocksdb_options_get_paranoid_checks(rocksdb_options_t* opt) {
+ return opt->rep.paranoid_checks;
+}
+
+void rocksdb_options_set_db_paths(rocksdb_options_t* opt,
+ const rocksdb_dbpath_t** dbpath_values,
+ size_t num_paths) {
+ std::vector<DbPath> db_paths(num_paths);
+ for (size_t i = 0; i < num_paths; ++i) {
+ db_paths[i] = dbpath_values[i]->rep;
+ }
+ opt->rep.db_paths = db_paths;
+}
+
+void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
+ opt->rep.env = (env ? env->rep : nullptr);
+}
+
+void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
+ if (l) {
+ opt->rep.info_log = l->rep;
+ }
+}
+
+void rocksdb_options_set_info_log_level(rocksdb_options_t* opt, int v) {
+ opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
+}
+
+int rocksdb_options_get_info_log_level(rocksdb_options_t* opt) {
+ return static_cast<int>(opt->rep.info_log_level);
+}
+
+void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt,
+ size_t s) {
+ opt->rep.db_write_buffer_size = s;
+}
+
+size_t rocksdb_options_get_db_write_buffer_size(rocksdb_options_t* opt) {
+ return opt->rep.db_write_buffer_size;
+}
+
+void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
+ opt->rep.write_buffer_size = s;
+}
+
+size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) {
+ return opt->rep.write_buffer_size;
+}
+
+void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
+ opt->rep.max_open_files = n;
+}
+
+int rocksdb_options_get_max_open_files(rocksdb_options_t* opt) {
+ return opt->rep.max_open_files;
+}
+
+void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt,
+ int n) {
+ opt->rep.max_file_opening_threads = n;
+}
+
+int rocksdb_options_get_max_file_opening_threads(rocksdb_options_t* opt) {
+ return opt->rep.max_file_opening_threads;
+}
+
+void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt,
+ uint64_t n) {
+ opt->rep.max_total_wal_size = n;
+}
+
+uint64_t rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt) {
+ return opt->rep.max_total_wal_size;
+}
+
+void rocksdb_options_set_target_file_size_base(rocksdb_options_t* opt,
+ uint64_t n) {
+ opt->rep.target_file_size_base = n;
+}
+
+uint64_t rocksdb_options_get_target_file_size_base(rocksdb_options_t* opt) {
+ return opt->rep.target_file_size_base;
+}
+
+void rocksdb_options_set_target_file_size_multiplier(rocksdb_options_t* opt,
+ int n) {
+ opt->rep.target_file_size_multiplier = n;
+}
+
+int rocksdb_options_get_target_file_size_multiplier(rocksdb_options_t* opt) {
+ return opt->rep.target_file_size_multiplier;
+}
+
+void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt,
+ uint64_t n) {
+ opt->rep.max_bytes_for_level_base = n;
+}
+
+uint64_t rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t* opt) {
+ return opt->rep.max_bytes_for_level_base;
+}
+
+void rocksdb_options_set_level_compaction_dynamic_level_bytes(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.level_compaction_dynamic_level_bytes = v;
+}
+
+unsigned char rocksdb_options_get_level_compaction_dynamic_level_bytes(
+ rocksdb_options_t* opt) {
+ return opt->rep.level_compaction_dynamic_level_bytes;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t* opt,
+ double n) {
+ opt->rep.max_bytes_for_level_multiplier = n;
+}
+
+double rocksdb_options_get_max_bytes_for_level_multiplier(
+ rocksdb_options_t* opt) {
+ return opt->rep.max_bytes_for_level_multiplier;
+}
+
+void rocksdb_options_set_max_compaction_bytes(rocksdb_options_t* opt,
+ uint64_t n) {
+ opt->rep.max_compaction_bytes = n;
+}
+
+uint64_t rocksdb_options_get_max_compaction_bytes(rocksdb_options_t* opt) {
+ return opt->rep.max_compaction_bytes;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+ rocksdb_options_t* opt, int* level_values, size_t num_levels) {
+ opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels);
+ for (size_t i = 0; i < num_levels; ++i) {
+ opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i];
+ }
+}
+
+void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
+ opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+}
+
+void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
+ unsigned char val) {
+ opt->rep.skip_stats_update_on_db_open = val;
+}
+
+unsigned char rocksdb_options_get_skip_stats_update_on_db_open(
+ rocksdb_options_t* opt) {
+ return opt->rep.skip_stats_update_on_db_open;
+}
+
+void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
+ rocksdb_options_t* opt, unsigned char val) {
+ opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
+}
+
+unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
+ rocksdb_options_t* opt) {
+ return opt->rep.skip_checking_sst_file_sizes_on_db_open;
+}
+
+/* Blob Options Settings */
+void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt,
+ unsigned char val) {
+ opt->rep.enable_blob_files = val;
+}
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
+ rocksdb_options_t* opt) {
+ return opt->rep.enable_blob_files;
+}
+
+void rocksdb_options_set_min_blob_size(rocksdb_options_t* opt, uint64_t val) {
+ opt->rep.min_blob_size = val;
+}
+
+uint64_t rocksdb_options_get_min_blob_size(rocksdb_options_t* opt) {
+ return opt->rep.min_blob_size;
+}
+
+void rocksdb_options_set_blob_file_size(rocksdb_options_t* opt, uint64_t val) {
+ opt->rep.blob_file_size = val;
+}
+
+uint64_t rocksdb_options_get_blob_file_size(rocksdb_options_t* opt) {
+ return opt->rep.blob_file_size;
+}
+
+void rocksdb_options_set_blob_compression_type(rocksdb_options_t* opt,
+ int val) {
+ opt->rep.blob_compression_type = static_cast<CompressionType>(val);
+}
+
+int rocksdb_options_get_blob_compression_type(rocksdb_options_t* opt) {
+ return opt->rep.blob_compression_type;
+}
+
+void rocksdb_options_set_enable_blob_gc(rocksdb_options_t* opt,
+ unsigned char val) {
+ opt->rep.enable_blob_garbage_collection = val;
+}
+
+unsigned char rocksdb_options_get_enable_blob_gc(rocksdb_options_t* opt) {
+ return opt->rep.enable_blob_garbage_collection;
+}
+
+void rocksdb_options_set_blob_gc_age_cutoff(rocksdb_options_t* opt,
+ double val) {
+ opt->rep.blob_garbage_collection_age_cutoff = val;
+}
+
+double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) {
+ return opt->rep.blob_garbage_collection_age_cutoff;
+}
+
+void rocksdb_options_set_blob_gc_force_threshold(rocksdb_options_t* opt,
+ double val) {
+ opt->rep.blob_garbage_collection_force_threshold = val;
+}
+
+double rocksdb_options_get_blob_gc_force_threshold(rocksdb_options_t* opt) {
+ return opt->rep.blob_garbage_collection_force_threshold;
+}
+
+void rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt,
+ uint64_t val) {
+ opt->rep.blob_compaction_readahead_size = val;
+}
+
+uint64_t rocksdb_options_get_blob_compaction_readahead_size(
+ rocksdb_options_t* opt) {
+ return opt->rep.blob_compaction_readahead_size;
+}
+
+void rocksdb_options_set_blob_file_starting_level(rocksdb_options_t* opt,
+ int val) {
+ opt->rep.blob_file_starting_level = val;
+}
+
+int rocksdb_options_get_blob_file_starting_level(rocksdb_options_t* opt) {
+ return opt->rep.blob_file_starting_level;
+}
+
+void rocksdb_options_set_blob_cache(rocksdb_options_t* opt,
+ rocksdb_cache_t* blob_cache) {
+ opt->rep.blob_cache = blob_cache->rep;
+}
+
+void rocksdb_options_set_prepopulate_blob_cache(rocksdb_options_t* opt, int t) {
+ opt->rep.prepopulate_blob_cache = static_cast<PrepopulateBlobCache>(t);
+}
+
+int rocksdb_options_get_prepopulate_blob_cache(rocksdb_options_t* opt) {
+ return static_cast<int>(opt->rep.prepopulate_blob_cache);
+}
+
+void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
+ opt->rep.num_levels = n;
+}
+
+int rocksdb_options_get_num_levels(rocksdb_options_t* opt) {
+ return opt->rep.num_levels;
+}
+
+void rocksdb_options_set_level0_file_num_compaction_trigger(
+ rocksdb_options_t* opt, int n) {
+ opt->rep.level0_file_num_compaction_trigger = n;
+}
+
+int rocksdb_options_get_level0_file_num_compaction_trigger(
+ rocksdb_options_t* opt) {
+ return opt->rep.level0_file_num_compaction_trigger;
+}
+
+void rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t* opt,
+ int n) {
+ opt->rep.level0_slowdown_writes_trigger = n;
+}
+
+int rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t* opt) {
+ return opt->rep.level0_slowdown_writes_trigger;
+}
+
+void rocksdb_options_set_level0_stop_writes_trigger(rocksdb_options_t* opt,
+ int n) {
+ opt->rep.level0_stop_writes_trigger = n;
+}
+
+int rocksdb_options_get_level0_stop_writes_trigger(rocksdb_options_t* opt) {
+ return opt->rep.level0_stop_writes_trigger;
+}
+
+void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt, int mode) {
+ opt->rep.wal_recovery_mode = static_cast<WALRecoveryMode>(mode);
+}
+
+int rocksdb_options_get_wal_recovery_mode(rocksdb_options_t* opt) {
+ return static_cast<int>(opt->rep.wal_recovery_mode);
+}
+
+void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) {
+ opt->rep.compression = static_cast<CompressionType>(t);
+}
+
+int rocksdb_options_get_compression(rocksdb_options_t* opt) {
+ return opt->rep.compression;
+}
+
+void rocksdb_options_set_bottommost_compression(rocksdb_options_t* opt, int t) {
+ opt->rep.bottommost_compression = static_cast<CompressionType>(t);
+}
+
+int rocksdb_options_get_bottommost_compression(rocksdb_options_t* opt) {
+ return opt->rep.bottommost_compression;
+}
+
+void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt,
+ const int* level_values,
+ size_t num_levels) {
+ opt->rep.compression_per_level.resize(num_levels);
+ for (size_t i = 0; i < num_levels; ++i) {
+ opt->rep.compression_per_level[i] =
+ static_cast<CompressionType>(level_values[i]);
+ }
+}
+
+void rocksdb_options_set_bottommost_compression_options(rocksdb_options_t* opt,
+ int w_bits, int level,
+ int strategy,
+ int max_dict_bytes,
+ unsigned char enabled) {
+ opt->rep.bottommost_compression_opts.window_bits = w_bits;
+ opt->rep.bottommost_compression_opts.level = level;
+ opt->rep.bottommost_compression_opts.strategy = strategy;
+ opt->rep.bottommost_compression_opts.max_dict_bytes = max_dict_bytes;
+ opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
+ rocksdb_options_t* opt, int zstd_max_train_bytes, unsigned char enabled) {
+ opt->rep.bottommost_compression_opts.zstd_max_train_bytes =
+ zstd_max_train_bytes;
+ opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer(
+ rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer,
+ unsigned char enabled) {
+ opt->rep.bottommost_compression_opts.use_zstd_dict_trainer =
+ use_zstd_dict_trainer;
+ opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+unsigned char
+rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer(
+ rocksdb_options_t* opt) {
+ return opt->rep.bottommost_compression_opts.use_zstd_dict_trainer;
+}
+
+void rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
+ rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes,
+ unsigned char enabled) {
+ opt->rep.bottommost_compression_opts.max_dict_buffer_bytes =
+ max_dict_buffer_bytes;
+ opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits,
+ int level, int strategy,
+ int max_dict_bytes) {
+ opt->rep.compression_opts.window_bits = w_bits;
+ opt->rep.compression_opts.level = level;
+ opt->rep.compression_opts.strategy = strategy;
+ opt->rep.compression_opts.max_dict_bytes = max_dict_bytes;
+}
+
+void rocksdb_options_set_compression_options_zstd_max_train_bytes(
+ rocksdb_options_t* opt, int zstd_max_train_bytes) {
+ opt->rep.compression_opts.zstd_max_train_bytes = zstd_max_train_bytes;
+}
+
+int rocksdb_options_get_compression_options_zstd_max_train_bytes(
+ rocksdb_options_t* opt) {
+ return opt->rep.compression_opts.zstd_max_train_bytes;
+}
+
+void rocksdb_options_set_compression_options_use_zstd_dict_trainer(
+ rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer) {
+ opt->rep.compression_opts.use_zstd_dict_trainer = use_zstd_dict_trainer;
+}
+
+unsigned char rocksdb_options_get_compression_options_use_zstd_dict_trainer(
+ rocksdb_options_t* opt) {
+ return opt->rep.compression_opts.use_zstd_dict_trainer;
+}
+
+void rocksdb_options_set_compression_options_parallel_threads(
+ rocksdb_options_t* opt, int value) {
+ opt->rep.compression_opts.parallel_threads = value;
+}
+
+int rocksdb_options_get_compression_options_parallel_threads(
+ rocksdb_options_t* opt) {
+ return opt->rep.compression_opts.parallel_threads;
+}
+
+void rocksdb_options_set_compression_options_max_dict_buffer_bytes(
+ rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes) {
+ opt->rep.compression_opts.max_dict_buffer_bytes = max_dict_buffer_bytes;
+}
+
+uint64_t rocksdb_options_get_compression_options_max_dict_buffer_bytes(
+ rocksdb_options_t* opt) {
+ return opt->rep.compression_opts.max_dict_buffer_bytes;
+}
+
+void rocksdb_options_set_prefix_extractor(
+ rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) {
+ opt->rep.prefix_extractor.reset(prefix_extractor);
+}
+
+void rocksdb_options_set_use_fsync(rocksdb_options_t* opt, int use_fsync) {
+ opt->rep.use_fsync = use_fsync;
+}
+
+int rocksdb_options_get_use_fsync(rocksdb_options_t* opt) {
+ return opt->rep.use_fsync;
+}
+
+void rocksdb_options_set_db_log_dir(rocksdb_options_t* opt,
+ const char* db_log_dir) {
+ opt->rep.db_log_dir = db_log_dir;
+}
+
+void rocksdb_options_set_wal_dir(rocksdb_options_t* opt, const char* v) {
+ opt->rep.wal_dir = v;
+}
+
+void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) {
+ opt->rep.WAL_ttl_seconds = ttl;
+}
+
+uint64_t rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t* opt) {
+ return opt->rep.WAL_ttl_seconds;
+}
+
+void rocksdb_options_set_WAL_size_limit_MB(rocksdb_options_t* opt,
+ uint64_t limit) {
+ opt->rep.WAL_size_limit_MB = limit;
+}
+
+uint64_t rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t* opt) {
+ return opt->rep.WAL_size_limit_MB;
+}
+
+void rocksdb_options_set_manifest_preallocation_size(rocksdb_options_t* opt,
+ size_t v) {
+ opt->rep.manifest_preallocation_size = v;
+}
+
+size_t rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t* opt) {
+ return opt->rep.manifest_preallocation_size;
+}
+
+void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.use_direct_reads = v;
+}
+
+unsigned char rocksdb_options_get_use_direct_reads(rocksdb_options_t* opt) {
+ return opt->rep.use_direct_reads;
+}
+
+void rocksdb_options_set_use_direct_io_for_flush_and_compaction(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.use_direct_io_for_flush_and_compaction = v;
+}
+
+unsigned char rocksdb_options_get_use_direct_io_for_flush_and_compaction(
+ rocksdb_options_t* opt) {
+ return opt->rep.use_direct_io_for_flush_and_compaction;
+}
+
+void rocksdb_options_set_allow_mmap_reads(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.allow_mmap_reads = v;
+}
+
+unsigned char rocksdb_options_get_allow_mmap_reads(rocksdb_options_t* opt) {
+ return opt->rep.allow_mmap_reads;
+}
+
+void rocksdb_options_set_allow_mmap_writes(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.allow_mmap_writes = v;
+}
+
+unsigned char rocksdb_options_get_allow_mmap_writes(rocksdb_options_t* opt) {
+ return opt->rep.allow_mmap_writes;
+}
+
+void rocksdb_options_set_is_fd_close_on_exec(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.is_fd_close_on_exec = v;
+}
+
+unsigned char rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t* opt) {
+ return opt->rep.is_fd_close_on_exec;
+}
+
+void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt,
+ unsigned int v) {
+ opt->rep.stats_dump_period_sec = v;
+}
+
+unsigned int rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t* opt) {
+ return opt->rep.stats_dump_period_sec;
+}
+
+void rocksdb_options_set_stats_persist_period_sec(rocksdb_options_t* opt,
+ unsigned int v) {
+ opt->rep.stats_persist_period_sec = v;
+}
+
+unsigned int rocksdb_options_get_stats_persist_period_sec(
+ rocksdb_options_t* opt) {
+ return opt->rep.stats_persist_period_sec;
+}
+
+void rocksdb_options_set_advise_random_on_open(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.advise_random_on_open = v;
+}
+
+unsigned char rocksdb_options_get_advise_random_on_open(
+ rocksdb_options_t* opt) {
+ return opt->rep.advise_random_on_open;
+}
+
+void rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t* opt,
+ int v) {
+ switch (v) {
+ case 0:
+ opt->rep.access_hint_on_compaction_start =
+ ROCKSDB_NAMESPACE::Options::NONE;
+ break;
+ case 1:
+ opt->rep.access_hint_on_compaction_start =
+ ROCKSDB_NAMESPACE::Options::NORMAL;
+ break;
+ case 2:
+ opt->rep.access_hint_on_compaction_start =
+ ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
+ break;
+ case 3:
+ opt->rep.access_hint_on_compaction_start =
+ ROCKSDB_NAMESPACE::Options::WILLNEED;
+ break;
+ default:
+ assert(0);
+ }
+}
+
+int rocksdb_options_get_access_hint_on_compaction_start(
+ rocksdb_options_t* opt) {
+ return opt->rep.access_hint_on_compaction_start;
+}
+
+void rocksdb_options_set_use_adaptive_mutex(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.use_adaptive_mutex = v;
+}
+
+unsigned char rocksdb_options_get_use_adaptive_mutex(rocksdb_options_t* opt) {
+ return opt->rep.use_adaptive_mutex;
+}
+
+void rocksdb_options_set_wal_bytes_per_sync(rocksdb_options_t* opt,
+ uint64_t v) {
+ opt->rep.wal_bytes_per_sync = v;
+}
+
+uint64_t rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t* opt) {
+ return opt->rep.wal_bytes_per_sync;
+}
+
+void rocksdb_options_set_bytes_per_sync(rocksdb_options_t* opt, uint64_t v) {
+ opt->rep.bytes_per_sync = v;
+}
+
+uint64_t rocksdb_options_get_bytes_per_sync(rocksdb_options_t* opt) {
+ return opt->rep.bytes_per_sync;
+}
+
+void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt,
+ uint64_t v) {
+ opt->rep.writable_file_max_buffer_size = static_cast<size_t>(v);
+}
+
+uint64_t rocksdb_options_get_writable_file_max_buffer_size(
+ rocksdb_options_t* opt) {
+ return opt->rep.writable_file_max_buffer_size;
+}
+
+void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.allow_concurrent_memtable_write = v;
+}
+
+unsigned char rocksdb_options_get_allow_concurrent_memtable_write(
+ rocksdb_options_t* opt) {
+ return opt->rep.allow_concurrent_memtable_write;
+}
+
+void rocksdb_options_set_enable_write_thread_adaptive_yield(
+ rocksdb_options_t* opt, unsigned char v) {
+ opt->rep.enable_write_thread_adaptive_yield = v;
+}
+
+unsigned char rocksdb_options_get_enable_write_thread_adaptive_yield(
+ rocksdb_options_t* opt) {
+ return opt->rep.enable_write_thread_adaptive_yield;
+}
+
+void rocksdb_options_set_max_sequential_skip_in_iterations(
+ rocksdb_options_t* opt, uint64_t v) {
+ opt->rep.max_sequential_skip_in_iterations = v;
+}
+
+uint64_t rocksdb_options_get_max_sequential_skip_in_iterations(
+ rocksdb_options_t* opt) {
+ return opt->rep.max_sequential_skip_in_iterations;
+}
+
+void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt,
+ int n) {
+ opt->rep.max_write_buffer_number = n;
+}
+
+int rocksdb_options_get_max_write_buffer_number(rocksdb_options_t* opt) {
+ return opt->rep.max_write_buffer_number;
+}
+
+void rocksdb_options_set_min_write_buffer_number_to_merge(
+ rocksdb_options_t* opt, int n) {
+ opt->rep.min_write_buffer_number_to_merge = n;
+}
+
+int rocksdb_options_get_min_write_buffer_number_to_merge(
+ rocksdb_options_t* opt) {
+ return opt->rep.min_write_buffer_number_to_merge;
+}
+
+void rocksdb_options_set_max_write_buffer_number_to_maintain(
+ rocksdb_options_t* opt, int n) {
+ opt->rep.max_write_buffer_number_to_maintain = n;
+}
+
+int rocksdb_options_get_max_write_buffer_number_to_maintain(
+ rocksdb_options_t* opt) {
+ return opt->rep.max_write_buffer_number_to_maintain;
+}
+
+void rocksdb_options_set_max_write_buffer_size_to_maintain(
+ rocksdb_options_t* opt, int64_t n) {
+ opt->rep.max_write_buffer_size_to_maintain = n;
+}
+
+int64_t rocksdb_options_get_max_write_buffer_size_to_maintain(
+ rocksdb_options_t* opt) {
+ return opt->rep.max_write_buffer_size_to_maintain;
+}
+
+void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.enable_pipelined_write = v;
+}
+
+unsigned char rocksdb_options_get_enable_pipelined_write(
+ rocksdb_options_t* opt) {
+ return opt->rep.enable_pipelined_write;
+}
+
+void rocksdb_options_set_unordered_write(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.unordered_write = v;
+}
+
+unsigned char rocksdb_options_get_unordered_write(rocksdb_options_t* opt) {
+ return opt->rep.unordered_write;
+}
+
+void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt,
+ uint32_t n) {
+ opt->rep.max_subcompactions = n;
+}
+
+uint32_t rocksdb_options_get_max_subcompactions(rocksdb_options_t* opt) {
+ return opt->rep.max_subcompactions;
+}
+
+void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) {
+ opt->rep.max_background_jobs = n;
+}
+
+int rocksdb_options_get_max_background_jobs(rocksdb_options_t* opt) {
+ return opt->rep.max_background_jobs;
+}
+
+void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt,
+ int n) {
+ opt->rep.max_background_compactions = n;
+}
+
+int rocksdb_options_get_max_background_compactions(rocksdb_options_t* opt) {
+ return opt->rep.max_background_compactions;
+}
+
+void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) {
+ opt->rep.max_background_flushes = n;
+}
+
+int rocksdb_options_get_max_background_flushes(rocksdb_options_t* opt) {
+ return opt->rep.max_background_flushes;
+}
+
+void rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t* opt,
+ double v) {
+ opt->rep.experimental_mempurge_threshold = v;
+}
+
+double rocksdb_options_get_experimental_mempurge_threshold(
+ rocksdb_options_t* opt) {
+ return opt->rep.experimental_mempurge_threshold;
+}
+
+void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) {
+ opt->rep.max_log_file_size = v;
+}
+
+size_t rocksdb_options_get_max_log_file_size(rocksdb_options_t* opt) {
+ return opt->rep.max_log_file_size;
+}
+
+void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt,
+ size_t v) {
+ opt->rep.log_file_time_to_roll = v;
+}
+
+size_t rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t* opt) {
+ return opt->rep.log_file_time_to_roll;
+}
+
+void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) {
+ opt->rep.keep_log_file_num = v;
+}
+
+size_t rocksdb_options_get_keep_log_file_num(rocksdb_options_t* opt) {
+ return opt->rep.keep_log_file_num;
+}
+
+void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt,
+ size_t v) {
+ opt->rep.recycle_log_file_num = v;
+}
+
+size_t rocksdb_options_get_recycle_log_file_num(rocksdb_options_t* opt) {
+ return opt->rep.recycle_log_file_num;
+}
+
+void rocksdb_options_set_soft_pending_compaction_bytes_limit(
+ rocksdb_options_t* opt, size_t v) {
+ opt->rep.soft_pending_compaction_bytes_limit = v;
+}
+
+size_t rocksdb_options_get_soft_pending_compaction_bytes_limit(
+ rocksdb_options_t* opt) {
+ return opt->rep.soft_pending_compaction_bytes_limit;
+}
+
+void rocksdb_options_set_hard_pending_compaction_bytes_limit(
+ rocksdb_options_t* opt, size_t v) {
+ opt->rep.hard_pending_compaction_bytes_limit = v;
+}
+
+size_t rocksdb_options_get_hard_pending_compaction_bytes_limit(
+ rocksdb_options_t* opt) {
+ return opt->rep.hard_pending_compaction_bytes_limit;
+}
+
+void rocksdb_options_set_max_manifest_file_size(rocksdb_options_t* opt,
+ size_t v) {
+ opt->rep.max_manifest_file_size = v;
+}
+
+size_t rocksdb_options_get_max_manifest_file_size(rocksdb_options_t* opt) {
+ return opt->rep.max_manifest_file_size;
+}
+
+void rocksdb_options_set_table_cache_numshardbits(rocksdb_options_t* opt,
+ int v) {
+ opt->rep.table_cache_numshardbits = v;
+}
+
+int rocksdb_options_get_table_cache_numshardbits(rocksdb_options_t* opt) {
+ return opt->rep.table_cache_numshardbits;
+}
+
+void rocksdb_options_set_arena_block_size(rocksdb_options_t* opt, size_t v) {
+ opt->rep.arena_block_size = v;
+}
+
+size_t rocksdb_options_get_arena_block_size(rocksdb_options_t* opt) {
+ return opt->rep.arena_block_size;
+}
+
+void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt,
+ int disable) {
+ opt->rep.disable_auto_compactions = disable;
+}
+
+unsigned char rocksdb_options_get_disable_auto_compactions(
+ rocksdb_options_t* opt) {
+ return opt->rep.disable_auto_compactions;
+}
+
+void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt,
+ int v) {
+ opt->rep.optimize_filters_for_hits = v;
+}
+
+unsigned char rocksdb_options_get_optimize_filters_for_hits(
+ rocksdb_options_t* opt) {
+ return opt->rep.optimize_filters_for_hits;
+}
+
+void rocksdb_options_set_delete_obsolete_files_period_micros(
+ rocksdb_options_t* opt, uint64_t v) {
+ opt->rep.delete_obsolete_files_period_micros = v;
+}
+
+uint64_t rocksdb_options_get_delete_obsolete_files_period_micros(
+ rocksdb_options_t* opt) {
+ return opt->rep.delete_obsolete_files_period_micros;
+}
+
+void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) {
+ opt->rep.PrepareForBulkLoad();
+}
+
+void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t* opt) {
+ opt->rep.memtable_factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory);
+}
+
+void rocksdb_options_set_memtable_prefix_bloom_size_ratio(
+ rocksdb_options_t* opt, double v) {
+ opt->rep.memtable_prefix_bloom_size_ratio = v;
+}
+
+double rocksdb_options_get_memtable_prefix_bloom_size_ratio(
+ rocksdb_options_t* opt) {
+ return opt->rep.memtable_prefix_bloom_size_ratio;
+}
+
+void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt,
+ size_t v) {
+ opt->rep.memtable_huge_page_size = v;
+}
+
+size_t rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t* opt) {
+ return opt->rep.memtable_huge_page_size;
+}
+
+void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t* opt,
+ size_t bucket_count,
+ int32_t skiplist_height,
+ int32_t skiplist_branching_factor) {
+ ROCKSDB_NAMESPACE::MemTableRepFactory* factory =
+ ROCKSDB_NAMESPACE::NewHashSkipListRepFactory(
+ bucket_count, skiplist_height, skiplist_branching_factor);
+ opt->rep.memtable_factory.reset(factory);
+}
+
+void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t* opt,
+ size_t bucket_count) {
+ opt->rep.memtable_factory.reset(
+ ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count));
+}
+
+void rocksdb_options_set_plain_table_factory(rocksdb_options_t* opt,
+ uint32_t user_key_len,
+ int bloom_bits_per_key,
+ double hash_table_ratio,
+ size_t index_sparseness) {
+ ROCKSDB_NAMESPACE::PlainTableOptions options;
+ options.user_key_len = user_key_len;
+ options.bloom_bits_per_key = bloom_bits_per_key;
+ options.hash_table_ratio = hash_table_ratio;
+ options.index_sparseness = index_sparseness;
+
+ ROCKSDB_NAMESPACE::TableFactory* factory =
+ ROCKSDB_NAMESPACE::NewPlainTableFactory(options);
+ opt->rep.table_factory.reset(factory);
+}
+
+void rocksdb_options_set_max_successive_merges(rocksdb_options_t* opt,
+ size_t v) {
+ opt->rep.max_successive_merges = v;
+}
+
+size_t rocksdb_options_get_max_successive_merges(rocksdb_options_t* opt) {
+ return opt->rep.max_successive_merges;
+}
+
+void rocksdb_options_set_bloom_locality(rocksdb_options_t* opt, uint32_t v) {
+ opt->rep.bloom_locality = v;
+}
+
+uint32_t rocksdb_options_get_bloom_locality(rocksdb_options_t* opt) {
+ return opt->rep.bloom_locality;
+}
+
+void rocksdb_options_set_inplace_update_support(rocksdb_options_t* opt,
+ unsigned char v) {
+ opt->rep.inplace_update_support = v;
+}
+
+unsigned char rocksdb_options_get_inplace_update_support(
+ rocksdb_options_t* opt) {
+ return opt->rep.inplace_update_support;
+}
+
+void rocksdb_options_set_inplace_update_num_locks(rocksdb_options_t* opt,
+ size_t v) {
+ opt->rep.inplace_update_num_locks = v;
+}
+
+size_t rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t* opt) {
+ return opt->rep.inplace_update_num_locks;
+}
+
+void rocksdb_options_set_report_bg_io_stats(rocksdb_options_t* opt, int v) {
+ opt->rep.report_bg_io_stats = v;
+}
+
+unsigned char rocksdb_options_get_report_bg_io_stats(rocksdb_options_t* opt) {
+ return opt->rep.report_bg_io_stats;
+}
+
+void rocksdb_options_set_compaction_style(rocksdb_options_t* opt, int style) {
+ opt->rep.compaction_style =
+ static_cast<ROCKSDB_NAMESPACE::CompactionStyle>(style);
+}
+
+int rocksdb_options_get_compaction_style(rocksdb_options_t* opt) {
+ return opt->rep.compaction_style;
+}
+
+void rocksdb_options_set_universal_compaction_options(
+ rocksdb_options_t* opt, rocksdb_universal_compaction_options_t* uco) {
+ opt->rep.compaction_options_universal = *(uco->rep);
+}
+
+void rocksdb_options_set_fifo_compaction_options(
+ rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo) {
+ opt->rep.compaction_options_fifo = fifo->rep;
+}
+
+char* rocksdb_options_statistics_get_string(rocksdb_options_t* opt) {
+ ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
+ if (statistics) {
+ return strdup(statistics->ToString().c_str());
+ }
+ return nullptr;
+}
+
+void rocksdb_options_set_ratelimiter(rocksdb_options_t* opt,
+ rocksdb_ratelimiter_t* limiter) {
+ if (limiter) {
+ opt->rep.rate_limiter = limiter->rep;
+ }
+}
+
+void rocksdb_options_set_atomic_flush(rocksdb_options_t* opt,
+ unsigned char atomic_flush) {
+ opt->rep.atomic_flush = atomic_flush;
+}
+
+unsigned char rocksdb_options_get_atomic_flush(rocksdb_options_t* opt) {
+ return opt->rep.atomic_flush;
+}
+
+void rocksdb_options_set_manual_wal_flush(rocksdb_options_t* opt,
+ unsigned char manual_wal_flush) {
+ opt->rep.manual_wal_flush = manual_wal_flush;
+}
+
+unsigned char rocksdb_options_get_manual_wal_flush(rocksdb_options_t* opt) {
+ return opt->rep.manual_wal_flush;
+}
+
+void rocksdb_options_set_wal_compression(rocksdb_options_t* opt, int val) {
+ opt->rep.wal_compression = static_cast<CompressionType>(val);
+}
+
+int rocksdb_options_get_wal_compression(rocksdb_options_t* opt) {
+ return opt->rep.wal_compression;
+}
+
+rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(int64_t rate_bytes_per_sec,
+ int64_t refill_period_us,
+ int32_t fairness) {
+ rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t;
+ rate_limiter->rep.reset(
+ NewGenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness));
+ return rate_limiter;
+}
+
+void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t* limiter) {
+ delete limiter;
+}
+
+void rocksdb_options_set_row_cache(rocksdb_options_t* opt,
+ rocksdb_cache_t* cache) {
+ if (cache) {
+ opt->rep.row_cache = cache->rep;
+ }
+}
+
+void rocksdb_options_add_compact_on_deletion_collector_factory(
+ rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger) {
+ std::shared_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory>
+ compact_on_del =
+ NewCompactOnDeletionCollectorFactory(window_size, num_dels_trigger);
+ opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
+}
+
+void rocksdb_set_perf_level(int v) {
+ PerfLevel level = static_cast<PerfLevel>(v);
+ SetPerfLevel(level);
+}
+
+rocksdb_perfcontext_t* rocksdb_perfcontext_create() {
+ rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t;
+ context->rep = ROCKSDB_NAMESPACE::get_perf_context();
+ return context;
+}
+
+void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) {
+ context->rep->Reset();
+}
+
+char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context,
+ unsigned char exclude_zero_counters) {
+ return strdup(context->rep->ToString(exclude_zero_counters).c_str());
+}
+
+uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
+ int metric) {
+ PerfContext* rep = context->rep;
+ switch (metric) {
+ case rocksdb_user_key_comparison_count:
+ return rep->user_key_comparison_count;
+ case rocksdb_block_cache_hit_count:
+ return rep->block_cache_hit_count;
+ case rocksdb_block_read_count:
+ return rep->block_read_count;
+ case rocksdb_block_read_byte:
+ return rep->block_read_byte;
+ case rocksdb_block_read_time:
+ return rep->block_read_time;
+ case rocksdb_block_checksum_time:
+ return rep->block_checksum_time;
+ case rocksdb_block_decompress_time:
+ return rep->block_decompress_time;
+ case rocksdb_get_read_bytes:
+ return rep->get_read_bytes;
+ case rocksdb_multiget_read_bytes:
+ return rep->multiget_read_bytes;
+ case rocksdb_iter_read_bytes:
+ return rep->iter_read_bytes;
+ case rocksdb_internal_key_skipped_count:
+ return rep->internal_key_skipped_count;
+ case rocksdb_internal_delete_skipped_count:
+ return rep->internal_delete_skipped_count;
+ case rocksdb_internal_recent_skipped_count:
+ return rep->internal_recent_skipped_count;
+ case rocksdb_internal_merge_count:
+ return rep->internal_merge_count;
+ case rocksdb_get_snapshot_time:
+ return rep->get_snapshot_time;
+ case rocksdb_get_from_memtable_time:
+ return rep->get_from_memtable_time;
+ case rocksdb_get_from_memtable_count:
+ return rep->get_from_memtable_count;
+ case rocksdb_get_post_process_time:
+ return rep->get_post_process_time;
+ case rocksdb_get_from_output_files_time:
+ return rep->get_from_output_files_time;
+ case rocksdb_seek_on_memtable_time:
+ return rep->seek_on_memtable_time;
+ case rocksdb_seek_on_memtable_count:
+ return rep->seek_on_memtable_count;
+ case rocksdb_next_on_memtable_count:
+ return rep->next_on_memtable_count;
+ case rocksdb_prev_on_memtable_count:
+ return rep->prev_on_memtable_count;
+ case rocksdb_seek_child_seek_time:
+ return rep->seek_child_seek_time;
+ case rocksdb_seek_child_seek_count:
+ return rep->seek_child_seek_count;
+ case rocksdb_seek_min_heap_time:
+ return rep->seek_min_heap_time;
+ case rocksdb_seek_max_heap_time:
+ return rep->seek_max_heap_time;
+ case rocksdb_seek_internal_seek_time:
+ return rep->seek_internal_seek_time;
+ case rocksdb_find_next_user_entry_time:
+ return rep->find_next_user_entry_time;
+ case rocksdb_write_wal_time:
+ return rep->write_wal_time;
+ case rocksdb_write_memtable_time:
+ return rep->write_memtable_time;
+ case rocksdb_write_delay_time:
+ return rep->write_delay_time;
+ case rocksdb_write_pre_and_post_process_time:
+ return rep->write_pre_and_post_process_time;
+ case rocksdb_db_mutex_lock_nanos:
+ return rep->db_mutex_lock_nanos;
+ case rocksdb_db_condition_wait_nanos:
+ return rep->db_condition_wait_nanos;
+ case rocksdb_merge_operator_time_nanos:
+ return rep->merge_operator_time_nanos;
+ case rocksdb_read_index_block_nanos:
+ return rep->read_index_block_nanos;
+ case rocksdb_read_filter_block_nanos:
+ return rep->read_filter_block_nanos;
+ case rocksdb_new_table_block_iter_nanos:
+ return rep->new_table_block_iter_nanos;
+ case rocksdb_new_table_iterator_nanos:
+ return rep->new_table_iterator_nanos;
+ case rocksdb_block_seek_nanos:
+ return rep->block_seek_nanos;
+ case rocksdb_find_table_nanos:
+ return rep->find_table_nanos;
+ case rocksdb_bloom_memtable_hit_count:
+ return rep->bloom_memtable_hit_count;
+ case rocksdb_bloom_memtable_miss_count:
+ return rep->bloom_memtable_miss_count;
+ case rocksdb_bloom_sst_hit_count:
+ return rep->bloom_sst_hit_count;
+ case rocksdb_bloom_sst_miss_count:
+ return rep->bloom_sst_miss_count;
+ case rocksdb_key_lock_wait_time:
+ return rep->key_lock_wait_time;
+ case rocksdb_key_lock_wait_count:
+ return rep->key_lock_wait_count;
+ case rocksdb_env_new_sequential_file_nanos:
+ return rep->env_new_sequential_file_nanos;
+ case rocksdb_env_new_random_access_file_nanos:
+ return rep->env_new_random_access_file_nanos;
+ case rocksdb_env_new_writable_file_nanos:
+ return rep->env_new_writable_file_nanos;
+ case rocksdb_env_reuse_writable_file_nanos:
+ return rep->env_reuse_writable_file_nanos;
+ case rocksdb_env_new_random_rw_file_nanos:
+ return rep->env_new_random_rw_file_nanos;
+ case rocksdb_env_new_directory_nanos:
+ return rep->env_new_directory_nanos;
+ case rocksdb_env_file_exists_nanos:
+ return rep->env_file_exists_nanos;
+ case rocksdb_env_get_children_nanos:
+ return rep->env_get_children_nanos;
+ case rocksdb_env_get_children_file_attributes_nanos:
+ return rep->env_get_children_file_attributes_nanos;
+ case rocksdb_env_delete_file_nanos:
+ return rep->env_delete_file_nanos;
+ case rocksdb_env_create_dir_nanos:
+ return rep->env_create_dir_nanos;
+ case rocksdb_env_create_dir_if_missing_nanos:
+ return rep->env_create_dir_if_missing_nanos;
+ case rocksdb_env_delete_dir_nanos:
+ return rep->env_delete_dir_nanos;
+ case rocksdb_env_get_file_size_nanos:
+ return rep->env_get_file_size_nanos;
+ case rocksdb_env_get_file_modification_time_nanos:
+ return rep->env_get_file_modification_time_nanos;
+ case rocksdb_env_rename_file_nanos:
+ return rep->env_rename_file_nanos;
+ case rocksdb_env_link_file_nanos:
+ return rep->env_link_file_nanos;
+ case rocksdb_env_lock_file_nanos:
+ return rep->env_lock_file_nanos;
+ case rocksdb_env_unlock_file_nanos:
+ return rep->env_unlock_file_nanos;
+ case rocksdb_env_new_logger_nanos:
+ return rep->env_new_logger_nanos;
+ case rocksdb_number_async_seek:
+ return rep->number_async_seek;
+ case rocksdb_blob_cache_hit_count:
+ return rep->blob_cache_hit_count;
+ case rocksdb_blob_read_count:
+ return rep->blob_read_count;
+ case rocksdb_blob_read_byte:
+ return rep->blob_read_byte;
+ case rocksdb_blob_read_time:
+ return rep->blob_read_time;
+ case rocksdb_blob_checksum_time:
+ return rep->blob_checksum_time;
+ case rocksdb_blob_decompress_time:
+ return rep->blob_decompress_time;
+ case rocksdb_internal_range_del_reseek_count:
+ return rep->internal_range_del_reseek_count;
+ default:
+ break;
+ }
+ return 0;
+}
+
+void rocksdb_perfcontext_destroy(rocksdb_perfcontext_t* context) {
+ delete context;
+}
+
+/*
+TODO:
+DB::OpenForReadOnly
+DB::KeyMayExist
+DB::GetOptions
+DB::GetSortedWalFiles
+DB::GetLatestSequenceNumber
+DB::GetUpdatesSince
+DB::GetDbIdentity
+DB::RunManualCompaction
+custom cache
+table_properties_collectors
+*/
+
+rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
+ void* state, void (*destructor)(void*),
+ unsigned char (*filter)(void*, int level, const char* key,
+ size_t key_length, const char* existing_value,
+ size_t value_length, char** new_value,
+ size_t* new_value_length,
+ unsigned char* value_changed),
+ const char* (*name)(void*)) {
+ rocksdb_compactionfilter_t* result = new rocksdb_compactionfilter_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->filter_ = filter;
+ result->ignore_snapshots_ = true;
+ result->name_ = name;
+ return result;
+}
+
+void rocksdb_compactionfilter_set_ignore_snapshots(
+ rocksdb_compactionfilter_t* filter, unsigned char whether_ignore) {
+ filter->ignore_snapshots_ = whether_ignore;
+}
+
+void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t* filter) {
+ delete filter;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_full_compaction(
+ rocksdb_compactionfiltercontext_t* context) {
+ return context->rep.is_full_compaction;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_manual_compaction(
+ rocksdb_compactionfiltercontext_t* context) {
+ return context->rep.is_manual_compaction;
+}
+
+rocksdb_compactionfilterfactory_t* rocksdb_compactionfilterfactory_create(
+ void* state, void (*destructor)(void*),
+ rocksdb_compactionfilter_t* (*create_compaction_filter)(
+ void*, rocksdb_compactionfiltercontext_t* context),
+ const char* (*name)(void*)) {
+ rocksdb_compactionfilterfactory_t* result =
+ new rocksdb_compactionfilterfactory_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->create_compaction_filter_ = create_compaction_filter;
+ result->name_ = name;
+ return result;
+}
+
+void rocksdb_compactionfilterfactory_destroy(
+ rocksdb_compactionfilterfactory_t* factory) {
+ delete factory;
+}
+
+rocksdb_comparator_t* rocksdb_comparator_create(
+ void* state, void (*destructor)(void*),
+ int (*compare)(void*, const char* a, size_t alen, const char* b,
+ size_t blen),
+ const char* (*name)(void*)) {
+ rocksdb_comparator_t* result = new rocksdb_comparator_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->compare_ = compare;
+ result->name_ = name;
+ result->compare_ts_ = nullptr;
+ result->compare_without_ts_ = nullptr;
+ return result;
+}
+
+void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { delete cmp; }
+
+rocksdb_comparator_t* rocksdb_comparator_with_ts_create(
+ void* state, void (*destructor)(void*),
+ int (*compare)(void*, const char* a, size_t alen, const char* b,
+ size_t blen),
+ int (*compare_ts)(void*, const char* a_ts, size_t a_tslen, const char* b_ts,
+ size_t b_tslen),
+ int (*compare_without_ts)(void*, const char* a, size_t alen,
+ unsigned char a_has_ts, const char* b,
+ size_t blen, unsigned char b_has_ts),
+ const char* (*name)(void*), size_t timestamp_size) {
+ rocksdb_comparator_t* result = new rocksdb_comparator_t(timestamp_size);
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->compare_ = compare;
+ result->compare_ts_ = compare_ts;
+ result->compare_without_ts_ = compare_without_ts;
+ result->name_ = name;
+ return result;
+}
+
+void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) {
+ delete filter;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(
+ double bits_per_key, bool original_format) {
+ // Make a rocksdb_filterpolicy_t, but override all of its methods so
+ // they delegate to a NewBloomFilterPolicy() instead of user
+ // supplied C functions.
+ struct Wrapper : public rocksdb_filterpolicy_t {
+ const FilterPolicy* rep_;
+ ~Wrapper() override { delete rep_; }
+ const char* Name() const override { return rep_->Name(); }
+ const char* CompatibilityName() const override {
+ return rep_->CompatibilityName();
+ }
+ // No need to override GetFilterBitsBuilder if this one is overridden
+ ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
+ const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
+ const override {
+ return rep_->GetBuilderWithContext(context);
+ }
+ ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader(
+ const Slice& contents) const override {
+ return rep_->GetFilterBitsReader(contents);
+ }
+ static void DoNothing(void*) {}
+ };
+ Wrapper* wrapper = new Wrapper;
+ wrapper->rep_ = NewBloomFilterPolicy(bits_per_key, original_format);
+ wrapper->state_ = nullptr;
+ wrapper->destructor_ = &Wrapper::DoNothing;
+ return wrapper;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full(
+ double bits_per_key) {
+ return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(double bits_per_key) {
+ return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_format(
+ double bloom_equivalent_bits_per_key, int bloom_before_level) {
+ // Make a rocksdb_filterpolicy_t, but override all of its methods so
+ // they delegate to a NewRibbonFilterPolicy() instead of user
+ // supplied C functions.
+ struct Wrapper : public rocksdb_filterpolicy_t {
+ const FilterPolicy* rep_;
+ ~Wrapper() override { delete rep_; }
+ const char* Name() const override { return rep_->Name(); }
+ const char* CompatibilityName() const override {
+ return rep_->CompatibilityName();
+ }
+ ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
+ const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
+ const override {
+ return rep_->GetBuilderWithContext(context);
+ }
+ ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader(
+ const Slice& contents) const override {
+ return rep_->GetFilterBitsReader(contents);
+ }
+ static void DoNothing(void*) {}
+ };
+ Wrapper* wrapper = new Wrapper;
+ wrapper->rep_ =
+ NewRibbonFilterPolicy(bloom_equivalent_bits_per_key, bloom_before_level);
+ wrapper->state_ = nullptr;
+ wrapper->destructor_ = &Wrapper::DoNothing;
+ return wrapper;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon(
+ double bloom_equivalent_bits_per_key) {
+ return rocksdb_filterpolicy_create_ribbon_format(
+ bloom_equivalent_bits_per_key, /*bloom_before_level = disabled*/ -1);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_hybrid(
+ double bloom_equivalent_bits_per_key, int bloom_before_level) {
+ return rocksdb_filterpolicy_create_ribbon_format(
+ bloom_equivalent_bits_per_key, bloom_before_level);
+}
+
+rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
+ void* state, void (*destructor)(void*),
+ char* (*full_merge)(void*, const char* key, size_t key_length,
+ const char* existing_value,
+ size_t existing_value_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length),
+ char* (*partial_merge)(void*, const char* key, size_t key_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length),
+ void (*delete_value)(void*, const char* value, size_t value_length),
+ const char* (*name)(void*)) {
+ rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->full_merge_ = full_merge;
+ result->partial_merge_ = partial_merge;
+ result->delete_value_ = delete_value;
+ result->name_ = name;
+ return result;
+}
+
+void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) {
+ delete merge_operator;
+}
+
+rocksdb_readoptions_t* rocksdb_readoptions_create() {
+ return new rocksdb_readoptions_t;
+}
+
+void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { delete opt; }
+
+void rocksdb_readoptions_set_verify_checksums(rocksdb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.verify_checksums = v;
+}
+
+unsigned char rocksdb_readoptions_get_verify_checksums(
+ rocksdb_readoptions_t* opt) {
+ return opt->rep.verify_checksums;
+}
+
+void rocksdb_readoptions_set_fill_cache(rocksdb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.fill_cache = v;
+}
+
+unsigned char rocksdb_readoptions_get_fill_cache(rocksdb_readoptions_t* opt) {
+ return opt->rep.fill_cache;
+}
+
+void rocksdb_readoptions_set_snapshot(rocksdb_readoptions_t* opt,
+ const rocksdb_snapshot_t* snap) {
+ opt->rep.snapshot = (snap ? snap->rep : nullptr);
+}
+
+void rocksdb_readoptions_set_iterate_upper_bound(rocksdb_readoptions_t* opt,
+ const char* key,
+ size_t keylen) {
+ if (key == nullptr) {
+ opt->upper_bound = Slice();
+ opt->rep.iterate_upper_bound = nullptr;
+
+ } else {
+ opt->upper_bound = Slice(key, keylen);
+ opt->rep.iterate_upper_bound = &opt->upper_bound;
+ }
+}
+
+void rocksdb_readoptions_set_iterate_lower_bound(rocksdb_readoptions_t* opt,
+ const char* key,
+ size_t keylen) {
+ if (key == nullptr) {
+ opt->lower_bound = Slice();
+ opt->rep.iterate_lower_bound = nullptr;
+ } else {
+ opt->lower_bound = Slice(key, keylen);
+ opt->rep.iterate_lower_bound = &opt->lower_bound;
+ }
+}
+
+void rocksdb_readoptions_set_read_tier(rocksdb_readoptions_t* opt, int v) {
+ opt->rep.read_tier = static_cast<ROCKSDB_NAMESPACE::ReadTier>(v);
+}
+
+int rocksdb_readoptions_get_read_tier(rocksdb_readoptions_t* opt) {
+ return static_cast<int>(opt->rep.read_tier);
+}
+
+void rocksdb_readoptions_set_tailing(rocksdb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.tailing = v;
+}
+
+unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) {
+ return opt->rep.tailing;
+}
+
+void rocksdb_readoptions_set_managed(rocksdb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.managed = v;
+}
+
+void rocksdb_readoptions_set_readahead_size(rocksdb_readoptions_t* opt,
+ size_t v) {
+ opt->rep.readahead_size = v;
+}
+
+size_t rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t* opt) {
+ return opt->rep.readahead_size;
+}
+
+void rocksdb_readoptions_set_prefix_same_as_start(rocksdb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.prefix_same_as_start = v;
+}
+
+unsigned char rocksdb_readoptions_get_prefix_same_as_start(
+ rocksdb_readoptions_t* opt) {
+ return opt->rep.prefix_same_as_start;
+}
+
+void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.pin_data = v;
+}
+
+unsigned char rocksdb_readoptions_get_pin_data(rocksdb_readoptions_t* opt) {
+ return opt->rep.pin_data;
+}
+
+void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.total_order_seek = v;
+}
+
+unsigned char rocksdb_readoptions_get_total_order_seek(
+ rocksdb_readoptions_t* opt) {
+ return opt->rep.total_order_seek;
+}
+
+void rocksdb_readoptions_set_max_skippable_internal_keys(
+ rocksdb_readoptions_t* opt, uint64_t v) {
+ opt->rep.max_skippable_internal_keys = v;
+}
+
+uint64_t rocksdb_readoptions_get_max_skippable_internal_keys(
+ rocksdb_readoptions_t* opt) {
+ return opt->rep.max_skippable_internal_keys;
+}
+
+void rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
+ rocksdb_readoptions_t* opt, unsigned char v) {
+ opt->rep.background_purge_on_iterator_cleanup = v;
+}
+
+unsigned char rocksdb_readoptions_get_background_purge_on_iterator_cleanup(
+ rocksdb_readoptions_t* opt) {
+ return opt->rep.background_purge_on_iterator_cleanup;
+}
+
+void rocksdb_readoptions_set_ignore_range_deletions(rocksdb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.ignore_range_deletions = v;
+}
+
+unsigned char rocksdb_readoptions_get_ignore_range_deletions(
+ rocksdb_readoptions_t* opt) {
+ return opt->rep.ignore_range_deletions;
+}
+
+void rocksdb_readoptions_set_deadline(rocksdb_readoptions_t* opt,
+ uint64_t microseconds) {
+ opt->rep.deadline = std::chrono::microseconds(microseconds);
+}
+
+uint64_t rocksdb_readoptions_get_deadline(rocksdb_readoptions_t* opt) {
+ return opt->rep.deadline.count();
+}
+
+void rocksdb_readoptions_set_io_timeout(rocksdb_readoptions_t* opt,
+ uint64_t microseconds) {
+ opt->rep.io_timeout = std::chrono::microseconds(microseconds);
+}
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) {
+ return opt->rep.io_timeout.count();
+}
+
+void rocksdb_readoptions_set_timestamp(rocksdb_readoptions_t* opt,
+ const char* ts, size_t tslen) {
+ if (ts == nullptr) {
+ opt->timestamp = Slice();
+ opt->rep.timestamp = nullptr;
+ } else {
+ opt->timestamp = Slice(ts, tslen);
+ opt->rep.timestamp = &opt->timestamp;
+ }
+}
+
+void rocksdb_readoptions_set_iter_start_ts(rocksdb_readoptions_t* opt,
+ const char* ts, size_t tslen) {
+ if (ts == nullptr) {
+ opt->iter_start_ts = Slice();
+ opt->rep.iter_start_ts = nullptr;
+ } else {
+ opt->iter_start_ts = Slice(ts, tslen);
+ opt->rep.iter_start_ts = &opt->iter_start_ts;
+ }
+}
+
+rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
+ return new rocksdb_writeoptions_t;
+}
+
+void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { delete opt; }
+
+void rocksdb_writeoptions_set_sync(rocksdb_writeoptions_t* opt,
+ unsigned char v) {
+ opt->rep.sync = v;
+}
+
+unsigned char rocksdb_writeoptions_get_sync(rocksdb_writeoptions_t* opt) {
+ return opt->rep.sync;
+}
+
+void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt,
+ int disable) {
+ opt->rep.disableWAL = disable;
+}
+
+unsigned char rocksdb_writeoptions_get_disable_WAL(
+ rocksdb_writeoptions_t* opt) {
+ return opt->rep.disableWAL;
+}
+
+void rocksdb_writeoptions_set_ignore_missing_column_families(
+ rocksdb_writeoptions_t* opt, unsigned char v) {
+ opt->rep.ignore_missing_column_families = v;
+}
+
+unsigned char rocksdb_writeoptions_get_ignore_missing_column_families(
+ rocksdb_writeoptions_t* opt) {
+ return opt->rep.ignore_missing_column_families;
+}
+
+void rocksdb_writeoptions_set_no_slowdown(rocksdb_writeoptions_t* opt,
+ unsigned char v) {
+ opt->rep.no_slowdown = v;
+}
+
+unsigned char rocksdb_writeoptions_get_no_slowdown(
+ rocksdb_writeoptions_t* opt) {
+ return opt->rep.no_slowdown;
+}
+
+void rocksdb_writeoptions_set_low_pri(rocksdb_writeoptions_t* opt,
+ unsigned char v) {
+ opt->rep.low_pri = v;
+}
+
+unsigned char rocksdb_writeoptions_get_low_pri(rocksdb_writeoptions_t* opt) {
+ return opt->rep.low_pri;
+}
+
+void rocksdb_writeoptions_set_memtable_insert_hint_per_batch(
+ rocksdb_writeoptions_t* opt, unsigned char v) {
+ opt->rep.memtable_insert_hint_per_batch = v;
+}
+
+unsigned char rocksdb_writeoptions_get_memtable_insert_hint_per_batch(
+ rocksdb_writeoptions_t* opt) {
+ return opt->rep.memtable_insert_hint_per_batch;
+}
+
+rocksdb_compactoptions_t* rocksdb_compactoptions_create() {
+ return new rocksdb_compactoptions_t;
+}
+
+void rocksdb_compactoptions_destroy(rocksdb_compactoptions_t* opt) {
+ delete opt;
+}
+
+void rocksdb_compactoptions_set_bottommost_level_compaction(
+ rocksdb_compactoptions_t* opt, unsigned char v) {
+ opt->rep.bottommost_level_compaction =
+ static_cast<BottommostLevelCompaction>(v);
+}
+
+unsigned char rocksdb_compactoptions_get_bottommost_level_compaction(
+ rocksdb_compactoptions_t* opt) {
+ return static_cast<unsigned char>(opt->rep.bottommost_level_compaction);
+}
+
+void rocksdb_compactoptions_set_exclusive_manual_compaction(
+ rocksdb_compactoptions_t* opt, unsigned char v) {
+ opt->rep.exclusive_manual_compaction = v;
+}
+
+unsigned char rocksdb_compactoptions_get_exclusive_manual_compaction(
+ rocksdb_compactoptions_t* opt) {
+ return opt->rep.exclusive_manual_compaction;
+}
+
+void rocksdb_compactoptions_set_change_level(rocksdb_compactoptions_t* opt,
+ unsigned char v) {
+ opt->rep.change_level = v;
+}
+
+unsigned char rocksdb_compactoptions_get_change_level(
+ rocksdb_compactoptions_t* opt) {
+ return opt->rep.change_level;
+}
+
+void rocksdb_compactoptions_set_target_level(rocksdb_compactoptions_t* opt,
+ int n) {
+ opt->rep.target_level = n;
+}
+
+int rocksdb_compactoptions_get_target_level(rocksdb_compactoptions_t* opt) {
+ return opt->rep.target_level;
+}
+
+void rocksdb_compactoptions_set_full_history_ts_low(
+ rocksdb_compactoptions_t* opt, char* ts, size_t tslen) {
+ if (ts == nullptr) {
+ opt->full_history_ts_low = Slice();
+ opt->rep.full_history_ts_low = nullptr;
+ } else {
+ opt->full_history_ts_low = Slice(ts, tslen);
+ opt->rep.full_history_ts_low = &opt->full_history_ts_low;
+ }
+}
+
+rocksdb_flushoptions_t* rocksdb_flushoptions_create() {
+ return new rocksdb_flushoptions_t;
+}
+
+void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) { delete opt; }
+
+void rocksdb_flushoptions_set_wait(rocksdb_flushoptions_t* opt,
+ unsigned char v) {
+ opt->rep.wait = v;
+}
+
+unsigned char rocksdb_flushoptions_get_wait(rocksdb_flushoptions_t* opt) {
+ return opt->rep.wait;
+}
+
+rocksdb_memory_allocator_t* rocksdb_jemalloc_nodump_allocator_create(
+ char** errptr) {
+ rocksdb_memory_allocator_t* allocator = new rocksdb_memory_allocator_t;
+ ROCKSDB_NAMESPACE::JemallocAllocatorOptions options;
+ SaveError(errptr, ROCKSDB_NAMESPACE::NewJemallocNodumpAllocator(
+ options, &allocator->rep));
+ return allocator;
+}
+
+void rocksdb_memory_allocator_destroy(rocksdb_memory_allocator_t* allocator) {
+ delete allocator;
+}
+
+rocksdb_lru_cache_options_t* rocksdb_lru_cache_options_create() {
+ return new rocksdb_lru_cache_options_t;
+}
+
+void rocksdb_lru_cache_options_destroy(rocksdb_lru_cache_options_t* opt) {
+ delete opt;
+}
+
+void rocksdb_lru_cache_options_set_capacity(rocksdb_lru_cache_options_t* opt,
+ size_t capacity) {
+ opt->rep.capacity = capacity;
+}
+
+void rocksdb_lru_cache_options_set_num_shard_bits(
+ rocksdb_lru_cache_options_t* opt, int num_shard_bits) {
+ opt->rep.num_shard_bits = num_shard_bits;
+}
+
+void rocksdb_lru_cache_options_set_memory_allocator(
+ rocksdb_lru_cache_options_t* opt, rocksdb_memory_allocator_t* allocator) {
+ opt->rep.memory_allocator = allocator->rep;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
+ rocksdb_cache_t* c = new rocksdb_cache_t;
+ c->rep = NewLRUCache(capacity);
+ return c;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru_with_strict_capacity_limit(
+ size_t capacity) {
+ rocksdb_cache_t* c = new rocksdb_cache_t;
+ c->rep = NewLRUCache(capacity);
+ c->rep->SetStrictCapacityLimit(true);
+ return c;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru_opts(
+ rocksdb_lru_cache_options_t* opt) {
+ rocksdb_cache_t* c = new rocksdb_cache_t;
+ c->rep = NewLRUCache(opt->rep);
+ return c;
+}
+
+void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; }
+
+void rocksdb_cache_disown_data(rocksdb_cache_t* cache) {
+ cache->rep->DisownData();
+}
+
+void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
+ cache->rep->SetCapacity(capacity);
+}
+
+size_t rocksdb_cache_get_capacity(rocksdb_cache_t* cache) {
+ return cache->rep->GetCapacity();
+}
+
+size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) {
+ return cache->rep->GetUsage();
+}
+
+size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) {
+ return cache->rep->GetPinnedUsage();
+}
+
+rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path,
+ uint64_t target_size) {
+ rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
+ result->rep.path = std::string(path);
+ result->rep.target_size = target_size;
+ return result;
+}
+
+void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) { delete dbpath; }
+
+rocksdb_env_t* rocksdb_create_default_env() {
+ rocksdb_env_t* result = new rocksdb_env_t;
+ result->rep = Env::Default();
+ result->is_default = true;
+ return result;
+}
+
+rocksdb_env_t* rocksdb_create_mem_env() {
+ rocksdb_env_t* result = new rocksdb_env_t;
+ result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default());
+ result->is_default = false;
+ return result;
+}
+
+void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
+ env->rep->SetBackgroundThreads(n);
+}
+
+int rocksdb_env_get_background_threads(rocksdb_env_t* env) {
+ return env->rep->GetBackgroundThreads();
+}
+
+void rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env,
+ int n) {
+ env->rep->SetBackgroundThreads(n, Env::BOTTOM);
+}
+
+int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env) {
+ return env->rep->GetBackgroundThreads(Env::BOTTOM);
+}
+
+void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env,
+ int n) {
+ env->rep->SetBackgroundThreads(n, Env::HIGH);
+}
+
+int rocksdb_env_get_high_priority_background_threads(rocksdb_env_t* env) {
+ return env->rep->GetBackgroundThreads(Env::HIGH);
+}
+
+void rocksdb_env_set_low_priority_background_threads(rocksdb_env_t* env,
+ int n) {
+ env->rep->SetBackgroundThreads(n, Env::LOW);
+}
+
+int rocksdb_env_get_low_priority_background_threads(rocksdb_env_t* env) {
+ return env->rep->GetBackgroundThreads(Env::LOW);
+}
+
+void rocksdb_env_join_all_threads(rocksdb_env_t* env) {
+ env->rep->WaitForJoin();
+}
+
+void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) {
+ env->rep->LowerThreadPoolIOPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_io_priority(
+ rocksdb_env_t* env) {
+ env->rep->LowerThreadPoolIOPriority(Env::HIGH);
+}
+
+void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) {
+ env->rep->LowerThreadPoolCPUPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(
+ rocksdb_env_t* env) {
+ env->rep->LowerThreadPoolCPUPriority(Env::HIGH);
+}
+
+void rocksdb_env_destroy(rocksdb_env_t* env) {
+ if (!env->is_default) delete env->rep;
+ delete env;
+}
+
+rocksdb_envoptions_t* rocksdb_envoptions_create() {
+ rocksdb_envoptions_t* opt = new rocksdb_envoptions_t;
+ return opt;
+}
+
+void rocksdb_envoptions_destroy(rocksdb_envoptions_t* opt) { delete opt; }
+
+rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create(
+ const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options) {
+ rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
+ writer->rep = new SstFileWriter(env->rep, io_options->rep);
+ return writer;
+}
+
+void rocksdb_create_dir_if_missing(rocksdb_env_t* env, const char* path,
+ char** errptr) {
+ SaveError(errptr, env->rep->CreateDirIfMissing(std::string(path)));
+}
+
+rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create_with_comparator(
+ const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options,
+ const rocksdb_comparator_t* /*comparator*/) {
+ rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
+ writer->rep = new SstFileWriter(env->rep, io_options->rep);
+ return writer;
+}
+
+void rocksdb_sstfilewriter_open(rocksdb_sstfilewriter_t* writer,
+ const char* name, char** errptr) {
+ SaveError(errptr, writer->rep->Open(std::string(name)));
+}
+
+void rocksdb_sstfilewriter_add(rocksdb_sstfilewriter_t* writer, const char* key,
+ size_t keylen, const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_put(rocksdb_sstfilewriter_t* writer, const char* key,
+ size_t keylen, const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_put_with_ts(rocksdb_sstfilewriter_t* writer,
+ const char* key, size_t keylen,
+ const char* ts, size_t tslen,
+ const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(ts, tslen),
+ Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_merge(rocksdb_sstfilewriter_t* writer,
+ const char* key, size_t keylen,
+ const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Merge(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_delete(rocksdb_sstfilewriter_t* writer,
+ const char* key, size_t keylen,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Delete(Slice(key, keylen)));
+}
+
+void rocksdb_sstfilewriter_delete_with_ts(rocksdb_sstfilewriter_t* writer,
+ const char* key, size_t keylen,
+ const char* ts, size_t tslen,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Delete(Slice(key, keylen), Slice(ts, tslen)));
+}
+
+void rocksdb_sstfilewriter_delete_range(rocksdb_sstfilewriter_t* writer,
+ const char* begin_key,
+ size_t begin_keylen,
+ const char* end_key, size_t end_keylen,
+ char** errptr) {
+ SaveError(errptr, writer->rep->DeleteRange(Slice(begin_key, begin_keylen),
+ Slice(end_key, end_keylen)));
+}
+
+void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer,
+ char** errptr) {
+ SaveError(errptr, writer->rep->Finish(nullptr));
+}
+
+void rocksdb_sstfilewriter_file_size(rocksdb_sstfilewriter_t* writer,
+ uint64_t* file_size) {
+ *file_size = writer->rep->FileSize();
+}
+
+void rocksdb_sstfilewriter_destroy(rocksdb_sstfilewriter_t* writer) {
+ delete writer->rep;
+ delete writer;
+}
+
+rocksdb_ingestexternalfileoptions_t*
+rocksdb_ingestexternalfileoptions_create() {
+ rocksdb_ingestexternalfileoptions_t* opt =
+ new rocksdb_ingestexternalfileoptions_t;
+ return opt;
+}
+
+void rocksdb_ingestexternalfileoptions_set_move_files(
+ rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files) {
+ opt->rep.move_files = move_files;
+}
+
+void rocksdb_ingestexternalfileoptions_set_snapshot_consistency(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char snapshot_consistency) {
+ opt->rep.snapshot_consistency = snapshot_consistency;
+}
+
+void rocksdb_ingestexternalfileoptions_set_allow_global_seqno(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char allow_global_seqno) {
+ opt->rep.allow_global_seqno = allow_global_seqno;
+}
+
+void rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char allow_blocking_flush) {
+ opt->rep.allow_blocking_flush = allow_blocking_flush;
+}
+
+void rocksdb_ingestexternalfileoptions_set_ingest_behind(
+ rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind) {
+ opt->rep.ingest_behind = ingest_behind;
+}
+
+void rocksdb_ingestexternalfileoptions_destroy(
+ rocksdb_ingestexternalfileoptions_t* opt) {
+ delete opt;
+}
+
+void rocksdb_ingest_external_file(
+ rocksdb_t* db, const char* const* file_list, const size_t list_len,
+ const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
+ std::vector<std::string> files(list_len);
+ for (size_t i = 0; i < list_len; ++i) {
+ files[i] = std::string(file_list[i]);
+ }
+ SaveError(errptr, db->rep->IngestExternalFile(files, opt->rep));
+}
+
+void rocksdb_ingest_external_file_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* handle,
+ const char* const* file_list, const size_t list_len,
+ const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
+ std::vector<std::string> files(list_len);
+ for (size_t i = 0; i < list_len; ++i) {
+ files[i] = std::string(file_list[i]);
+ }
+ SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep));
+}
+
+void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) {
+ SaveError(errptr, db->rep->TryCatchUpWithPrimary());
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create(
+ void* state, void (*destructor)(void*),
+ char* (*transform)(void*, const char* key, size_t length,
+ size_t* dst_length),
+ unsigned char (*in_domain)(void*, const char* key, size_t length),
+ unsigned char (*in_range)(void*, const char* key, size_t length),
+ const char* (*name)(void*)) {
+ rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->transform_ = transform;
+ result->in_domain_ = in_domain;
+ result->in_range_ = in_range;
+ result->name_ = name;
+ return result;
+}
+
+void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) { delete st; }
+
+struct SliceTransformWrapper : public rocksdb_slicetransform_t {
+ const SliceTransform* rep_;
+ ~SliceTransformWrapper() override { delete rep_; }
+ const char* Name() const override { return rep_->Name(); }
+ std::string GetId() const override { return rep_->GetId(); }
+ Slice Transform(const Slice& src) const override {
+ return rep_->Transform(src);
+ }
+ bool InDomain(const Slice& src) const override { return rep_->InDomain(src); }
+ bool InRange(const Slice& src) const override { return rep_->InRange(src); }
+ static void DoNothing(void*) {}
+};
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(
+ size_t prefixLen) {
+ SliceTransformWrapper* wrapper = new SliceTransformWrapper;
+ wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen);
+ wrapper->state_ = nullptr;
+ wrapper->destructor_ = &SliceTransformWrapper::DoNothing;
+ return wrapper;
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() {
+ SliceTransformWrapper* wrapper = new SliceTransformWrapper;
+ wrapper->rep_ = ROCKSDB_NAMESPACE::NewNoopTransform();
+ wrapper->state_ = nullptr;
+ wrapper->destructor_ = &SliceTransformWrapper::DoNothing;
+ return wrapper;
+}
+
+rocksdb_universal_compaction_options_t*
+rocksdb_universal_compaction_options_create() {
+ rocksdb_universal_compaction_options_t* result =
+ new rocksdb_universal_compaction_options_t;
+ result->rep = new ROCKSDB_NAMESPACE::CompactionOptionsUniversal;
+ return result;
+}
+
+void rocksdb_universal_compaction_options_set_size_ratio(
+ rocksdb_universal_compaction_options_t* uco, int ratio) {
+ uco->rep->size_ratio = ratio;
+}
+
+int rocksdb_universal_compaction_options_get_size_ratio(
+ rocksdb_universal_compaction_options_t* uco) {
+ return uco->rep->size_ratio;
+}
+
+void rocksdb_universal_compaction_options_set_min_merge_width(
+ rocksdb_universal_compaction_options_t* uco, int w) {
+ uco->rep->min_merge_width = w;
+}
+
+int rocksdb_universal_compaction_options_get_min_merge_width(
+ rocksdb_universal_compaction_options_t* uco) {
+ return uco->rep->min_merge_width;
+}
+
+void rocksdb_universal_compaction_options_set_max_merge_width(
+ rocksdb_universal_compaction_options_t* uco, int w) {
+ uco->rep->max_merge_width = w;
+}
+
+int rocksdb_universal_compaction_options_get_max_merge_width(
+ rocksdb_universal_compaction_options_t* uco) {
+ return uco->rep->max_merge_width;
+}
+
+void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+ rocksdb_universal_compaction_options_t* uco, int p) {
+ uco->rep->max_size_amplification_percent = p;
+}
+
+int rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+ rocksdb_universal_compaction_options_t* uco) {
+ return uco->rep->max_size_amplification_percent;
+}
+
+void rocksdb_universal_compaction_options_set_compression_size_percent(
+ rocksdb_universal_compaction_options_t* uco, int p) {
+ uco->rep->compression_size_percent = p;
+}
+
+int rocksdb_universal_compaction_options_get_compression_size_percent(
+ rocksdb_universal_compaction_options_t* uco) {
+ return uco->rep->compression_size_percent;
+}
+
+void rocksdb_universal_compaction_options_set_stop_style(
+ rocksdb_universal_compaction_options_t* uco, int style) {
+ uco->rep->stop_style =
+ static_cast<ROCKSDB_NAMESPACE::CompactionStopStyle>(style);
+}
+
+int rocksdb_universal_compaction_options_get_stop_style(
+ rocksdb_universal_compaction_options_t* uco) {
+ return static_cast<int>(uco->rep->stop_style);
+}
+
+void rocksdb_universal_compaction_options_destroy(
+ rocksdb_universal_compaction_options_t* uco) {
+ delete uco->rep;
+ delete uco;
+}
+
+rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() {
+ rocksdb_fifo_compaction_options_t* result =
+ new rocksdb_fifo_compaction_options_t;
+ result->rep = CompactionOptionsFIFO();
+ return result;
+}
+
+void rocksdb_fifo_compaction_options_set_max_table_files_size(
+ rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
+ fifo_opts->rep.max_table_files_size = size;
+}
+
+uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size(
+ rocksdb_fifo_compaction_options_t* fifo_opts) {
+ return fifo_opts->rep.max_table_files_size;
+}
+
+void rocksdb_fifo_compaction_options_destroy(
+ rocksdb_fifo_compaction_options_t* fifo_opts) {
+ delete fifo_opts;
+}
+
+void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt,
+ int level) {
+ if (level >= 0) {
+ assert(level <= opt->rep.num_levels);
+ opt->rep.compression_per_level.resize(opt->rep.num_levels);
+ for (int i = 0; i < level; i++) {
+ opt->rep.compression_per_level[i] = ROCKSDB_NAMESPACE::kNoCompression;
+ }
+ for (int i = level; i < opt->rep.num_levels; i++) {
+ opt->rep.compression_per_level[i] = opt->rep.compression;
+ }
+ }
+}
+
+int rocksdb_livefiles_count(const rocksdb_livefiles_t* lf) {
+ return static_cast<int>(lf->rep.size());
+}
+
+const char* rocksdb_livefiles_column_family_name(const rocksdb_livefiles_t* lf,
+ int index) {
+ return lf->rep[index].column_family_name.c_str();
+}
+
+const char* rocksdb_livefiles_name(const rocksdb_livefiles_t* lf, int index) {
+ return lf->rep[index].name.c_str();
+}
+
+int rocksdb_livefiles_level(const rocksdb_livefiles_t* lf, int index) {
+ return lf->rep[index].level;
+}
+
+size_t rocksdb_livefiles_size(const rocksdb_livefiles_t* lf, int index) {
+ return lf->rep[index].size;
+}
+
+const char* rocksdb_livefiles_smallestkey(const rocksdb_livefiles_t* lf,
+ int index, size_t* size) {
+ *size = lf->rep[index].smallestkey.size();
+ return lf->rep[index].smallestkey.data();
+}
+
+const char* rocksdb_livefiles_largestkey(const rocksdb_livefiles_t* lf,
+ int index, size_t* size) {
+ *size = lf->rep[index].largestkey.size();
+ return lf->rep[index].largestkey.data();
+}
+
+uint64_t rocksdb_livefiles_entries(const rocksdb_livefiles_t* lf, int index) {
+ return lf->rep[index].num_entries;
+}
+
+uint64_t rocksdb_livefiles_deletions(const rocksdb_livefiles_t* lf, int index) {
+ return lf->rep[index].num_deletions;
+}
+
+extern void rocksdb_livefiles_destroy(const rocksdb_livefiles_t* lf) {
+ delete lf;
+}
+
+void rocksdb_get_options_from_string(const rocksdb_options_t* base_options,
+ const char* opts_str,
+ rocksdb_options_t* new_options,
+ char** errptr) {
+ SaveError(errptr,
+ GetOptionsFromString(base_options->rep, std::string(opts_str),
+ &new_options->rep));
+}
+
+void rocksdb_delete_file_in_range(rocksdb_t* db, const char* start_key,
+ size_t start_key_len, const char* limit_key,
+ size_t limit_key_len, char** errptr) {
+ Slice a, b;
+ SaveError(
+ errptr,
+ DeleteFilesInRange(
+ db->rep, db->rep->DefaultColumnFamily(),
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
+}
+
+void rocksdb_delete_file_in_range_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* limit_key,
+ size_t limit_key_len, char** errptr) {
+ Slice a, b;
+ SaveError(
+ errptr,
+ DeleteFilesInRange(
+ db->rep, column_family->rep,
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
+}
+
+/* MetaData */
+
+rocksdb_column_family_metadata_t* rocksdb_get_column_family_metadata(
+ rocksdb_t* db) {
+ rocksdb_column_family_metadata_t* meta = new rocksdb_column_family_metadata_t;
+ db->rep->GetColumnFamilyMetaData(&meta->rep);
+ return meta;
+}
+
+rocksdb_column_family_metadata_t* rocksdb_get_column_family_metadata_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family) {
+ rocksdb_column_family_metadata_t* meta = new rocksdb_column_family_metadata_t;
+ db->rep->GetColumnFamilyMetaData(column_family->rep, &meta->rep);
+ return meta;
+}
+
+void rocksdb_column_family_metadata_destroy(
+ rocksdb_column_family_metadata_t* cf_meta) {
+ delete cf_meta;
+}
+
+uint64_t rocksdb_column_family_metadata_get_size(
+ rocksdb_column_family_metadata_t* cf_meta) {
+ return cf_meta->rep.size;
+}
+
+size_t rocksdb_column_family_metadata_get_file_count(
+ rocksdb_column_family_metadata_t* cf_meta) {
+ return cf_meta->rep.file_count;
+}
+
+char* rocksdb_column_family_metadata_get_name(
+ rocksdb_column_family_metadata_t* cf_meta) {
+ return strdup(cf_meta->rep.name.c_str());
+}
+
+size_t rocksdb_column_family_metadata_get_level_count(
+ rocksdb_column_family_metadata_t* cf_meta) {
+ return cf_meta->rep.levels.size();
+}
+
+rocksdb_level_metadata_t* rocksdb_column_family_metadata_get_level_metadata(
+ rocksdb_column_family_metadata_t* cf_meta, size_t i) {
+ if (i >= cf_meta->rep.levels.size()) {
+ return NULL;
+ }
+ rocksdb_level_metadata_t* level_meta =
+ (rocksdb_level_metadata_t*)malloc(sizeof(rocksdb_level_metadata_t));
+ level_meta->rep = &cf_meta->rep.levels[i];
+
+ return level_meta;
+}
+
+void rocksdb_level_metadata_destroy(rocksdb_level_metadata_t* level_meta) {
+ // Only free the base pointer as its parent rocksdb_column_family_metadata_t
+ // has the ownership of its rep.
+ free(level_meta);
+}
+
+int rocksdb_level_metadata_get_level(rocksdb_level_metadata_t* level_meta) {
+ return level_meta->rep->level;
+}
+
+uint64_t rocksdb_level_metadata_get_size(rocksdb_level_metadata_t* level_meta) {
+ return level_meta->rep->size;
+}
+
+size_t rocksdb_level_metadata_get_file_count(
+ rocksdb_level_metadata_t* level_meta) {
+ return level_meta->rep->files.size();
+}
+
+rocksdb_sst_file_metadata_t* rocksdb_level_metadata_get_sst_file_metadata(
+ rocksdb_level_metadata_t* level_meta, size_t i) {
+ if (i >= level_meta->rep->files.size()) {
+ return nullptr;
+ }
+ rocksdb_sst_file_metadata_t* file_meta =
+ (rocksdb_sst_file_metadata_t*)malloc(sizeof(rocksdb_sst_file_metadata_t));
+ file_meta->rep = &level_meta->rep->files[i];
+ return file_meta;
+}
+
+void rocksdb_sst_file_metadata_destroy(rocksdb_sst_file_metadata_t* file_meta) {
+ // Only free the base pointer as its parent rocksdb_level_metadata_t
+ // has the ownership of its rep.
+ free(file_meta);
+}
+
+char* rocksdb_sst_file_metadata_get_relative_filename(
+ rocksdb_sst_file_metadata_t* file_meta) {
+ return strdup(file_meta->rep->relative_filename.c_str());
+}
+
+uint64_t rocksdb_sst_file_metadata_get_size(
+ rocksdb_sst_file_metadata_t* file_meta) {
+ return file_meta->rep->size;
+}
+
+char* rocksdb_sst_file_metadata_get_smallestkey(
+ rocksdb_sst_file_metadata_t* file_meta, size_t* key_len) {
+ *key_len = file_meta->rep->smallestkey.size();
+ return CopyString(file_meta->rep->smallestkey);
+}
+
+char* rocksdb_sst_file_metadata_get_largestkey(
+ rocksdb_sst_file_metadata_t* file_meta, size_t* key_len) {
+ *key_len = file_meta->rep->largestkey.size();
+ return CopyString(file_meta->rep->largestkey);
+}
+
+/* Transactions */
+
+rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() {
+ return new rocksdb_transactiondb_options_t;
+}
+
+void rocksdb_transactiondb_options_destroy(
+ rocksdb_transactiondb_options_t* opt) {
+ delete opt;
+}
+
+void rocksdb_transactiondb_options_set_max_num_locks(
+ rocksdb_transactiondb_options_t* opt, int64_t max_num_locks) {
+ opt->rep.max_num_locks = max_num_locks;
+}
+
+void rocksdb_transactiondb_options_set_num_stripes(
+ rocksdb_transactiondb_options_t* opt, size_t num_stripes) {
+ opt->rep.num_stripes = num_stripes;
+}
+
+void rocksdb_transactiondb_options_set_transaction_lock_timeout(
+ rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout) {
+ opt->rep.transaction_lock_timeout = txn_lock_timeout;
+}
+
+void rocksdb_transactiondb_options_set_default_lock_timeout(
+ rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout) {
+ opt->rep.default_lock_timeout = default_lock_timeout;
+}
+
+rocksdb_transaction_options_t* rocksdb_transaction_options_create() {
+ return new rocksdb_transaction_options_t;
+}
+
+void rocksdb_transaction_options_destroy(rocksdb_transaction_options_t* opt) {
+ delete opt;
+}
+
+void rocksdb_transaction_options_set_set_snapshot(
+ rocksdb_transaction_options_t* opt, unsigned char v) {
+ opt->rep.set_snapshot = v;
+}
+
+void rocksdb_transaction_options_set_deadlock_detect(
+ rocksdb_transaction_options_t* opt, unsigned char v) {
+ opt->rep.deadlock_detect = v;
+}
+
+void rocksdb_transaction_options_set_lock_timeout(
+ rocksdb_transaction_options_t* opt, int64_t lock_timeout) {
+ opt->rep.lock_timeout = lock_timeout;
+}
+
+void rocksdb_transaction_options_set_expiration(
+ rocksdb_transaction_options_t* opt, int64_t expiration) {
+ opt->rep.expiration = expiration;
+}
+
+void rocksdb_transaction_options_set_deadlock_detect_depth(
+ rocksdb_transaction_options_t* opt, int64_t depth) {
+ opt->rep.deadlock_detect_depth = depth;
+}
+
+void rocksdb_transaction_options_set_max_write_batch_size(
+ rocksdb_transaction_options_t* opt, size_t size) {
+ opt->rep.max_write_batch_size = size;
+}
+
+void rocksdb_transaction_options_set_skip_prepare(
+ rocksdb_transaction_options_t* opt, unsigned char v) {
+ opt->rep.skip_prepare = v;
+}
+
+rocksdb_optimistictransaction_options_t*
+rocksdb_optimistictransaction_options_create() {
+ return new rocksdb_optimistictransaction_options_t;
+}
+
+void rocksdb_optimistictransaction_options_destroy(
+ rocksdb_optimistictransaction_options_t* opt) {
+ delete opt;
+}
+
+void rocksdb_optimistictransaction_options_set_set_snapshot(
+ rocksdb_optimistictransaction_options_t* opt, unsigned char v) {
+ opt->rep.set_snapshot = v;
+}
+
+char* rocksdb_optimistictransactiondb_property_value(
+ rocksdb_optimistictransactiondb_t* db, const char* propname) {
+ std::string tmp;
+ if (db->rep->GetProperty(Slice(propname), &tmp)) {
+ // We use strdup() since we expect human readable output.
+ return strdup(tmp.c_str());
+ } else {
+ return nullptr;
+ }
+}
+
+int rocksdb_optimistictransactiondb_property_int(
+ rocksdb_optimistictransactiondb_t* db, const char* propname,
+ uint64_t* out_val) {
+ if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family(
+ rocksdb_transactiondb_t* txn_db,
+ const rocksdb_options_t* column_family_options,
+ const char* column_family_name, char** errptr) {
+ rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+ SaveError(errptr, txn_db->rep->CreateColumnFamily(
+ ColumnFamilyOptions(column_family_options->rep),
+ std::string(column_family_name), &(handle->rep)));
+ return handle;
+}
+
+rocksdb_transactiondb_t* rocksdb_transactiondb_open(
+ const rocksdb_options_t* options,
+ const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+ char** errptr) {
+ TransactionDB* txn_db;
+ if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
+ std::string(name), &txn_db))) {
+ return nullptr;
+ }
+ rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
+ result->rep = txn_db;
+ return result;
+}
+
+rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
+ const rocksdb_options_t* options,
+ const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i < num_column_families; i++) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep)));
+ }
+
+ TransactionDB* txn_db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
+ std::string(name), column_families,
+ &handles, &txn_db))) {
+ return nullptr;
+ }
+
+ for (size_t i = 0; i < handles.size(); i++) {
+ rocksdb_column_family_handle_t* c_handle =
+ new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
+ result->rep = txn_db;
+ return result;
+}
+
+const rocksdb_snapshot_t* rocksdb_transactiondb_create_snapshot(
+ rocksdb_transactiondb_t* txn_db) {
+ rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+ result->rep = txn_db->rep->GetSnapshot();
+ return result;
+}
+
+void rocksdb_transactiondb_release_snapshot(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot) {
+ txn_db->rep->ReleaseSnapshot(snapshot->rep);
+ delete snapshot;
+}
+
+char* rocksdb_transactiondb_property_value(rocksdb_transactiondb_t* db,
+ const char* propname) {
+ std::string tmp;
+ if (db->rep->GetProperty(Slice(propname), &tmp)) {
+ // We use strdup() since we expect human readable output.
+ return strdup(tmp.c_str());
+ } else {
+ return nullptr;
+ }
+}
+
+int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db,
+ const char* propname,
+ uint64_t* out_val) {
+ if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+rocksdb_transaction_t* rocksdb_transaction_begin(
+ rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* write_options,
+ const rocksdb_transaction_options_t* txn_options,
+ rocksdb_transaction_t* old_txn) {
+ if (old_txn == nullptr) {
+ rocksdb_transaction_t* result = new rocksdb_transaction_t;
+ result->rep = txn_db->rep->BeginTransaction(write_options->rep,
+ txn_options->rep, nullptr);
+ return result;
+ }
+ old_txn->rep = txn_db->rep->BeginTransaction(write_options->rep,
+ txn_options->rep, old_txn->rep);
+ return old_txn;
+}
+
+rocksdb_transaction_t** rocksdb_transactiondb_get_prepared_transactions(
+ rocksdb_transactiondb_t* txn_db, size_t* cnt) {
+ std::vector<Transaction*> txns;
+ txn_db->rep->GetAllPreparedTransactions(&txns);
+ *cnt = txns.size();
+ if (txns.empty()) {
+ return nullptr;
+ } else {
+ rocksdb_transaction_t** buf = (rocksdb_transaction_t**)malloc(
+ txns.size() * sizeof(rocksdb_transaction_t*));
+ for (size_t i = 0; i < txns.size(); i++) {
+ buf[i] = new rocksdb_transaction_t;
+ buf[i]->rep = txns[i];
+ }
+ return buf;
+ }
+}
+
+void rocksdb_transaction_set_name(rocksdb_transaction_t* txn, const char* name,
+ size_t name_len, char** errptr) {
+ std::string str = std::string(name, name_len);
+ SaveError(errptr, txn->rep->SetName(str));
+}
+
+char* rocksdb_transaction_get_name(rocksdb_transaction_t* txn,
+ size_t* name_len) {
+ auto name = txn->rep->GetName();
+ *name_len = name.size();
+ return CopyString(name);
+}
+
+void rocksdb_transaction_prepare(rocksdb_transaction_t* txn, char** errptr) {
+ SaveError(errptr, txn->rep->Prepare());
+}
+
+rocksdb_writebatch_wi_t* rocksdb_transaction_get_writebatch_wi(
+ rocksdb_transaction_t* txn) {
+ rocksdb_writebatch_wi_t* wi =
+ (rocksdb_writebatch_wi_t*)malloc(sizeof(rocksdb_writebatch_wi_t));
+ wi->rep = txn->rep->GetWriteBatch();
+
+ return wi;
+}
+
+void rocksdb_transaction_rebuild_from_writebatch(
+ rocksdb_transaction_t* txn, rocksdb_writebatch_t* writebatch,
+ char** errptr) {
+ SaveError(errptr, txn->rep->RebuildFromWriteBatch(&writebatch->rep));
+}
+
+void rocksdb_transaction_rebuild_from_writebatch_wi(rocksdb_transaction_t* txn,
+ rocksdb_writebatch_wi_t* wi,
+ char** errptr) {
+ SaveError(errptr, txn->rep->RebuildFromWriteBatch(wi->rep->GetWriteBatch()));
+}
+
+void rocksdb_transaction_commit(rocksdb_transaction_t* txn, char** errptr) {
+ SaveError(errptr, txn->rep->Commit());
+}
+
+void rocksdb_transaction_rollback(rocksdb_transaction_t* txn, char** errptr) {
+ SaveError(errptr, txn->rep->Rollback());
+}
+
+void rocksdb_transaction_set_savepoint(rocksdb_transaction_t* txn) {
+ txn->rep->SetSavePoint();
+}
+
+void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn,
+ char** errptr) {
+ SaveError(errptr, txn->rep->RollbackToSavePoint());
+}
+
+void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) {
+ delete txn->rep;
+ delete txn;
+}
+
+const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot(
+ rocksdb_transaction_t* txn) {
+ // This will be freed later on using free, so use malloc here to avoid a
+ // mismatch
+ rocksdb_snapshot_t* result =
+ (rocksdb_snapshot_t*)malloc(sizeof(rocksdb_snapshot_t));
+ result->rep = txn->rep->GetSnapshot();
+ return result;
+}
+
+// Read a key inside a transaction
+char* rocksdb_transaction_get(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, size_t* vlen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp);
+ if (s.ok()) {
+ *vlen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vlen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, char** errptr) {
+ rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+ Status s = txn->rep->Get(options->rep, Slice(key, klen), &v->rep);
+ if (!s.ok()) {
+ delete (v);
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ return nullptr;
+ }
+ return v;
+}
+
+char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, size_t* vlen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s =
+ txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp);
+ if (s.ok()) {
+ *vlen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vlen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ char** errptr) {
+ rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+ Status s = txn->rep->Get(options->rep, column_family->rep, Slice(key, klen),
+ &v->rep);
+ if (!s.ok()) {
+ delete (v);
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ return nullptr;
+ }
+ return v;
+}
+
+// Read a key inside a transaction
+char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t klen,
+ size_t* vlen, unsigned char exclusive,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s =
+ txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive);
+ if (s.ok()) {
+ *vlen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vlen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, unsigned char exclusive, char** errptr) {
+ rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+ Status s = txn->rep->GetForUpdate(options->rep, Slice(key, klen),
+ v->rep.GetSelf(), exclusive);
+ v->rep.PinSelf();
+ if (!s.ok()) {
+ delete (v);
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ return nullptr;
+ }
+ return v;
+}
+
+char* rocksdb_transaction_get_for_update_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ size_t* vlen, unsigned char exclusive, char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
+ Slice(key, klen), &tmp, exclusive);
+ if (s.ok()) {
+ *vlen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vlen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ unsigned char exclusive, char** errptr) {
+ rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+ Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
+ Slice(key, klen), &v->rep, exclusive);
+ if (!s.ok()) {
+ delete (v);
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ return nullptr;
+ }
+ return v;
+}
+
+void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ size_t num_keys,
+ const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ char** values_list,
+ size_t* values_list_sizes, char** errs) {
+ std::vector<Slice> keys(num_keys);
+ for (size_t i = 0; i < num_keys; i++) {
+ keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<std::string> values(num_keys);
+ std::vector<Status> statuses =
+ txn->rep->MultiGet(options->rep, keys, &values);
+ for (size_t i = 0; i < num_keys; i++) {
+ if (statuses[i].ok()) {
+ values_list[i] = CopyString(values[i]);
+ values_list_sizes[i] = values[i].size();
+ errs[i] = nullptr;
+ } else {
+ values_list[i] = nullptr;
+ values_list_sizes[i] = 0;
+ if (!statuses[i].IsNotFound()) {
+ errs[i] = strdup(statuses[i].ToString().c_str());
+ } else {
+ errs[i] = nullptr;
+ }
+ }
+ }
+}
+
+void rocksdb_transaction_multi_get_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ const rocksdb_column_family_handle_t* const* column_families,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** errs) {
+ std::vector<Slice> keys(num_keys);
+ std::vector<ColumnFamilyHandle*> cfs(num_keys);
+ for (size_t i = 0; i < num_keys; i++) {
+ keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ cfs[i] = column_families[i]->rep;
+ }
+ std::vector<std::string> values(num_keys);
+ std::vector<Status> statuses =
+ txn->rep->MultiGet(options->rep, cfs, keys, &values);
+ for (size_t i = 0; i < num_keys; i++) {
+ if (statuses[i].ok()) {
+ values_list[i] = CopyString(values[i]);
+ values_list_sizes[i] = values[i].size();
+ errs[i] = nullptr;
+ } else {
+ values_list[i] = nullptr;
+ values_list_sizes[i] = 0;
+ if (!statuses[i].IsNotFound()) {
+ errs[i] = strdup(statuses[i].ToString().c_str());
+ } else {
+ errs[i] = nullptr;
+ }
+ }
+ }
+}
+
+// Read a key outside a transaction
+char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, size_t* vlen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp);
+ if (s.ok()) {
+ *vlen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vlen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, char** errptr) {
+ rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+ Status s = txn_db->rep->Get(options->rep, txn_db->rep->DefaultColumnFamily(),
+ Slice(key, klen), &v->rep);
+ if (!s.ok()) {
+ delete (v);
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ return nullptr;
+ }
+ return v;
+}
+
+char* rocksdb_transactiondb_get_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, size_t* vallen, char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = txn_db->rep->Get(options->rep, column_family->rep,
+ Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr) {
+ rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+ Status s = txn_db->rep->Get(options->rep, column_family->rep,
+ Slice(key, keylen), &v->rep);
+ if (!s.ok()) {
+ delete (v);
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ return nullptr;
+ }
+ return v;
+}
+
+void rocksdb_transactiondb_multi_get(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options,
+ size_t num_keys,
+ const char* const* keys_list,
+ const size_t* keys_list_sizes,
+ char** values_list,
+ size_t* values_list_sizes, char** errs) {
+ std::vector<Slice> keys(num_keys);
+ for (size_t i = 0; i < num_keys; i++) {
+ keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ }
+ std::vector<std::string> values(num_keys);
+ std::vector<Status> statuses =
+ txn_db->rep->MultiGet(options->rep, keys, &values);
+ for (size_t i = 0; i < num_keys; i++) {
+ if (statuses[i].ok()) {
+ values_list[i] = CopyString(values[i]);
+ values_list_sizes[i] = values[i].size();
+ errs[i] = nullptr;
+ } else {
+ values_list[i] = nullptr;
+ values_list_sizes[i] = 0;
+ if (!statuses[i].IsNotFound()) {
+ errs[i] = strdup(statuses[i].ToString().c_str());
+ } else {
+ errs[i] = nullptr;
+ }
+ }
+ }
+}
+
+void rocksdb_transactiondb_multi_get_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ const rocksdb_column_family_handle_t* const* column_families,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** errs) {
+ std::vector<Slice> keys(num_keys);
+ std::vector<ColumnFamilyHandle*> cfs(num_keys);
+ for (size_t i = 0; i < num_keys; i++) {
+ keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+ cfs[i] = column_families[i]->rep;
+ }
+ std::vector<std::string> values(num_keys);
+ std::vector<Status> statuses =
+ txn_db->rep->MultiGet(options->rep, cfs, keys, &values);
+ for (size_t i = 0; i < num_keys; i++) {
+ if (statuses[i].ok()) {
+ values_list[i] = CopyString(values[i]);
+ values_list_sizes[i] = values[i].size();
+ errs[i] = nullptr;
+ } else {
+ values_list[i] = nullptr;
+ values_list_sizes[i] = 0;
+ if (!statuses[i].IsNotFound()) {
+ errs[i] = strdup(statuses[i].ToString().c_str());
+ } else {
+ errs[i] = nullptr;
+ }
+ }
+ }
+}
+
+// Put a key inside a transaction
+void rocksdb_transaction_put(rocksdb_transaction_t* txn, const char* key,
+ size_t klen, const char* val, size_t vlen,
+ char** errptr) {
+ SaveError(errptr, txn->rep->Put(Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transaction_put_cf(rocksdb_transaction_t* txn,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr) {
+ SaveError(errptr, txn->rep->Put(column_family->rep, Slice(key, klen),
+ Slice(val, vlen)));
+}
+
+void rocksdb_transaction_set_commit_timestamp(rocksdb_transaction_t* txn,
+ uint64_t commit_timestamp) {
+ txn->rep->SetCommitTimestamp(commit_timestamp);
+}
+
+void rocksdb_transaction_set_read_timestamp_for_validation(
+ rocksdb_transaction_t* txn, uint64_t read_timestamp) {
+ txn->rep->SetReadTimestampForValidation(read_timestamp);
+}
+
+// Put a key outside a transaction
+void rocksdb_transactiondb_put(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr) {
+ SaveError(errptr,
+ txn_db->rep->Put(options->rep, Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen,
+ const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr, txn_db->rep->Put(options->rep, column_family->rep,
+ Slice(key, keylen), Slice(val, vallen)));
+}
+
+// Write batch into transaction db
+void rocksdb_transactiondb_write(rocksdb_transactiondb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_t* batch, char** errptr) {
+ SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+// Merge a key inside a transaction
+void rocksdb_transaction_merge(rocksdb_transaction_t* txn, const char* key,
+ size_t klen, const char* val, size_t vlen,
+ char** errptr) {
+ SaveError(errptr, txn->rep->Merge(Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transaction_merge_cf(rocksdb_transaction_t* txn,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr) {
+ SaveError(errptr, txn->rep->Merge(column_family->rep, Slice(key, klen),
+ Slice(val, vlen)));
+}
+
+// Merge a key outside a transaction
+void rocksdb_transactiondb_merge(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr) {
+ SaveError(errptr, txn_db->rep->Merge(options->rep, Slice(key, klen),
+ Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_merge_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ const char* val, size_t vlen, char** errptr) {
+ SaveError(errptr, txn_db->rep->Merge(options->rep, column_family->rep,
+ Slice(key, klen), Slice(val, vlen)));
+}
+
+// Delete a key inside a transaction
+void rocksdb_transaction_delete(rocksdb_transaction_t* txn, const char* key,
+ size_t klen, char** errptr) {
+ SaveError(errptr, txn->rep->Delete(Slice(key, klen)));
+}
+
+void rocksdb_transaction_delete_cf(
+ rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, char** errptr) {
+ SaveError(errptr, txn->rep->Delete(column_family->rep, Slice(key, klen)));
+}
+
+// Delete a key outside a transaction
+void rocksdb_transactiondb_delete(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, char** errptr) {
+ SaveError(errptr, txn_db->rep->Delete(options->rep, Slice(key, klen)));
+}
+
+void rocksdb_transactiondb_delete_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr) {
+ SaveError(errptr, txn_db->rep->Delete(options->rep, column_family->rep,
+ Slice(key, keylen)));
+}
+
+// Create an iterator inside a transaction
+rocksdb_iterator_t* rocksdb_transaction_create_iterator(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = txn->rep->GetIterator(options->rep);
+ return result;
+}
+
+// Create an iterator inside a transaction with column family
+rocksdb_iterator_t* rocksdb_transaction_create_iterator_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = txn->rep->GetIterator(options->rep, column_family->rep);
+ return result;
+}
+
+// Create an iterator outside a transaction
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = txn_db->rep->NewIterator(options->rep);
+ return result;
+}
+
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family) {
+ rocksdb_iterator_t* result = new rocksdb_iterator_t;
+ result->rep = txn_db->rep->NewIterator(options->rep, column_family->rep);
+ return result;
+}
+
+void rocksdb_transactiondb_close(rocksdb_transactiondb_t* txn_db) {
+ delete txn_db->rep;
+ delete txn_db;
+}
+
+void rocksdb_transactiondb_flush_wal(rocksdb_transactiondb_t* txn_db,
+ unsigned char sync, char** errptr) {
+ SaveError(errptr, txn_db->rep->FlushWAL(sync));
+}
+
+void rocksdb_transactiondb_flush(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_flushoptions_t* options,
+ char** errptr) {
+ SaveError(errptr, txn_db->rep->Flush(options->rep));
+}
+
+void rocksdb_transactiondb_flush_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, char** errptr) {
+ SaveError(errptr, txn_db->rep->Flush(options->rep, column_family->rep));
+}
+
+rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create(
+ rocksdb_transactiondb_t* txn_db, char** errptr) {
+ Checkpoint* checkpoint;
+ if (SaveError(errptr, Checkpoint::Create(txn_db->rep, &checkpoint))) {
+ return nullptr;
+ }
+ rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+ result->rep = checkpoint;
+ return result;
+}
+
+rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open(
+ const rocksdb_options_t* options, const char* name, char** errptr) {
+ OptimisticTransactionDB* otxn_db;
+ if (SaveError(errptr, OptimisticTransactionDB::Open(
+ options->rep, std::string(name), &otxn_db))) {
+ return nullptr;
+ }
+ rocksdb_optimistictransactiondb_t* result =
+ new rocksdb_optimistictransactiondb_t;
+ result->rep = otxn_db;
+ return result;
+}
+
+rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open_column_families(
+ const rocksdb_options_t* db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (int i = 0; i < num_column_families; i++) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ std::string(column_family_names[i]),
+ ColumnFamilyOptions(column_family_options[i]->rep)));
+ }
+
+ OptimisticTransactionDB* otxn_db;
+ std::vector<ColumnFamilyHandle*> handles;
+ if (SaveError(errptr, OptimisticTransactionDB::Open(
+ DBOptions(db_options->rep), std::string(name),
+ column_families, &handles, &otxn_db))) {
+ return nullptr;
+ }
+
+ for (size_t i = 0; i < handles.size(); i++) {
+ rocksdb_column_family_handle_t* c_handle =
+ new rocksdb_column_family_handle_t;
+ c_handle->rep = handles[i];
+ column_family_handles[i] = c_handle;
+ }
+ rocksdb_optimistictransactiondb_t* result =
+ new rocksdb_optimistictransactiondb_t;
+ result->rep = otxn_db;
+ return result;
+}
+
+rocksdb_t* rocksdb_optimistictransactiondb_get_base_db(
+ rocksdb_optimistictransactiondb_t* otxn_db) {
+ DB* base_db = otxn_db->rep->GetBaseDB();
+
+ if (base_db != nullptr) {
+ rocksdb_t* result = new rocksdb_t;
+ result->rep = base_db;
+ return result;
+ }
+
+ return nullptr;
+}
+
+void rocksdb_optimistictransactiondb_close_base_db(rocksdb_t* base_db) {
+ delete base_db;
+}
+
+rocksdb_transaction_t* rocksdb_optimistictransaction_begin(
+ rocksdb_optimistictransactiondb_t* otxn_db,
+ const rocksdb_writeoptions_t* write_options,
+ const rocksdb_optimistictransaction_options_t* otxn_options,
+ rocksdb_transaction_t* old_txn) {
+ if (old_txn == nullptr) {
+ rocksdb_transaction_t* result = new rocksdb_transaction_t;
+ result->rep = otxn_db->rep->BeginTransaction(write_options->rep,
+ otxn_options->rep, nullptr);
+ return result;
+ }
+ old_txn->rep = otxn_db->rep->BeginTransaction(
+ write_options->rep, otxn_options->rep, old_txn->rep);
+ return old_txn;
+}
+
+// Write batch into OptimisticTransactionDB
+void rocksdb_optimistictransactiondb_write(
+ rocksdb_optimistictransactiondb_t* otxn_db,
+ const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch,
+ char** errptr) {
+ SaveError(errptr, otxn_db->rep->Write(options->rep, &batch->rep));
+}
+
+void rocksdb_optimistictransactiondb_close(
+ rocksdb_optimistictransactiondb_t* otxn_db) {
+ delete otxn_db->rep;
+ delete otxn_db;
+}
+
+rocksdb_checkpoint_t* rocksdb_optimistictransactiondb_checkpoint_object_create(
+ rocksdb_optimistictransactiondb_t* otxn_db, char** errptr) {
+ Checkpoint* checkpoint;
+ if (SaveError(errptr, Checkpoint::Create(otxn_db->rep, &checkpoint))) {
+ return nullptr;
+ }
+ rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+ result->rep = checkpoint;
+ return result;
+}
+
+void rocksdb_free(void* ptr) { free(ptr); }
+
+rocksdb_pinnableslice_t* rocksdb_get_pinned(
+ rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+ size_t keylen, char** errptr) {
+ rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+ Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
+ Slice(key, keylen), &v->rep);
+ if (!s.ok()) {
+ delete (v);
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ return nullptr;
+ }
+ return v;
+}
+
+rocksdb_pinnableslice_t* rocksdb_get_pinned_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr) {
+ rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+ Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+ &v->rep);
+ if (!s.ok()) {
+ delete v;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ return nullptr;
+ }
+ return v;
+}
+
+void rocksdb_pinnableslice_destroy(rocksdb_pinnableslice_t* v) { delete v; }
+
+const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v,
+ size_t* vlen) {
+ if (!v) {
+ *vlen = 0;
+ return nullptr;
+ }
+
+ *vlen = v->rep.size();
+ return v->rep.data();
+}
+
+// container to keep databases and caches in order to use
+// ROCKSDB_NAMESPACE::MemoryUtil
+struct rocksdb_memory_consumers_t {
+ std::vector<rocksdb_t*> dbs;
+ std::unordered_set<rocksdb_cache_t*> caches;
+};
+
+// initializes new container of memory consumers
+rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() {
+ return new rocksdb_memory_consumers_t;
+}
+
+// adds datatabase to the container of memory consumers
+void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers,
+ rocksdb_t* db) {
+ consumers->dbs.push_back(db);
+}
+
+// adds cache to the container of memory consumers
+void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers,
+ rocksdb_cache_t* cache) {
+ consumers->caches.insert(cache);
+}
+
+// deletes container with memory consumers
+void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) {
+ delete consumers;
+}
+
+// contains memory usage statistics provided by ROCKSDB_NAMESPACE::MemoryUtil
+struct rocksdb_memory_usage_t {
+ uint64_t mem_table_total;
+ uint64_t mem_table_unflushed;
+ uint64_t mem_table_readers_total;
+ uint64_t cache_total;
+};
+
+// estimates amount of memory occupied by consumers (dbs and caches)
+rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create(
+ rocksdb_memory_consumers_t* consumers, char** errptr) {
+ vector<DB*> dbs;
+ for (auto db : consumers->dbs) {
+ dbs.push_back(db->rep);
+ }
+
+ unordered_set<const Cache*> cache_set;
+ for (auto cache : consumers->caches) {
+ cache_set.insert(const_cast<const Cache*>(cache->rep.get()));
+ }
+
+ std::map<ROCKSDB_NAMESPACE::MemoryUtil::UsageType, uint64_t> usage_by_type;
+
+ auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
+ &usage_by_type);
+ if (SaveError(errptr, status)) {
+ return nullptr;
+ }
+
+ auto result = new rocksdb_memory_usage_t;
+ result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal];
+ result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed];
+ result->mem_table_readers_total =
+ usage_by_type[MemoryUtil::kTableReadersTotal];
+ result->cache_total = usage_by_type[MemoryUtil::kCacheTotal];
+ return result;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_total(
+ rocksdb_memory_usage_t* memory_usage) {
+ return memory_usage->mem_table_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+ rocksdb_memory_usage_t* memory_usage) {
+ return memory_usage->mem_table_unflushed;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+ rocksdb_memory_usage_t* memory_usage) {
+ return memory_usage->mem_table_readers_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_cache_total(
+ rocksdb_memory_usage_t* memory_usage) {
+ return memory_usage->cache_total;
+}
+
+void rocksdb_options_set_dump_malloc_stats(rocksdb_options_t* opt,
+ unsigned char val) {
+ opt->rep.dump_malloc_stats = val;
+}
+
+void rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t* opt,
+ unsigned char val) {
+ opt->rep.memtable_whole_key_filtering = val;
+}
+
+void rocksdb_options_set_avoid_unnecessary_blocking_io(rocksdb_options_t* opt,
+ unsigned char val) {
+ opt->rep.avoid_unnecessary_blocking_io = val;
+}
+
+unsigned char rocksdb_options_get_avoid_unnecessary_blocking_io(
+ rocksdb_options_t* opt) {
+ return opt->rep.avoid_unnecessary_blocking_io;
+}
+
+// deletes container with memory usage estimates
+void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) {
+ delete usage;
+}
+
+void rocksdb_cancel_all_background_work(rocksdb_t* db, unsigned char wait) {
+ CancelAllBackgroundWork(db->rep, wait);
+}
+
+void rocksdb_disable_manual_compaction(rocksdb_t* db) {
+ db->rep->DisableManualCompaction();
+}
+
+void rocksdb_enable_manual_compaction(rocksdb_t* db) {
+ db->rep->EnableManualCompaction();
+}
+
+} // end extern "C"
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/c_test.c b/src/rocksdb/db/c_test.c
new file mode 100644
index 000000000..249ab9023
--- /dev/null
+++ b/src/rocksdb/db/c_test.c
@@ -0,0 +1,3476 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+ Use of this source code is governed by a BSD-style license that can be
+ found in the LICENSE file. See the AUTHORS file for names of contributors. */
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include <stdio.h>
+
+#ifndef ROCKSDB_LITE // Lite does not support C API
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "rocksdb/c.h"
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <inttypes.h>
+
+// Can not use port/port.h macros as this is a c file
+#ifdef OS_WIN
+#include <windows.h>
+
+// Ok for uniqueness
+int geteuid() {
+ int result = 0;
+
+ result = ((int)GetCurrentProcessId() << 16);
+ result |= (int)GetCurrentThreadId();
+
+ return result;
+}
+
+#endif
+
+const char* phase = "";
+static char dbname[200];
+static char sstfilename[200];
+static char dbbackupname[200];
+static char dbcheckpointname[200];
+static char dbpathname[200];
+static char secondary_path[200];
+
+static void StartPhase(const char* name) {
+ fprintf(stderr, "=== Test %s\n", name);
+ phase = name;
+}
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996) // getenv security warning
+#endif
+static const char* GetTempDir(void) {
+ const char* ret = getenv("TEST_TMPDIR");
+ if (ret == NULL || ret[0] == '\0')
+#ifdef OS_WIN
+ ret = getenv("TEMP");
+#else
+ ret = "/tmp";
+#endif
+ return ret;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#define CheckNoError(err) \
+ if ((err) != NULL) { \
+ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+ abort(); \
+ }
+
+#define CheckCondition(cond) \
+ if (!(cond)) { \
+ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+ abort(); \
+ }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+ if (expected == NULL && v == NULL) {
+ // ok
+ } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+ memcmp(expected, v, n) == 0) {
+ // ok
+ return;
+ } else {
+ fprintf(stderr, "%s: expected '%s', got '%s'\n", phase,
+ (expected ? expected : "(null)"), (v ? v : "(null)"));
+ abort();
+ }
+}
+
+static void Free(char** ptr) {
+ if (*ptr) {
+ free(*ptr);
+ *ptr = NULL;
+ }
+}
+
+static void CheckValue(char* err, const char* expected, char** actual,
+ size_t actual_length) {
+ CheckNoError(err);
+ CheckEqual(expected, *actual, actual_length);
+ Free(actual);
+}
+
+static void CheckGet(rocksdb_t* db, const rocksdb_readoptions_t* options,
+ const char* key, const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_get(db, options, key, strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* handle, const char* key,
+ const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_get_cf(db, options, handle, key, strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckPinGet(rocksdb_t* db, const rocksdb_readoptions_t* options,
+ const char* key, const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ const char* val;
+ rocksdb_pinnableslice_t* p;
+ p = rocksdb_get_pinned(db, options, key, strlen(key), &err);
+ CheckNoError(err);
+ val = rocksdb_pinnableslice_value(p, &val_len);
+ CheckEqual(expected, val, val_len);
+ rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckPinGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* handle,
+ const char* key, const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ const char* val;
+ rocksdb_pinnableslice_t* p;
+ p = rocksdb_get_pinned_cf(db, options, handle, key, strlen(key), &err);
+ CheckNoError(err);
+ val = rocksdb_pinnableslice_value(p, &val_len);
+ CheckEqual(expected, val, val_len);
+ rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckMultiGetValues(size_t num_keys, char** values,
+ size_t* values_sizes, char** errs,
+ const char** expected) {
+ for (size_t i = 0; i < num_keys; i++) {
+ CheckNoError(errs[i]);
+ CheckEqual(expected[i], values[i], values_sizes[i]);
+ Free(&values[i]);
+ }
+}
+
+static void CheckIter(rocksdb_iterator_t* iter, const char* key,
+ const char* val) {
+ size_t len;
+ const char* str;
+ str = rocksdb_iter_key(iter, &len);
+ CheckEqual(key, str, len);
+ str = rocksdb_iter_value(iter, &len);
+ CheckEqual(val, str, len);
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckPut(void* ptr, const char* k, size_t klen, const char* v,
+ size_t vlen) {
+ int* state = (int*)ptr;
+ CheckCondition(*state < 2);
+ switch (*state) {
+ case 0:
+ CheckEqual("bar", k, klen);
+ CheckEqual("b", v, vlen);
+ break;
+ case 1:
+ CheckEqual("box", k, klen);
+ CheckEqual("c", v, vlen);
+ break;
+ }
+ (*state)++;
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+ int* state = (int*)ptr;
+ CheckCondition(*state == 2);
+ CheckEqual("bar", k, klen);
+ (*state)++;
+}
+
+static void CmpDestroy(void* arg) { (void)arg; }
+
+static int CmpCompare(void* arg, const char* a, size_t alen, const char* b,
+ size_t blen) {
+ (void)arg;
+ size_t n = (alen < blen) ? alen : blen;
+ int r = memcmp(a, b, n);
+ if (r == 0) {
+ if (alen < blen)
+ r = -1;
+ else if (alen > blen)
+ r = +1;
+ }
+ return r;
+}
+
+static const char* CmpName(void* arg) {
+ (void)arg;
+ return "foo";
+}
+
+// Custom compaction filter
+static void CFilterDestroy(void* arg) { (void)arg; }
+static const char* CFilterName(void* arg) {
+ (void)arg;
+ return "foo";
+}
+static unsigned char CFilterFilter(void* arg, int level, const char* key,
+ size_t key_length,
+ const char* existing_value,
+ size_t value_length, char** new_value,
+ size_t* new_value_length,
+ unsigned char* value_changed) {
+ (void)arg;
+ (void)level;
+ (void)existing_value;
+ (void)value_length;
+ if (key_length == 3) {
+ if (memcmp(key, "bar", key_length) == 0) {
+ return 1;
+ } else if (memcmp(key, "baz", key_length) == 0) {
+ *value_changed = 1;
+ *new_value = "newbazvalue";
+ *new_value_length = 11;
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static void CFilterFactoryDestroy(void* arg) { (void)arg; }
+static const char* CFilterFactoryName(void* arg) {
+ (void)arg;
+ return "foo";
+}
+static rocksdb_compactionfilter_t* CFilterCreate(
+ void* arg, rocksdb_compactionfiltercontext_t* context) {
+ (void)arg;
+ (void)context;
+ return rocksdb_compactionfilter_create(NULL, CFilterDestroy, CFilterFilter,
+ CFilterName);
+}
+
+void CheckMetaData(rocksdb_column_family_metadata_t* cf_meta,
+ const char* expected_cf_name) {
+ char* cf_name = rocksdb_column_family_metadata_get_name(cf_meta);
+ assert(strcmp(cf_name, expected_cf_name) == 0);
+ rocksdb_free(cf_name);
+
+ size_t cf_size = rocksdb_column_family_metadata_get_size(cf_meta);
+ assert(cf_size > 0);
+ size_t cf_file_count = rocksdb_column_family_metadata_get_size(cf_meta);
+ assert(cf_file_count > 0);
+
+ uint64_t total_level_size = 0;
+ size_t total_file_count = 0;
+ size_t level_count = rocksdb_column_family_metadata_get_level_count(cf_meta);
+ assert(level_count > 0);
+ for (size_t l = 0; l < level_count; ++l) {
+ rocksdb_level_metadata_t* level_meta =
+ rocksdb_column_family_metadata_get_level_metadata(cf_meta, l);
+ assert(level_meta);
+ assert(rocksdb_level_metadata_get_level(level_meta) >= (int)l);
+ uint64_t level_size = rocksdb_level_metadata_get_size(level_meta);
+ uint64_t file_size_in_level = 0;
+
+ size_t file_count = rocksdb_level_metadata_get_file_count(level_meta);
+ total_file_count += file_count;
+ for (size_t f = 0; f < file_count; ++f) {
+ rocksdb_sst_file_metadata_t* file_meta =
+ rocksdb_level_metadata_get_sst_file_metadata(level_meta, f);
+ assert(file_meta);
+
+ uint64_t file_size = rocksdb_sst_file_metadata_get_size(file_meta);
+ assert(file_size > 0);
+ file_size_in_level += file_size;
+
+ char* file_name =
+ rocksdb_sst_file_metadata_get_relative_filename(file_meta);
+ assert(file_name);
+ assert(strlen(file_name) > 0);
+ rocksdb_free(file_name);
+
+ size_t smallest_key_len;
+ char* smallest_key = rocksdb_sst_file_metadata_get_smallestkey(
+ file_meta, &smallest_key_len);
+ assert(smallest_key);
+ assert(smallest_key_len > 0);
+ size_t largest_key_len;
+ char* largest_key =
+ rocksdb_sst_file_metadata_get_largestkey(file_meta, &largest_key_len);
+ assert(largest_key);
+ assert(largest_key_len > 0);
+ rocksdb_free(smallest_key);
+ rocksdb_free(largest_key);
+
+ rocksdb_sst_file_metadata_destroy(file_meta);
+ }
+ assert(level_size == file_size_in_level);
+ total_level_size += level_size;
+ rocksdb_level_metadata_destroy(level_meta);
+ }
+ assert(total_file_count > 0);
+ assert(cf_size == total_level_size);
+}
+
+void GetAndCheckMetaData(rocksdb_t* db) {
+ rocksdb_column_family_metadata_t* cf_meta =
+ rocksdb_get_column_family_metadata(db);
+
+ CheckMetaData(cf_meta, "default");
+
+ rocksdb_column_family_metadata_destroy(cf_meta);
+}
+
+void GetAndCheckMetaDataCf(rocksdb_t* db,
+ rocksdb_column_family_handle_t* handle,
+ const char* cf_name) {
+ // Compact to make sure we have at least one sst file to obtain datadata.
+ rocksdb_compact_range_cf(db, handle, NULL, 0, NULL, 0);
+
+ rocksdb_column_family_metadata_t* cf_meta =
+ rocksdb_get_column_family_metadata_cf(db, handle);
+
+ CheckMetaData(cf_meta, cf_name);
+
+ rocksdb_column_family_metadata_destroy(cf_meta);
+}
+
+static rocksdb_t* CheckCompaction(rocksdb_t* db, rocksdb_options_t* options,
+ rocksdb_readoptions_t* roptions,
+ rocksdb_writeoptions_t* woptions) {
+ char* err = NULL;
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "foovalue");
+ rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "bar", "barvalue");
+ rocksdb_put(db, woptions, "baz", 3, "bazvalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "baz", "bazvalue");
+
+ // Disable compaction
+ rocksdb_disable_manual_compaction(db);
+ rocksdb_compact_range(db, NULL, 0, NULL, 0);
+ // should not filter anything when disabled
+ CheckGet(db, roptions, "foo", "foovalue");
+ CheckGet(db, roptions, "bar", "barvalue");
+ CheckGet(db, roptions, "baz", "bazvalue");
+ // Reenable compaction
+ rocksdb_enable_manual_compaction(db);
+
+ // Force compaction
+ rocksdb_compact_range(db, NULL, 0, NULL, 0);
+ // should have filtered bar, but not foo
+ CheckGet(db, roptions, "foo", "foovalue");
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "baz", "newbazvalue");
+
+ rocksdb_suggest_compact_range(db, "bar", 3, "foo", 3, &err);
+ GetAndCheckMetaData(db);
+ CheckNoError(err);
+
+ return db;
+}
+
+// Custom merge operator
+static void MergeOperatorDestroy(void* arg) { (void)arg; }
+static const char* MergeOperatorName(void* arg) {
+ (void)arg;
+ return "TestMergeOperator";
+}
+static char* MergeOperatorFullMerge(
+ void* arg, const char* key, size_t key_length, const char* existing_value,
+ size_t existing_value_length, const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length) {
+ (void)arg;
+ (void)key;
+ (void)key_length;
+ (void)existing_value;
+ (void)existing_value_length;
+ (void)operands_list;
+ (void)operands_list_length;
+ (void)num_operands;
+ *new_value_length = 4;
+ *success = 1;
+ char* result = malloc(4);
+ memcpy(result, "fake", 4);
+ return result;
+}
+static char* MergeOperatorPartialMerge(void* arg, const char* key,
+ size_t key_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length,
+ int num_operands, unsigned char* success,
+ size_t* new_value_length) {
+ (void)arg;
+ (void)key;
+ (void)key_length;
+ (void)operands_list;
+ (void)operands_list_length;
+ (void)num_operands;
+ *new_value_length = 4;
+ *success = 1;
+ char* result = malloc(4);
+ memcpy(result, "fake", 4);
+ return result;
+}
+
+static void CheckTxnGet(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options, const char* key,
+ const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_transaction_get(txn, options, key, strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckTxnGetCF(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_transaction_get_cf(txn, options, column_family, key,
+ strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckTxnPinGet(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ const char* key, const char* expected) {
+ rocksdb_pinnableslice_t* p = NULL;
+ const char* val = NULL;
+ char* err = NULL;
+ size_t val_len;
+ p = rocksdb_transaction_get_pinned(txn, options, key, strlen(key), &err);
+ CheckNoError(err);
+ val = rocksdb_pinnableslice_value(p, &val_len);
+ CheckEqual(expected, val, val_len);
+ rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckTxnPinGetCF(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, const char* expected) {
+ rocksdb_pinnableslice_t* p = NULL;
+ const char* val = NULL;
+ char* err = NULL;
+ size_t val_len;
+ p = rocksdb_transaction_get_pinned_cf(txn, options, column_family, key,
+ strlen(key), &err);
+ CheckNoError(err);
+ val = rocksdb_pinnableslice_value(p, &val_len);
+ CheckEqual(expected, val, val_len);
+ rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckTxnDBGet(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options, const char* key,
+ const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_transactiondb_get(txn_db, options, key, strlen(key), &val_len,
+ &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = rocksdb_transactiondb_get_cf(txn_db, options, column_family, key,
+ strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckTxnDBPinGet(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options,
+ const char* key, const char* expected) {
+ rocksdb_pinnableslice_t* p = NULL;
+ const char* val = NULL;
+ char* err = NULL;
+ size_t val_len;
+ p = rocksdb_transactiondb_get_pinned(txn_db, options, key, strlen(key), &err);
+ CheckNoError(err);
+ val = rocksdb_pinnableslice_value(p, &val_len);
+ CheckEqual(expected, val, val_len);
+ rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckTxnDBPinGetCF(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, const char* expected) {
+ rocksdb_pinnableslice_t* p = NULL;
+ const char* val = NULL;
+ char* err = NULL;
+ size_t val_len;
+ p = rocksdb_transactiondb_get_pinned_cf(txn_db, options, column_family, key,
+ strlen(key), &err);
+ CheckNoError(err);
+ val = rocksdb_pinnableslice_value(p, &val_len);
+ CheckEqual(expected, val, val_len);
+ rocksdb_pinnableslice_destroy(p);
+}
+
+static void LoadAndCheckLatestOptions(const char* db_name, rocksdb_env_t* env,
+ bool ignore_unknown_options,
+ rocksdb_cache_t* cache,
+ rocksdb_comparator_t* cmp,
+ const size_t expected_num_column_families,
+ const char** expected_cf_names,
+ const char* expected_open_err) {
+ rocksdb_options_t* db_options;
+ size_t num_column_families;
+ char** list_column_family_names;
+ rocksdb_options_t** list_column_family_options;
+ char* err = 0;
+
+ // load the latest rocksdb option
+ rocksdb_load_latest_options(db_name, env, ignore_unknown_options, cache,
+ &db_options, &num_column_families,
+ &list_column_family_names,
+ &list_column_family_options, &err);
+ assert(num_column_families == expected_num_column_families);
+ CheckNoError(err);
+
+ // verify the loaded options by opening the db.
+ rocksdb_options_set_error_if_exists(db_options, 0);
+
+ char** list_const_cf_names =
+ (char**)malloc(num_column_families * sizeof(char*));
+ rocksdb_options_t** list_const_cf_options = (rocksdb_options_t**)malloc(
+ num_column_families * sizeof(rocksdb_options_t*));
+ for (size_t i = 0; i < num_column_families; ++i) {
+ assert(strcmp(list_column_family_names[i], expected_cf_names[i]) == 0);
+ list_const_cf_names[i] = list_column_family_names[i];
+ if (cmp) {
+ rocksdb_options_set_comparator(list_column_family_options[i], cmp);
+ }
+ list_const_cf_options[i] = list_column_family_options[i];
+ }
+ rocksdb_column_family_handle_t** handles =
+ (rocksdb_column_family_handle_t**)malloc(
+ num_column_families * sizeof(rocksdb_column_family_handle_t*));
+
+ rocksdb_t* db = rocksdb_open_column_families(
+ db_options, db_name, (int)num_column_families,
+ (const char* const*)list_const_cf_names,
+ (const rocksdb_options_t* const*)list_const_cf_options, handles, &err);
+ if (expected_open_err == NULL) {
+ CheckNoError(err);
+ for (size_t i = 0; i < num_column_families; ++i) {
+ rocksdb_column_family_handle_destroy(handles[i]);
+ }
+ free(handles);
+ rocksdb_close(db);
+ } else {
+ assert(err != NULL);
+ assert(strcmp(err, expected_open_err) == 0);
+ free(handles);
+ free(err);
+ }
+
+ free(list_const_cf_names);
+ free(list_const_cf_options);
+ rocksdb_load_latest_options_destroy(db_options, list_column_family_names,
+ list_column_family_options,
+ num_column_families);
+}
+
+int main(int argc, char** argv) {
+ (void)argc;
+ (void)argv;
+ rocksdb_t* db;
+ rocksdb_comparator_t* cmp;
+ rocksdb_cache_t* cache;
+ rocksdb_dbpath_t* dbpath;
+ rocksdb_env_t* env;
+ rocksdb_options_t* options;
+ rocksdb_compactoptions_t* coptions;
+ rocksdb_block_based_table_options_t* table_options;
+ rocksdb_readoptions_t* roptions;
+ rocksdb_writeoptions_t* woptions;
+ rocksdb_ratelimiter_t* rate_limiter;
+ rocksdb_transactiondb_t* txn_db;
+ rocksdb_transactiondb_options_t* txn_db_options;
+ rocksdb_transaction_t* txn;
+ rocksdb_transaction_options_t* txn_options;
+ rocksdb_optimistictransactiondb_t* otxn_db;
+ rocksdb_optimistictransaction_options_t* otxn_options;
+ char* err = NULL;
+ int run = -1;
+
+ snprintf(dbname, sizeof(dbname), "%s/rocksdb_c_test-%d", GetTempDir(),
+ ((int)geteuid()));
+
+ snprintf(dbbackupname, sizeof(dbbackupname), "%s/rocksdb_c_test-%d-backup",
+ GetTempDir(), ((int)geteuid()));
+
+ snprintf(dbcheckpointname, sizeof(dbcheckpointname),
+ "%s/rocksdb_c_test-%d-checkpoint", GetTempDir(), ((int)geteuid()));
+
+ snprintf(sstfilename, sizeof(sstfilename), "%s/rocksdb_c_test-%d-sst",
+ GetTempDir(), ((int)geteuid()));
+
+ snprintf(dbpathname, sizeof(dbpathname), "%s/rocksdb_c_test-%d-dbpath",
+ GetTempDir(), ((int)geteuid()));
+
+ StartPhase("create_objects");
+ cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+ dbpath = rocksdb_dbpath_create(dbpathname, 1024 * 1024);
+ env = rocksdb_create_default_env();
+
+ rocksdb_create_dir_if_missing(env, GetTempDir(), &err);
+ CheckNoError(err);
+
+ cache = rocksdb_cache_create_lru(100000);
+
+ options = rocksdb_options_create();
+ rocksdb_options_set_comparator(options, cmp);
+ rocksdb_options_set_error_if_exists(options, 1);
+ rocksdb_options_set_env(options, env);
+ rocksdb_options_set_info_log(options, NULL);
+ rocksdb_options_set_write_buffer_size(options, 100000);
+ rocksdb_options_set_paranoid_checks(options, 1);
+ rocksdb_options_set_max_open_files(options, 10);
+
+ table_options = rocksdb_block_based_options_create();
+ rocksdb_block_based_options_set_block_cache(table_options, cache);
+ rocksdb_block_based_options_set_data_block_index_type(table_options, 1);
+ rocksdb_block_based_options_set_data_block_hash_ratio(table_options, 0.75);
+ rocksdb_options_set_block_based_table_factory(options, table_options);
+
+ rocksdb_options_set_compression(options, rocksdb_no_compression);
+ rocksdb_options_set_compression_options(options, -14, -1, 0, 0);
+ int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
+ rocksdb_no_compression, rocksdb_no_compression};
+ rocksdb_options_set_compression_per_level(options, compression_levels, 4);
+ rate_limiter = rocksdb_ratelimiter_create(1000 * 1024 * 1024, 100 * 1000, 10);
+ rocksdb_options_set_ratelimiter(options, rate_limiter);
+ rocksdb_ratelimiter_destroy(rate_limiter);
+
+ roptions = rocksdb_readoptions_create();
+ rocksdb_readoptions_set_verify_checksums(roptions, 1);
+ rocksdb_readoptions_set_fill_cache(roptions, 1);
+
+ woptions = rocksdb_writeoptions_create();
+ rocksdb_writeoptions_set_sync(woptions, 1);
+
+ coptions = rocksdb_compactoptions_create();
+ rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1);
+
+ rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000,
+ 10001);
+
+ StartPhase("destroy");
+ rocksdb_destroy_db(options, dbname, &err);
+ Free(&err);
+
+ StartPhase("open_error");
+ rocksdb_open(options, dbname, &err);
+ CheckCondition(err != NULL);
+ Free(&err);
+
+ StartPhase("open");
+ rocksdb_options_set_create_if_missing(options, 1);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", NULL);
+
+ StartPhase("put");
+ rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("backup_and_restore");
+ {
+ rocksdb_destroy_db(options, dbbackupname, &err);
+ CheckNoError(err);
+
+ rocksdb_backup_engine_t* be =
+ rocksdb_backup_engine_open(options, dbbackupname, &err);
+ CheckNoError(err);
+
+ rocksdb_backup_engine_create_new_backup(be, db, &err);
+ CheckNoError(err);
+
+ // need a change to trigger a new backup
+ rocksdb_delete(db, woptions, "does-not-exist", 14, &err);
+ CheckNoError(err);
+
+ rocksdb_backup_engine_create_new_backup(be, db, &err);
+ CheckNoError(err);
+
+ const rocksdb_backup_engine_info_t* bei =
+ rocksdb_backup_engine_get_backup_info(be);
+ CheckCondition(rocksdb_backup_engine_info_count(bei) > 1);
+ rocksdb_backup_engine_info_destroy(bei);
+
+ rocksdb_backup_engine_purge_old_backups(be, 1, &err);
+ CheckNoError(err);
+
+ bei = rocksdb_backup_engine_get_backup_info(be);
+ CheckCondition(rocksdb_backup_engine_info_count(bei) == 1);
+ rocksdb_backup_engine_info_destroy(bei);
+
+ rocksdb_delete(db, woptions, "foo", 3, &err);
+ CheckNoError(err);
+
+ rocksdb_close(db);
+
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_restore_options_t* restore_options =
+ rocksdb_restore_options_create();
+ rocksdb_restore_options_set_keep_log_files(restore_options, 0);
+ rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname,
+ restore_options, &err);
+ CheckNoError(err);
+ rocksdb_restore_options_destroy(restore_options);
+
+ rocksdb_options_set_error_if_exists(options, 0);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_options_set_error_if_exists(options, 1);
+
+ CheckGet(db, roptions, "foo", "hello");
+
+ rocksdb_backup_engine_close(be);
+ }
+
+ StartPhase("checkpoint");
+ {
+ rocksdb_destroy_db(options, dbcheckpointname, &err);
+ CheckNoError(err);
+
+ rocksdb_checkpoint_t* checkpoint =
+ rocksdb_checkpoint_object_create(db, &err);
+ CheckNoError(err);
+
+ rocksdb_checkpoint_create(checkpoint, dbcheckpointname, 0, &err);
+ CheckNoError(err);
+
+ // start a new database from the checkpoint
+ rocksdb_close(db);
+ rocksdb_options_set_error_if_exists(options, 0);
+ db = rocksdb_open(options, dbcheckpointname, &err);
+ CheckNoError(err);
+
+ CheckGet(db, roptions, "foo", "hello");
+
+ rocksdb_checkpoint_object_destroy(checkpoint);
+
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbcheckpointname, &err);
+ CheckNoError(err);
+
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_options_set_error_if_exists(options, 1);
+ }
+
+ StartPhase("compactall");
+ rocksdb_compact_range(db, NULL, 0, NULL, 0);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("compactrange");
+ rocksdb_compact_range(db, "a", 1, "z", 1);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("compactallopt");
+ rocksdb_compact_range_opt(db, coptions, NULL, 0, NULL, 0);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("compactrangeopt");
+ rocksdb_compact_range_opt(db, coptions, "a", 1, "z", 1);
+ CheckGet(db, roptions, "foo", "hello");
+
+ // Simple check cache usage
+ StartPhase("cache_usage");
+ {
+ rocksdb_readoptions_set_pin_data(roptions, 1);
+ rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+ rocksdb_iter_seek(iter, "foo", 3);
+
+ size_t usage = rocksdb_cache_get_usage(cache);
+ CheckCondition(usage > 0);
+
+ size_t pin_usage = rocksdb_cache_get_pinned_usage(cache);
+ CheckCondition(pin_usage > 0);
+
+ rocksdb_iter_next(iter);
+ rocksdb_iter_destroy(iter);
+ rocksdb_readoptions_set_pin_data(roptions, 0);
+ }
+
+ StartPhase("addfile");
+ {
+ rocksdb_envoptions_t* env_opt = rocksdb_envoptions_create();
+ rocksdb_options_t* io_options = rocksdb_options_create();
+ rocksdb_sstfilewriter_t* writer =
+ rocksdb_sstfilewriter_create(env_opt, io_options);
+
+ remove(sstfilename);
+ rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk1", 5, "v1", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v2", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v3", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_finish(writer, &err);
+ CheckNoError(err);
+
+ rocksdb_ingestexternalfileoptions_t* ing_opt =
+ rocksdb_ingestexternalfileoptions_create();
+ const char* file_list[1] = {sstfilename};
+ rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "sstk1", "v1");
+ CheckGet(db, roptions, "sstk2", "v2");
+ CheckGet(db, roptions, "sstk3", "v3");
+
+ remove(sstfilename);
+ rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v4", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk22", 6, "v5", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v6", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_finish(writer, &err);
+ CheckNoError(err);
+
+ rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "sstk1", "v1");
+ CheckGet(db, roptions, "sstk2", "v4");
+ CheckGet(db, roptions, "sstk22", "v5");
+ CheckGet(db, roptions, "sstk3", "v6");
+
+ rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "abc1", 4, "v7", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "abc2", 4, "v8", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "abc3", 4, "v9", 2, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_put(writer, "abc4", 4, "v10", 3, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_delete_range(writer, "abc1", 4, "abc4", 4, &err);
+ CheckNoError(err);
+ rocksdb_sstfilewriter_finish(writer, &err);
+ CheckNoError(err);
+
+ rocksdb_ingestexternalfileoptions_destroy(ing_opt);
+ rocksdb_sstfilewriter_destroy(writer);
+ rocksdb_options_destroy(io_options);
+ rocksdb_envoptions_destroy(env_opt);
+
+ // Delete all keys we just ingested
+ rocksdb_delete(db, woptions, "sstk1", 5, &err);
+ CheckNoError(err);
+ rocksdb_delete(db, woptions, "sstk2", 5, &err);
+ CheckNoError(err);
+ rocksdb_delete(db, woptions, "sstk22", 6, &err);
+ CheckNoError(err);
+ rocksdb_delete(db, woptions, "sstk3", 5, &err);
+ CheckNoError(err);
+ }
+
+ StartPhase("writebatch");
+ {
+ rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+ rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+ rocksdb_writebatch_clear(wb);
+ rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+ rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+ rocksdb_writebatch_delete(wb, "bar", 3);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "hello");
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "box", "c");
+ int pos = 0;
+ rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+ CheckCondition(pos == 3);
+ rocksdb_writebatch_clear(wb);
+ rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+ rocksdb_writebatch_put(wb, "bay", 3, "d", 1);
+ rocksdb_writebatch_delete_range(wb, "bar", 3, "bay", 3);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "bay", "d");
+ rocksdb_writebatch_clear(wb);
+ const char* start_list[1] = {"bay"};
+ const size_t start_sizes[1] = {3};
+ const char* end_list[1] = {"baz"};
+ const size_t end_sizes[1] = {3};
+ rocksdb_writebatch_delete_rangev(wb, 1, start_list, start_sizes, end_list,
+ end_sizes);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "bay", NULL);
+ rocksdb_writebatch_destroy(wb);
+ }
+
+ StartPhase("writebatch_vectors");
+ {
+ rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+ const char* k_list[2] = {"z", "ap"};
+ const size_t k_sizes[2] = {1, 2};
+ const char* v_list[3] = {"x", "y", "z"};
+ const size_t v_sizes[3] = {1, 1, 1};
+ rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", "xyz");
+ rocksdb_writebatch_delete(wb, "zap", 3);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", NULL);
+ rocksdb_writebatch_destroy(wb);
+ }
+
+ StartPhase("writebatch_savepoint");
+ {
+ rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+ rocksdb_writebatch_set_save_point(wb);
+ rocksdb_writebatch_set_save_point(wb);
+ const char* k_list[2] = {"z", "ap"};
+ const size_t k_sizes[2] = {1, 2};
+ const char* v_list[3] = {"x", "y", "z"};
+ const size_t v_sizes[3] = {1, 1, 1};
+ rocksdb_writebatch_pop_save_point(wb, &err);
+ CheckNoError(err);
+ rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+ rocksdb_writebatch_rollback_to_save_point(wb, &err);
+ CheckNoError(err);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", NULL);
+ rocksdb_writebatch_destroy(wb);
+ }
+
+ StartPhase("writebatch_rep");
+ {
+ rocksdb_writebatch_t* wb1 = rocksdb_writebatch_create();
+ rocksdb_writebatch_put(wb1, "baz", 3, "d", 1);
+ rocksdb_writebatch_put(wb1, "quux", 4, "e", 1);
+ rocksdb_writebatch_delete(wb1, "quux", 4);
+ size_t repsize1 = 0;
+ const char* rep = rocksdb_writebatch_data(wb1, &repsize1);
+ rocksdb_writebatch_t* wb2 = rocksdb_writebatch_create_from(rep, repsize1);
+ CheckCondition(rocksdb_writebatch_count(wb1) ==
+ rocksdb_writebatch_count(wb2));
+ size_t repsize2 = 0;
+ CheckCondition(
+ memcmp(rep, rocksdb_writebatch_data(wb2, &repsize2), repsize1) == 0);
+ rocksdb_writebatch_destroy(wb1);
+ rocksdb_writebatch_destroy(wb2);
+ }
+
+ StartPhase("writebatch_wi");
+ {
+ rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+ rocksdb_writebatch_wi_put(wbi, "foo", 3, "a", 1);
+ rocksdb_writebatch_wi_clear(wbi);
+ rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1);
+ rocksdb_writebatch_wi_put(wbi, "box", 3, "c", 1);
+ rocksdb_writebatch_wi_delete(wbi, "bar", 3);
+ int count = rocksdb_writebatch_wi_count(wbi);
+ CheckCondition(count == 3);
+ size_t size;
+ char* value;
+ value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size,
+ &err);
+ CheckValue(err, "c", &value, size);
+ value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "bar", 3, &size,
+ &err);
+ CheckValue(err, NULL, &value, size);
+ value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions,
+ "foo", 3, &size, &err);
+ CheckValue(err, "hello", &value, size);
+ value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions,
+ "box", 3, &size, &err);
+ CheckValue(err, "c", &value, size);
+ rocksdb_write_writebatch_wi(db, woptions, wbi, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "hello");
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "box", "c");
+ int pos = 0;
+ rocksdb_writebatch_wi_iterate(wbi, &pos, CheckPut, CheckDel);
+ CheckCondition(pos == 3);
+ rocksdb_writebatch_wi_clear(wbi);
+ rocksdb_writebatch_wi_destroy(wbi);
+ }
+
+ StartPhase("writebatch_wi_vectors");
+ {
+ rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1);
+ const char* k_list[2] = {"z", "ap"};
+ const size_t k_sizes[2] = {1, 2};
+ const char* v_list[3] = {"x", "y", "z"};
+ const size_t v_sizes[3] = {1, 1, 1};
+ rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+ rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", "xyz");
+ rocksdb_writebatch_wi_delete(wb, "zap", 3);
+ rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", NULL);
+ rocksdb_writebatch_wi_destroy(wb);
+ }
+
+ StartPhase("writebatch_wi_savepoint");
+ {
+ rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1);
+ rocksdb_writebatch_wi_set_save_point(wb);
+ const char* k_list[2] = {"z", "ap"};
+ const size_t k_sizes[2] = {1, 2};
+ const char* v_list[3] = {"x", "y", "z"};
+ const size_t v_sizes[3] = {1, 1, 1};
+ rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+ rocksdb_writebatch_wi_rollback_to_save_point(wb, &err);
+ CheckNoError(err);
+ rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "zap", NULL);
+ rocksdb_writebatch_wi_destroy(wb);
+ }
+
+ StartPhase("iter");
+ {
+ rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_next(iter);
+ CheckIter(iter, "foo", "hello");
+ rocksdb_iter_prev(iter);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_prev(iter);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_last(iter);
+ CheckIter(iter, "foo", "hello");
+ rocksdb_iter_seek(iter, "b", 1);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_seek_for_prev(iter, "g", 1);
+ CheckIter(iter, "foo", "hello");
+ rocksdb_iter_seek_for_prev(iter, "box", 3);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+ }
+
+ StartPhase("wbwi_iter");
+ {
+ rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, roptions);
+ rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+ rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1);
+ rocksdb_writebatch_wi_delete(wbi, "foo", 3);
+ rocksdb_iterator_t* iter =
+ rocksdb_writebatch_wi_create_iterator_with_base(wbi, base_iter);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "bar", "b");
+ rocksdb_iter_next(iter);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_prev(iter);
+ CheckIter(iter, "bar", "b");
+ rocksdb_iter_prev(iter);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_last(iter);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_seek(iter, "b", 1);
+ CheckIter(iter, "bar", "b");
+ rocksdb_iter_seek_for_prev(iter, "c", 1);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_seek_for_prev(iter, "box", 3);
+ CheckIter(iter, "box", "c");
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+ rocksdb_writebatch_wi_destroy(wbi);
+ }
+
+ StartPhase("multiget");
+ {
+ const char* keys[3] = {"box", "foo", "notfound"};
+ const size_t keys_sizes[3] = {3, 3, 8};
+ char* vals[3];
+ size_t vals_sizes[3];
+ char* errs[3];
+ const char* expected[3] = {"c", "hello", NULL};
+ rocksdb_multi_get(db, roptions, 3, keys, keys_sizes, vals, vals_sizes,
+ errs);
+ CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+ }
+
+ StartPhase("pin_get");
+ {
+ CheckPinGet(db, roptions, "box", "c");
+ CheckPinGet(db, roptions, "foo", "hello");
+ CheckPinGet(db, roptions, "notfound", NULL);
+ }
+
+ StartPhase("approximate_sizes");
+ {
+ int i;
+ int n = 20000;
+ char keybuf[100];
+ char valbuf[100];
+ uint64_t sizes[2];
+ const char* start[2] = {"a", "k00000000000000010000"};
+ size_t start_len[2] = {1, 21};
+ const char* limit[2] = {"k00000000000000010000", "z"};
+ size_t limit_len[2] = {21, 1};
+ rocksdb_writeoptions_set_sync(woptions, 0);
+ for (i = 0; i < n; i++) {
+ snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+ snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+ rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+ &err);
+ CheckNoError(err);
+ }
+ rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes,
+ &err);
+ CheckNoError(err);
+ CheckCondition(sizes[0] > 0);
+ CheckCondition(sizes[1] > 0);
+ }
+
+ StartPhase("property");
+ {
+ char* prop = rocksdb_property_value(db, "nosuchprop");
+ CheckCondition(prop == NULL);
+ prop = rocksdb_property_value(db, "rocksdb.stats");
+ CheckCondition(prop != NULL);
+ Free(&prop);
+ }
+
+ StartPhase("snapshot");
+ {
+ const rocksdb_snapshot_t* snap;
+ snap = rocksdb_create_snapshot(db);
+ rocksdb_delete(db, woptions, "foo", 3, &err);
+ CheckNoError(err);
+ rocksdb_readoptions_set_snapshot(roptions, snap);
+ CheckGet(db, roptions, "foo", "hello");
+ rocksdb_readoptions_set_snapshot(roptions, NULL);
+ CheckGet(db, roptions, "foo", NULL);
+ rocksdb_release_snapshot(db, snap);
+ }
+ StartPhase("snapshot_with_memtable_inplace_update");
+ {
+ rocksdb_close(db);
+ const rocksdb_snapshot_t* snap = NULL;
+ const char* s_key = "foo_snap";
+ const char* value1 = "hello_s1";
+ const char* value2 = "hello_s2";
+ rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
+ rocksdb_options_set_inplace_update_support(options, 1);
+ rocksdb_options_set_error_if_exists(options, 0);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, s_key, 8, value1, 8, &err);
+ snap = rocksdb_create_snapshot(db);
+ assert(snap != NULL);
+ rocksdb_put(db, woptions, s_key, 8, value2, 8, &err);
+ CheckNoError(err);
+ rocksdb_readoptions_set_snapshot(roptions, snap);
+ CheckGet(db, roptions, "foo", NULL);
+ // snapshot syntax is invalid, because of inplace update supported is set
+ CheckGet(db, roptions, s_key, value2);
+ // restore the data and options
+ rocksdb_delete(db, woptions, s_key, 8, &err);
+ CheckGet(db, roptions, s_key, NULL);
+ rocksdb_release_snapshot(db, snap);
+ rocksdb_readoptions_set_snapshot(roptions, NULL);
+ rocksdb_options_set_inplace_update_support(options, 0);
+ rocksdb_options_set_allow_concurrent_memtable_write(options, 1);
+ rocksdb_options_set_error_if_exists(options, 1);
+ }
+ StartPhase("repair");
+ {
+ // If we do not compact here, then the lazy deletion of
+ // files (https://reviews.facebook.net/D6123) would leave
+ // around deleted files and the repair process will find
+ // those files and put them back into the database.
+ rocksdb_compact_range(db, NULL, 0, NULL, 0);
+ rocksdb_close(db);
+ rocksdb_options_set_create_if_missing(options, 0);
+ rocksdb_options_set_error_if_exists(options, 0);
+ rocksdb_options_set_wal_recovery_mode(options, 2);
+ rocksdb_repair_db(options, dbname, &err);
+ CheckNoError(err);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", NULL);
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "box", "c");
+ rocksdb_options_set_create_if_missing(options, 1);
+ rocksdb_options_set_error_if_exists(options, 1);
+ }
+
+ StartPhase("filter");
+ for (run = 1; run <= 4; run++) {
+ // run=0 uses custom filter (not currently supported)
+ // run=1 uses old block-based bloom filter
+ // run=2 run uses full bloom filter
+ // run=3 uses Ribbon
+ // run=4 uses Ribbon-Bloom hybrid configuration
+ CheckNoError(err);
+ rocksdb_filterpolicy_t* policy;
+ if (run == 1) {
+ policy = rocksdb_filterpolicy_create_bloom(8.0);
+ } else if (run == 2) {
+ policy = rocksdb_filterpolicy_create_bloom_full(8.0);
+ } else if (run == 3) {
+ policy = rocksdb_filterpolicy_create_ribbon(8.0);
+ } else {
+ policy = rocksdb_filterpolicy_create_ribbon_hybrid(8.0, 1);
+ }
+ rocksdb_block_based_options_set_filter_policy(table_options, policy);
+
+ // Create new database
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ rocksdb_options_set_block_based_table_factory(options, table_options);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+ CheckNoError(err);
+
+ {
+ // Add enough keys to get just one reasonably populated Bloom filter
+ const int keys_to_add = 1500;
+ int i;
+ char keybuf[100];
+ for (i = 0; i < keys_to_add; i++) {
+ snprintf(keybuf, sizeof(keybuf), "yes%020d", i);
+ rocksdb_put(db, woptions, keybuf, strlen(keybuf), "val", 3, &err);
+ CheckNoError(err);
+ }
+ }
+ rocksdb_compact_range(db, NULL, 0, NULL, 0);
+
+ CheckGet(db, roptions, "foo", "foovalue");
+ CheckGet(db, roptions, "bar", "barvalue");
+
+ {
+ // Query some keys not added to identify Bloom filter implementation
+ // from false positive queries, using perfcontext to detect Bloom
+ // filter behavior
+ rocksdb_perfcontext_t* perf = rocksdb_perfcontext_create();
+ rocksdb_perfcontext_reset(perf);
+
+ const int keys_to_query = 10000;
+ int i;
+ char keybuf[100];
+ for (i = 0; i < keys_to_query; i++) {
+ snprintf(keybuf, sizeof(keybuf), "no%020d", i);
+ CheckGet(db, roptions, keybuf, NULL);
+ }
+
+ const int hits =
+ (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_hit_count);
+ if (run == 0) {
+ // Due to half true, half false with fake filter result
+ CheckCondition(hits == keys_to_query / 2);
+ } else if (run == 1 || run == 2 || run == 4) {
+ // For run == 1, block-based Bloom is no longer available in public
+ // API; attempting to enable it enables full Bloom instead.
+ //
+ // Essentially a fingerprint of full Bloom schema, format_version=5
+ CheckCondition(hits == 188);
+ } else {
+ // Essentially a fingerprint of Ribbon schema
+ CheckCondition(hits == 226);
+ }
+ CheckCondition(
+ (keys_to_query - hits) ==
+ (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_miss_count));
+
+ rocksdb_perfcontext_destroy(perf);
+ }
+
+ // Reset the policy
+ rocksdb_block_based_options_set_filter_policy(table_options, NULL);
+ rocksdb_options_set_block_based_table_factory(options, table_options);
+ }
+
+ StartPhase("compaction_filter");
+ {
+ rocksdb_options_t* options_with_filter = rocksdb_options_create();
+ rocksdb_options_set_create_if_missing(options_with_filter, 1);
+ rocksdb_compactionfilter_t* cfilter;
+ cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy,
+ CFilterFilter, CFilterName);
+ // Create new database
+ rocksdb_close(db);
+ rocksdb_destroy_db(options_with_filter, dbname, &err);
+ rocksdb_options_set_compaction_filter(options_with_filter, cfilter);
+ db = CheckCompaction(db, options_with_filter, roptions, woptions);
+
+ rocksdb_options_set_compaction_filter(options_with_filter, NULL);
+ rocksdb_compactionfilter_destroy(cfilter);
+ rocksdb_options_destroy(options_with_filter);
+ }
+
+ StartPhase("compaction_filter_factory");
+ {
+ rocksdb_options_t* options_with_filter_factory = rocksdb_options_create();
+ rocksdb_options_set_create_if_missing(options_with_filter_factory, 1);
+ rocksdb_compactionfilterfactory_t* factory;
+ factory = rocksdb_compactionfilterfactory_create(
+ NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName);
+ // Create new database
+ rocksdb_close(db);
+ rocksdb_destroy_db(options_with_filter_factory, dbname, &err);
+ rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
+ factory);
+ db = CheckCompaction(db, options_with_filter_factory, roptions, woptions);
+
+ rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
+ NULL);
+ rocksdb_options_destroy(options_with_filter_factory);
+ }
+
+ StartPhase("merge_operator");
+ {
+ rocksdb_mergeoperator_t* merge_operator;
+ merge_operator = rocksdb_mergeoperator_create(
+ NULL, MergeOperatorDestroy, MergeOperatorFullMerge,
+ MergeOperatorPartialMerge, NULL, MergeOperatorName);
+ // Create new database
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ rocksdb_options_set_merge_operator(options, merge_operator);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "foovalue");
+ rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "fake");
+
+ // Merge of a non-existing value
+ rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "bar", "fake");
+ }
+
+ StartPhase("columnfamilies");
+ {
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_options_t* db_options = rocksdb_options_create();
+ rocksdb_options_set_create_if_missing(db_options, 1);
+ db = rocksdb_open(db_options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_close(db);
+ {
+ const char* expected_cf_names[1] = {"default"};
+ LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1,
+ expected_cf_names, NULL);
+ }
+
+ rocksdb_options_set_create_if_missing(db_options, 0);
+ db = rocksdb_open(db_options, dbname, &err);
+ rocksdb_column_family_handle_t* cfh;
+ cfh = rocksdb_create_column_family(db, db_options, "cf1", &err);
+ rocksdb_column_family_handle_destroy(cfh);
+ CheckNoError(err);
+ rocksdb_close(db);
+
+ size_t cflen;
+ char** column_fams =
+ rocksdb_list_column_families(db_options, dbname, &cflen, &err);
+ CheckNoError(err);
+ CheckEqual("default", column_fams[0], 7);
+ CheckEqual("cf1", column_fams[1], 3);
+ CheckCondition(cflen == 2);
+ rocksdb_list_column_families_destroy(column_fams, cflen);
+
+ rocksdb_options_t* cf_options = rocksdb_options_create();
+
+ const char* cf_names[2] = {"default", "cf1"};
+ const rocksdb_options_t* cf_opts[2] = {cf_options, cf_options};
+ rocksdb_column_family_handle_t* handles[2];
+
+ LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 2, cf_names,
+ NULL);
+
+ db = rocksdb_open_column_families(db_options, dbname, 2, cf_names, cf_opts,
+ handles, &err);
+ CheckNoError(err);
+
+ rocksdb_put_cf(db, woptions, handles[1], "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+
+ rocksdb_put_cf(db, woptions, handles[1], "foobar1", 7, "hello1", 6, &err);
+ CheckNoError(err);
+ rocksdb_put_cf(db, woptions, handles[1], "foobar2", 7, "hello2", 6, &err);
+ CheckNoError(err);
+ rocksdb_put_cf(db, woptions, handles[1], "foobar3", 7, "hello3", 6, &err);
+ CheckNoError(err);
+ rocksdb_put_cf(db, woptions, handles[1], "foobar4", 7, "hello4", 6, &err);
+ CheckNoError(err);
+ rocksdb_suggest_compact_range_cf(db, handles[1], "foo", 3, "foobar9", 7,
+ &err);
+ CheckNoError(err);
+
+ rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
+ rocksdb_flushoptions_set_wait(flush_options, 1);
+ rocksdb_flush_cf(db, flush_options, handles[1], &err);
+ CheckNoError(err) rocksdb_flushoptions_destroy(flush_options);
+
+ CheckGetCF(db, roptions, handles[1], "foo", "hello");
+ CheckPinGetCF(db, roptions, handles[1], "foo", "hello");
+
+ rocksdb_delete_cf(db, woptions, handles[1], "foo", 3, &err);
+ CheckNoError(err);
+
+ rocksdb_delete_range_cf(db, woptions, handles[1], "foobar2", 7, "foobar4",
+ 7, &err);
+ CheckNoError(err);
+
+ CheckGetCF(db, roptions, handles[1], "foo", NULL);
+ CheckPinGetCF(db, roptions, handles[1], "foo", NULL);
+
+ rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+ rocksdb_writebatch_put_cf(wb, handles[1], "baz", 3, "a", 1);
+ rocksdb_writebatch_clear(wb);
+ rocksdb_writebatch_put_cf(wb, handles[1], "bar", 3, "b", 1);
+ rocksdb_writebatch_put_cf(wb, handles[1], "box", 3, "c", 1);
+ rocksdb_writebatch_put_cf(wb, handles[1], "buff", 4, "rocksdb", 7);
+ rocksdb_writebatch_delete_cf(wb, handles[1], "bar", 3);
+ rocksdb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGetCF(db, roptions, handles[1], "baz", NULL);
+ CheckGetCF(db, roptions, handles[1], "bar", NULL);
+ CheckGetCF(db, roptions, handles[1], "box", "c");
+ CheckGetCF(db, roptions, handles[1], "buff", "rocksdb");
+ CheckPinGetCF(db, roptions, handles[1], "baz", NULL);
+ CheckPinGetCF(db, roptions, handles[1], "bar", NULL);
+ CheckPinGetCF(db, roptions, handles[1], "box", "c");
+ CheckPinGetCF(db, roptions, handles[1], "buff", "rocksdb");
+ rocksdb_writebatch_destroy(wb);
+
+ rocksdb_flush_wal(db, 1, &err);
+ CheckNoError(err);
+
+ const char* keys[3] = {"box", "box", "barfooxx"};
+ const rocksdb_column_family_handle_t* get_handles[3] = {
+ handles[0], handles[1], handles[1]};
+ const size_t keys_sizes[3] = {3, 3, 8};
+ char* vals[3];
+ size_t vals_sizes[3];
+ char* errs[3];
+ rocksdb_multi_get_cf(db, roptions, get_handles, 3, keys, keys_sizes, vals,
+ vals_sizes, errs);
+
+ int i;
+ for (i = 0; i < 3; i++) {
+ CheckEqual(NULL, errs[i], 0);
+ switch (i) {
+ case 0:
+ CheckEqual(NULL, vals[i], vals_sizes[i]); // wrong cf
+ break;
+ case 1:
+ CheckEqual("c", vals[i], vals_sizes[i]); // bingo
+ break;
+ case 2:
+ CheckEqual(NULL, vals[i], vals_sizes[i]); // normal not found
+ break;
+ }
+ Free(&vals[i]);
+ }
+
+ {
+ const char* batched_keys[4] = {"box", "buff", "barfooxx", "box"};
+ const size_t batched_keys_sizes[4] = {3, 4, 8, 3};
+ const char* expected_value[4] = {"c", "rocksdb", NULL, "c"};
+ char* batched_errs[4];
+
+ rocksdb_pinnableslice_t* pvals[4];
+ rocksdb_batched_multi_get_cf(db, roptions, handles[1], 4, batched_keys,
+ batched_keys_sizes, pvals, batched_errs,
+ false);
+ const char* val;
+ size_t val_len;
+ for (i = 0; i < 4; ++i) {
+ val = rocksdb_pinnableslice_value(pvals[i], &val_len);
+ CheckNoError(batched_errs[i]);
+ CheckEqual(expected_value[i], val, val_len);
+ rocksdb_pinnableslice_destroy(pvals[i]);
+ }
+ }
+
+ {
+ unsigned char value_found = 0;
+
+ CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11,
+ NULL, NULL, NULL, 0, NULL));
+ CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11,
+ &vals[0], &vals_sizes[0], NULL, 0,
+ &value_found));
+ if (value_found) {
+ Free(&vals[0]);
+ }
+
+ CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1],
+ "invalid_key", 11, NULL, NULL,
+ NULL, 0, NULL));
+ CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1],
+ "invalid_key", 11, &vals[0],
+ &vals_sizes[0], NULL, 0, NULL));
+ if (value_found) {
+ Free(&vals[0]);
+ }
+ }
+
+ rocksdb_iterator_t* iter =
+ rocksdb_create_iterator_cf(db, roptions, handles[1]);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+
+ for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
+ i++;
+ }
+ CheckCondition(i == 4);
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+
+ rocksdb_column_family_handle_t* iters_cf_handles[2] = {handles[0],
+ handles[1]};
+ rocksdb_iterator_t* iters_handles[2];
+ rocksdb_create_iterators(db, roptions, iters_cf_handles, iters_handles, 2,
+ &err);
+ CheckNoError(err);
+
+ iter = iters_handles[0];
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_destroy(iter);
+
+ iter = iters_handles[1];
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+
+ for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
+ i++;
+ }
+ CheckCondition(i == 4);
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+
+ GetAndCheckMetaDataCf(db, handles[1], cf_names[1]);
+
+ rocksdb_drop_column_family(db, handles[1], &err);
+ CheckNoError(err);
+ for (i = 0; i < 2; i++) {
+ rocksdb_column_family_handle_destroy(handles[i]);
+ }
+ rocksdb_close(db);
+ {
+ // As column family has been dropped, we expect only one column family.
+ const char* expected_cf_names[1] = {"default"};
+ LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1,
+ expected_cf_names, NULL);
+ }
+ rocksdb_destroy_db(options, dbname, &err);
+ rocksdb_options_destroy(db_options);
+ rocksdb_options_destroy(cf_options);
+ }
+
+ StartPhase("prefix");
+ {
+ // Create new database
+ rocksdb_options_set_allow_mmap_reads(options, 1);
+ rocksdb_options_set_prefix_extractor(
+ options, rocksdb_slicetransform_create_fixed_prefix(3));
+ rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
+ rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
+ rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
+
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err);
+ CheckNoError(err);
+
+ rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+ CheckCondition(!rocksdb_iter_valid(iter));
+
+ rocksdb_iter_seek(iter, "bar", 3);
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ CheckCondition(rocksdb_iter_valid(iter));
+
+ CheckIter(iter, "bar1", "bar");
+ rocksdb_iter_next(iter);
+ CheckIter(iter, "bar2", "bar");
+ rocksdb_iter_next(iter);
+ CheckIter(iter, "bar3", "bar");
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+
+ rocksdb_readoptions_set_total_order_seek(roptions, 1);
+ iter = rocksdb_create_iterator(db, roptions);
+ CheckCondition(!rocksdb_iter_valid(iter));
+
+ rocksdb_iter_seek(iter, "ba", 2);
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "bar1", "bar");
+
+ rocksdb_iter_destroy(iter);
+ rocksdb_readoptions_set_total_order_seek(roptions, 0);
+
+ rocksdb_close(db);
+
+ {
+ const char* expected_cf_names[1] = {"default"};
+ LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1,
+ expected_cf_names,
+ "Invalid argument: leveldb.BytewiseComparator: "
+ "does not match existing comparator foo");
+ LoadAndCheckLatestOptions(dbname, env, false, cache, cmp, 1,
+ expected_cf_names, NULL);
+ }
+ rocksdb_destroy_db(options, dbname, &err);
+ }
+
+ // Check memory usage stats
+ StartPhase("approximate_memory_usage");
+ {
+ // Create database
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_memory_consumers_t* consumers;
+ consumers = rocksdb_memory_consumers_create();
+ rocksdb_memory_consumers_add_db(consumers, db);
+ rocksdb_memory_consumers_add_cache(consumers, cache);
+
+ // take memory usage report before write-read operation
+ rocksdb_memory_usage_t* mu1;
+ mu1 = rocksdb_approximate_memory_usage_create(consumers, &err);
+ CheckNoError(err);
+
+ // Put data (this should affect memtables)
+ rocksdb_put(db, woptions, "memory", 6, "test", 4, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "memory", "test");
+
+ // take memory usage report after write-read operation
+ rocksdb_memory_usage_t* mu2;
+ mu2 = rocksdb_approximate_memory_usage_create(consumers, &err);
+ CheckNoError(err);
+
+ // amount of memory used within memtables should grow
+ CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_total(mu2) >=
+ rocksdb_approximate_memory_usage_get_mem_table_total(mu1));
+ CheckCondition(
+ rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >=
+ rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1));
+
+ rocksdb_memory_consumers_destroy(consumers);
+ rocksdb_approximate_memory_usage_destroy(mu1);
+ rocksdb_approximate_memory_usage_destroy(mu2);
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+ }
+
+ StartPhase("cuckoo_options");
+ {
+ rocksdb_cuckoo_table_options_t* cuckoo_options;
+ cuckoo_options = rocksdb_cuckoo_options_create();
+ rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5);
+ rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200);
+ rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10);
+ rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1);
+ rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0);
+ rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options);
+
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_cuckoo_options_destroy(cuckoo_options);
+ }
+
+ StartPhase("options");
+ {
+ rocksdb_options_t* o;
+ o = rocksdb_options_create();
+
+ // Set and check options.
+ rocksdb_options_set_allow_ingest_behind(o, 1);
+ CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o));
+
+ rocksdb_options_compaction_readahead_size(o, 10);
+ CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o));
+
+ rocksdb_options_set_create_if_missing(o, 1);
+ CheckCondition(1 == rocksdb_options_get_create_if_missing(o));
+
+ rocksdb_options_set_create_missing_column_families(o, 1);
+ CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o));
+
+ rocksdb_options_set_error_if_exists(o, 1);
+ CheckCondition(1 == rocksdb_options_get_error_if_exists(o));
+
+ rocksdb_options_set_paranoid_checks(o, 1);
+ CheckCondition(1 == rocksdb_options_get_paranoid_checks(o));
+
+ rocksdb_options_set_info_log_level(o, 3);
+ CheckCondition(3 == rocksdb_options_get_info_log_level(o));
+
+ rocksdb_options_set_write_buffer_size(o, 100);
+ CheckCondition(100 == rocksdb_options_get_write_buffer_size(o));
+
+ rocksdb_options_set_db_write_buffer_size(o, 1000);
+ CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o));
+
+ rocksdb_options_set_max_open_files(o, 21);
+ CheckCondition(21 == rocksdb_options_get_max_open_files(o));
+
+ rocksdb_options_set_max_file_opening_threads(o, 5);
+ CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o));
+
+ rocksdb_options_set_max_total_wal_size(o, 400);
+ CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o));
+
+ rocksdb_options_set_num_levels(o, 7);
+ CheckCondition(7 == rocksdb_options_get_num_levels(o));
+
+ rocksdb_options_set_level0_file_num_compaction_trigger(o, 4);
+ CheckCondition(4 ==
+ rocksdb_options_get_level0_file_num_compaction_trigger(o));
+
+ rocksdb_options_set_level0_slowdown_writes_trigger(o, 6);
+ CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o));
+
+ rocksdb_options_set_level0_stop_writes_trigger(o, 8);
+ CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o));
+
+ rocksdb_options_set_target_file_size_base(o, 256);
+ CheckCondition(256 == rocksdb_options_get_target_file_size_base(o));
+
+ rocksdb_options_set_target_file_size_multiplier(o, 3);
+ CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o));
+
+ rocksdb_options_set_max_bytes_for_level_base(o, 1024);
+ CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o));
+
+ rocksdb_options_set_level_compaction_dynamic_level_bytes(o, 1);
+ CheckCondition(1 ==
+ rocksdb_options_get_level_compaction_dynamic_level_bytes(o));
+
+ rocksdb_options_set_max_bytes_for_level_multiplier(o, 2.0);
+ CheckCondition(2.0 ==
+ rocksdb_options_get_max_bytes_for_level_multiplier(o));
+
+ rocksdb_options_set_skip_stats_update_on_db_open(o, 1);
+ CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
+
+ rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(o, 1);
+ CheckCondition(
+ 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
+
+ rocksdb_options_set_max_write_buffer_number(o, 97);
+ CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
+
+ rocksdb_options_set_min_write_buffer_number_to_merge(o, 23);
+ CheckCondition(23 ==
+ rocksdb_options_get_min_write_buffer_number_to_merge(o));
+
+ rocksdb_options_set_max_write_buffer_number_to_maintain(o, 64);
+ CheckCondition(64 ==
+ rocksdb_options_get_max_write_buffer_number_to_maintain(o));
+
+ rocksdb_options_set_max_write_buffer_size_to_maintain(o, 50000);
+ CheckCondition(50000 ==
+ rocksdb_options_get_max_write_buffer_size_to_maintain(o));
+
+ rocksdb_options_set_enable_pipelined_write(o, 1);
+ CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o));
+
+ rocksdb_options_set_unordered_write(o, 1);
+ CheckCondition(1 == rocksdb_options_get_unordered_write(o));
+
+ rocksdb_options_set_max_subcompactions(o, 123456);
+ CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o));
+
+ rocksdb_options_set_max_background_jobs(o, 2);
+ CheckCondition(2 == rocksdb_options_get_max_background_jobs(o));
+
+ rocksdb_options_set_max_background_compactions(o, 3);
+ CheckCondition(3 == rocksdb_options_get_max_background_compactions(o));
+
+ rocksdb_options_set_max_background_flushes(o, 5);
+ CheckCondition(5 == rocksdb_options_get_max_background_flushes(o));
+
+ rocksdb_options_set_max_log_file_size(o, 6);
+ CheckCondition(6 == rocksdb_options_get_max_log_file_size(o));
+
+ rocksdb_options_set_log_file_time_to_roll(o, 7);
+ CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o));
+
+ rocksdb_options_set_keep_log_file_num(o, 8);
+ CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o));
+
+ rocksdb_options_set_recycle_log_file_num(o, 9);
+ CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o));
+
+ rocksdb_options_set_soft_pending_compaction_bytes_limit(o, 10);
+ CheckCondition(10 ==
+ rocksdb_options_get_soft_pending_compaction_bytes_limit(o));
+
+ rocksdb_options_set_hard_pending_compaction_bytes_limit(o, 11);
+ CheckCondition(11 ==
+ rocksdb_options_get_hard_pending_compaction_bytes_limit(o));
+
+ rocksdb_options_set_max_manifest_file_size(o, 12);
+ CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o));
+
+ rocksdb_options_set_table_cache_numshardbits(o, 13);
+ CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o));
+
+ rocksdb_options_set_arena_block_size(o, 14);
+ CheckCondition(14 == rocksdb_options_get_arena_block_size(o));
+
+ rocksdb_options_set_use_fsync(o, 1);
+ CheckCondition(1 == rocksdb_options_get_use_fsync(o));
+
+ rocksdb_options_set_WAL_ttl_seconds(o, 15);
+ CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o));
+
+ rocksdb_options_set_WAL_size_limit_MB(o, 16);
+ CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o));
+
+ rocksdb_options_set_manifest_preallocation_size(o, 17);
+ CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o));
+
+ rocksdb_options_set_allow_mmap_reads(o, 1);
+ CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o));
+
+ rocksdb_options_set_allow_mmap_writes(o, 1);
+ CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o));
+
+ rocksdb_options_set_use_direct_reads(o, 1);
+ CheckCondition(1 == rocksdb_options_get_use_direct_reads(o));
+
+ rocksdb_options_set_use_direct_io_for_flush_and_compaction(o, 1);
+ CheckCondition(
+ 1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o));
+
+ rocksdb_options_set_is_fd_close_on_exec(o, 1);
+ CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o));
+
+ rocksdb_options_set_stats_dump_period_sec(o, 18);
+ CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o));
+
+ rocksdb_options_set_stats_persist_period_sec(o, 5);
+ CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o));
+
+ rocksdb_options_set_advise_random_on_open(o, 1);
+ CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o));
+
+ rocksdb_options_set_access_hint_on_compaction_start(o, 3);
+ CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o));
+
+ rocksdb_options_set_use_adaptive_mutex(o, 1);
+ CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o));
+
+ rocksdb_options_set_bytes_per_sync(o, 19);
+ CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o));
+
+ rocksdb_options_set_wal_bytes_per_sync(o, 20);
+ CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o));
+
+ rocksdb_options_set_writable_file_max_buffer_size(o, 21);
+ CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o));
+
+ rocksdb_options_set_allow_concurrent_memtable_write(o, 1);
+ CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o));
+
+ rocksdb_options_set_enable_write_thread_adaptive_yield(o, 1);
+ CheckCondition(1 ==
+ rocksdb_options_get_enable_write_thread_adaptive_yield(o));
+
+ rocksdb_options_set_max_sequential_skip_in_iterations(o, 22);
+ CheckCondition(22 ==
+ rocksdb_options_get_max_sequential_skip_in_iterations(o));
+
+ rocksdb_options_set_disable_auto_compactions(o, 1);
+ CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o));
+
+ rocksdb_options_set_optimize_filters_for_hits(o, 1);
+ CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o));
+
+ rocksdb_options_set_delete_obsolete_files_period_micros(o, 23);
+ CheckCondition(23 ==
+ rocksdb_options_get_delete_obsolete_files_period_micros(o));
+
+ rocksdb_options_set_memtable_prefix_bloom_size_ratio(o, 2.0);
+ CheckCondition(2.0 ==
+ rocksdb_options_get_memtable_prefix_bloom_size_ratio(o));
+
+ rocksdb_options_set_max_compaction_bytes(o, 24);
+ CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o));
+
+ rocksdb_options_set_memtable_huge_page_size(o, 25);
+ CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o));
+
+ rocksdb_options_set_max_successive_merges(o, 26);
+ CheckCondition(26 == rocksdb_options_get_max_successive_merges(o));
+
+ rocksdb_options_set_bloom_locality(o, 27);
+ CheckCondition(27 == rocksdb_options_get_bloom_locality(o));
+
+ rocksdb_options_set_inplace_update_support(o, 1);
+ CheckCondition(1 == rocksdb_options_get_inplace_update_support(o));
+
+ rocksdb_options_set_inplace_update_num_locks(o, 28);
+ CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o));
+
+ rocksdb_options_set_report_bg_io_stats(o, 1);
+ CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o));
+
+ rocksdb_options_set_wal_recovery_mode(o, 2);
+ CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o));
+
+ rocksdb_options_set_compression(o, 5);
+ CheckCondition(5 == rocksdb_options_get_compression(o));
+
+ rocksdb_options_set_bottommost_compression(o, 4);
+ CheckCondition(4 == rocksdb_options_get_bottommost_compression(o));
+
+ rocksdb_options_set_compaction_style(o, 2);
+ CheckCondition(2 == rocksdb_options_get_compaction_style(o));
+
+ rocksdb_options_set_atomic_flush(o, 1);
+ CheckCondition(1 == rocksdb_options_get_atomic_flush(o));
+
+ rocksdb_options_set_manual_wal_flush(o, 1);
+ CheckCondition(1 == rocksdb_options_get_manual_wal_flush(o));
+
+ rocksdb_options_set_wal_compression(o, 1);
+ CheckCondition(1 == rocksdb_options_get_wal_compression(o));
+
+ rocksdb_options_set_experimental_mempurge_threshold(o, 29.0);
+ CheckCondition(29.0 ==
+ rocksdb_options_get_experimental_mempurge_threshold(o));
+
+ /* Blob Options */
+ rocksdb_options_set_enable_blob_files(o, 1);
+ CheckCondition(1 == rocksdb_options_get_enable_blob_files(o));
+
+ rocksdb_options_set_min_blob_size(o, 29);
+ CheckCondition(29 == rocksdb_options_get_min_blob_size(o));
+
+ rocksdb_options_set_blob_file_size(o, 30);
+ CheckCondition(30 == rocksdb_options_get_blob_file_size(o));
+
+ rocksdb_options_set_blob_compression_type(o, 4);
+ CheckCondition(4 == rocksdb_options_get_blob_compression_type(o));
+
+ rocksdb_options_set_enable_blob_gc(o, 1);
+ CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o));
+
+ rocksdb_options_set_blob_gc_age_cutoff(o, 0.5);
+ CheckCondition(0.5 == rocksdb_options_get_blob_gc_age_cutoff(o));
+
+ rocksdb_options_set_blob_gc_force_threshold(o, 0.75);
+ CheckCondition(0.75 == rocksdb_options_get_blob_gc_force_threshold(o));
+
+ rocksdb_options_set_blob_compaction_readahead_size(o, 262144);
+ CheckCondition(262144 ==
+ rocksdb_options_get_blob_compaction_readahead_size(o));
+
+ rocksdb_options_set_blob_file_starting_level(o, 5);
+ CheckCondition(5 == rocksdb_options_get_blob_file_starting_level(o));
+
+ rocksdb_options_set_prepopulate_blob_cache(o, 1 /* flush only */);
+ CheckCondition(1 == rocksdb_options_get_prepopulate_blob_cache(o));
+
+ // Create a copy that should be equal to the original.
+ rocksdb_options_t* copy;
+ copy = rocksdb_options_create_copy(o);
+
+ CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(copy));
+ CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(copy));
+ CheckCondition(1 == rocksdb_options_get_create_if_missing(copy));
+ CheckCondition(1 ==
+ rocksdb_options_get_create_missing_column_families(copy));
+ CheckCondition(1 == rocksdb_options_get_error_if_exists(copy));
+ CheckCondition(1 == rocksdb_options_get_paranoid_checks(copy));
+ CheckCondition(3 == rocksdb_options_get_info_log_level(copy));
+ CheckCondition(100 == rocksdb_options_get_write_buffer_size(copy));
+ CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(copy));
+ CheckCondition(21 == rocksdb_options_get_max_open_files(copy));
+ CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(copy));
+ CheckCondition(400 == rocksdb_options_get_max_total_wal_size(copy));
+ CheckCondition(7 == rocksdb_options_get_num_levels(copy));
+ CheckCondition(
+ 4 == rocksdb_options_get_level0_file_num_compaction_trigger(copy));
+ CheckCondition(6 ==
+ rocksdb_options_get_level0_slowdown_writes_trigger(copy));
+ CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(copy));
+ CheckCondition(256 == rocksdb_options_get_target_file_size_base(copy));
+ CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(copy));
+ CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(copy));
+ CheckCondition(
+ 1 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy));
+ CheckCondition(2.0 ==
+ rocksdb_options_get_max_bytes_for_level_multiplier(copy));
+ CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
+ CheckCondition(
+ 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
+ CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(copy));
+ CheckCondition(23 ==
+ rocksdb_options_get_min_write_buffer_number_to_merge(copy));
+ CheckCondition(
+ 64 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
+ CheckCondition(50000 ==
+ rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
+ CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(copy));
+ CheckCondition(1 == rocksdb_options_get_unordered_write(copy));
+ CheckCondition(123456 == rocksdb_options_get_max_subcompactions(copy));
+ CheckCondition(2 == rocksdb_options_get_max_background_jobs(copy));
+ CheckCondition(3 == rocksdb_options_get_max_background_compactions(copy));
+ CheckCondition(5 == rocksdb_options_get_max_background_flushes(copy));
+ CheckCondition(6 == rocksdb_options_get_max_log_file_size(copy));
+ CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(copy));
+ CheckCondition(8 == rocksdb_options_get_keep_log_file_num(copy));
+ CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(copy));
+ CheckCondition(
+ 10 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy));
+ CheckCondition(
+ 11 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy));
+ CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(copy));
+ CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(copy));
+ CheckCondition(14 == rocksdb_options_get_arena_block_size(copy));
+ CheckCondition(1 == rocksdb_options_get_use_fsync(copy));
+ CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(copy));
+ CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(copy));
+ CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(copy));
+ CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(copy));
+ CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(copy));
+ CheckCondition(1 == rocksdb_options_get_use_direct_reads(copy));
+ CheckCondition(
+ 1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy));
+ CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(copy));
+ CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(copy));
+ CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(copy));
+ CheckCondition(1 == rocksdb_options_get_advise_random_on_open(copy));
+ CheckCondition(3 ==
+ rocksdb_options_get_access_hint_on_compaction_start(copy));
+ CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(copy));
+ CheckCondition(19 == rocksdb_options_get_bytes_per_sync(copy));
+ CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(copy));
+ CheckCondition(21 ==
+ rocksdb_options_get_writable_file_max_buffer_size(copy));
+ CheckCondition(1 ==
+ rocksdb_options_get_allow_concurrent_memtable_write(copy));
+ CheckCondition(
+ 1 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy));
+ CheckCondition(22 ==
+ rocksdb_options_get_max_sequential_skip_in_iterations(copy));
+ CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(copy));
+ CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(copy));
+ CheckCondition(
+ 23 == rocksdb_options_get_delete_obsolete_files_period_micros(copy));
+ CheckCondition(2.0 ==
+ rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy));
+ CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(copy));
+ CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(copy));
+ CheckCondition(26 == rocksdb_options_get_max_successive_merges(copy));
+ CheckCondition(27 == rocksdb_options_get_bloom_locality(copy));
+ CheckCondition(1 == rocksdb_options_get_inplace_update_support(copy));
+ CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(copy));
+ CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(copy));
+ CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(copy));
+ CheckCondition(5 == rocksdb_options_get_compression(copy));
+ CheckCondition(4 == rocksdb_options_get_bottommost_compression(copy));
+ CheckCondition(2 == rocksdb_options_get_compaction_style(copy));
+ CheckCondition(1 == rocksdb_options_get_atomic_flush(copy));
+ CheckCondition(29.0 ==
+ rocksdb_options_get_experimental_mempurge_threshold(copy));
+
+ // Copies should be independent.
+ rocksdb_options_set_allow_ingest_behind(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_allow_ingest_behind(copy));
+ CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o));
+
+ rocksdb_options_compaction_readahead_size(copy, 20);
+ CheckCondition(20 == rocksdb_options_get_compaction_readahead_size(copy));
+ CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o));
+
+ rocksdb_options_set_create_if_missing(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_create_if_missing(copy));
+ CheckCondition(1 == rocksdb_options_get_create_if_missing(o));
+
+ rocksdb_options_set_create_missing_column_families(copy, 0);
+ CheckCondition(0 ==
+ rocksdb_options_get_create_missing_column_families(copy));
+ CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o));
+
+ rocksdb_options_set_error_if_exists(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_error_if_exists(copy));
+ CheckCondition(1 == rocksdb_options_get_error_if_exists(o));
+
+ rocksdb_options_set_paranoid_checks(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_paranoid_checks(copy));
+ CheckCondition(1 == rocksdb_options_get_paranoid_checks(o));
+
+ rocksdb_options_set_info_log_level(copy, 2);
+ CheckCondition(2 == rocksdb_options_get_info_log_level(copy));
+ CheckCondition(3 == rocksdb_options_get_info_log_level(o));
+
+ rocksdb_options_set_write_buffer_size(copy, 200);
+ CheckCondition(200 == rocksdb_options_get_write_buffer_size(copy));
+ CheckCondition(100 == rocksdb_options_get_write_buffer_size(o));
+
+ rocksdb_options_set_db_write_buffer_size(copy, 2000);
+ CheckCondition(2000 == rocksdb_options_get_db_write_buffer_size(copy));
+ CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o));
+
+ rocksdb_options_set_max_open_files(copy, 42);
+ CheckCondition(42 == rocksdb_options_get_max_open_files(copy));
+ CheckCondition(21 == rocksdb_options_get_max_open_files(o));
+
+ rocksdb_options_set_max_file_opening_threads(copy, 3);
+ CheckCondition(3 == rocksdb_options_get_max_file_opening_threads(copy));
+ CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o));
+
+ rocksdb_options_set_max_total_wal_size(copy, 4000);
+ CheckCondition(4000 == rocksdb_options_get_max_total_wal_size(copy));
+ CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o));
+
+ rocksdb_options_set_num_levels(copy, 6);
+ CheckCondition(6 == rocksdb_options_get_num_levels(copy));
+ CheckCondition(7 == rocksdb_options_get_num_levels(o));
+
+ rocksdb_options_set_level0_file_num_compaction_trigger(copy, 14);
+ CheckCondition(
+ 14 == rocksdb_options_get_level0_file_num_compaction_trigger(copy));
+ CheckCondition(4 ==
+ rocksdb_options_get_level0_file_num_compaction_trigger(o));
+
+ rocksdb_options_set_level0_slowdown_writes_trigger(copy, 61);
+ CheckCondition(61 ==
+ rocksdb_options_get_level0_slowdown_writes_trigger(copy));
+ CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o));
+
+ rocksdb_options_set_level0_stop_writes_trigger(copy, 17);
+ CheckCondition(17 == rocksdb_options_get_level0_stop_writes_trigger(copy));
+ CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o));
+
+ rocksdb_options_set_target_file_size_base(copy, 128);
+ CheckCondition(128 == rocksdb_options_get_target_file_size_base(copy));
+ CheckCondition(256 == rocksdb_options_get_target_file_size_base(o));
+
+ rocksdb_options_set_target_file_size_multiplier(copy, 13);
+ CheckCondition(13 == rocksdb_options_get_target_file_size_multiplier(copy));
+ CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o));
+
+ rocksdb_options_set_max_bytes_for_level_base(copy, 900);
+ CheckCondition(900 == rocksdb_options_get_max_bytes_for_level_base(copy));
+ CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o));
+
+ rocksdb_options_set_level_compaction_dynamic_level_bytes(copy, 0);
+ CheckCondition(
+ 0 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy));
+ CheckCondition(1 ==
+ rocksdb_options_get_level_compaction_dynamic_level_bytes(o));
+
+ rocksdb_options_set_max_bytes_for_level_multiplier(copy, 8.0);
+ CheckCondition(8.0 ==
+ rocksdb_options_get_max_bytes_for_level_multiplier(copy));
+ CheckCondition(2.0 ==
+ rocksdb_options_get_max_bytes_for_level_multiplier(o));
+
+ rocksdb_options_set_skip_stats_update_on_db_open(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
+ CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
+
+ rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(copy, 0);
+ CheckCondition(
+ 0 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
+ CheckCondition(
+ 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
+
+ rocksdb_options_set_max_write_buffer_number(copy, 2000);
+ CheckCondition(2000 == rocksdb_options_get_max_write_buffer_number(copy));
+ CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
+
+ rocksdb_options_set_min_write_buffer_number_to_merge(copy, 146);
+ CheckCondition(146 ==
+ rocksdb_options_get_min_write_buffer_number_to_merge(copy));
+ CheckCondition(23 ==
+ rocksdb_options_get_min_write_buffer_number_to_merge(o));
+
+ rocksdb_options_set_max_write_buffer_number_to_maintain(copy, 128);
+ CheckCondition(
+ 128 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
+ CheckCondition(64 ==
+ rocksdb_options_get_max_write_buffer_number_to_maintain(o));
+
+ rocksdb_options_set_max_write_buffer_size_to_maintain(copy, 9000);
+ CheckCondition(9000 ==
+ rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
+ CheckCondition(50000 ==
+ rocksdb_options_get_max_write_buffer_size_to_maintain(o));
+
+ rocksdb_options_set_enable_pipelined_write(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_enable_pipelined_write(copy));
+ CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o));
+
+ rocksdb_options_set_unordered_write(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_unordered_write(copy));
+ CheckCondition(1 == rocksdb_options_get_unordered_write(o));
+
+ rocksdb_options_set_max_subcompactions(copy, 90001);
+ CheckCondition(90001 == rocksdb_options_get_max_subcompactions(copy));
+ CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o));
+
+ rocksdb_options_set_max_background_jobs(copy, 12);
+ CheckCondition(12 == rocksdb_options_get_max_background_jobs(copy));
+ CheckCondition(2 == rocksdb_options_get_max_background_jobs(o));
+
+ rocksdb_options_set_max_background_compactions(copy, 13);
+ CheckCondition(13 == rocksdb_options_get_max_background_compactions(copy));
+ CheckCondition(3 == rocksdb_options_get_max_background_compactions(o));
+
+ rocksdb_options_set_max_background_flushes(copy, 15);
+ CheckCondition(15 == rocksdb_options_get_max_background_flushes(copy));
+ CheckCondition(5 == rocksdb_options_get_max_background_flushes(o));
+
+ rocksdb_options_set_max_log_file_size(copy, 16);
+ CheckCondition(16 == rocksdb_options_get_max_log_file_size(copy));
+ CheckCondition(6 == rocksdb_options_get_max_log_file_size(o));
+
+ rocksdb_options_set_log_file_time_to_roll(copy, 17);
+ CheckCondition(17 == rocksdb_options_get_log_file_time_to_roll(copy));
+ CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o));
+
+ rocksdb_options_set_keep_log_file_num(copy, 18);
+ CheckCondition(18 == rocksdb_options_get_keep_log_file_num(copy));
+ CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o));
+
+ rocksdb_options_set_recycle_log_file_num(copy, 19);
+ CheckCondition(19 == rocksdb_options_get_recycle_log_file_num(copy));
+ CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o));
+
+ rocksdb_options_set_soft_pending_compaction_bytes_limit(copy, 110);
+ CheckCondition(
+ 110 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy));
+ CheckCondition(10 ==
+ rocksdb_options_get_soft_pending_compaction_bytes_limit(o));
+
+ rocksdb_options_set_hard_pending_compaction_bytes_limit(copy, 111);
+ CheckCondition(
+ 111 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy));
+ CheckCondition(11 ==
+ rocksdb_options_get_hard_pending_compaction_bytes_limit(o));
+
+ rocksdb_options_set_max_manifest_file_size(copy, 112);
+ CheckCondition(112 == rocksdb_options_get_max_manifest_file_size(copy));
+ CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o));
+
+ rocksdb_options_set_table_cache_numshardbits(copy, 113);
+ CheckCondition(113 == rocksdb_options_get_table_cache_numshardbits(copy));
+ CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o));
+
+ rocksdb_options_set_arena_block_size(copy, 114);
+ CheckCondition(114 == rocksdb_options_get_arena_block_size(copy));
+ CheckCondition(14 == rocksdb_options_get_arena_block_size(o));
+
+ rocksdb_options_set_use_fsync(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_use_fsync(copy));
+ CheckCondition(1 == rocksdb_options_get_use_fsync(o));
+
+ rocksdb_options_set_WAL_ttl_seconds(copy, 115);
+ CheckCondition(115 == rocksdb_options_get_WAL_ttl_seconds(copy));
+ CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o));
+
+ rocksdb_options_set_WAL_size_limit_MB(copy, 116);
+ CheckCondition(116 == rocksdb_options_get_WAL_size_limit_MB(copy));
+ CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o));
+
+ rocksdb_options_set_manifest_preallocation_size(copy, 117);
+ CheckCondition(117 ==
+ rocksdb_options_get_manifest_preallocation_size(copy));
+ CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o));
+
+ rocksdb_options_set_allow_mmap_reads(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_allow_mmap_reads(copy));
+ CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o));
+
+ rocksdb_options_set_allow_mmap_writes(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_allow_mmap_writes(copy));
+ CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o));
+
+ rocksdb_options_set_use_direct_reads(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_use_direct_reads(copy));
+ CheckCondition(1 == rocksdb_options_get_use_direct_reads(o));
+
+ rocksdb_options_set_use_direct_io_for_flush_and_compaction(copy, 0);
+ CheckCondition(
+ 0 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy));
+ CheckCondition(
+ 1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o));
+
+ rocksdb_options_set_is_fd_close_on_exec(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_is_fd_close_on_exec(copy));
+ CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o));
+
+ rocksdb_options_set_stats_dump_period_sec(copy, 218);
+ CheckCondition(218 == rocksdb_options_get_stats_dump_period_sec(copy));
+ CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o));
+
+ rocksdb_options_set_stats_persist_period_sec(copy, 600);
+ CheckCondition(600 == rocksdb_options_get_stats_persist_period_sec(copy));
+ CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o));
+
+ rocksdb_options_set_advise_random_on_open(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_advise_random_on_open(copy));
+ CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o));
+
+ rocksdb_options_set_access_hint_on_compaction_start(copy, 2);
+ CheckCondition(2 ==
+ rocksdb_options_get_access_hint_on_compaction_start(copy));
+ CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o));
+
+ rocksdb_options_set_use_adaptive_mutex(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_use_adaptive_mutex(copy));
+ CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o));
+
+ rocksdb_options_set_bytes_per_sync(copy, 219);
+ CheckCondition(219 == rocksdb_options_get_bytes_per_sync(copy));
+ CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o));
+
+ rocksdb_options_set_wal_bytes_per_sync(copy, 120);
+ CheckCondition(120 == rocksdb_options_get_wal_bytes_per_sync(copy));
+ CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o));
+
+ rocksdb_options_set_writable_file_max_buffer_size(copy, 121);
+ CheckCondition(121 ==
+ rocksdb_options_get_writable_file_max_buffer_size(copy));
+ CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o));
+
+ rocksdb_options_set_allow_concurrent_memtable_write(copy, 0);
+ CheckCondition(0 ==
+ rocksdb_options_get_allow_concurrent_memtable_write(copy));
+ CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o));
+
+ rocksdb_options_set_enable_write_thread_adaptive_yield(copy, 0);
+ CheckCondition(
+ 0 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy));
+ CheckCondition(1 ==
+ rocksdb_options_get_enable_write_thread_adaptive_yield(o));
+
+ rocksdb_options_set_max_sequential_skip_in_iterations(copy, 122);
+ CheckCondition(122 ==
+ rocksdb_options_get_max_sequential_skip_in_iterations(copy));
+ CheckCondition(22 ==
+ rocksdb_options_get_max_sequential_skip_in_iterations(o));
+
+ rocksdb_options_set_disable_auto_compactions(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_disable_auto_compactions(copy));
+ CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o));
+
+ rocksdb_options_set_optimize_filters_for_hits(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_optimize_filters_for_hits(copy));
+ CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o));
+
+ rocksdb_options_set_delete_obsolete_files_period_micros(copy, 123);
+ CheckCondition(
+ 123 == rocksdb_options_get_delete_obsolete_files_period_micros(copy));
+ CheckCondition(23 ==
+ rocksdb_options_get_delete_obsolete_files_period_micros(o));
+
+ rocksdb_options_set_memtable_prefix_bloom_size_ratio(copy, 4.0);
+ CheckCondition(4.0 ==
+ rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy));
+ CheckCondition(2.0 ==
+ rocksdb_options_get_memtable_prefix_bloom_size_ratio(o));
+
+ rocksdb_options_set_max_compaction_bytes(copy, 124);
+ CheckCondition(124 == rocksdb_options_get_max_compaction_bytes(copy));
+ CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o));
+
+ rocksdb_options_set_memtable_huge_page_size(copy, 125);
+ CheckCondition(125 == rocksdb_options_get_memtable_huge_page_size(copy));
+ CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o));
+
+ rocksdb_options_set_max_successive_merges(copy, 126);
+ CheckCondition(126 == rocksdb_options_get_max_successive_merges(copy));
+ CheckCondition(26 == rocksdb_options_get_max_successive_merges(o));
+
+ rocksdb_options_set_bloom_locality(copy, 127);
+ CheckCondition(127 == rocksdb_options_get_bloom_locality(copy));
+ CheckCondition(27 == rocksdb_options_get_bloom_locality(o));
+
+ rocksdb_options_set_inplace_update_support(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_inplace_update_support(copy));
+ CheckCondition(1 == rocksdb_options_get_inplace_update_support(o));
+
+ rocksdb_options_set_inplace_update_num_locks(copy, 128);
+ CheckCondition(128 == rocksdb_options_get_inplace_update_num_locks(copy));
+ CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o));
+
+ rocksdb_options_set_report_bg_io_stats(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_report_bg_io_stats(copy));
+ CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o));
+
+ rocksdb_options_set_wal_recovery_mode(copy, 1);
+ CheckCondition(1 == rocksdb_options_get_wal_recovery_mode(copy));
+ CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o));
+
+ rocksdb_options_set_compression(copy, 4);
+ CheckCondition(4 == rocksdb_options_get_compression(copy));
+ CheckCondition(5 == rocksdb_options_get_compression(o));
+
+ rocksdb_options_set_bottommost_compression(copy, 3);
+ CheckCondition(3 == rocksdb_options_get_bottommost_compression(copy));
+ CheckCondition(4 == rocksdb_options_get_bottommost_compression(o));
+
+ rocksdb_options_set_compaction_style(copy, 1);
+ CheckCondition(1 == rocksdb_options_get_compaction_style(copy));
+ CheckCondition(2 == rocksdb_options_get_compaction_style(o));
+
+ rocksdb_options_set_atomic_flush(copy, 0);
+ CheckCondition(0 == rocksdb_options_get_atomic_flush(copy));
+ CheckCondition(1 == rocksdb_options_get_atomic_flush(o));
+
+ rocksdb_options_set_experimental_mempurge_threshold(copy, 229.0);
+ CheckCondition(229.0 ==
+ rocksdb_options_get_experimental_mempurge_threshold(copy));
+ CheckCondition(29.0 ==
+ rocksdb_options_get_experimental_mempurge_threshold(o));
+
+ rocksdb_options_destroy(copy);
+ rocksdb_options_destroy(o);
+ }
+
+ StartPhase("read_options");
+ {
+ rocksdb_readoptions_t* ro;
+ ro = rocksdb_readoptions_create();
+
+ rocksdb_readoptions_set_verify_checksums(ro, 1);
+ CheckCondition(1 == rocksdb_readoptions_get_verify_checksums(ro));
+
+ rocksdb_readoptions_set_fill_cache(ro, 1);
+ CheckCondition(1 == rocksdb_readoptions_get_fill_cache(ro));
+
+ rocksdb_readoptions_set_read_tier(ro, 2);
+ CheckCondition(2 == rocksdb_readoptions_get_read_tier(ro));
+
+ rocksdb_readoptions_set_tailing(ro, 1);
+ CheckCondition(1 == rocksdb_readoptions_get_tailing(ro));
+
+ rocksdb_readoptions_set_readahead_size(ro, 100);
+ CheckCondition(100 == rocksdb_readoptions_get_readahead_size(ro));
+
+ rocksdb_readoptions_set_prefix_same_as_start(ro, 1);
+ CheckCondition(1 == rocksdb_readoptions_get_prefix_same_as_start(ro));
+
+ rocksdb_readoptions_set_pin_data(ro, 1);
+ CheckCondition(1 == rocksdb_readoptions_get_pin_data(ro));
+
+ rocksdb_readoptions_set_total_order_seek(ro, 1);
+ CheckCondition(1 == rocksdb_readoptions_get_total_order_seek(ro));
+
+ rocksdb_readoptions_set_max_skippable_internal_keys(ro, 200);
+ CheckCondition(200 ==
+ rocksdb_readoptions_get_max_skippable_internal_keys(ro));
+
+ rocksdb_readoptions_set_background_purge_on_iterator_cleanup(ro, 1);
+ CheckCondition(
+ 1 == rocksdb_readoptions_get_background_purge_on_iterator_cleanup(ro));
+
+ rocksdb_readoptions_set_ignore_range_deletions(ro, 1);
+ CheckCondition(1 == rocksdb_readoptions_get_ignore_range_deletions(ro));
+
+ rocksdb_readoptions_set_deadline(ro, 300);
+ CheckCondition(300 == rocksdb_readoptions_get_deadline(ro));
+
+ rocksdb_readoptions_set_io_timeout(ro, 400);
+ CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro));
+
+ rocksdb_readoptions_destroy(ro);
+ }
+
+ StartPhase("write_options");
+ {
+ rocksdb_writeoptions_t* wo;
+ wo = rocksdb_writeoptions_create();
+
+ rocksdb_writeoptions_set_sync(wo, 1);
+ CheckCondition(1 == rocksdb_writeoptions_get_sync(wo));
+
+ rocksdb_writeoptions_disable_WAL(wo, 1);
+ CheckCondition(1 == rocksdb_writeoptions_get_disable_WAL(wo));
+
+ rocksdb_writeoptions_set_ignore_missing_column_families(wo, 1);
+ CheckCondition(1 ==
+ rocksdb_writeoptions_get_ignore_missing_column_families(wo));
+
+ rocksdb_writeoptions_set_no_slowdown(wo, 1);
+ CheckCondition(1 == rocksdb_writeoptions_get_no_slowdown(wo));
+
+ rocksdb_writeoptions_set_low_pri(wo, 1);
+ CheckCondition(1 == rocksdb_writeoptions_get_low_pri(wo));
+
+ rocksdb_writeoptions_set_memtable_insert_hint_per_batch(wo, 1);
+ CheckCondition(1 ==
+ rocksdb_writeoptions_get_memtable_insert_hint_per_batch(wo));
+
+ rocksdb_writeoptions_destroy(wo);
+ }
+
+ StartPhase("compact_options");
+ {
+ rocksdb_compactoptions_t* co;
+ co = rocksdb_compactoptions_create();
+
+ rocksdb_compactoptions_set_exclusive_manual_compaction(co, 1);
+ CheckCondition(1 ==
+ rocksdb_compactoptions_get_exclusive_manual_compaction(co));
+
+ rocksdb_compactoptions_set_bottommost_level_compaction(co, 1);
+ CheckCondition(1 ==
+ rocksdb_compactoptions_get_bottommost_level_compaction(co));
+
+ rocksdb_compactoptions_set_change_level(co, 1);
+ CheckCondition(1 == rocksdb_compactoptions_get_change_level(co));
+
+ rocksdb_compactoptions_set_target_level(co, 1);
+ CheckCondition(1 == rocksdb_compactoptions_get_target_level(co));
+
+ rocksdb_compactoptions_destroy(co);
+ }
+
+ StartPhase("flush_options");
+ {
+ rocksdb_flushoptions_t* fo;
+ fo = rocksdb_flushoptions_create();
+
+ rocksdb_flushoptions_set_wait(fo, 1);
+ CheckCondition(1 == rocksdb_flushoptions_get_wait(fo));
+
+ rocksdb_flushoptions_destroy(fo);
+ }
+
+ StartPhase("cache_options");
+ {
+ rocksdb_cache_t* co;
+ co = rocksdb_cache_create_lru(100);
+ CheckCondition(100 == rocksdb_cache_get_capacity(co));
+
+ rocksdb_cache_set_capacity(co, 200);
+ CheckCondition(200 == rocksdb_cache_get_capacity(co));
+
+ rocksdb_cache_destroy(co);
+ }
+
+ StartPhase("jemalloc_nodump_allocator");
+ {
+ rocksdb_memory_allocator_t* allocator;
+ allocator = rocksdb_jemalloc_nodump_allocator_create(&err);
+ if (err != NULL) {
+ // not supported on all platforms, allow unsupported error
+ const char* ni = "Not implemented: ";
+ size_t ni_len = strlen(ni);
+ size_t err_len = strlen(err);
+
+ CheckCondition(err_len >= ni_len);
+ CheckCondition(memcmp(ni, err, ni_len) == 0);
+ Free(&err);
+ } else {
+ rocksdb_cache_t* co;
+ rocksdb_lru_cache_options_t* copts;
+
+ copts = rocksdb_lru_cache_options_create();
+
+ rocksdb_lru_cache_options_set_capacity(copts, 100);
+ rocksdb_lru_cache_options_set_memory_allocator(copts, allocator);
+
+ co = rocksdb_cache_create_lru_opts(copts);
+ CheckCondition(100 == rocksdb_cache_get_capacity(co));
+
+ rocksdb_cache_destroy(co);
+ rocksdb_lru_cache_options_destroy(copts);
+ }
+ rocksdb_memory_allocator_destroy(allocator);
+ }
+
+ StartPhase("env");
+ {
+ rocksdb_env_t* e;
+ e = rocksdb_create_default_env();
+
+ rocksdb_env_set_background_threads(e, 10);
+ CheckCondition(10 == rocksdb_env_get_background_threads(e));
+
+ rocksdb_env_set_high_priority_background_threads(e, 20);
+ CheckCondition(20 == rocksdb_env_get_high_priority_background_threads(e));
+
+ rocksdb_env_set_low_priority_background_threads(e, 30);
+ CheckCondition(30 == rocksdb_env_get_low_priority_background_threads(e));
+
+ rocksdb_env_set_bottom_priority_background_threads(e, 40);
+ CheckCondition(40 == rocksdb_env_get_bottom_priority_background_threads(e));
+
+ rocksdb_env_destroy(e);
+ }
+
+ StartPhase("universal_compaction_options");
+ {
+ rocksdb_universal_compaction_options_t* uco;
+ uco = rocksdb_universal_compaction_options_create();
+
+ rocksdb_universal_compaction_options_set_size_ratio(uco, 5);
+ CheckCondition(5 ==
+ rocksdb_universal_compaction_options_get_size_ratio(uco));
+
+ rocksdb_universal_compaction_options_set_min_merge_width(uco, 15);
+ CheckCondition(
+ 15 == rocksdb_universal_compaction_options_get_min_merge_width(uco));
+
+ rocksdb_universal_compaction_options_set_max_merge_width(uco, 25);
+ CheckCondition(
+ 25 == rocksdb_universal_compaction_options_get_max_merge_width(uco));
+
+ rocksdb_universal_compaction_options_set_max_size_amplification_percent(uco,
+ 35);
+ CheckCondition(
+ 35 ==
+ rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+ uco));
+
+ rocksdb_universal_compaction_options_set_compression_size_percent(uco, 45);
+ CheckCondition(
+ 45 ==
+ rocksdb_universal_compaction_options_get_compression_size_percent(uco));
+
+ rocksdb_universal_compaction_options_set_stop_style(uco, 1);
+ CheckCondition(1 ==
+ rocksdb_universal_compaction_options_get_stop_style(uco));
+
+ rocksdb_universal_compaction_options_destroy(uco);
+ }
+
+ StartPhase("fifo_compaction_options");
+ {
+ rocksdb_fifo_compaction_options_t* fco;
+ fco = rocksdb_fifo_compaction_options_create();
+
+ rocksdb_fifo_compaction_options_set_max_table_files_size(fco, 100000);
+ CheckCondition(
+ 100000 ==
+ rocksdb_fifo_compaction_options_get_max_table_files_size(fco));
+
+ rocksdb_fifo_compaction_options_destroy(fco);
+ }
+
+ StartPhase("backup_engine_option");
+ {
+ rocksdb_backup_engine_options_t* bdo;
+ bdo = rocksdb_backup_engine_options_create("path");
+
+ rocksdb_backup_engine_options_set_share_table_files(bdo, 1);
+ CheckCondition(1 ==
+ rocksdb_backup_engine_options_get_share_table_files(bdo));
+
+ rocksdb_backup_engine_options_set_sync(bdo, 1);
+ CheckCondition(1 == rocksdb_backup_engine_options_get_sync(bdo));
+
+ rocksdb_backup_engine_options_set_destroy_old_data(bdo, 1);
+ CheckCondition(1 ==
+ rocksdb_backup_engine_options_get_destroy_old_data(bdo));
+
+ rocksdb_backup_engine_options_set_backup_log_files(bdo, 1);
+ CheckCondition(1 ==
+ rocksdb_backup_engine_options_get_backup_log_files(bdo));
+
+ rocksdb_backup_engine_options_set_backup_rate_limit(bdo, 123);
+ CheckCondition(123 ==
+ rocksdb_backup_engine_options_get_backup_rate_limit(bdo));
+
+ rocksdb_backup_engine_options_set_restore_rate_limit(bdo, 37);
+ CheckCondition(37 ==
+ rocksdb_backup_engine_options_get_restore_rate_limit(bdo));
+
+ rocksdb_backup_engine_options_set_max_background_operations(bdo, 20);
+ CheckCondition(
+ 20 == rocksdb_backup_engine_options_get_max_background_operations(bdo));
+
+ rocksdb_backup_engine_options_set_callback_trigger_interval_size(bdo, 9000);
+ CheckCondition(
+ 9000 ==
+ rocksdb_backup_engine_options_get_callback_trigger_interval_size(bdo));
+
+ rocksdb_backup_engine_options_set_max_valid_backups_to_open(bdo, 40);
+ CheckCondition(
+ 40 == rocksdb_backup_engine_options_get_max_valid_backups_to_open(bdo));
+
+ rocksdb_backup_engine_options_set_share_files_with_checksum_naming(bdo, 2);
+ CheckCondition(
+ 2 == rocksdb_backup_engine_options_get_share_files_with_checksum_naming(
+ bdo));
+
+ rocksdb_backup_engine_options_destroy(bdo);
+ }
+
+ StartPhase("compression_options");
+ {
+ rocksdb_options_t* co;
+ co = rocksdb_options_create();
+
+ rocksdb_options_set_compression_options_zstd_max_train_bytes(co, 100);
+ CheckCondition(
+ 100 ==
+ rocksdb_options_get_compression_options_zstd_max_train_bytes(co));
+
+ rocksdb_options_set_compression_options_parallel_threads(co, 2);
+ CheckCondition(
+ 2 == rocksdb_options_get_compression_options_parallel_threads(co));
+
+ rocksdb_options_set_compression_options_max_dict_buffer_bytes(co, 200);
+ CheckCondition(
+ 200 ==
+ rocksdb_options_get_compression_options_max_dict_buffer_bytes(co));
+
+ rocksdb_options_set_compression_options_use_zstd_dict_trainer(co, 0);
+ CheckCondition(
+ 0 == rocksdb_options_get_compression_options_use_zstd_dict_trainer(co));
+ rocksdb_options_destroy(co);
+ }
+
+ StartPhase("iterate_upper_bound");
+ {
+ // Create new empty database
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_options_set_prefix_extractor(options, NULL);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_put(db, woptions, "a", 1, "0", 1, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo", 3, "bar", 3, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err);
+ CheckNoError(err);
+ rocksdb_put(db, woptions, "g1", 2, "0", 1, &err);
+ CheckNoError(err);
+
+ // testing basic case with no iterate_upper_bound and no prefix_extractor
+ {
+ rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+ rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+ rocksdb_iter_seek(iter, "foo", 3);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "foo", "bar");
+
+ rocksdb_iter_next(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "foo1", "bar1");
+
+ rocksdb_iter_next(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "g1", "0");
+
+ rocksdb_iter_destroy(iter);
+ }
+
+ // testing iterate_upper_bound and forward iterator
+ // to make sure it stops at bound
+ {
+ // iterate_upper_bound points beyond the last expected entry
+ rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4);
+
+ rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+ rocksdb_iter_seek(iter, "foo", 3);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "foo", "bar");
+
+ rocksdb_iter_next(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "foo1", "bar1");
+
+ rocksdb_iter_next(iter);
+ // should stop here...
+ CheckCondition(!rocksdb_iter_valid(iter));
+
+ rocksdb_iter_destroy(iter);
+ rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+ }
+ }
+
+ StartPhase("transactions");
+ {
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+
+ // open a TransactionDB
+ txn_db_options = rocksdb_transactiondb_options_create();
+ txn_options = rocksdb_transaction_options_create();
+ rocksdb_options_set_create_if_missing(options, 1);
+ txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+ CheckNoError(err);
+
+ // put outside a transaction
+ rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+ CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+ CheckTxnDBPinGet(txn_db, roptions, "foo", "hello");
+
+ // delete from outside transaction
+ rocksdb_transactiondb_delete(txn_db, woptions, "foo", 3, &err);
+ CheckNoError(err);
+ CheckTxnDBGet(txn_db, roptions, "foo", NULL);
+ CheckTxnDBPinGet(txn_db, roptions, "foo", NULL);
+
+ // write batch into TransactionDB
+ rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+ rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+ rocksdb_writebatch_clear(wb);
+ rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+ rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+ rocksdb_writebatch_delete(wb, "bar", 3);
+ rocksdb_transactiondb_write(txn_db, woptions, wb, &err);
+ rocksdb_writebatch_destroy(wb);
+ CheckTxnDBGet(txn_db, roptions, "box", "c");
+ CheckTxnDBPinGet(txn_db, roptions, "box", "c");
+ CheckNoError(err);
+
+ // multi get
+ {
+ const char* keys[3] = {"box", "foo", "notfound"};
+ const size_t keys_sizes[3] = {3, 3, 8};
+ char* vals[3];
+ size_t vals_sizes[3];
+ char* errs[3];
+ const char* expected[3] = {"c", NULL, NULL};
+ rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes,
+ vals, vals_sizes, errs);
+ CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+ }
+
+ // begin a transaction
+ txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+ // put
+ rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+ CheckTxnGet(txn, roptions, "foo", "hello");
+ CheckTxnPinGet(txn, roptions, "foo", "hello");
+ {
+ const char* keys[3] = {"box", "foo", "notfound"};
+ const size_t keys_sizes[3] = {3, 3, 8};
+ char* vals[3];
+ size_t vals_sizes[3];
+ char* errs[3];
+ const char* expected[3] = {"c", "hello", NULL};
+ rocksdb_transaction_multi_get(txn, roptions, 3, keys, keys_sizes, vals,
+ vals_sizes, errs);
+ CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+ }
+ // delete
+ rocksdb_transaction_delete(txn, "foo", 3, &err);
+ CheckNoError(err);
+ CheckTxnGet(txn, roptions, "foo", NULL);
+ CheckTxnPinGet(txn, roptions, "foo", NULL);
+
+ rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+
+ // read from outside transaction, before commit
+ CheckTxnDBGet(txn_db, roptions, "foo", NULL);
+ CheckTxnDBPinGet(txn_db, roptions, "foo", NULL);
+ {
+ const char* keys[3] = {"box", "foo", "notfound"};
+ const size_t keys_sizes[3] = {3, 3, 8};
+ char* vals[3];
+ size_t vals_sizes[3];
+ char* errs[3];
+ const char* expected[3] = {"c", NULL, NULL};
+ rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes,
+ vals, vals_sizes, errs);
+ CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+ }
+
+ // commit
+ rocksdb_transaction_commit(txn, &err);
+ CheckNoError(err);
+
+ // read from outside transaction, after commit
+ CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+ CheckTxnDBPinGet(txn_db, roptions, "foo", "hello");
+ {
+ const char* keys[3] = {"box", "foo", "notfound"};
+ const size_t keys_sizes[3] = {3, 3, 8};
+ char* vals[3];
+ size_t vals_sizes[3];
+ char* errs[3];
+ const char* expected[3] = {"c", "hello", NULL};
+ rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes,
+ vals, vals_sizes, errs);
+ CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+ }
+
+ // reuse old transaction
+ txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, txn);
+
+ // snapshot
+ const rocksdb_snapshot_t* snapshot;
+ snapshot = rocksdb_transactiondb_create_snapshot(txn_db);
+ rocksdb_readoptions_set_snapshot(roptions, snapshot);
+
+ rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err);
+ CheckNoError(err);
+
+ CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+ CheckTxnDBPinGet(txn_db, roptions, "foo", "hello");
+ rocksdb_readoptions_set_snapshot(roptions, NULL);
+ rocksdb_transactiondb_release_snapshot(txn_db, snapshot);
+ CheckTxnDBGet(txn_db, roptions, "foo", "hey");
+ CheckTxnDBPinGet(txn_db, roptions, "foo", "hey");
+
+ // iterate
+ rocksdb_transaction_put(txn, "bar", 3, "hi", 2, &err);
+ rocksdb_iterator_t* iter =
+ rocksdb_transaction_create_iterator(txn, roptions);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "bar", "hi");
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+
+ // rollback
+ rocksdb_transaction_rollback(txn, &err);
+ CheckNoError(err);
+ CheckTxnDBGet(txn_db, roptions, "bar", NULL);
+ CheckTxnDBPinGet(txn_db, roptions, "bar", NULL);
+
+ // save point
+ rocksdb_transaction_put(txn, "foo1", 4, "hi1", 3, &err);
+ rocksdb_transaction_set_savepoint(txn);
+ CheckTxnGet(txn, roptions, "foo1", "hi1");
+ CheckTxnPinGet(txn, roptions, "foo1", "hi1");
+ rocksdb_transaction_put(txn, "foo2", 4, "hi2", 3, &err);
+ CheckTxnGet(txn, roptions, "foo2", "hi2");
+ CheckTxnPinGet(txn, roptions, "foo2", "hi2");
+
+ // rollback to savepoint
+ rocksdb_transaction_rollback_to_savepoint(txn, &err);
+ CheckNoError(err);
+ CheckTxnGet(txn, roptions, "foo2", NULL);
+ CheckTxnGet(txn, roptions, "foo1", "hi1");
+ CheckTxnPinGet(txn, roptions, "foo2", NULL);
+ CheckTxnPinGet(txn, roptions, "foo1", "hi1");
+ CheckTxnDBGet(txn_db, roptions, "foo1", NULL);
+ CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+ CheckTxnDBPinGet(txn_db, roptions, "foo1", NULL);
+ CheckTxnDBPinGet(txn_db, roptions, "foo2", NULL);
+ rocksdb_transaction_commit(txn, &err);
+ CheckNoError(err);
+ CheckTxnDBGet(txn_db, roptions, "foo1", "hi1");
+ CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+ CheckTxnDBPinGet(txn_db, roptions, "foo1", "hi1");
+ CheckTxnDBPinGet(txn_db, roptions, "foo2", NULL);
+
+ // Column families.
+ rocksdb_column_family_handle_t* cfh;
+ cfh = rocksdb_transactiondb_create_column_family(txn_db, options,
+ "txn_db_cf", &err);
+ CheckNoError(err);
+
+ rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello",
+ 8, &err);
+ CheckNoError(err);
+ CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello");
+ CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello");
+ {
+ const rocksdb_column_family_handle_t* get_handles[2] = {cfh, cfh};
+ const char* keys[2] = {"cf_foo", "notfound"};
+ const size_t keys_sizes[2] = {6, 8};
+ char* vals[2];
+ size_t vals_sizes[2];
+ char* errs[2];
+ const char* expected[2] = {"cf_hello", NULL};
+ rocksdb_transactiondb_multi_get_cf(txn_db, roptions, get_handles, 2, keys,
+ keys_sizes, vals, vals_sizes, errs);
+ CheckMultiGetValues(2, vals, vals_sizes, errs, expected);
+ }
+
+ rocksdb_transactiondb_delete_cf(txn_db, woptions, cfh, "cf_foo", 6, &err);
+ CheckNoError(err);
+ CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
+ CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
+
+ // flush
+ rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
+ rocksdb_flushoptions_set_wait(flush_options, 1);
+ rocksdb_transactiondb_flush_wal(txn_db, 1, &err);
+ CheckNoError(err);
+ rocksdb_transactiondb_flush_cf(txn_db, flush_options, cfh, &err);
+ CheckNoError(err);
+ rocksdb_transactiondb_flush(txn_db, flush_options, &err);
+ CheckNoError(err);
+ rocksdb_flushoptions_destroy(flush_options);
+
+ // close and destroy
+ rocksdb_column_family_handle_destroy(cfh);
+ rocksdb_transaction_destroy(txn);
+ rocksdb_transactiondb_close(txn_db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_transaction_options_destroy(txn_options);
+ rocksdb_transactiondb_options_destroy(txn_db_options);
+ }
+
+ StartPhase("two-phase commit");
+ {
+ // open a TransactionDB
+ txn_db_options = rocksdb_transactiondb_options_create();
+ txn_options = rocksdb_transaction_options_create();
+ rocksdb_options_set_create_if_missing(options, 1);
+ txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_transaction_options_set_skip_prepare(txn_options, 0);
+ txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+ rocksdb_transaction_commit(txn, &err);
+ CheckCondition(err != NULL);
+ Free(&err);
+ err = NULL;
+ rocksdb_transaction_prepare(txn, &err);
+ CheckCondition(err != NULL);
+ Free(&err);
+ err = NULL;
+ rocksdb_transaction_set_name(txn, "txn1", 4, &err);
+ CheckNoError(err);
+ rocksdb_transaction_prepare(txn, &err);
+ CheckNoError(err);
+ rocksdb_transaction_commit(txn, &err);
+ CheckNoError(err);
+ rocksdb_transaction_destroy(txn);
+
+ // prepare 2 transactions and close db.
+ rocksdb_transaction_t* txn1 =
+ rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+ rocksdb_transaction_put(txn1, "bar1", 4, "1", 1, &err);
+ CheckNoError(err);
+ rocksdb_transaction_set_name(txn1, "txn1", 4, &err);
+ CheckNoError(err);
+ rocksdb_transaction_prepare(txn1, &err);
+ CheckNoError(err);
+ rocksdb_transaction_t* txn2 =
+ rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+ rocksdb_transaction_put(txn2, "bar2", 4, "2", 1, &err);
+ CheckNoError(err);
+ rocksdb_transaction_set_name(txn2, "txn2", 4, &err);
+ CheckNoError(err);
+ rocksdb_transaction_prepare(txn2, &err);
+ CheckNoError(err);
+ rocksdb_transaction_destroy(txn1);
+ rocksdb_transaction_destroy(txn2);
+ rocksdb_transactiondb_close(txn_db);
+ rocksdb_transaction_options_destroy(txn_options);
+ rocksdb_transactiondb_options_destroy(txn_db_options);
+
+ // reopen db and get all prepared.
+ txn_db_options = rocksdb_transactiondb_options_create();
+ txn_options = rocksdb_transaction_options_create();
+ rocksdb_options_set_error_if_exists(options, 0);
+ txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+ CheckNoError(err);
+ CheckTxnDBPinGet(txn_db, roptions, "bar1", NULL);
+ CheckTxnDBPinGet(txn_db, roptions, "bar2", NULL);
+ size_t cnt;
+ rocksdb_transaction_t** txns =
+ rocksdb_transactiondb_get_prepared_transactions(txn_db, &cnt);
+ CheckCondition(cnt == 2);
+ size_t i;
+ for (i = 0; i < cnt; i++) {
+ txn = txns[i];
+ size_t name_len = 0;
+ char* name = rocksdb_transaction_get_name(txn, &name_len);
+ CheckCondition(name_len == 4);
+ if (strncmp(name, "txn1", name_len) == 0) {
+ rocksdb_transaction_commit(txn, &err);
+ } else if (strncmp(name, "txn2", name_len) == 0) {
+ rocksdb_transaction_rollback(txn, &err);
+ }
+ rocksdb_free(name);
+ CheckNoError(err);
+ rocksdb_transaction_destroy(txn);
+ }
+ rocksdb_free(txns);
+ CheckTxnDBGet(txn_db, roptions, "bar1", "1");
+ CheckTxnDBGet(txn_db, roptions, "bar2", NULL);
+ rocksdb_transactiondb_put(txn_db, woptions, "bar2", 4, "2", 1, &err);
+ CheckNoError(err);
+
+ // close and destroy
+ rocksdb_transactiondb_close(txn_db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_transaction_options_destroy(txn_options);
+ rocksdb_transactiondb_options_destroy(txn_db_options);
+ }
+
+ StartPhase("optimistic_transactions");
+ {
+ rocksdb_options_t* db_options = rocksdb_options_create();
+ rocksdb_options_set_create_if_missing(db_options, 1);
+ rocksdb_options_set_allow_concurrent_memtable_write(db_options, 1);
+ otxn_db = rocksdb_optimistictransactiondb_open(db_options, dbname, &err);
+ otxn_options = rocksdb_optimistictransaction_options_create();
+ rocksdb_transaction_t* txn1 = rocksdb_optimistictransaction_begin(
+ otxn_db, woptions, otxn_options, NULL);
+ rocksdb_transaction_t* txn2 = rocksdb_optimistictransaction_begin(
+ otxn_db, woptions, otxn_options, NULL);
+ rocksdb_transaction_put(txn1, "key", 3, "value", 5, &err);
+ CheckNoError(err);
+ rocksdb_transaction_put(txn2, "key1", 4, "value1", 6, &err);
+ CheckNoError(err);
+ CheckTxnGet(txn1, roptions, "key", "value");
+ CheckTxnPinGet(txn1, roptions, "key", "value");
+ rocksdb_transaction_commit(txn1, &err);
+ CheckNoError(err);
+ rocksdb_transaction_commit(txn2, &err);
+ CheckNoError(err);
+ rocksdb_transaction_destroy(txn1);
+ rocksdb_transaction_destroy(txn2);
+
+ // Check column family
+ db = rocksdb_optimistictransactiondb_get_base_db(otxn_db);
+ rocksdb_put(db, woptions, "key", 3, "value", 5, &err);
+ CheckNoError(err);
+ rocksdb_column_family_handle_t *cfh1, *cfh2;
+ cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err);
+ cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err);
+ txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+ NULL);
+ rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err);
+ CheckNoError(err);
+ rocksdb_transaction_put_cf(txn, cfh2, "key_cf2", 7, "val_cf2", 7, &err);
+ CheckNoError(err);
+ rocksdb_transaction_commit(txn, &err);
+ CheckNoError(err);
+ txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+ txn);
+ CheckGetCF(db, roptions, cfh1, "key_cf1", "val_cf1");
+ CheckTxnGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1");
+ CheckTxnPinGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1");
+ {
+ const rocksdb_column_family_handle_t* get_handles[3] = {cfh1, cfh2, cfh2};
+ const char* keys[3] = {"key_cf1", "key_cf2", "notfound"};
+ const size_t keys_sizes[3] = {7, 7, 8};
+ char* vals[3];
+ size_t vals_sizes[3];
+ char* errs[3];
+ const char* expected[3] = {"val_cf1", "val_cf2", NULL};
+ rocksdb_transaction_multi_get_cf(txn, roptions, get_handles, 3, keys,
+ keys_sizes, vals, vals_sizes, errs);
+ CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+ }
+
+ // Check iterator with column family
+ rocksdb_transaction_put_cf(txn, cfh1, "key1_cf", 7, "val1_cf", 7, &err);
+ CheckNoError(err);
+ rocksdb_iterator_t* iter =
+ rocksdb_transaction_create_iterator_cf(txn, roptions, cfh1);
+ CheckCondition(!rocksdb_iter_valid(iter));
+ rocksdb_iter_seek_to_first(iter);
+ CheckCondition(rocksdb_iter_valid(iter));
+ CheckIter(iter, "key1_cf", "val1_cf");
+ rocksdb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ rocksdb_iter_destroy(iter);
+
+ rocksdb_transaction_destroy(txn);
+ rocksdb_column_family_handle_destroy(cfh1);
+ rocksdb_column_family_handle_destroy(cfh2);
+ rocksdb_optimistictransactiondb_close_base_db(db);
+ rocksdb_optimistictransactiondb_close(otxn_db);
+
+ // Check open optimistic transaction db with column families
+ size_t cf_len;
+ char** column_fams =
+ rocksdb_list_column_families(db_options, dbname, &cf_len, &err);
+ CheckNoError(err);
+ CheckEqual("default", column_fams[0], 7);
+ CheckEqual("txn_db_cf1", column_fams[1], 10);
+ CheckEqual("txn_db_cf2", column_fams[2], 10);
+ CheckCondition(cf_len == 3);
+ rocksdb_list_column_families_destroy(column_fams, cf_len);
+
+ const char* cf_names[3] = {"default", "txn_db_cf1", "txn_db_cf2"};
+ rocksdb_options_t* cf_options = rocksdb_options_create();
+ const rocksdb_options_t* cf_opts[3] = {cf_options, cf_options, cf_options};
+
+ rocksdb_options_set_error_if_exists(cf_options, 0);
+ rocksdb_column_family_handle_t* cf_handles[3];
+ otxn_db = rocksdb_optimistictransactiondb_open_column_families(
+ db_options, dbname, 3, cf_names, cf_opts, cf_handles, &err);
+ CheckNoError(err);
+ rocksdb_transaction_t* txn_cf = rocksdb_optimistictransaction_begin(
+ otxn_db, woptions, otxn_options, NULL);
+ CheckTxnGetCF(txn_cf, roptions, cf_handles[0], "key", "value");
+ CheckTxnGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1");
+ CheckTxnGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2");
+ CheckTxnPinGetCF(txn_cf, roptions, cf_handles[0], "key", "value");
+ CheckTxnPinGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1");
+ CheckTxnPinGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2");
+ rocksdb_transaction_destroy(txn_cf);
+ rocksdb_options_destroy(cf_options);
+ rocksdb_column_family_handle_destroy(cf_handles[0]);
+ rocksdb_column_family_handle_destroy(cf_handles[1]);
+ rocksdb_column_family_handle_destroy(cf_handles[2]);
+ rocksdb_optimistictransactiondb_close(otxn_db);
+ rocksdb_destroy_db(db_options, dbname, &err);
+ rocksdb_options_destroy(db_options);
+ rocksdb_optimistictransaction_options_destroy(otxn_options);
+ CheckNoError(err);
+ }
+
+ // Simple sanity check that setting memtable rep works.
+ StartPhase("memtable_reps");
+ {
+ // Create database with vector memtable.
+ rocksdb_options_set_memtable_vector_rep(options);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+
+ // Create database with hash skiplist memtable.
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ }
+
+ // Check that secondary instance works.
+ StartPhase("open_as_secondary");
+ {
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+
+ rocksdb_options_t* db_options = rocksdb_options_create();
+ rocksdb_options_set_create_if_missing(db_options, 1);
+ db = rocksdb_open(db_options, dbname, &err);
+ CheckNoError(err);
+ rocksdb_t* db1;
+ rocksdb_options_t* opts = rocksdb_options_create();
+ rocksdb_options_set_max_open_files(opts, -1);
+ rocksdb_options_set_create_if_missing(opts, 1);
+ snprintf(secondary_path, sizeof(secondary_path),
+ "%s/rocksdb_c_test_secondary-%d", GetTempDir(), ((int)geteuid()));
+ db1 = rocksdb_open_as_secondary(opts, dbname, secondary_path, &err);
+ CheckNoError(err);
+
+ rocksdb_writeoptions_set_sync(woptions, 0);
+ rocksdb_writeoptions_disable_WAL(woptions, 1);
+ rocksdb_put(db, woptions, "key0", 4, "value0", 6, &err);
+ CheckNoError(err);
+ rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create();
+ rocksdb_flushoptions_set_wait(flush_opts, 1);
+ rocksdb_flush(db, flush_opts, &err);
+ CheckNoError(err);
+ rocksdb_try_catch_up_with_primary(db1, &err);
+ CheckNoError(err);
+ rocksdb_readoptions_t* ropts = rocksdb_readoptions_create();
+ rocksdb_readoptions_set_verify_checksums(ropts, 1);
+ rocksdb_readoptions_set_snapshot(ropts, NULL);
+ CheckGet(db, ropts, "key0", "value0");
+ CheckGet(db1, ropts, "key0", "value0");
+
+ rocksdb_writeoptions_disable_WAL(woptions, 0);
+ rocksdb_put(db, woptions, "key1", 4, "value1", 6, &err);
+ CheckNoError(err);
+ rocksdb_try_catch_up_with_primary(db1, &err);
+ CheckNoError(err);
+ CheckGet(db1, ropts, "key0", "value0");
+ CheckGet(db1, ropts, "key1", "value1");
+
+ rocksdb_close(db1);
+ rocksdb_destroy_db(opts, secondary_path, &err);
+ CheckNoError(err);
+
+ rocksdb_options_destroy(db_options);
+ rocksdb_options_destroy(opts);
+ rocksdb_readoptions_destroy(ropts);
+ rocksdb_flushoptions_destroy(flush_opts);
+ }
+
+ // Simple sanity check that options setting db_paths work.
+ StartPhase("open_db_paths");
+ {
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+
+ const rocksdb_dbpath_t* paths[1] = {dbpath};
+ rocksdb_options_set_db_paths(options, paths, 1);
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+ }
+
+ StartPhase("filter_with_prefix_seek");
+ {
+ rocksdb_close(db);
+ rocksdb_destroy_db(options, dbname, &err);
+ CheckNoError(err);
+
+ rocksdb_options_set_prefix_extractor(
+ options, rocksdb_slicetransform_create_fixed_prefix(1));
+ rocksdb_filterpolicy_t* filter_policy =
+ rocksdb_filterpolicy_create_bloom_full(8.0);
+ rocksdb_block_based_options_set_filter_policy(table_options, filter_policy);
+ rocksdb_options_set_block_based_table_factory(options, table_options);
+
+ db = rocksdb_open(options, dbname, &err);
+ CheckNoError(err);
+
+ int i;
+ for (i = 0; i < 10; ++i) {
+ char key = '0' + (char)i;
+ rocksdb_put(db, woptions, &key, 1, "", 1, &err);
+ CheckNoError(err);
+ }
+
+ // Flush to generate an L0 so that filter will be used later.
+ rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
+ rocksdb_flushoptions_set_wait(flush_options, 1);
+ rocksdb_flush(db, flush_options, &err);
+ rocksdb_flushoptions_destroy(flush_options);
+ CheckNoError(err);
+
+ rocksdb_readoptions_t* ropts = rocksdb_readoptions_create();
+ rocksdb_iterator_t* iter = rocksdb_create_iterator(db, ropts);
+
+ rocksdb_iter_seek(iter, "0", 1);
+ int cnt = 0;
+ while (rocksdb_iter_valid(iter)) {
+ ++cnt;
+ rocksdb_iter_next(iter);
+ }
+ CheckCondition(10 == cnt);
+
+ rocksdb_iter_destroy(iter);
+ rocksdb_readoptions_destroy(ropts);
+ }
+
+ StartPhase("cancel_all_background_work");
+ rocksdb_cancel_all_background_work(db, 1);
+
+ StartPhase("cleanup");
+ rocksdb_close(db);
+ rocksdb_options_destroy(options);
+ rocksdb_block_based_options_destroy(table_options);
+ rocksdb_readoptions_destroy(roptions);
+ rocksdb_writeoptions_destroy(woptions);
+ rocksdb_compactoptions_destroy(coptions);
+ rocksdb_cache_destroy(cache);
+ rocksdb_comparator_destroy(cmp);
+ rocksdb_dbpath_destroy(dbpath);
+ rocksdb_env_destroy(env);
+
+ fprintf(stderr, "PASS\n");
+ return 0;
+}
+
+#else
+
+int main(void) {
+ fprintf(stderr, "SKIPPED\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/column_family.cc b/src/rocksdb/db/column_family.cc
new file mode 100644
index 000000000..268060ddf
--- /dev/null
+++ b/src/rocksdb/db/column_family.cc
@@ -0,0 +1,1683 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/column_family.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_source.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+#include "db/db_impl/db_impl.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/table_properties_collector.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/table.h"
+#include "table/merging_iterator.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
+ ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
+ : cfd_(column_family_data), db_(db), mutex_(mutex) {
+ if (cfd_ != nullptr) {
+ cfd_->Ref();
+ }
+}
+
+ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
+ if (cfd_ != nullptr) {
+#ifndef ROCKSDB_LITE
+ for (auto& listener : cfd_->ioptions()->listeners) {
+ listener->OnColumnFamilyHandleDeletionStarted(this);
+ }
+#endif // ROCKSDB_LITE
+ // Job id == 0 means that this is not our background process, but rather
+ // user thread
+ // Need to hold some shared pointers owned by the initial_cf_options
+ // before final cleaning up finishes.
+ ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options();
+ JobContext job_context(0);
+ mutex_->Lock();
+ bool dropped = cfd_->IsDropped();
+ if (cfd_->UnrefAndTryDelete()) {
+ if (dropped) {
+ db_->FindObsoleteFiles(&job_context, false, true);
+ }
+ }
+ mutex_->Unlock();
+ if (job_context.HaveSomethingToDelete()) {
+ bool defer_purge =
+ db_->immutable_db_options().avoid_unnecessary_blocking_io;
+ db_->PurgeObsoleteFiles(job_context, defer_purge);
+ }
+ job_context.Clean();
+ }
+}
+
+uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
+
+const std::string& ColumnFamilyHandleImpl::GetName() const {
+ return cfd()->GetName();
+}
+
+Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
+#ifndef ROCKSDB_LITE
+ // accessing mutable cf-options requires db mutex.
+ InstrumentedMutexLock l(mutex_);
+ *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions());
+ return Status::OK();
+#else
+ (void)desc;
+ return Status::NotSupported();
+#endif // !ROCKSDB_LITE
+}
+
+const Comparator* ColumnFamilyHandleImpl::GetComparator() const {
+ return cfd()->user_comparator();
+}
+
+void GetIntTblPropCollectorFactory(
+ const ImmutableCFOptions& ioptions,
+ IntTblPropCollectorFactories* int_tbl_prop_collector_factories) {
+ assert(int_tbl_prop_collector_factories);
+
+ auto& collector_factories = ioptions.table_properties_collector_factories;
+ for (size_t i = 0; i < ioptions.table_properties_collector_factories.size();
+ ++i) {
+ assert(collector_factories[i]);
+ int_tbl_prop_collector_factories->emplace_back(
+ new UserKeyTablePropertiesCollectorFactory(collector_factories[i]));
+ }
+}
+
+Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
+ if (!cf_options.compression_per_level.empty()) {
+ for (size_t level = 0; level < cf_options.compression_per_level.size();
+ ++level) {
+ if (!CompressionTypeSupported(cf_options.compression_per_level[level])) {
+ return Status::InvalidArgument(
+ "Compression type " +
+ CompressionTypeToString(cf_options.compression_per_level[level]) +
+ " is not linked with the binary.");
+ }
+ }
+ } else {
+ if (!CompressionTypeSupported(cf_options.compression)) {
+ return Status::InvalidArgument(
+ "Compression type " +
+ CompressionTypeToString(cf_options.compression) +
+ " is not linked with the binary.");
+ }
+ }
+ if (cf_options.compression_opts.zstd_max_train_bytes > 0) {
+ if (cf_options.compression_opts.use_zstd_dict_trainer) {
+ if (!ZSTD_TrainDictionarySupported()) {
+ return Status::InvalidArgument(
+ "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ "
+ "is not linked with the binary.");
+ }
+ } else if (!ZSTD_FinalizeDictionarySupported()) {
+ return Status::InvalidArgument(
+ "zstd finalizeDictionary cannot be used because ZSTD 1.4.5+ "
+ "is not linked with the binary.");
+ }
+ if (cf_options.compression_opts.max_dict_bytes == 0) {
+ return Status::InvalidArgument(
+ "The dictionary size limit (`CompressionOptions::max_dict_bytes`) "
+ "should be nonzero if we're using zstd's dictionary generator.");
+ }
+ }
+
+ if (!CompressionTypeSupported(cf_options.blob_compression_type)) {
+ std::ostringstream oss;
+ oss << "The specified blob compression type "
+ << CompressionTypeToString(cf_options.blob_compression_type)
+ << " is not available.";
+
+ return Status::InvalidArgument(oss.str());
+ }
+
+ return Status::OK();
+}
+
+Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) {
+ if (cf_options.inplace_update_support) {
+ return Status::InvalidArgument(
+ "In-place memtable updates (inplace_update_support) is not compatible "
+ "with concurrent writes (allow_concurrent_memtable_write)");
+ }
+ if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) {
+ return Status::InvalidArgument(
+ "Memtable doesn't concurrent writes (allow_concurrent_memtable_write)");
+ }
+ return Status::OK();
+}
+
+Status CheckCFPathsSupported(const DBOptions& db_options,
+ const ColumnFamilyOptions& cf_options) {
+ // More than one cf_paths are supported only in universal
+ // and level compaction styles. This function also checks the case
+ // in which cf_paths is not specified, which results in db_paths
+ // being used.
+ if ((cf_options.compaction_style != kCompactionStyleUniversal) &&
+ (cf_options.compaction_style != kCompactionStyleLevel)) {
+ if (cf_options.cf_paths.size() > 1) {
+ return Status::NotSupported(
+ "More than one CF paths are only supported in "
+ "universal and level compaction styles. ");
+ } else if (cf_options.cf_paths.empty() && db_options.db_paths.size() > 1) {
+ return Status::NotSupported(
+ "More than one DB paths are only supported in "
+ "universal and level compaction styles. ");
+ }
+ }
+ return Status::OK();
+}
+
+namespace {
+const uint64_t kDefaultTtl = 0xfffffffffffffffe;
+const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
+} // anonymous namespace
+
+ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+ const ColumnFamilyOptions& src) {
+ ColumnFamilyOptions result = src;
+ size_t clamp_max = std::conditional<
+ sizeof(size_t) == 4, std::integral_constant<size_t, 0xffffffff>,
+ std::integral_constant<uint64_t, 64ull << 30>>::type::value;
+ ClipToRange(&result.write_buffer_size, (static_cast<size_t>(64)) << 10,
+ clamp_max);
+ // if user sets arena_block_size, we trust user to use this value. Otherwise,
+ // calculate a proper value from writer_buffer_size;
+ if (result.arena_block_size <= 0) {
+ result.arena_block_size =
+ std::min(size_t{1024 * 1024}, result.write_buffer_size / 8);
+
+ // Align up to 4k
+ const size_t align = 4 * 1024;
+ result.arena_block_size =
+ ((result.arena_block_size + align - 1) / align) * align;
+ }
+ result.min_write_buffer_number_to_merge =
+ std::min(result.min_write_buffer_number_to_merge,
+ result.max_write_buffer_number - 1);
+ if (result.min_write_buffer_number_to_merge < 1) {
+ result.min_write_buffer_number_to_merge = 1;
+ }
+
+ if (db_options.atomic_flush && result.min_write_buffer_number_to_merge > 1) {
+ ROCKS_LOG_WARN(
+ db_options.logger,
+ "Currently, if atomic_flush is true, then triggering flush for any "
+ "column family internally (non-manual flush) will trigger flushing "
+ "all column families even if the number of memtables is smaller "
+ "min_write_buffer_number_to_merge. Therefore, configuring "
+ "min_write_buffer_number_to_merge > 1 is not compatible and should "
+ "be satinized to 1. Not doing so will lead to data loss and "
+ "inconsistent state across multiple column families when WAL is "
+ "disabled, which is a common setting for atomic flush");
+
+ result.min_write_buffer_number_to_merge = 1;
+ }
+
+ if (result.num_levels < 1) {
+ result.num_levels = 1;
+ }
+ if (result.compaction_style == kCompactionStyleLevel &&
+ result.num_levels < 2) {
+ result.num_levels = 2;
+ }
+
+ if (result.compaction_style == kCompactionStyleUniversal &&
+ db_options.allow_ingest_behind && result.num_levels < 3) {
+ result.num_levels = 3;
+ }
+
+ if (result.max_write_buffer_number < 2) {
+ result.max_write_buffer_number = 2;
+ }
+ // fall back max_write_buffer_number_to_maintain if
+ // max_write_buffer_size_to_maintain is not set
+ if (result.max_write_buffer_size_to_maintain < 0) {
+ result.max_write_buffer_size_to_maintain =
+ result.max_write_buffer_number *
+ static_cast<int64_t>(result.write_buffer_size);
+ } else if (result.max_write_buffer_size_to_maintain == 0 &&
+ result.max_write_buffer_number_to_maintain < 0) {
+ result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
+ }
+ // bloom filter size shouldn't exceed 1/4 of memtable size.
+ if (result.memtable_prefix_bloom_size_ratio > 0.25) {
+ result.memtable_prefix_bloom_size_ratio = 0.25;
+ } else if (result.memtable_prefix_bloom_size_ratio < 0) {
+ result.memtable_prefix_bloom_size_ratio = 0;
+ }
+
+ if (!result.prefix_extractor) {
+ assert(result.memtable_factory);
+ Slice name = result.memtable_factory->Name();
+ if (name.compare("HashSkipListRepFactory") == 0 ||
+ name.compare("HashLinkListRepFactory") == 0) {
+ result.memtable_factory = std::make_shared<SkipListFactory>();
+ }
+ }
+
+ if (result.compaction_style == kCompactionStyleFIFO) {
+ // since we delete level0 files in FIFO compaction when there are too many
+ // of them, these options don't really mean anything
+ result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+ result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+ }
+
+ if (result.max_bytes_for_level_multiplier <= 0) {
+ result.max_bytes_for_level_multiplier = 1;
+ }
+
+ if (result.level0_file_num_compaction_trigger == 0) {
+ ROCKS_LOG_WARN(db_options.logger,
+ "level0_file_num_compaction_trigger cannot be 0");
+ result.level0_file_num_compaction_trigger = 1;
+ }
+
+ if (result.level0_stop_writes_trigger <
+ result.level0_slowdown_writes_trigger ||
+ result.level0_slowdown_writes_trigger <
+ result.level0_file_num_compaction_trigger) {
+ ROCKS_LOG_WARN(db_options.logger,
+ "This condition must be satisfied: "
+ "level0_stop_writes_trigger(%d) >= "
+ "level0_slowdown_writes_trigger(%d) >= "
+ "level0_file_num_compaction_trigger(%d)",
+ result.level0_stop_writes_trigger,
+ result.level0_slowdown_writes_trigger,
+ result.level0_file_num_compaction_trigger);
+ if (result.level0_slowdown_writes_trigger <
+ result.level0_file_num_compaction_trigger) {
+ result.level0_slowdown_writes_trigger =
+ result.level0_file_num_compaction_trigger;
+ }
+ if (result.level0_stop_writes_trigger <
+ result.level0_slowdown_writes_trigger) {
+ result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger;
+ }
+ ROCKS_LOG_WARN(db_options.logger,
+ "Adjust the value to "
+ "level0_stop_writes_trigger(%d)"
+ "level0_slowdown_writes_trigger(%d)"
+ "level0_file_num_compaction_trigger(%d)",
+ result.level0_stop_writes_trigger,
+ result.level0_slowdown_writes_trigger,
+ result.level0_file_num_compaction_trigger);
+ }
+
+ if (result.soft_pending_compaction_bytes_limit == 0) {
+ result.soft_pending_compaction_bytes_limit =
+ result.hard_pending_compaction_bytes_limit;
+ } else if (result.hard_pending_compaction_bytes_limit > 0 &&
+ result.soft_pending_compaction_bytes_limit >
+ result.hard_pending_compaction_bytes_limit) {
+ result.soft_pending_compaction_bytes_limit =
+ result.hard_pending_compaction_bytes_limit;
+ }
+
+#ifndef ROCKSDB_LITE
+ // When the DB is stopped, it's possible that there are some .trash files that
+ // were not deleted yet, when we open the DB we will find these .trash files
+ // and schedule them to be deleted (or delete immediately if SstFileManager
+ // was not used)
+ auto sfm =
+ static_cast<SstFileManagerImpl*>(db_options.sst_file_manager.get());
+ for (size_t i = 0; i < result.cf_paths.size(); i++) {
+ DeleteScheduler::CleanupDirectory(db_options.env, sfm,
+ result.cf_paths[i].path)
+ .PermitUncheckedError();
+ }
+#endif
+
+ if (result.cf_paths.empty()) {
+ result.cf_paths = db_options.db_paths;
+ }
+
+ if (result.level_compaction_dynamic_level_bytes) {
+ if (result.compaction_style != kCompactionStyleLevel) {
+ ROCKS_LOG_WARN(db_options.info_log.get(),
+ "level_compaction_dynamic_level_bytes only makes sense"
+ "for level-based compaction");
+ result.level_compaction_dynamic_level_bytes = false;
+ } else if (result.cf_paths.size() > 1U) {
+ // we don't yet know how to make both of this feature and multiple
+ // DB path work.
+ ROCKS_LOG_WARN(db_options.info_log.get(),
+ "multiple cf_paths/db_paths and"
+ "level_compaction_dynamic_level_bytes"
+ "can't be used together");
+ result.level_compaction_dynamic_level_bytes = false;
+ }
+ }
+
+ if (result.max_compaction_bytes == 0) {
+ result.max_compaction_bytes = result.target_file_size_base * 25;
+ }
+
+ bool is_block_based_table = (result.table_factory->IsInstanceOf(
+ TableFactory::kBlockBasedTableName()));
+
+ const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
+ if (result.ttl == kDefaultTtl) {
+ if (is_block_based_table &&
+ result.compaction_style != kCompactionStyleFIFO) {
+ result.ttl = kAdjustedTtl;
+ } else {
+ result.ttl = 0;
+ }
+ }
+
+ const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60;
+
+ // Turn on periodic compactions and set them to occur once every 30 days if
+ // compaction filters are used and periodic_compaction_seconds is set to the
+ // default value.
+ if (result.compaction_style != kCompactionStyleFIFO) {
+ if ((result.compaction_filter != nullptr ||
+ result.compaction_filter_factory != nullptr) &&
+ result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
+ is_block_based_table) {
+ result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+ }
+ } else {
+ // result.compaction_style == kCompactionStyleFIFO
+ if (result.ttl == 0) {
+ if (is_block_based_table) {
+ if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+ result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+ }
+ result.ttl = result.periodic_compaction_seconds;
+ }
+ } else if (result.periodic_compaction_seconds != 0) {
+ result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
+ }
+ }
+
+ // TTL compactions would work similar to Periodic Compactions in Universal in
+ // most of the cases. So, if ttl is set, execute the periodic compaction
+ // codepath.
+ if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) {
+ if (result.periodic_compaction_seconds != 0) {
+ result.periodic_compaction_seconds =
+ std::min(result.ttl, result.periodic_compaction_seconds);
+ } else {
+ result.periodic_compaction_seconds = result.ttl;
+ }
+ }
+
+ if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+ result.periodic_compaction_seconds = 0;
+ }
+
+ return result;
+}
+
+int SuperVersion::dummy = 0;
+void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
+void* const SuperVersion::kSVObsolete = nullptr;
+
+SuperVersion::~SuperVersion() {
+ for (auto td : to_delete) {
+ delete td;
+ }
+}
+
+SuperVersion* SuperVersion::Ref() {
+ refs.fetch_add(1, std::memory_order_relaxed);
+ return this;
+}
+
+bool SuperVersion::Unref() {
+ // fetch_sub returns the previous value of ref
+ uint32_t previous_refs = refs.fetch_sub(1);
+ assert(previous_refs > 0);
+ return previous_refs == 1;
+}
+
+void SuperVersion::Cleanup() {
+ assert(refs.load(std::memory_order_relaxed) == 0);
+ // Since this SuperVersion object is being deleted,
+ // decrement reference to the immutable MemtableList
+ // this SV object was pointing to.
+ imm->Unref(&to_delete);
+ MemTable* m = mem->Unref();
+ if (m != nullptr) {
+ auto* memory_usage = current->cfd()->imm()->current_memory_usage();
+ assert(*memory_usage >= m->ApproximateMemoryUsage());
+ *memory_usage -= m->ApproximateMemoryUsage();
+ to_delete.push_back(m);
+ }
+ current->Unref();
+ cfd->UnrefAndTryDelete();
+}
+
+void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
+ MemTableListVersion* new_imm, Version* new_current) {
+ cfd = new_cfd;
+ mem = new_mem;
+ imm = new_imm;
+ current = new_current;
+ cfd->Ref();
+ mem->Ref();
+ imm->Ref();
+ current->Ref();
+ refs.store(1, std::memory_order_relaxed);
+}
+
+namespace {
+void SuperVersionUnrefHandle(void* ptr) {
+ // UnrefHandle is called when a thread exits or a ThreadLocalPtr gets
+ // destroyed. When the former happens, the thread shouldn't see kSVInUse.
+ // When the latter happens, only super_version_ holds a reference
+ // to ColumnFamilyData, so no further queries are possible.
+ SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+ bool was_last_ref __attribute__((__unused__));
+ was_last_ref = sv->Unref();
+ // Thread-local SuperVersions can't outlive ColumnFamilyData::super_version_.
+ // This is important because we can't do SuperVersion cleanup here.
+ // That would require locking DB mutex, which would deadlock because
+ // SuperVersionUnrefHandle is called with locked ThreadLocalPtr mutex.
+ assert(!was_last_ref);
+}
+} // anonymous namespace
+
+std::vector<std::string> ColumnFamilyData::GetDbPaths() const {
+ std::vector<std::string> paths;
+ paths.reserve(ioptions_.cf_paths.size());
+ for (const DbPath& db_path : ioptions_.cf_paths) {
+ paths.emplace_back(db_path.path);
+ }
+ return paths;
+}
+
+const uint32_t ColumnFamilyData::kDummyColumnFamilyDataId =
+ std::numeric_limits<uint32_t>::max();
+
+ColumnFamilyData::ColumnFamilyData(
+ uint32_t id, const std::string& name, Version* _dummy_versions,
+ Cache* _table_cache, WriteBufferManager* write_buffer_manager,
+ const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options,
+ const FileOptions* file_options, ColumnFamilySet* column_family_set,
+ BlockCacheTracer* const block_cache_tracer,
+ const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
+ const std::string& db_session_id)
+ : id_(id),
+ name_(name),
+ dummy_versions_(_dummy_versions),
+ current_(nullptr),
+ refs_(0),
+ initialized_(false),
+ dropped_(false),
+ internal_comparator_(cf_options.comparator),
+ initial_cf_options_(SanitizeOptions(db_options, cf_options)),
+ ioptions_(db_options, initial_cf_options_),
+ mutable_cf_options_(initial_cf_options_),
+ is_delete_range_supported_(
+ cf_options.table_factory->IsDeleteRangeSupported()),
+ write_buffer_manager_(write_buffer_manager),
+ mem_(nullptr),
+ imm_(ioptions_.min_write_buffer_number_to_merge,
+ ioptions_.max_write_buffer_number_to_maintain,
+ ioptions_.max_write_buffer_size_to_maintain),
+ super_version_(nullptr),
+ super_version_number_(0),
+ local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
+ next_(nullptr),
+ prev_(nullptr),
+ log_number_(0),
+ flush_reason_(FlushReason::kOthers),
+ column_family_set_(column_family_set),
+ queued_for_flush_(false),
+ queued_for_compaction_(false),
+ prev_compaction_needed_bytes_(0),
+ allow_2pc_(db_options.allow_2pc),
+ last_memtable_id_(0),
+ db_paths_registered_(false),
+ mempurge_used_(false) {
+ if (id_ != kDummyColumnFamilyDataId) {
+ // TODO(cc): RegisterDbPaths can be expensive, considering moving it
+ // outside of this constructor which might be called with db mutex held.
+ // TODO(cc): considering using ioptions_.fs, currently some tests rely on
+ // EnvWrapper, that's the main reason why we use env here.
+ Status s = ioptions_.env->RegisterDbPaths(GetDbPaths());
+ if (s.ok()) {
+ db_paths_registered_ = true;
+ } else {
+ ROCKS_LOG_ERROR(
+ ioptions_.logger,
+ "Failed to register data paths of column family (id: %d, name: %s)",
+ id_, name_.c_str());
+ }
+ }
+ Ref();
+
+ // Convert user defined table properties collector factories to internal ones.
+ GetIntTblPropCollectorFactory(ioptions_, &int_tbl_prop_collector_factories_);
+
+ // if _dummy_versions is nullptr, then this is a dummy column family.
+ if (_dummy_versions != nullptr) {
+ internal_stats_.reset(
+ new InternalStats(ioptions_.num_levels, ioptions_.clock, this));
+ table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache,
+ block_cache_tracer, io_tracer,
+ db_session_id));
+ blob_file_cache_.reset(
+ new BlobFileCache(_table_cache, ioptions(), soptions(), id_,
+ internal_stats_->GetBlobFileReadHist(), io_tracer));
+ blob_source_.reset(new BlobSource(ioptions(), db_id, db_session_id,
+ blob_file_cache_.get()));
+
+ if (ioptions_.compaction_style == kCompactionStyleLevel) {
+ compaction_picker_.reset(
+ new LevelCompactionPicker(ioptions_, &internal_comparator_));
+#ifndef ROCKSDB_LITE
+ } else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+ compaction_picker_.reset(
+ new UniversalCompactionPicker(ioptions_, &internal_comparator_));
+ } else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
+ compaction_picker_.reset(
+ new FIFOCompactionPicker(ioptions_, &internal_comparator_));
+ } else if (ioptions_.compaction_style == kCompactionStyleNone) {
+ compaction_picker_.reset(
+ new NullCompactionPicker(ioptions_, &internal_comparator_));
+ ROCKS_LOG_WARN(ioptions_.logger,
+ "Column family %s does not use any background compaction. "
+ "Compactions can only be done via CompactFiles\n",
+ GetName().c_str());
+#endif // !ROCKSDB_LITE
+ } else {
+ ROCKS_LOG_ERROR(ioptions_.logger,
+ "Unable to recognize the specified compaction style %d. "
+ "Column family %s will use kCompactionStyleLevel.\n",
+ ioptions_.compaction_style, GetName().c_str());
+ compaction_picker_.reset(
+ new LevelCompactionPicker(ioptions_, &internal_comparator_));
+ }
+
+ if (column_family_set_->NumberOfColumnFamilies() < 10) {
+ ROCKS_LOG_INFO(ioptions_.logger,
+ "--------------- Options for column family [%s]:\n",
+ name.c_str());
+ initial_cf_options_.Dump(ioptions_.logger);
+ } else {
+ ROCKS_LOG_INFO(ioptions_.logger, "\t(skipping printing options)\n");
+ }
+ }
+
+ RecalculateWriteStallConditions(mutable_cf_options_);
+
+ if (cf_options.table_factory->IsInstanceOf(
+ TableFactory::kBlockBasedTableName()) &&
+ cf_options.table_factory->GetOptions<BlockBasedTableOptions>()) {
+ const BlockBasedTableOptions* bbto =
+ cf_options.table_factory->GetOptions<BlockBasedTableOptions>();
+ const auto& options_overrides = bbto->cache_usage_options.options_overrides;
+ const auto file_metadata_charged =
+ options_overrides.at(CacheEntryRole::kFileMetadata).charged;
+ if (bbto->block_cache &&
+ file_metadata_charged == CacheEntryRoleOptions::Decision::kEnabled) {
+ // TODO(hx235): Add a `ConcurrentCacheReservationManager` at DB scope
+ // responsible for reservation of `ObsoleteFileInfo` so that we can keep
+ // this `file_metadata_cache_res_mgr_` nonconcurrent
+ file_metadata_cache_res_mgr_.reset(new ConcurrentCacheReservationManager(
+ std::make_shared<
+ CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>>(
+ bbto->block_cache)));
+ }
+ }
+}
+
+// DB mutex held
+ColumnFamilyData::~ColumnFamilyData() {
+ assert(refs_.load(std::memory_order_relaxed) == 0);
+ // remove from linked list
+ auto prev = prev_;
+ auto next = next_;
+ prev->next_ = next;
+ next->prev_ = prev;
+
+ if (!dropped_ && column_family_set_ != nullptr) {
+ // If it's dropped, it's already removed from column family set
+ // If column_family_set_ == nullptr, this is dummy CFD and not in
+ // ColumnFamilySet
+ column_family_set_->RemoveColumnFamily(this);
+ }
+
+ if (current_ != nullptr) {
+ current_->Unref();
+ }
+
+ // It would be wrong if this ColumnFamilyData is in flush_queue_ or
+ // compaction_queue_ and we destroyed it
+ assert(!queued_for_flush_);
+ assert(!queued_for_compaction_);
+ assert(super_version_ == nullptr);
+
+ if (dummy_versions_ != nullptr) {
+ // List must be empty
+ assert(dummy_versions_->Next() == dummy_versions_);
+ bool deleted __attribute__((__unused__));
+ deleted = dummy_versions_->Unref();
+ assert(deleted);
+ }
+
+ if (mem_ != nullptr) {
+ delete mem_->Unref();
+ }
+ autovector<MemTable*> to_delete;
+ imm_.current()->Unref(&to_delete);
+ for (MemTable* m : to_delete) {
+ delete m;
+ }
+
+ if (db_paths_registered_) {
+ // TODO(cc): considering using ioptions_.fs, currently some tests rely on
+ // EnvWrapper, that's the main reason why we use env here.
+ Status s = ioptions_.env->UnregisterDbPaths(GetDbPaths());
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ ioptions_.logger,
+ "Failed to unregister data paths of column family (id: %d, name: %s)",
+ id_, name_.c_str());
+ }
+ }
+}
+
+bool ColumnFamilyData::UnrefAndTryDelete() {
+ int old_refs = refs_.fetch_sub(1);
+ assert(old_refs > 0);
+
+ if (old_refs == 1) {
+ assert(super_version_ == nullptr);
+ delete this;
+ return true;
+ }
+
+ if (old_refs == 2 && super_version_ != nullptr) {
+ // Only the super_version_ holds me
+ SuperVersion* sv = super_version_;
+ super_version_ = nullptr;
+
+ // Release SuperVersion references kept in ThreadLocalPtr.
+ local_sv_.reset();
+
+ if (sv->Unref()) {
+ // Note: sv will delete this ColumnFamilyData during Cleanup()
+ assert(sv->cfd == this);
+ sv->Cleanup();
+ delete sv;
+ return true;
+ }
+ }
+ return false;
+}
+
+void ColumnFamilyData::SetDropped() {
+ // can't drop default CF
+ assert(id_ != 0);
+ dropped_ = true;
+ write_controller_token_.reset();
+
+ // remove from column_family_set
+ column_family_set_->RemoveColumnFamily(this);
+}
+
+ColumnFamilyOptions ColumnFamilyData::GetLatestCFOptions() const {
+ return BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
+}
+
+uint64_t ColumnFamilyData::OldestLogToKeep() {
+ auto current_log = GetLogNumber();
+
+ if (allow_2pc_) {
+ auto imm_prep_log = imm()->PrecomputeMinLogContainingPrepSection();
+ auto mem_prep_log = mem()->GetMinLogContainingPrepSection();
+
+ if (imm_prep_log > 0 && imm_prep_log < current_log) {
+ current_log = imm_prep_log;
+ }
+
+ if (mem_prep_log > 0 && mem_prep_log < current_log) {
+ current_log = mem_prep_log;
+ }
+ }
+
+ return current_log;
+}
+
+const double kIncSlowdownRatio = 0.8;
+const double kDecSlowdownRatio = 1 / kIncSlowdownRatio;
+const double kNearStopSlowdownRatio = 0.6;
+const double kDelayRecoverSlowdownRatio = 1.4;
+
+namespace {
+// If penalize_stop is true, we further reduce slowdown rate.
+std::unique_ptr<WriteControllerToken> SetupDelay(
+ WriteController* write_controller, uint64_t compaction_needed_bytes,
+ uint64_t prev_compaction_need_bytes, bool penalize_stop,
+ bool auto_compactions_disabled) {
+ const uint64_t kMinWriteRate = 16 * 1024u; // Minimum write rate 16KB/s.
+
+ uint64_t max_write_rate = write_controller->max_delayed_write_rate();
+ uint64_t write_rate = write_controller->delayed_write_rate();
+
+ if (auto_compactions_disabled) {
+ // When auto compaction is disabled, always use the value user gave.
+ write_rate = max_write_rate;
+ } else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) {
+ // If user gives rate less than kMinWriteRate, don't adjust it.
+ //
+ // If already delayed, need to adjust based on previous compaction debt.
+ // When there are two or more column families require delay, we always
+ // increase or reduce write rate based on information for one single
+ // column family. It is likely to be OK but we can improve if there is a
+ // problem.
+ // Ignore compaction_needed_bytes = 0 case because compaction_needed_bytes
+ // is only available in level-based compaction
+ //
+ // If the compaction debt stays the same as previously, we also further slow
+ // down. It usually means a mem table is full. It's mainly for the case
+ // where both of flush and compaction are much slower than the speed we
+ // insert to mem tables, so we need to actively slow down before we get
+ // feedback signal from compaction and flushes to avoid the full stop
+ // because of hitting the max write buffer number.
+ //
+ // If DB just falled into the stop condition, we need to further reduce
+ // the write rate to avoid the stop condition.
+ if (penalize_stop) {
+ // Penalize the near stop or stop condition by more aggressive slowdown.
+ // This is to provide the long term slowdown increase signal.
+ // The penalty is more than the reward of recovering to the normal
+ // condition.
+ write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+ kNearStopSlowdownRatio);
+ if (write_rate < kMinWriteRate) {
+ write_rate = kMinWriteRate;
+ }
+ } else if (prev_compaction_need_bytes > 0 &&
+ prev_compaction_need_bytes <= compaction_needed_bytes) {
+ write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+ kIncSlowdownRatio);
+ if (write_rate < kMinWriteRate) {
+ write_rate = kMinWriteRate;
+ }
+ } else if (prev_compaction_need_bytes > compaction_needed_bytes) {
+ // We are speeding up by ratio of kSlowdownRatio when we have paid
+ // compaction debt. But we'll never speed up to faster than the write rate
+ // given by users.
+ write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+ kDecSlowdownRatio);
+ if (write_rate > max_write_rate) {
+ write_rate = max_write_rate;
+ }
+ }
+ }
+ return write_controller->GetDelayToken(write_rate);
+}
+
+int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
+ int level0_slowdown_writes_trigger) {
+ // SanitizeOptions() ensures it.
+ assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger);
+
+ if (level0_file_num_compaction_trigger < 0) {
+ return std::numeric_limits<int>::max();
+ }
+
+ const int64_t twice_level0_trigger =
+ static_cast<int64_t>(level0_file_num_compaction_trigger) * 2;
+
+ const int64_t one_fourth_trigger_slowdown =
+ static_cast<int64_t>(level0_file_num_compaction_trigger) +
+ ((level0_slowdown_writes_trigger - level0_file_num_compaction_trigger) /
+ 4);
+
+ assert(twice_level0_trigger >= 0);
+ assert(one_fourth_trigger_slowdown >= 0);
+
+ // 1/4 of the way between L0 compaction trigger threshold and slowdown
+ // condition.
+ // Or twice as compaction trigger, if it is smaller.
+ int64_t res = std::min(twice_level0_trigger, one_fourth_trigger_slowdown);
+ if (res >= std::numeric_limits<int32_t>::max()) {
+ return std::numeric_limits<int32_t>::max();
+ } else {
+ // res fits in int
+ return static_cast<int>(res);
+ }
+}
+} // anonymous namespace
+
+std::pair<WriteStallCondition, ColumnFamilyData::WriteStallCause>
+ColumnFamilyData::GetWriteStallConditionAndCause(
+ int num_unflushed_memtables, int num_l0_files,
+ uint64_t num_compaction_needed_bytes,
+ const MutableCFOptions& mutable_cf_options,
+ const ImmutableCFOptions& immutable_cf_options) {
+ if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) {
+ return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit};
+ } else if (!mutable_cf_options.disable_auto_compactions &&
+ num_l0_files >= mutable_cf_options.level0_stop_writes_trigger) {
+ return {WriteStallCondition::kStopped, WriteStallCause::kL0FileCountLimit};
+ } else if (!mutable_cf_options.disable_auto_compactions &&
+ mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+ num_compaction_needed_bytes >=
+ mutable_cf_options.hard_pending_compaction_bytes_limit) {
+ return {WriteStallCondition::kStopped,
+ WriteStallCause::kPendingCompactionBytes};
+ } else if (mutable_cf_options.max_write_buffer_number > 3 &&
+ num_unflushed_memtables >=
+ mutable_cf_options.max_write_buffer_number - 1 &&
+ num_unflushed_memtables - 1 >=
+ immutable_cf_options.min_write_buffer_number_to_merge) {
+ return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit};
+ } else if (!mutable_cf_options.disable_auto_compactions &&
+ mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
+ num_l0_files >=
+ mutable_cf_options.level0_slowdown_writes_trigger) {
+ return {WriteStallCondition::kDelayed, WriteStallCause::kL0FileCountLimit};
+ } else if (!mutable_cf_options.disable_auto_compactions &&
+ mutable_cf_options.soft_pending_compaction_bytes_limit > 0 &&
+ num_compaction_needed_bytes >=
+ mutable_cf_options.soft_pending_compaction_bytes_limit) {
+ return {WriteStallCondition::kDelayed,
+ WriteStallCause::kPendingCompactionBytes};
+ }
+ return {WriteStallCondition::kNormal, WriteStallCause::kNone};
+}
+
+WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
+ const MutableCFOptions& mutable_cf_options) {
+ auto write_stall_condition = WriteStallCondition::kNormal;
+ if (current_ != nullptr) {
+ auto* vstorage = current_->storage_info();
+ auto write_controller = column_family_set_->write_controller_;
+ uint64_t compaction_needed_bytes =
+ vstorage->estimated_compaction_needed_bytes();
+
+ auto write_stall_condition_and_cause = GetWriteStallConditionAndCause(
+ imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(),
+ vstorage->estimated_compaction_needed_bytes(), mutable_cf_options,
+ *ioptions());
+ write_stall_condition = write_stall_condition_and_cause.first;
+ auto write_stall_cause = write_stall_condition_and_cause.second;
+
+ bool was_stopped = write_controller->IsStopped();
+ bool needed_delay = write_controller->NeedsDelay();
+
+ if (write_stall_condition == WriteStallCondition::kStopped &&
+ write_stall_cause == WriteStallCause::kMemtableLimit) {
+ write_controller_token_ = write_controller->GetStopToken();
+ internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1);
+ ROCKS_LOG_WARN(
+ ioptions_.logger,
+ "[%s] Stopping writes because we have %d immutable memtables "
+ "(waiting for flush), max_write_buffer_number is set to %d",
+ name_.c_str(), imm()->NumNotFlushed(),
+ mutable_cf_options.max_write_buffer_number);
+ } else if (write_stall_condition == WriteStallCondition::kStopped &&
+ write_stall_cause == WriteStallCause::kL0FileCountLimit) {
+ write_controller_token_ = write_controller->GetStopToken();
+ internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1);
+ if (compaction_picker_->IsLevel0CompactionInProgress()) {
+ internal_stats_->AddCFStats(
+ InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1);
+ }
+ ROCKS_LOG_WARN(ioptions_.logger,
+ "[%s] Stopping writes because we have %d level-0 files",
+ name_.c_str(), vstorage->l0_delay_trigger_count());
+ } else if (write_stall_condition == WriteStallCondition::kStopped &&
+ write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
+ write_controller_token_ = write_controller->GetStopToken();
+ internal_stats_->AddCFStats(
+ InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1);
+ ROCKS_LOG_WARN(
+ ioptions_.logger,
+ "[%s] Stopping writes because of estimated pending compaction "
+ "bytes %" PRIu64,
+ name_.c_str(), compaction_needed_bytes);
+ } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+ write_stall_cause == WriteStallCause::kMemtableLimit) {
+ write_controller_token_ =
+ SetupDelay(write_controller, compaction_needed_bytes,
+ prev_compaction_needed_bytes_, was_stopped,
+ mutable_cf_options.disable_auto_compactions);
+ internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1);
+ ROCKS_LOG_WARN(
+ ioptions_.logger,
+ "[%s] Stalling writes because we have %d immutable memtables "
+ "(waiting for flush), max_write_buffer_number is set to %d "
+ "rate %" PRIu64,
+ name_.c_str(), imm()->NumNotFlushed(),
+ mutable_cf_options.max_write_buffer_number,
+ write_controller->delayed_write_rate());
+ } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+ write_stall_cause == WriteStallCause::kL0FileCountLimit) {
+ // L0 is the last two files from stopping.
+ bool near_stop = vstorage->l0_delay_trigger_count() >=
+ mutable_cf_options.level0_stop_writes_trigger - 2;
+ write_controller_token_ =
+ SetupDelay(write_controller, compaction_needed_bytes,
+ prev_compaction_needed_bytes_, was_stopped || near_stop,
+ mutable_cf_options.disable_auto_compactions);
+ internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+ 1);
+ if (compaction_picker_->IsLevel0CompactionInProgress()) {
+ internal_stats_->AddCFStats(
+ InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1);
+ }
+ ROCKS_LOG_WARN(ioptions_.logger,
+ "[%s] Stalling writes because we have %d level-0 files "
+ "rate %" PRIu64,
+ name_.c_str(), vstorage->l0_delay_trigger_count(),
+ write_controller->delayed_write_rate());
+ } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+ write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
+ // If the distance to hard limit is less than 1/4 of the gap between soft
+ // and
+ // hard bytes limit, we think it is near stop and speed up the slowdown.
+ bool near_stop =
+ mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+ (compaction_needed_bytes -
+ mutable_cf_options.soft_pending_compaction_bytes_limit) >
+ 3 *
+ (mutable_cf_options.hard_pending_compaction_bytes_limit -
+ mutable_cf_options.soft_pending_compaction_bytes_limit) /
+ 4;
+
+ write_controller_token_ =
+ SetupDelay(write_controller, compaction_needed_bytes,
+ prev_compaction_needed_bytes_, was_stopped || near_stop,
+ mutable_cf_options.disable_auto_compactions);
+ internal_stats_->AddCFStats(
+ InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1);
+ ROCKS_LOG_WARN(
+ ioptions_.logger,
+ "[%s] Stalling writes because of estimated pending compaction "
+ "bytes %" PRIu64 " rate %" PRIu64,
+ name_.c_str(), vstorage->estimated_compaction_needed_bytes(),
+ write_controller->delayed_write_rate());
+ } else {
+ assert(write_stall_condition == WriteStallCondition::kNormal);
+ if (vstorage->l0_delay_trigger_count() >=
+ GetL0ThresholdSpeedupCompaction(
+ mutable_cf_options.level0_file_num_compaction_trigger,
+ mutable_cf_options.level0_slowdown_writes_trigger)) {
+ write_controller_token_ =
+ write_controller->GetCompactionPressureToken();
+ ROCKS_LOG_INFO(
+ ioptions_.logger,
+ "[%s] Increasing compaction threads because we have %d level-0 "
+ "files ",
+ name_.c_str(), vstorage->l0_delay_trigger_count());
+ } else if (vstorage->estimated_compaction_needed_bytes() >=
+ mutable_cf_options.soft_pending_compaction_bytes_limit / 4) {
+ // Increase compaction threads if bytes needed for compaction exceeds
+ // 1/4 of threshold for slowing down.
+ // If soft pending compaction byte limit is not set, always speed up
+ // compaction.
+ write_controller_token_ =
+ write_controller->GetCompactionPressureToken();
+ if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) {
+ ROCKS_LOG_INFO(
+ ioptions_.logger,
+ "[%s] Increasing compaction threads because of estimated pending "
+ "compaction "
+ "bytes %" PRIu64,
+ name_.c_str(), vstorage->estimated_compaction_needed_bytes());
+ }
+ } else {
+ write_controller_token_.reset();
+ }
+ // If the DB recovers from delay conditions, we reward with reducing
+ // double the slowdown ratio. This is to balance the long term slowdown
+ // increase signal.
+ if (needed_delay) {
+ uint64_t write_rate = write_controller->delayed_write_rate();
+ write_controller->set_delayed_write_rate(static_cast<uint64_t>(
+ static_cast<double>(write_rate) * kDelayRecoverSlowdownRatio));
+ // Set the low pri limit to be 1/4 the delayed write rate.
+ // Note we don't reset this value even after delay condition is relased.
+ // Low-pri rate will continue to apply if there is a compaction
+ // pressure.
+ write_controller->low_pri_rate_limiter()->SetBytesPerSecond(write_rate /
+ 4);
+ }
+ }
+ prev_compaction_needed_bytes_ = compaction_needed_bytes;
+ }
+ return write_stall_condition;
+}
+
+const FileOptions* ColumnFamilyData::soptions() const {
+ return &(column_family_set_->file_options_);
+}
+
+void ColumnFamilyData::SetCurrent(Version* current_version) {
+ current_ = current_version;
+}
+
+uint64_t ColumnFamilyData::GetNumLiveVersions() const {
+ return VersionSet::GetNumLiveVersions(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetTotalSstFilesSize() const {
+ return VersionSet::GetTotalSstFilesSize(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetTotalBlobFileSize() const {
+ return VersionSet::GetTotalBlobFileSize(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetLiveSstFilesSize() const {
+ return current_->GetSstFilesSize();
+}
+
+MemTable* ColumnFamilyData::ConstructNewMemtable(
+ const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
+ return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
+ write_buffer_manager_, earliest_seq, id_);
+}
+
+void ColumnFamilyData::CreateNewMemtable(
+ const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
+ if (mem_ != nullptr) {
+ delete mem_->Unref();
+ }
+ SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq));
+ mem_->Ref();
+}
+
+bool ColumnFamilyData::NeedsCompaction() const {
+ return !mutable_cf_options_.disable_auto_compactions &&
+ compaction_picker_->NeedsCompaction(current_->storage_info());
+}
+
+Compaction* ColumnFamilyData::PickCompaction(
+ const MutableCFOptions& mutable_options,
+ const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) {
+ SequenceNumber earliest_mem_seqno =
+ std::min(mem_->GetEarliestSequenceNumber(),
+ imm_.current()->GetEarliestSequenceNumber(false));
+ auto* result = compaction_picker_->PickCompaction(
+ GetName(), mutable_options, mutable_db_options, current_->storage_info(),
+ log_buffer, earliest_mem_seqno);
+ if (result != nullptr) {
+ result->SetInputVersion(current_);
+ }
+ return result;
+}
+
+bool ColumnFamilyData::RangeOverlapWithCompaction(
+ const Slice& smallest_user_key, const Slice& largest_user_key,
+ int level) const {
+ return compaction_picker_->RangeOverlapWithCompaction(
+ smallest_user_key, largest_user_key, level);
+}
+
+Status ColumnFamilyData::RangesOverlapWithMemtables(
+ const autovector<Range>& ranges, SuperVersion* super_version,
+ bool allow_data_in_errors, bool* overlap) {
+ assert(overlap != nullptr);
+ *overlap = false;
+ // Create an InternalIterator over all unflushed memtables
+ Arena arena;
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
+ merge_iter_builder.AddIterator(
+ super_version->mem->NewIterator(read_opts, &arena));
+ super_version->imm->AddIterators(read_opts, &merge_iter_builder,
+ false /* add_range_tombstone_iter */);
+ ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
+
+ auto read_seq = super_version->current->version_set()->LastSequence();
+ ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq);
+ auto* active_range_del_iter = super_version->mem->NewRangeTombstoneIterator(
+ read_opts, read_seq, false /* immutable_memtable */);
+ range_del_agg.AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator>(active_range_del_iter));
+ Status status;
+ status = super_version->imm->AddRangeTombstoneIterators(
+ read_opts, nullptr /* arena */, &range_del_agg);
+ // AddRangeTombstoneIterators always return Status::OK.
+ assert(status.ok());
+
+ for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) {
+ auto* vstorage = super_version->current->storage_info();
+ auto* ucmp = vstorage->InternalComparator()->user_comparator();
+ InternalKey range_start(ranges[i].start, kMaxSequenceNumber,
+ kValueTypeForSeek);
+ memtable_iter->Seek(range_start.Encode());
+ status = memtable_iter->status();
+ ParsedInternalKey seek_result;
+
+ if (status.ok() && memtable_iter->Valid()) {
+ status = ParseInternalKey(memtable_iter->key(), &seek_result,
+ allow_data_in_errors);
+ }
+
+ if (status.ok()) {
+ if (memtable_iter->Valid() &&
+ ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) {
+ *overlap = true;
+ } else if (range_del_agg.IsRangeOverlapped(ranges[i].start,
+ ranges[i].limit)) {
+ *overlap = true;
+ }
+ }
+ }
+ return status;
+}
+
+const int ColumnFamilyData::kCompactAllLevels = -1;
+const int ColumnFamilyData::kCompactToBaseLevel = -2;
+
+Compaction* ColumnFamilyData::CompactRange(
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, int input_level,
+ int output_level, const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* conflict,
+ uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+ auto* result = compaction_picker_->CompactRange(
+ GetName(), mutable_cf_options, mutable_db_options,
+ current_->storage_info(), input_level, output_level,
+ compact_range_options, begin, end, compaction_end, conflict,
+ max_file_num_to_ignore, trim_ts);
+ if (result != nullptr) {
+ result->SetInputVersion(current_);
+ }
+ return result;
+}
+
+SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(DBImpl* db) {
+ SuperVersion* sv = GetThreadLocalSuperVersion(db);
+ sv->Ref();
+ if (!ReturnThreadLocalSuperVersion(sv)) {
+ // This Unref() corresponds to the Ref() in GetThreadLocalSuperVersion()
+ // when the thread-local pointer was populated. So, the Ref() earlier in
+ // this function still prevents the returned SuperVersion* from being
+ // deleted out from under the caller.
+ sv->Unref();
+ }
+ return sv;
+}
+
+SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
+ // The SuperVersion is cached in thread local storage to avoid acquiring
+ // mutex when SuperVersion does not change since the last use. When a new
+ // SuperVersion is installed, the compaction or flush thread cleans up
+ // cached SuperVersion in all existing thread local storage. To avoid
+ // acquiring mutex for this operation, we use atomic Swap() on the thread
+ // local pointer to guarantee exclusive access. If the thread local pointer
+ // is being used while a new SuperVersion is installed, the cached
+ // SuperVersion can become stale. In that case, the background thread would
+ // have swapped in kSVObsolete. We re-check the value at when returning
+ // SuperVersion back to thread local, with an atomic compare and swap.
+ // The superversion will need to be released if detected to be stale.
+ void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
+ // Invariant:
+ // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
+ // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
+ // should only keep kSVInUse before ReturnThreadLocalSuperVersion call
+ // (if no Scrape happens).
+ assert(ptr != SuperVersion::kSVInUse);
+ SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+ if (sv == SuperVersion::kSVObsolete ||
+ sv->version_number != super_version_number_.load()) {
+ RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES);
+ SuperVersion* sv_to_delete = nullptr;
+
+ if (sv && sv->Unref()) {
+ RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS);
+ db->mutex()->Lock();
+ // NOTE: underlying resources held by superversion (sst files) might
+ // not be released until the next background job.
+ sv->Cleanup();
+ if (db->immutable_db_options().avoid_unnecessary_blocking_io) {
+ db->AddSuperVersionsToFreeQueue(sv);
+ db->SchedulePurge();
+ } else {
+ sv_to_delete = sv;
+ }
+ } else {
+ db->mutex()->Lock();
+ }
+ sv = super_version_->Ref();
+ db->mutex()->Unlock();
+
+ delete sv_to_delete;
+ }
+ assert(sv != nullptr);
+ return sv;
+}
+
+bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
+ assert(sv != nullptr);
+ // Put the SuperVersion back
+ void* expected = SuperVersion::kSVInUse;
+ if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
+ // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
+ // storage has not been altered and no Scrape has happened. The
+ // SuperVersion is still current.
+ return true;
+ } else {
+ // ThreadLocal scrape happened in the process of this GetImpl call (after
+ // thread local Swap() at the beginning and before CompareAndSwap()).
+ // This means the SuperVersion it holds is obsolete.
+ assert(expected == SuperVersion::kSVObsolete);
+ }
+ return false;
+}
+
+void ColumnFamilyData::InstallSuperVersion(SuperVersionContext* sv_context,
+ InstrumentedMutex* db_mutex) {
+ db_mutex->AssertHeld();
+ return InstallSuperVersion(sv_context, mutable_cf_options_);
+}
+
+void ColumnFamilyData::InstallSuperVersion(
+ SuperVersionContext* sv_context,
+ const MutableCFOptions& mutable_cf_options) {
+ SuperVersion* new_superversion = sv_context->new_superversion.release();
+ new_superversion->mutable_cf_options = mutable_cf_options;
+ new_superversion->Init(this, mem_, imm_.current(), current_);
+ SuperVersion* old_superversion = super_version_;
+ super_version_ = new_superversion;
+ ++super_version_number_;
+ super_version_->version_number = super_version_number_;
+ if (old_superversion == nullptr || old_superversion->current != current() ||
+ old_superversion->mem != mem_ ||
+ old_superversion->imm != imm_.current()) {
+ // Should not recalculate slow down condition if nothing has changed, since
+ // currently RecalculateWriteStallConditions() treats it as further slowing
+ // down is needed.
+ super_version_->write_stall_condition =
+ RecalculateWriteStallConditions(mutable_cf_options);
+ } else {
+ super_version_->write_stall_condition =
+ old_superversion->write_stall_condition;
+ }
+ if (old_superversion != nullptr) {
+ // Reset SuperVersions cached in thread local storage.
+ // This should be done before old_superversion->Unref(). That's to ensure
+ // that local_sv_ never holds the last reference to SuperVersion, since
+ // it has no means to safely do SuperVersion cleanup.
+ ResetThreadLocalSuperVersions();
+
+ if (old_superversion->mutable_cf_options.write_buffer_size !=
+ mutable_cf_options.write_buffer_size) {
+ mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
+ }
+ if (old_superversion->write_stall_condition !=
+ new_superversion->write_stall_condition) {
+ sv_context->PushWriteStallNotification(
+ old_superversion->write_stall_condition,
+ new_superversion->write_stall_condition, GetName(), ioptions());
+ }
+ if (old_superversion->Unref()) {
+ old_superversion->Cleanup();
+ sv_context->superversions_to_free.push_back(old_superversion);
+ }
+ }
+}
+
+void ColumnFamilyData::ResetThreadLocalSuperVersions() {
+ autovector<void*> sv_ptrs;
+ local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
+ for (auto ptr : sv_ptrs) {
+ assert(ptr);
+ if (ptr == SuperVersion::kSVInUse) {
+ continue;
+ }
+ auto sv = static_cast<SuperVersion*>(ptr);
+ bool was_last_ref __attribute__((__unused__));
+ was_last_ref = sv->Unref();
+ // sv couldn't have been the last reference because
+ // ResetThreadLocalSuperVersions() is called before
+ // unref'ing super_version_.
+ assert(!was_last_ref);
+ }
+}
+
+Status ColumnFamilyData::ValidateOptions(
+ const DBOptions& db_options, const ColumnFamilyOptions& cf_options) {
+ Status s;
+ s = CheckCompressionSupported(cf_options);
+ if (s.ok() && db_options.allow_concurrent_memtable_write) {
+ s = CheckConcurrentWritesSupported(cf_options);
+ }
+ if (s.ok() && db_options.unordered_write &&
+ cf_options.max_successive_merges != 0) {
+ s = Status::InvalidArgument(
+ "max_successive_merges > 0 is incompatible with unordered_write");
+ }
+ if (s.ok()) {
+ s = CheckCFPathsSupported(db_options, cf_options);
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) {
+ if (!cf_options.table_factory->IsInstanceOf(
+ TableFactory::kBlockBasedTableName())) {
+ return Status::NotSupported(
+ "TTL is only supported in Block-Based Table format. ");
+ }
+ }
+
+ if (cf_options.periodic_compaction_seconds > 0 &&
+ cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
+ if (!cf_options.table_factory->IsInstanceOf(
+ TableFactory::kBlockBasedTableName())) {
+ return Status::NotSupported(
+ "Periodic Compaction is only supported in "
+ "Block-Based Table format. ");
+ }
+ }
+
+ if (cf_options.enable_blob_garbage_collection) {
+ if (cf_options.blob_garbage_collection_age_cutoff < 0.0 ||
+ cf_options.blob_garbage_collection_age_cutoff > 1.0) {
+ return Status::InvalidArgument(
+ "The age cutoff for blob garbage collection should be in the range "
+ "[0.0, 1.0].");
+ }
+ if (cf_options.blob_garbage_collection_force_threshold < 0.0 ||
+ cf_options.blob_garbage_collection_force_threshold > 1.0) {
+ return Status::InvalidArgument(
+ "The garbage ratio threshold for forcing blob garbage collection "
+ "should be in the range [0.0, 1.0].");
+ }
+ }
+
+ if (cf_options.compaction_style == kCompactionStyleFIFO &&
+ db_options.max_open_files != -1 && cf_options.ttl > 0) {
+ return Status::NotSupported(
+ "FIFO compaction only supported with max_open_files = -1.");
+ }
+
+ std::vector<uint32_t> supported{0, 1, 2, 4, 8};
+ if (std::find(supported.begin(), supported.end(),
+ cf_options.memtable_protection_bytes_per_key) ==
+ supported.end()) {
+ return Status::NotSupported(
+ "Memtable per key-value checksum protection only supports 0, 1, 2, 4 "
+ "or 8 bytes per key.");
+ }
+ return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status ColumnFamilyData::SetOptions(
+ const DBOptions& db_opts,
+ const std::unordered_map<std::string, std::string>& options_map) {
+ ColumnFamilyOptions cf_opts =
+ BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
+ ConfigOptions config_opts;
+ config_opts.mutable_options_only = true;
+ Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map,
+ &cf_opts);
+ if (s.ok()) {
+ s = ValidateOptions(db_opts, cf_opts);
+ }
+ if (s.ok()) {
+ mutable_cf_options_ = MutableCFOptions(cf_opts);
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ }
+ return s;
+}
+#endif // ROCKSDB_LITE
+
+// REQUIRES: DB mutex held
+Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
+ if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
+ return Env::WLTH_NOT_SET;
+ }
+ if (level == 0) {
+ return Env::WLTH_MEDIUM;
+ }
+ int base_level = current_->storage_info()->base_level();
+
+ // L1: medium, L2: long, ...
+ if (level - base_level >= 2) {
+ return Env::WLTH_EXTREME;
+ } else if (level < base_level) {
+ // There is no restriction which prevents level passed in to be smaller
+ // than base_level.
+ return Env::WLTH_MEDIUM;
+ }
+ return static_cast<Env::WriteLifeTimeHint>(
+ level - base_level + static_cast<int>(Env::WLTH_MEDIUM));
+}
+
+Status ColumnFamilyData::AddDirectories(
+ std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs) {
+ Status s;
+ assert(created_dirs != nullptr);
+ assert(data_dirs_.empty());
+ for (auto& p : ioptions_.cf_paths) {
+ auto existing_dir = created_dirs->find(p.path);
+
+ if (existing_dir == created_dirs->end()) {
+ std::unique_ptr<FSDirectory> path_directory;
+ s = DBImpl::CreateAndNewDirectory(ioptions_.fs.get(), p.path,
+ &path_directory);
+ if (!s.ok()) {
+ return s;
+ }
+ assert(path_directory != nullptr);
+ data_dirs_.emplace_back(path_directory.release());
+ (*created_dirs)[p.path] = data_dirs_.back();
+ } else {
+ data_dirs_.emplace_back(existing_dir->second);
+ }
+ }
+ assert(data_dirs_.size() == ioptions_.cf_paths.size());
+ return s;
+}
+
+FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const {
+ if (data_dirs_.empty()) {
+ return nullptr;
+ }
+
+ assert(path_id < data_dirs_.size());
+ return data_dirs_[path_id].get();
+}
+
+ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
+ const ImmutableDBOptions* db_options,
+ const FileOptions& file_options,
+ Cache* table_cache,
+ WriteBufferManager* _write_buffer_manager,
+ WriteController* _write_controller,
+ BlockCacheTracer* const block_cache_tracer,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::string& db_id,
+ const std::string& db_session_id)
+ : max_column_family_(0),
+ file_options_(file_options),
+ dummy_cfd_(new ColumnFamilyData(
+ ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr,
+ nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr,
+ block_cache_tracer, io_tracer, db_id, db_session_id)),
+ default_cfd_cache_(nullptr),
+ db_name_(dbname),
+ db_options_(db_options),
+ table_cache_(table_cache),
+ write_buffer_manager_(_write_buffer_manager),
+ write_controller_(_write_controller),
+ block_cache_tracer_(block_cache_tracer),
+ io_tracer_(io_tracer),
+ db_id_(db_id),
+ db_session_id_(db_session_id) {
+ // initialize linked list
+ dummy_cfd_->prev_ = dummy_cfd_;
+ dummy_cfd_->next_ = dummy_cfd_;
+}
+
+ColumnFamilySet::~ColumnFamilySet() {
+ while (column_family_data_.size() > 0) {
+ // cfd destructor will delete itself from column_family_data_
+ auto cfd = column_family_data_.begin()->second;
+ bool last_ref __attribute__((__unused__));
+ last_ref = cfd->UnrefAndTryDelete();
+ assert(last_ref);
+ }
+ bool dummy_last_ref __attribute__((__unused__));
+ dummy_last_ref = dummy_cfd_->UnrefAndTryDelete();
+ assert(dummy_last_ref);
+}
+
+ColumnFamilyData* ColumnFamilySet::GetDefault() const {
+ assert(default_cfd_cache_ != nullptr);
+ return default_cfd_cache_;
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
+ auto cfd_iter = column_family_data_.find(id);
+ if (cfd_iter != column_family_data_.end()) {
+ return cfd_iter->second;
+ } else {
+ return nullptr;
+ }
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(
+ const std::string& name) const {
+ auto cfd_iter = column_families_.find(name);
+ if (cfd_iter != column_families_.end()) {
+ auto cfd = GetColumnFamily(cfd_iter->second);
+ assert(cfd != nullptr);
+ return cfd;
+ } else {
+ return nullptr;
+ }
+}
+
+uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
+ return ++max_column_family_;
+}
+
+uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
+
+void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
+ max_column_family_ = std::max(new_max_column_family, max_column_family_);
+}
+
+size_t ColumnFamilySet::NumberOfColumnFamilies() const {
+ return column_families_.size();
+}
+
+// under a DB mutex AND write thread
+ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
+ const std::string& name, uint32_t id, Version* dummy_versions,
+ const ColumnFamilyOptions& options) {
+ assert(column_families_.find(name) == column_families_.end());
+ ColumnFamilyData* new_cfd = new ColumnFamilyData(
+ id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
+ *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_,
+ db_id_, db_session_id_);
+ column_families_.insert({name, id});
+ column_family_data_.insert({id, new_cfd});
+ max_column_family_ = std::max(max_column_family_, id);
+ // add to linked list
+ new_cfd->next_ = dummy_cfd_;
+ auto prev = dummy_cfd_->prev_;
+ new_cfd->prev_ = prev;
+ prev->next_ = new_cfd;
+ dummy_cfd_->prev_ = new_cfd;
+ if (id == 0) {
+ default_cfd_cache_ = new_cfd;
+ }
+ return new_cfd;
+}
+
+// under a DB mutex AND from a write thread
+void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
+ auto cfd_iter = column_family_data_.find(cfd->GetID());
+ assert(cfd_iter != column_family_data_.end());
+ column_family_data_.erase(cfd_iter);
+ column_families_.erase(cfd->GetName());
+}
+
+// under a DB mutex OR from a write thread
+bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
+ if (column_family_id == 0) {
+ // optimization for common case
+ current_ = column_family_set_->GetDefault();
+ } else {
+ current_ = column_family_set_->GetColumnFamily(column_family_id);
+ }
+ handle_.SetCFD(current_);
+ return current_ != nullptr;
+}
+
+uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
+ assert(current_ != nullptr);
+ return current_->GetLogNumber();
+}
+
+MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
+ assert(current_ != nullptr);
+ return current_->mem();
+}
+
+ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
+ assert(current_ != nullptr);
+ return &handle_;
+}
+
+uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
+ uint32_t column_family_id = 0;
+ if (column_family != nullptr) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ column_family_id = cfh->GetID();
+ }
+ return column_family_id;
+}
+
+const Comparator* GetColumnFamilyUserComparator(
+ ColumnFamilyHandle* column_family) {
+ if (column_family != nullptr) {
+ return column_family->GetComparator();
+ }
+ return nullptr;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/column_family.h b/src/rocksdb/db/column_family.h
new file mode 100644
index 000000000..3e6d01d22
--- /dev/null
+++ b/src/rocksdb/db/column_family.h
@@ -0,0 +1,845 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "cache/cache_reservation_manager.h"
+#include "db/memtable_list.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/write_batch_internal.h"
+#include "db/write_controller.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/hash_containers.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Version;
+class VersionSet;
+class VersionStorageInfo;
+class MemTable;
+class MemTableListVersion;
+class CompactionPicker;
+class Compaction;
+class InternalKey;
+class InternalStats;
+class ColumnFamilyData;
+class DBImpl;
+class LogBuffer;
+class InstrumentedMutex;
+class InstrumentedMutexLock;
+struct SuperVersionContext;
+class BlobFileCache;
+class BlobSource;
+
+extern const double kIncSlowdownRatio;
+// This file contains a list of data structures for managing column family
+// level metadata.
+//
+// The basic relationships among classes declared here are illustrated as
+// following:
+//
+// +----------------------+ +----------------------+ +--------+
+// +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl |
+// | +----------------------+ | +----------------------+ +----+---+
+// | +--------------------------+ |
+// | | +-----------------------------+
+// | | |
+// | | +-----------------------------v-------------------------------+
+// | | | |
+// | | | ColumnFamilySet |
+// | | | |
+// | | +-------------+--------------------------+----------------+---+
+// | | | | |
+// | +-------------------------------------+ | |
+// | | | | v
+// | +-------------v-------------+ +-----v----v---------+
+// | | | | |
+// | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ......
+// | | | | |
+// +---> | | |
+// | +---------+ | |
+// | | MemTable| | |
+// | | List | | |
+// +--------+---+--+-+----+----+ +--------------------++
+// | | | |
+// | | | |
+// | | | +-----------------------+
+// | | +-----------+ |
+// v +--------+ | |
+// +--------+--------+ | | |
+// | | | | +----------v----------+
+// +---> |SuperVersion 1.a +-----------------> |
+// | +------+ | | MemTableListVersion |
+// +---+-------------+ | | | | |
+// | | | | +----+------------+---+
+// | current | | | | |
+// | +-------------+ | |mem | |
+// | | | | | |
+// +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+
+// | | | | | | | |
+// | Version 1.a | | memtable | | memtable | | memtable |
+// | | | 1.a | | 1.b | | 1.c |
+// +-------------+ | | | | | |
+// +----------+ +----------+ +----------+
+//
+// DBImpl keeps a ColumnFamilySet, which references to all column families by
+// pointing to respective ColumnFamilyData object of each column family.
+// This is how DBImpl can list and operate on all the column families.
+// ColumnFamilyHandle also points to ColumnFamilyData directly, so that
+// when a user executes a query, it can directly find memtables and Version
+// as well as SuperVersion to the column family, without going through
+// ColumnFamilySet.
+//
+// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
+// and SST files) indirectly, while ongoing operations may hold references
+// to a current or an out-of-date SuperVersion, which in turn points to a
+// point-in-time view of the LSM-tree. This guarantees the memtables and SST
+// files being operated on will not go away, until the SuperVersion is
+// unreferenced to 0 and destoryed.
+//
+// The following graph illustrates a possible referencing relationships:
+//
+// Column +--------------+ current +-----------+
+// Family +---->+ +------------------->+ |
+// Data | SuperVersion +----------+ | Version A |
+// | 3 | imm | | |
+// Iter2 +----->+ | +-------v------+ +-----------+
+// +-----+--------+ | MemtableList +----------------> Empty
+// | | Version r | +-----------+
+// | +--------------+ | |
+// +------------------+ current| Version B |
+// +--------------+ | +----->+ |
+// | | | | +-----+-----+
+// Compaction +>+ SuperVersion +-------------+ ^
+// Job | 2 +------+ | |current
+// | +----+ | | mem | +------------+
+// +--------------+ | | +---------------------> |
+// | +------------------------> MemTable a |
+// | mem | | |
+// +--------------+ | | +------------+
+// | +--------------------------+
+// Iter1 +-----> SuperVersion | | +------------+
+// | 1 +------------------------------>+ |
+// | +-+ | mem | MemTable b |
+// +--------------+ | | | |
+// | | +--------------+ +-----^------+
+// | |imm | MemtableList | |
+// | +--->+ Version s +------------+
+// | +--------------+
+// | +--------------+
+// | | MemtableList |
+// +------>+ Version t +--------> Empty
+// imm +--------------+
+//
+// In this example, even if the current LSM-tree consists of Version A and
+// memtable a, which is also referenced by SuperVersion, two older SuperVersion
+// SuperVersion2 and Superversion1 still exist, and are referenced by a
+// compaction job and an old iterator Iter1, respectively. SuperVersion2
+// contains Version B, memtable a and memtable b; SuperVersion1 contains
+// Version B and memtable b (mutable). As a result, Version B and memtable b
+// are prevented from being destroyed or deleted.
+
+// ColumnFamilyHandleImpl is the class that clients use to access different
+// column families. It has non-trivial destructor, which gets called when client
+// is done using the column family
+class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
+ public:
+ // create while holding the mutex
+ ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db,
+ InstrumentedMutex* mutex);
+ // destroy without mutex
+ virtual ~ColumnFamilyHandleImpl();
+ virtual ColumnFamilyData* cfd() const { return cfd_; }
+
+ virtual uint32_t GetID() const override;
+ virtual const std::string& GetName() const override;
+ virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
+ virtual const Comparator* GetComparator() const override;
+
+ private:
+ ColumnFamilyData* cfd_;
+ DBImpl* db_;
+ InstrumentedMutex* mutex_;
+};
+
+// Does not ref-count ColumnFamilyData
+// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
+// calls DBImpl methods. When this happens, MemTableInserter need access to
+// ColumnFamilyHandle (same as the client would need). In that case, we feed
+// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
+// methods
+class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
+ public:
+ ColumnFamilyHandleInternal()
+ : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
+ internal_cfd_(nullptr) {}
+
+ void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
+ virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
+
+ private:
+ ColumnFamilyData* internal_cfd_;
+};
+
+// holds references to memtable, all immutable memtables and version
+struct SuperVersion {
+ // Accessing members of this class is not thread-safe and requires external
+ // synchronization (ie db mutex held or on write thread).
+ ColumnFamilyData* cfd;
+ MemTable* mem;
+ MemTableListVersion* imm;
+ Version* current;
+ MutableCFOptions mutable_cf_options;
+ // Version number of the current SuperVersion
+ uint64_t version_number;
+ WriteStallCondition write_stall_condition;
+
+ // should be called outside the mutex
+ SuperVersion() = default;
+ ~SuperVersion();
+ SuperVersion* Ref();
+ // If Unref() returns true, Cleanup() should be called with mutex held
+ // before deleting this SuperVersion.
+ bool Unref();
+
+ // call these two methods with db mutex held
+ // Cleanup unrefs mem, imm and current. Also, it stores all memtables
+ // that needs to be deleted in to_delete vector. Unrefing those
+ // objects needs to be done in the mutex
+ void Cleanup();
+ void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
+ MemTableListVersion* new_imm, Version* new_current);
+
+ // The value of dummy is not actually used. kSVInUse takes its address as a
+ // mark in the thread local storage to indicate the SuperVersion is in use
+ // by thread. This way, the value of kSVInUse is guaranteed to have no
+ // conflict with SuperVersion object address and portable on different
+ // platform.
+ static int dummy;
+ static void* const kSVInUse;
+ static void* const kSVObsolete;
+
+ private:
+ std::atomic<uint32_t> refs;
+ // We need to_delete because during Cleanup(), imm->Unref() returns
+ // all memtables that we need to free through this vector. We then
+ // delete all those memtables outside of mutex, during destruction
+ autovector<MemTable*> to_delete;
+};
+
+extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
+
+extern Status CheckConcurrentWritesSupported(
+ const ColumnFamilyOptions& cf_options);
+
+extern Status CheckCFPathsSupported(const DBOptions& db_options,
+ const ColumnFamilyOptions& cf_options);
+
+extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+ const ColumnFamilyOptions& src);
+// Wrap user defined table properties collector factories `from cf_options`
+// into internal ones in int_tbl_prop_collector_factories. Add a system internal
+// one too.
+extern void GetIntTblPropCollectorFactory(
+ const ImmutableCFOptions& ioptions,
+ IntTblPropCollectorFactories* int_tbl_prop_collector_factories);
+
+class ColumnFamilySet;
+
+// This class keeps all the data that a column family needs.
+// Most methods require DB mutex held, unless otherwise noted
+class ColumnFamilyData {
+ public:
+ ~ColumnFamilyData();
+
+ // thread-safe
+ uint32_t GetID() const { return id_; }
+ // thread-safe
+ const std::string& GetName() const { return name_; }
+
+ // Ref() can only be called from a context where the caller can guarantee
+ // that ColumnFamilyData is alive (while holding a non-zero ref already,
+ // holding a DB mutex, or as the leader in a write batch group).
+ void Ref() { refs_.fetch_add(1); }
+
+ // UnrefAndTryDelete() decreases the reference count and do free if needed,
+ // return true if this is freed else false, UnrefAndTryDelete() can only
+ // be called while holding a DB mutex, or during single-threaded recovery.
+ bool UnrefAndTryDelete();
+
+ // SetDropped() can only be called under following conditions:
+ // 1) Holding a DB mutex,
+ // 2) from single-threaded write thread, AND
+ // 3) from single-threaded VersionSet::LogAndApply()
+ // After dropping column family no other operation on that column family
+ // will be executed. All the files and memory will be, however, kept around
+ // until client drops the column family handle. That way, client can still
+ // access data from dropped column family.
+ // Column family can be dropped and still alive. In that state:
+ // *) Compaction and flush is not executed on the dropped column family.
+ // *) Client can continue reading from column family. Writes will fail unless
+ // WriteOptions::ignore_missing_column_families is true
+ // When the dropped column family is unreferenced, then we:
+ // *) Remove column family from the linked list maintained by ColumnFamilySet
+ // *) delete all memory associated with that column family
+ // *) delete all the files associated with that column family
+ void SetDropped();
+ bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); }
+
+ // thread-safe
+ int NumberLevels() const { return ioptions_.num_levels; }
+
+ void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
+ uint64_t GetLogNumber() const { return log_number_; }
+
+ void SetFlushReason(FlushReason flush_reason) {
+ flush_reason_ = flush_reason;
+ }
+ FlushReason GetFlushReason() const { return flush_reason_; }
+ // thread-safe
+ const FileOptions* soptions() const;
+ const ImmutableOptions* ioptions() const { return &ioptions_; }
+ // REQUIRES: DB mutex held
+ // This returns the MutableCFOptions used by current SuperVersion
+ // You should use this API to reference MutableCFOptions most of the time.
+ const MutableCFOptions* GetCurrentMutableCFOptions() const {
+ return &(super_version_->mutable_cf_options);
+ }
+ // REQUIRES: DB mutex held
+ // This returns the latest MutableCFOptions, which may be not in effect yet.
+ const MutableCFOptions* GetLatestMutableCFOptions() const {
+ return &mutable_cf_options_;
+ }
+
+ // REQUIRES: DB mutex held
+ // Build ColumnFamiliesOptions with immutable options and latest mutable
+ // options.
+ ColumnFamilyOptions GetLatestCFOptions() const;
+
+ bool is_delete_range_supported() { return is_delete_range_supported_; }
+
+ // Validate CF options against DB options
+ static Status ValidateOptions(const DBOptions& db_options,
+ const ColumnFamilyOptions& cf_options);
+#ifndef ROCKSDB_LITE
+ // REQUIRES: DB mutex held
+ Status SetOptions(
+ const DBOptions& db_options,
+ const std::unordered_map<std::string, std::string>& options_map);
+#endif // ROCKSDB_LITE
+
+ InternalStats* internal_stats() { return internal_stats_.get(); }
+
+ MemTableList* imm() { return &imm_; }
+ MemTable* mem() { return mem_; }
+
+ bool IsEmpty() {
+ return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0;
+ }
+
+ Version* current() { return current_; }
+ Version* dummy_versions() { return dummy_versions_; }
+ void SetCurrent(Version* _current);
+ uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held
+ uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held
+ uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held
+ uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held
+ void SetMemtable(MemTable* new_mem) {
+ uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
+ new_mem->SetID(memtable_id);
+ mem_ = new_mem;
+ }
+
+ // calculate the oldest log needed for the durability of this column family
+ uint64_t OldestLogToKeep();
+
+ // See Memtable constructor for explanation of earliest_seq param.
+ MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
+ SequenceNumber earliest_seq);
+ void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
+ SequenceNumber earliest_seq);
+
+ TableCache* table_cache() const { return table_cache_.get(); }
+ BlobSource* blob_source() const { return blob_source_.get(); }
+
+ // See documentation in compaction_picker.h
+ // REQUIRES: DB mutex held
+ bool NeedsCompaction() const;
+ // REQUIRES: DB mutex held
+ Compaction* PickCompaction(const MutableCFOptions& mutable_options,
+ const MutableDBOptions& mutable_db_options,
+ LogBuffer* log_buffer);
+
+ // Check if the passed range overlap with any running compactions.
+ // REQUIRES: DB mutex held
+ bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ int level) const;
+
+ // Check if the passed ranges overlap with any unflushed memtables
+ // (immutable or mutable).
+ //
+ // @param super_version A referenced SuperVersion that will be held for the
+ // duration of this function.
+ //
+ // Thread-safe
+ Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
+ SuperVersion* super_version,
+ bool allow_data_in_errors, bool* overlap);
+
+ // A flag to tell a manual compaction is to compact all levels together
+ // instead of a specific level.
+ static const int kCompactAllLevels;
+ // A flag to tell a manual compaction's output is base level.
+ static const int kCompactToBaseLevel;
+ // REQUIRES: DB mutex held
+ Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore,
+ const std::string& trim_ts);
+
+ CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
+ // thread-safe
+ const Comparator* user_comparator() const {
+ return internal_comparator_.user_comparator();
+ }
+ // thread-safe
+ const InternalKeyComparator& internal_comparator() const {
+ return internal_comparator_;
+ }
+
+ const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const {
+ return &int_tbl_prop_collector_factories_;
+ }
+
+ SuperVersion* GetSuperVersion() { return super_version_; }
+ // thread-safe
+ // Return a already referenced SuperVersion to be used safely.
+ SuperVersion* GetReferencedSuperVersion(DBImpl* db);
+ // thread-safe
+ // Get SuperVersion stored in thread local storage. If it does not exist,
+ // get a reference from a current SuperVersion.
+ SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
+ // Try to return SuperVersion back to thread local storage. Return true on
+ // success and false on failure. It fails when the thread local storage
+ // contains anything other than SuperVersion::kSVInUse flag.
+ bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
+ // thread-safe
+ uint64_t GetSuperVersionNumber() const {
+ return super_version_number_.load();
+ }
+ // will return a pointer to SuperVersion* if previous SuperVersion
+ // if its reference count is zero and needs deletion or nullptr if not
+ // As argument takes a pointer to allocated SuperVersion to enable
+ // the clients to allocate SuperVersion outside of mutex.
+ // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
+ void InstallSuperVersion(SuperVersionContext* sv_context,
+ const MutableCFOptions& mutable_cf_options);
+ void InstallSuperVersion(SuperVersionContext* sv_context,
+ InstrumentedMutex* db_mutex);
+
+ void ResetThreadLocalSuperVersions();
+
+ // Protected by DB mutex
+ void set_queued_for_flush(bool value) { queued_for_flush_ = value; }
+ void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; }
+ bool queued_for_flush() { return queued_for_flush_; }
+ bool queued_for_compaction() { return queued_for_compaction_; }
+
+ enum class WriteStallCause {
+ kNone,
+ kMemtableLimit,
+ kL0FileCountLimit,
+ kPendingCompactionBytes,
+ };
+ static std::pair<WriteStallCondition, WriteStallCause>
+ GetWriteStallConditionAndCause(
+ int num_unflushed_memtables, int num_l0_files,
+ uint64_t num_compaction_needed_bytes,
+ const MutableCFOptions& mutable_cf_options,
+ const ImmutableCFOptions& immutable_cf_options);
+
+ // Recalculate some stall conditions, which are changed only during
+ // compaction, adding new memtable and/or recalculation of compaction score.
+ WriteStallCondition RecalculateWriteStallConditions(
+ const MutableCFOptions& mutable_cf_options);
+
+ void set_initialized() { initialized_.store(true); }
+
+ bool initialized() const { return initialized_.load(); }
+
+ const ColumnFamilyOptions& initial_cf_options() {
+ return initial_cf_options_;
+ }
+
+ Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
+
+ // created_dirs remembers directory created, so that we don't need to call
+ // the same data creation operation again.
+ Status AddDirectories(
+ std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs);
+
+ FSDirectory* GetDataDir(size_t path_id) const;
+
+ // full_history_ts_low_ can only increase.
+ void SetFullHistoryTsLow(std::string ts_low) {
+ assert(!ts_low.empty());
+ const Comparator* ucmp = user_comparator();
+ assert(ucmp);
+ if (full_history_ts_low_.empty() ||
+ ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) {
+ full_history_ts_low_ = std::move(ts_low);
+ }
+ }
+
+ const std::string& GetFullHistoryTsLow() const {
+ return full_history_ts_low_;
+ }
+
+ ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
+ WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
+ std::shared_ptr<CacheReservationManager>
+ GetFileMetadataCacheReservationManager() {
+ return file_metadata_cache_res_mgr_;
+ }
+
+ SequenceNumber GetFirstMemtableSequenceNumber() const;
+
+ static const uint32_t kDummyColumnFamilyDataId;
+
+ // Keep track of whether the mempurge feature was ever used.
+ void SetMempurgeUsed() { mempurge_used_ = true; }
+ bool GetMempurgeUsed() { return mempurge_used_; }
+
+ private:
+ friend class ColumnFamilySet;
+ ColumnFamilyData(uint32_t id, const std::string& name,
+ Version* dummy_versions, Cache* table_cache,
+ WriteBufferManager* write_buffer_manager,
+ const ColumnFamilyOptions& options,
+ const ImmutableDBOptions& db_options,
+ const FileOptions* file_options,
+ ColumnFamilySet* column_family_set,
+ BlockCacheTracer* const block_cache_tracer,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::string& db_id, const std::string& db_session_id);
+
+ std::vector<std::string> GetDbPaths() const;
+
+ uint32_t id_;
+ const std::string name_;
+ Version* dummy_versions_; // Head of circular doubly-linked list of versions.
+ Version* current_; // == dummy_versions->prev_
+
+ std::atomic<int> refs_; // outstanding references to ColumnFamilyData
+ std::atomic<bool> initialized_;
+ std::atomic<bool> dropped_; // true if client dropped it
+
+ const InternalKeyComparator internal_comparator_;
+ IntTblPropCollectorFactories int_tbl_prop_collector_factories_;
+
+ const ColumnFamilyOptions initial_cf_options_;
+ const ImmutableOptions ioptions_;
+ MutableCFOptions mutable_cf_options_;
+
+ const bool is_delete_range_supported_;
+
+ std::unique_ptr<TableCache> table_cache_;
+ std::unique_ptr<BlobFileCache> blob_file_cache_;
+ std::unique_ptr<BlobSource> blob_source_;
+
+ std::unique_ptr<InternalStats> internal_stats_;
+
+ WriteBufferManager* write_buffer_manager_;
+
+ MemTable* mem_;
+ MemTableList imm_;
+ SuperVersion* super_version_;
+
+ // An ordinal representing the current SuperVersion. Updated by
+ // InstallSuperVersion(), i.e. incremented every time super_version_
+ // changes.
+ std::atomic<uint64_t> super_version_number_;
+
+ // Thread's local copy of SuperVersion pointer
+ // This needs to be destructed before mutex_
+ std::unique_ptr<ThreadLocalPtr> local_sv_;
+
+ // pointers for a circular linked list. we use it to support iterations over
+ // all column families that are alive (note: dropped column families can also
+ // be alive as long as client holds a reference)
+ ColumnFamilyData* next_;
+ ColumnFamilyData* prev_;
+
+ // This is the earliest log file number that contains data from this
+ // Column Family. All earlier log files must be ignored and not
+ // recovered from
+ uint64_t log_number_;
+
+ std::atomic<FlushReason> flush_reason_;
+
+ // An object that keeps all the compaction stats
+ // and picks the next compaction
+ std::unique_ptr<CompactionPicker> compaction_picker_;
+
+ ColumnFamilySet* column_family_set_;
+
+ std::unique_ptr<WriteControllerToken> write_controller_token_;
+
+ // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
+ bool queued_for_flush_;
+
+ // If true --> this ColumnFamily is currently present in
+ // DBImpl::compaction_queue_
+ bool queued_for_compaction_;
+
+ uint64_t prev_compaction_needed_bytes_;
+
+ // if the database was opened with 2pc enabled
+ bool allow_2pc_;
+
+ // Memtable id to track flush.
+ std::atomic<uint64_t> last_memtable_id_;
+
+ // Directories corresponding to cf_paths.
+ std::vector<std::shared_ptr<FSDirectory>> data_dirs_;
+
+ bool db_paths_registered_;
+
+ std::string full_history_ts_low_;
+
+ // For charging memory usage of file metadata created for newly added files to
+ // a Version associated with this CFD
+ std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
+ bool mempurge_used_;
+};
+
+// ColumnFamilySet has interesting thread-safety requirements
+// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
+// mutex AND executed in the write thread.
+// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
+// single-threaded write thread. It is also called during Recovery and in
+// DumpManifest().
+// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
+// held and it needs to be executed from the write thread. SetDropped() also
+// guarantees that it will be called only from single-threaded LogAndApply(),
+// but this condition is not that important.
+// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
+// body of the iteration, wrap in a RefedColumnFamilySet.
+// * GetDefault() -- thread safe
+// * GetColumnFamily() -- either inside of DB mutex or from a write thread
+// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
+// NumberOfColumnFamilies -- inside of DB mutex
+class ColumnFamilySet {
+ public:
+ // ColumnFamilySet supports iteration
+ class iterator {
+ public:
+ explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {}
+ // NOTE: minimum operators for for-loop iteration
+ iterator& operator++() {
+ current_ = current_->next_;
+ return *this;
+ }
+ bool operator!=(const iterator& other) const {
+ return this->current_ != other.current_;
+ }
+ ColumnFamilyData* operator*() { return current_; }
+
+ private:
+ ColumnFamilyData* current_;
+ };
+
+ ColumnFamilySet(const std::string& dbname,
+ const ImmutableDBOptions* db_options,
+ const FileOptions& file_options, Cache* table_cache,
+ WriteBufferManager* _write_buffer_manager,
+ WriteController* _write_controller,
+ BlockCacheTracer* const block_cache_tracer,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::string& db_id, const std::string& db_session_id);
+ ~ColumnFamilySet();
+
+ ColumnFamilyData* GetDefault() const;
+ // GetColumnFamily() calls return nullptr if column family is not found
+ ColumnFamilyData* GetColumnFamily(uint32_t id) const;
+ ColumnFamilyData* GetColumnFamily(const std::string& name) const;
+ // this call will return the next available column family ID. it guarantees
+ // that there is no column family with id greater than or equal to the
+ // returned value in the current running instance or anytime in RocksDB
+ // instance history.
+ uint32_t GetNextColumnFamilyID();
+ uint32_t GetMaxColumnFamily();
+ void UpdateMaxColumnFamily(uint32_t new_max_column_family);
+ size_t NumberOfColumnFamilies() const;
+
+ ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
+ Version* dummy_version,
+ const ColumnFamilyOptions& options);
+
+ iterator begin() { return iterator(dummy_cfd_->next_); }
+ iterator end() { return iterator(dummy_cfd_); }
+
+ Cache* get_table_cache() { return table_cache_; }
+
+ WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
+
+ WriteController* write_controller() { return write_controller_; }
+
+ private:
+ friend class ColumnFamilyData;
+ // helper function that gets called from cfd destructor
+ // REQUIRES: DB mutex held
+ void RemoveColumnFamily(ColumnFamilyData* cfd);
+
+ // column_families_ and column_family_data_ need to be protected:
+ // * when mutating both conditions have to be satisfied:
+ // 1. DB mutex locked
+ // 2. thread currently in single-threaded write thread
+ // * when reading, at least one condition needs to be satisfied:
+ // 1. DB mutex locked
+ // 2. accessed from a single-threaded write thread
+ UnorderedMap<std::string, uint32_t> column_families_;
+ UnorderedMap<uint32_t, ColumnFamilyData*> column_family_data_;
+
+ uint32_t max_column_family_;
+ const FileOptions file_options_;
+
+ ColumnFamilyData* dummy_cfd_;
+ // We don't hold the refcount here, since default column family always exists
+ // We are also not responsible for cleaning up default_cfd_cache_. This is
+ // just a cache that makes common case (accessing default column family)
+ // faster
+ ColumnFamilyData* default_cfd_cache_;
+
+ const std::string db_name_;
+ const ImmutableDBOptions* const db_options_;
+ Cache* table_cache_;
+ WriteBufferManager* write_buffer_manager_;
+ WriteController* write_controller_;
+ BlockCacheTracer* const block_cache_tracer_;
+ std::shared_ptr<IOTracer> io_tracer_;
+ const std::string& db_id_;
+ std::string db_session_id_;
+};
+
+// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
+// iteration over the iterator, because the cfd is Refed and Unrefed during
+// each iteration to prevent concurrent CF drop from destroying it (until
+// Unref).
+class RefedColumnFamilySet {
+ public:
+ explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
+
+ class iterator {
+ public:
+ explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
+ MaybeRef(*wrapped_);
+ }
+ ~iterator() { MaybeUnref(*wrapped_); }
+ inline void MaybeRef(ColumnFamilyData* cfd) {
+ if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+ cfd->Ref();
+ }
+ }
+ inline void MaybeUnref(ColumnFamilyData* cfd) {
+ if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+ cfd->UnrefAndTryDelete();
+ }
+ }
+ // NOTE: minimum operators for for-loop iteration
+ inline iterator& operator++() {
+ ColumnFamilyData* old = *wrapped_;
+ ++wrapped_;
+ // Can only unref & potentially free cfd after accessing its next_
+ MaybeUnref(old);
+ MaybeRef(*wrapped_);
+ return *this;
+ }
+ inline bool operator!=(const iterator& other) const {
+ return this->wrapped_ != other.wrapped_;
+ }
+ inline ColumnFamilyData* operator*() { return *wrapped_; }
+
+ private:
+ ColumnFamilySet::iterator wrapped_;
+ };
+
+ iterator begin() { return iterator(wrapped_->begin()); }
+ iterator end() { return iterator(wrapped_->end()); }
+
+ private:
+ ColumnFamilySet* wrapped_;
+};
+
+// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
+// memtables of different column families (specified by ID in the write batch)
+class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
+ public:
+ explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
+ : column_family_set_(column_family_set), current_(nullptr) {}
+
+ // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
+ // with the arguments used to construct *orig.
+ explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig)
+ : column_family_set_(orig->column_family_set_), current_(nullptr) {}
+
+ // sets current_ to ColumnFamilyData with column_family_id
+ // returns false if column family doesn't exist
+ // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+ // under a DB mutex OR from a write thread
+ bool Seek(uint32_t column_family_id) override;
+
+ // Returns log number of the selected column family
+ // REQUIRES: under a DB mutex OR from a write thread
+ uint64_t GetLogNumber() const override;
+
+ // REQUIRES: Seek() called first
+ // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+ // under a DB mutex OR from a write thread
+ virtual MemTable* GetMemTable() const override;
+
+ // Returns column family handle for the selected column family
+ // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+ // under a DB mutex OR from a write thread
+ virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
+
+ // Cannot be called while another thread is calling Seek().
+ // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+ // under a DB mutex OR from a write thread
+ virtual ColumnFamilyData* current() override { return current_; }
+
+ private:
+ ColumnFamilySet* column_family_set_;
+ ColumnFamilyData* current_;
+ ColumnFamilyHandleInternal handle_;
+};
+
+extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
+
+extern const Comparator* GetColumnFamilyUserComparator(
+ ColumnFamilyHandle* column_family);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/column_family_test.cc b/src/rocksdb/db/column_family_test.cc
new file mode 100644
index 000000000..d33cbe50a
--- /dev/null
+++ b/src/rocksdb/db/column_family_test.cc
@@ -0,0 +1,3453 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+
+// counts how many operations were performed
+class EnvCounter : public SpecialEnv {
+ public:
+ explicit EnvCounter(Env* base)
+ : SpecialEnv(base), num_new_writable_file_(0) {}
+ int GetNumberOfNewWritableFileCalls() { return num_new_writable_file_; }
+ Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+ const EnvOptions& soptions) override {
+ ++num_new_writable_file_;
+ return EnvWrapper::NewWritableFile(f, r, soptions);
+ }
+
+ private:
+ std::atomic<int> num_new_writable_file_;
+};
+
+class ColumnFamilyTestBase : public testing::Test {
+ public:
+ explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
+ Env* base_env = Env::Default();
+ EXPECT_OK(
+ test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+ EXPECT_NE(nullptr, base_env);
+ env_ = new EnvCounter(base_env);
+ env_->skip_fsync_ = true;
+ dbname_ = test::PerThreadDBPath("column_family_test");
+ db_options_.create_if_missing = true;
+ db_options_.fail_if_options_file_error = true;
+ db_options_.env = env_;
+ EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
+ }
+
+ ~ColumnFamilyTestBase() override {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (auto h : handles_) {
+ ColumnFamilyDescriptor cfdescriptor;
+ Status s = h->GetDescriptor(&cfdescriptor);
+#ifdef ROCKSDB_LITE
+ EXPECT_TRUE(s.IsNotSupported());
+#else
+ EXPECT_OK(s);
+#endif // ROCKSDB_LITE
+ column_families.push_back(cfdescriptor);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ Destroy(column_families);
+ delete env_;
+ }
+
+ BlockBasedTableOptions GetBlockBasedTableOptions() {
+ BlockBasedTableOptions options;
+ options.format_version = format_;
+ return options;
+ }
+
+ // Return the value to associate with the specified key
+ Slice Value(int k, std::string* storage) {
+ if (k == 0) {
+ // Ugh. Random seed of 0 used to produce no entropy. This code
+ // preserves the implementation that was in place when all of the
+ // magic values in this file were picked.
+ *storage = std::string(kValueSize, ' ');
+ } else {
+ Random r(k);
+ *storage = r.RandomString(kValueSize);
+ }
+ return Slice(*storage);
+ }
+
+ void Build(int base, int n, int flush_every = 0) {
+ std::string key_space, value_space;
+ WriteBatch batch;
+
+ for (int i = 0; i < n; i++) {
+ if (flush_every != 0 && i != 0 && i % flush_every == 0) {
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ dbi->TEST_FlushMemTable();
+ }
+
+ int keyi = base + i;
+ Slice key(DBTestBase::Key(keyi));
+
+ batch.Clear();
+ batch.Put(handles_[0], key, Value(keyi, &value_space));
+ batch.Put(handles_[1], key, Value(keyi, &value_space));
+ batch.Put(handles_[2], key, Value(keyi, &value_space));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ }
+ }
+
+ void CheckMissed() {
+ uint64_t next_expected = 0;
+ uint64_t missed = 0;
+ int bad_keys = 0;
+ int bad_values = 0;
+ int correct = 0;
+ std::string value_space;
+ for (int cf = 0; cf < 3; cf++) {
+ next_expected = 0;
+ Iterator* iter = db_->NewIterator(ReadOptions(false, true), handles_[cf]);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ uint64_t key;
+ Slice in(iter->key());
+ in.remove_prefix(3);
+ if (!ConsumeDecimalNumber(&in, &key) || !in.empty() ||
+ key < next_expected) {
+ bad_keys++;
+ continue;
+ }
+ missed += (key - next_expected);
+ next_expected = key + 1;
+ if (iter->value() != Value(static_cast<int>(key), &value_space)) {
+ bad_values++;
+ } else {
+ correct++;
+ }
+ }
+ delete iter;
+ }
+
+ ASSERT_EQ(0, bad_keys);
+ ASSERT_EQ(0, bad_values);
+ ASSERT_EQ(0, missed);
+ (void)correct;
+ }
+
+ void Close() {
+ for (auto h : handles_) {
+ if (h) {
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(h));
+ }
+ }
+ handles_.clear();
+ names_.clear();
+ delete db_;
+ db_ = nullptr;
+ }
+
+ Status TryOpen(std::vector<std::string> cf,
+ std::vector<ColumnFamilyOptions> options = {}) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ names_.clear();
+ for (size_t i = 0; i < cf.size(); ++i) {
+ column_families.emplace_back(
+ cf[i], options.size() == 0 ? column_family_options_ : options[i]);
+ names_.push_back(cf[i]);
+ }
+ return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
+ }
+
+ Status OpenReadOnly(std::vector<std::string> cf,
+ std::vector<ColumnFamilyOptions> options = {}) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ names_.clear();
+ for (size_t i = 0; i < cf.size(); ++i) {
+ column_families.emplace_back(
+ cf[i], options.size() == 0 ? column_family_options_ : options[i]);
+ names_.push_back(cf[i]);
+ }
+ return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_,
+ &db_);
+ }
+
+#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported
+ void AssertOpenReadOnly(std::vector<std::string> cf,
+ std::vector<ColumnFamilyOptions> options = {}) {
+ ASSERT_OK(OpenReadOnly(cf, options));
+ }
+#endif // !ROCKSDB_LITE
+
+ void Open(std::vector<std::string> cf,
+ std::vector<ColumnFamilyOptions> options = {}) {
+ ASSERT_OK(TryOpen(cf, options));
+ }
+
+ void Open() { Open({"default"}); }
+
+ DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+ int GetProperty(int cf, std::string property) {
+ std::string value;
+ EXPECT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
+#ifndef CYGWIN
+ return std::stoi(value);
+#else
+ return std::strtol(value.c_str(), 0 /* off */, 10 /* base */);
+#endif
+ }
+
+ bool IsDbWriteStopped() {
+#ifndef ROCKSDB_LITE
+ uint64_t v;
+ EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.is-write-stopped", &v));
+ return (v == 1);
+#else
+ return dbfull()->TEST_write_controler().IsStopped();
+#endif // !ROCKSDB_LITE
+ }
+
+ uint64_t GetDbDelayedWriteRate() {
+#ifndef ROCKSDB_LITE
+ uint64_t v;
+ EXPECT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.actual-delayed-write-rate", &v));
+ return v;
+#else
+ if (!dbfull()->TEST_write_controler().NeedsDelay()) {
+ return 0;
+ }
+ return dbfull()->TEST_write_controler().delayed_write_rate();
+#endif // !ROCKSDB_LITE
+ }
+
+ void Destroy(const std::vector<ColumnFamilyDescriptor>& column_families =
+ std::vector<ColumnFamilyDescriptor>()) {
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_),
+ column_families));
+ }
+
+ void CreateColumnFamilies(
+ const std::vector<std::string>& cfs,
+ const std::vector<ColumnFamilyOptions> options = {}) {
+ int cfi = static_cast<int>(handles_.size());
+ handles_.resize(cfi + cfs.size());
+ names_.resize(cfi + cfs.size());
+ for (size_t i = 0; i < cfs.size(); ++i) {
+ const auto& current_cf_opt =
+ options.size() == 0 ? column_family_options_ : options[i];
+ ASSERT_OK(
+ db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi]));
+ names_[cfi] = cfs[i];
+
+#ifndef ROCKSDB_LITE // RocksDBLite does not support GetDescriptor
+ // Verify the CF options of the returned CF handle.
+ ColumnFamilyDescriptor desc;
+ ASSERT_OK(handles_[cfi]->GetDescriptor(&desc));
+ // Need to sanitize the default column family options before comparing
+ // them.
+ ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+ ConfigOptions(), desc.options,
+ SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt)));
+#endif // !ROCKSDB_LITE
+ cfi++;
+ }
+ }
+
+ void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
+ std::vector<std::string> names;
+ for (auto name : names_) {
+ if (name != "") {
+ names.push_back(name);
+ }
+ }
+ Close();
+ assert(options.size() == 0 || names.size() == options.size());
+ Open(names, options);
+ }
+
+ void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
+ CreateColumnFamilies(cfs);
+ Reopen();
+ }
+
+ void DropColumnFamilies(const std::vector<int>& cfs) {
+ for (auto cf : cfs) {
+ ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[cf]));
+ handles_[cf] = nullptr;
+ names_[cf] = "";
+ }
+ }
+
+ void PutRandomData(int cf, int num, int key_value_size, bool save = false) {
+ if (cf >= static_cast<int>(keys_.size())) {
+ keys_.resize(cf + 1);
+ }
+ for (int i = 0; i < num; ++i) {
+ // 10 bytes for key, rest is value
+ if (!save) {
+ ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 11),
+ rnd_.RandomString(key_value_size - 10)));
+ } else {
+ std::string key = test::RandomKey(&rnd_, 11);
+ keys_[cf].insert(key);
+ ASSERT_OK(Put(cf, key, rnd_.RandomString(key_value_size - 10)));
+ }
+ }
+ ASSERT_OK(db_->FlushWAL(/*sync=*/false));
+ }
+
+#ifndef ROCKSDB_LITE // TEST functions in DB are not supported in lite
+ void WaitForFlush(int cf) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+ }
+
+ void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); }
+
+ uint64_t MaxTotalInMemoryState() {
+ return dbfull()->TEST_MaxTotalInMemoryState();
+ }
+
+ void AssertMaxTotalInMemoryState(uint64_t value) {
+ ASSERT_EQ(value, MaxTotalInMemoryState());
+ }
+#endif // !ROCKSDB_LITE
+
+ Status Put(int cf, const std::string& key, const std::string& value) {
+ return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+ }
+ Status Merge(int cf, const std::string& key, const std::string& value) {
+ return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+ }
+ Status Flush(int cf) { return db_->Flush(FlushOptions(), handles_[cf]); }
+
+ std::string Get(int cf, const std::string& key) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ std::string result;
+ Status s = db_->Get(options, handles_[cf], Slice(key), &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ void CompactAll(int cf) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr,
+ nullptr));
+ }
+
+ void Compact(int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+ }
+
+ int NumTableFilesAtLevel(int level, int cf) {
+ return GetProperty(cf,
+ "rocksdb.num-files-at-level" + std::to_string(level));
+ }
+
+#ifndef ROCKSDB_LITE
+ // Return spread of files per level
+ std::string FilesPerLevel(int cf) {
+ std::string result;
+ int last_non_zero_offset = 0;
+ for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
+ int f = NumTableFilesAtLevel(level, cf);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = static_cast<int>(result.size());
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+#endif
+
+ void AssertFilesPerLevel(const std::string& value, int cf) {
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(value, FilesPerLevel(cf));
+#else
+ (void)value;
+ (void)cf;
+#endif
+ }
+
+#ifndef ROCKSDB_LITE // GetLiveFilesMetaData is not supported
+ int CountLiveFiles() {
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ return static_cast<int>(metadata.size());
+ }
+#endif // !ROCKSDB_LITE
+
+ void AssertCountLiveFiles(int expected_value) {
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(expected_value, CountLiveFiles());
+#else
+ (void)expected_value;
+#endif
+ }
+
+ // Do n memtable flushes, each of which produces an sstable
+ // covering the range [small,large].
+ void MakeTables(int cf, int n, const std::string& small,
+ const std::string& large) {
+ for (int i = 0; i < n; i++) {
+ ASSERT_OK(Put(cf, small, "begin"));
+ ASSERT_OK(Put(cf, large, "end"));
+ ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
+ }
+ }
+
+#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported
+ int CountLiveLogFiles() {
+ int micros_wait_for_log_deletion = 20000;
+ env_->SleepForMicroseconds(micros_wait_for_log_deletion);
+ int ret = 0;
+ VectorLogPtr wal_files;
+ Status s;
+ // GetSortedWalFiles is a flakey function -- it gets all the wal_dir
+ // children files and then later checks for their existence. if some of the
+ // log files doesn't exist anymore, it reports an error. it does all of this
+ // without DB mutex held, so if a background process deletes the log file
+ // while the function is being executed, it returns an error. We retry the
+ // function 10 times to avoid the error failing the test
+ for (int retries = 0; retries < 10; ++retries) {
+ wal_files.clear();
+ s = db_->GetSortedWalFiles(wal_files);
+ if (s.ok()) {
+ break;
+ }
+ }
+ EXPECT_OK(s);
+ for (const auto& wal : wal_files) {
+ if (wal->Type() == kAliveLogFile) {
+ ++ret;
+ }
+ }
+ return ret;
+ return 0;
+ }
+#endif // !ROCKSDB_LITE
+
+ void AssertCountLiveLogFiles(int value) {
+#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported
+ ASSERT_EQ(value, CountLiveLogFiles());
+#else
+ (void)value;
+#endif // !ROCKSDB_LITE
+ }
+
+ void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
+ assert(num_per_cf.size() == handles_.size());
+
+#ifndef ROCKSDB_LITE // GetProperty is not supported in lite
+ for (size_t i = 0; i < num_per_cf.size(); ++i) {
+ ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
+ "rocksdb.num-immutable-mem-table"));
+ }
+#endif // !ROCKSDB_LITE
+ }
+
+ void CopyFile(const std::string& source, const std::string& destination,
+ uint64_t size = 0) {
+ const EnvOptions soptions;
+ std::unique_ptr<SequentialFile> srcfile;
+ ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+ std::unique_ptr<WritableFile> destfile;
+ ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+ if (size == 0) {
+ // default argument means copy everything
+ ASSERT_OK(env_->GetFileSize(source, &size));
+ }
+
+ char buffer[4096];
+ Slice slice;
+ while (size > 0) {
+ uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+ ASSERT_OK(srcfile->Read(one, &slice, buffer));
+ ASSERT_OK(destfile->Append(slice));
+ size -= slice.size();
+ }
+ ASSERT_OK(destfile->Close());
+ }
+
+ int GetSstFileCount(std::string path) {
+ std::vector<std::string> files;
+ DBTestBase::GetSstFiles(env_, path, &files);
+ return static_cast<int>(files.size());
+ }
+
+ void RecalculateWriteStallConditions(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options) {
+ // add lock to avoid race condition between
+ // `RecalculateWriteStallConditions` which writes to CFStats and
+ // background `DBImpl::DumpStats()` threads which read CFStats
+ dbfull()->TEST_LockMutex();
+ cfd->RecalculateWriteStallConditions(mutable_cf_options);
+ dbfull()->TEST_UnlockMutex();
+ }
+
+ std::vector<ColumnFamilyHandle*> handles_;
+ std::vector<std::string> names_;
+ std::vector<std::set<std::string>> keys_;
+ ColumnFamilyOptions column_family_options_;
+ DBOptions db_options_;
+ std::string dbname_;
+ DB* db_ = nullptr;
+ EnvCounter* env_;
+ std::shared_ptr<Env> env_guard_;
+ Random rnd_;
+ uint32_t format_;
+};
+
+class ColumnFamilyTest
+ : public ColumnFamilyTestBase,
+ virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+ ColumnFamilyTest() : ColumnFamilyTestBase(GetParam()) {}
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest,
+ testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest,
+ testing::Values(kLatestFormatVersion));
+
+TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
+ for (int iter = 0; iter < 3; ++iter) {
+ Open();
+ CreateColumnFamilies({"one", "two", "three"});
+ for (size_t i = 0; i < handles_.size(); ++i) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handles_[i]);
+ ASSERT_EQ(i, cfh->GetID());
+ }
+ if (iter == 1) {
+ Reopen();
+ }
+ DropColumnFamilies({3});
+ Reopen();
+ if (iter == 2) {
+ // this tests if max_column_family is correctly persisted with
+ // WriteSnapshot()
+ Reopen();
+ }
+ CreateColumnFamilies({"three2"});
+ // ID 3 that was used for dropped column family "three" should not be
+ // reused
+ auto cfh3 = static_cast_with_check<ColumnFamilyHandleImpl>(handles_[3]);
+ ASSERT_EQ(4U, cfh3->GetID());
+ Close();
+ Destroy();
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
+ Open();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WriteOptionsFile:1",
+ "ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1"},
+ {"ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2",
+ "DBImpl::WriteOptionsFile:2"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread thread(
+ [&] { CreateColumnFamilies({"one"}); });
+
+ TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1");
+ uint64_t pv;
+ db_->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem, &pv);
+ TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2");
+
+ thread.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif // !ROCKSDB_LITE
+
+class FlushEmptyCFTestWithParam
+ : public ColumnFamilyTestBase,
+ virtual public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+ FlushEmptyCFTestWithParam()
+ : ColumnFamilyTestBase(std::get<0>(GetParam())),
+ allow_2pc_(std::get<1>(GetParam())) {}
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ bool allow_2pc_;
+};
+
+TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(env_));
+ db_options_.env = fault_env.get();
+ db_options_.allow_2pc = allow_2pc_;
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ // Generate log file A.
+ ASSERT_OK(Put(1, "foo", "v1")); // seqID 1
+
+ Reopen();
+ // Log file A is not dropped after reopening because default column family's
+ // min log number is 0.
+ // It flushes to SST file X
+ ASSERT_OK(Put(1, "foo", "v1")); // seqID 2
+ ASSERT_OK(Put(1, "bar", "v2")); // seqID 3
+ // Current log file is file B now. While flushing, a new log file C is created
+ // and is set to current. Boths' min log number is set to file C in memory, so
+ // after flushing file B is deleted. At the same time, the min log number of
+ // default CF is not written to manifest. Log file A still remains.
+ // Flushed to SST file Y.
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Flush(0));
+ ASSERT_OK(Put(1, "bar", "v3")); // seqID 4
+ ASSERT_OK(Put(1, "foo", "v4")); // seqID 5
+ ASSERT_OK(db_->FlushWAL(/*sync=*/false));
+
+ // Preserve file system state up to here to simulate a crash condition.
+ fault_env->SetFilesystemActive(false);
+ std::vector<std::string> names;
+ for (auto name : names_) {
+ if (name != "") {
+ names.push_back(name);
+ }
+ }
+
+ Close();
+ fault_env->ResetState();
+
+ // Before opening, there are four files:
+ // Log file A contains seqID 1
+ // Log file C contains seqID 4, 5
+ // SST file X contains seqID 1
+ // SST file Y contains seqID 2, 3
+ // Min log number:
+ // default CF: 0
+ // CF one, two: C
+ // When opening the DB, all the seqID should be preserved.
+ Open(names, {});
+ ASSERT_EQ("v4", Get(1, "foo"));
+ ASSERT_EQ("v3", Get(1, "bar"));
+ Close();
+
+ db_options_.env = env_;
+}
+
+TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(env_));
+ db_options_.env = fault_env.get();
+ db_options_.allow_2pc = allow_2pc_;
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ // Generate log file A.
+ ASSERT_OK(Put(1, "foo", "v1")); // seqID 1
+
+ Reopen();
+ // Log file A is not dropped after reopening because default column family's
+ // min log number is 0.
+ // It flushes to SST file X
+ ASSERT_OK(Put(1, "foo", "v1")); // seqID 2
+ ASSERT_OK(Put(1, "bar", "v2")); // seqID 3
+ // Current log file is file B now. While flushing, a new log file C is created
+ // and is set to current. Both CFs' min log number is set to file C so after
+ // flushing file B is deleted. Log file A still remains.
+ // Flushed to SST file Y.
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(0, "bar", "v2")); // seqID 4
+ ASSERT_OK(Put(2, "bar", "v2")); // seqID 5
+ ASSERT_OK(Put(1, "bar", "v3")); // seqID 6
+ // Flushing all column families. This forces all CFs' min log to current. This
+ // is written to the manifest file. Log file C is cleared.
+ ASSERT_OK(Flush(0));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Flush(2));
+ // Write to log file D
+ ASSERT_OK(Put(1, "bar", "v4")); // seqID 7
+ ASSERT_OK(Put(1, "bar", "v5")); // seqID 8
+ ASSERT_OK(db_->FlushWAL(/*sync=*/false));
+ // Preserve file system state up to here to simulate a crash condition.
+ fault_env->SetFilesystemActive(false);
+ std::vector<std::string> names;
+ for (auto name : names_) {
+ if (name != "") {
+ names.push_back(name);
+ }
+ }
+
+ Close();
+ fault_env->ResetState();
+ // Before opening, there are two logfiles:
+ // Log file A contains seqID 1
+ // Log file D contains seqID 7, 8
+ // Min log number:
+ // default CF: D
+ // CF one, two: D
+ // When opening the DB, log file D should be replayed using the seqID
+ // specified in the file.
+ Open(names, {});
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v5", Get(1, "bar"));
+ Close();
+
+ db_options_.env = env_;
+}
+
+INSTANTIATE_TEST_CASE_P(
+ FormatDef, FlushEmptyCFTestWithParam,
+ testing::Values(std::make_tuple(test::kDefaultFormatVersion, true),
+ std::make_tuple(test::kDefaultFormatVersion, false)));
+INSTANTIATE_TEST_CASE_P(
+ FormatLatest, FlushEmptyCFTestWithParam,
+ testing::Values(std::make_tuple(kLatestFormatVersion, true),
+ std::make_tuple(kLatestFormatVersion, false)));
+
+TEST_P(ColumnFamilyTest, AddDrop) {
+ Open();
+ CreateColumnFamilies({"one", "two", "three"});
+ ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
+ DropColumnFamilies({2});
+ ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+ CreateColumnFamilies({"four"});
+ ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_EQ("mirko", Get(1, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+ Close();
+ ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
+ Open({"default", "one", "three", "four"});
+ DropColumnFamilies({1});
+ Reopen();
+ Close();
+
+ std::vector<std::string> families;
+ ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+ std::sort(families.begin(), families.end());
+ ASSERT_TRUE(families ==
+ std::vector<std::string>({"default", "four", "three"}));
+}
+
+TEST_P(ColumnFamilyTest, BulkAddDrop) {
+ constexpr int kNumCF = 1000;
+ ColumnFamilyOptions cf_options;
+ WriteOptions write_options;
+ Open();
+ std::vector<std::string> cf_names;
+ std::vector<ColumnFamilyHandle*> cf_handles;
+ for (int i = 1; i <= kNumCF; i++) {
+ cf_names.push_back("cf1-" + std::to_string(i));
+ }
+ ASSERT_OK(db_->CreateColumnFamilies(cf_options, cf_names, &cf_handles));
+ for (int i = 1; i <= kNumCF; i++) {
+ ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar"));
+ }
+ ASSERT_OK(db_->DropColumnFamilies(cf_handles));
+ std::vector<ColumnFamilyDescriptor> cf_descriptors;
+ for (auto* handle : cf_handles) {
+ delete handle;
+ }
+ cf_handles.clear();
+ for (int i = 1; i <= kNumCF; i++) {
+ cf_descriptors.emplace_back("cf2-" + std::to_string(i),
+ ColumnFamilyOptions());
+ }
+ ASSERT_OK(db_->CreateColumnFamilies(cf_descriptors, &cf_handles));
+ for (int i = 1; i <= kNumCF; i++) {
+ ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar"));
+ }
+ ASSERT_OK(db_->DropColumnFamilies(cf_handles));
+ for (auto* handle : cf_handles) {
+ delete handle;
+ }
+ Close();
+ std::vector<std::string> families;
+ ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+ std::sort(families.begin(), families.end());
+ ASSERT_TRUE(families == std::vector<std::string>({"default"}));
+}
+
+TEST_P(ColumnFamilyTest, DropTest) {
+ // first iteration - don't reopen DB before dropping
+ // second iteration - reopen DB before dropping
+ for (int iter = 0; iter < 2; ++iter) {
+ Open({"default"});
+ CreateColumnFamiliesAndReopen({"pikachu"});
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush(1));
+
+ if (iter == 1) {
+ Reopen();
+ }
+ ASSERT_EQ("bar1", Get(1, "1"));
+
+ AssertCountLiveFiles(1);
+ DropColumnFamilies({1});
+ // make sure that all files are deleted when we drop the column family
+ AssertCountLiveFiles(0);
+ Destroy();
+ }
+}
+
+TEST_P(ColumnFamilyTest, WriteBatchFailure) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(handles_[0], Slice("existing"), Slice("column-family")));
+ ASSERT_OK(
+ batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ DropColumnFamilies({1});
+ WriteOptions woptions_ignore_missing_cf;
+ woptions_ignore_missing_cf.ignore_missing_column_families = true;
+ ASSERT_OK(
+ batch.Put(handles_[0], Slice("still here"), Slice("column-family")));
+ ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
+ ASSERT_EQ("column-family", Get(0, "still here"));
+ Status s = db_->Write(WriteOptions(), &batch);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, ReadWrite) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+ ASSERT_OK(Put(0, "foo", "v1"));
+ ASSERT_OK(Put(0, "bar", "v2"));
+ ASSERT_OK(Put(1, "mirko", "v3"));
+ ASSERT_OK(Put(0, "foo", "v2"));
+ ASSERT_OK(Put(2, "fodor", "v5"));
+
+ for (int iter = 0; iter <= 3; ++iter) {
+ ASSERT_EQ("v2", Get(0, "foo"));
+ ASSERT_EQ("v2", Get(0, "bar"));
+ ASSERT_EQ("v3", Get(1, "mirko"));
+ ASSERT_EQ("v5", Get(2, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+ if (iter <= 1) {
+ Reopen();
+ }
+ }
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
+ std::string backup_logs = dbname_ + "/backup_logs";
+
+ // delete old files in backup_logs directory
+ ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+ ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+ std::vector<std::string> old_files;
+ ASSERT_OK(env_->GetChildren(backup_logs, &old_files));
+ for (auto& file : old_files) {
+ ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file));
+ }
+
+ column_family_options_.merge_operator =
+ MergeOperators::CreateUInt64AddOperator();
+ db_options_.wal_dir = dbname_ + "/logs";
+ Destroy();
+ Open();
+ CreateColumnFamilies({"cf1", "cf2"});
+
+ // fill up the DB
+ std::string one, two, three;
+ PutFixed64(&one, 1);
+ PutFixed64(&two, 2);
+ PutFixed64(&three, 3);
+ ASSERT_OK(Merge(0, "foo", one));
+ ASSERT_OK(Merge(1, "mirko", one));
+ ASSERT_OK(Merge(0, "foo", one));
+ ASSERT_OK(Merge(2, "bla", one));
+ ASSERT_OK(Merge(2, "fodor", one));
+ ASSERT_OK(Merge(0, "bar", one));
+ ASSERT_OK(Merge(2, "bla", one));
+ ASSERT_OK(Merge(1, "mirko", two));
+ ASSERT_OK(Merge(1, "franjo", one));
+
+ // copy the logs to backup
+ std::vector<std::string> logs;
+ ASSERT_OK(env_->GetChildren(db_options_.wal_dir, &logs));
+ for (auto& log : logs) {
+ CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
+ }
+
+ // recover the DB
+ Close();
+
+ // 1. check consistency
+ // 2. copy the logs from backup back to WAL dir. if the recovery happens
+ // again on the same log files, this should lead to incorrect results
+ // due to applying merge operator twice
+ // 3. check consistency
+ for (int iter = 0; iter < 2; ++iter) {
+ // assert consistency
+ Open({"default", "cf1", "cf2"});
+ ASSERT_EQ(two, Get(0, "foo"));
+ ASSERT_EQ(one, Get(0, "bar"));
+ ASSERT_EQ(three, Get(1, "mirko"));
+ ASSERT_EQ(one, Get(1, "franjo"));
+ ASSERT_EQ(one, Get(2, "fodor"));
+ ASSERT_EQ(two, Get(2, "bla"));
+ Close();
+
+ if (iter == 0) {
+ // copy the logs from backup back to wal dir
+ for (auto& log : logs) {
+ CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
+ }
+ }
+ }
+}
+
+#ifndef ROCKSDB_LITE // TEST functions used are not supported
+TEST_P(ColumnFamilyTest, FlushTest) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+ ASSERT_OK(Put(0, "foo", "v1"));
+ ASSERT_OK(Put(0, "bar", "v2"));
+ ASSERT_OK(Put(1, "mirko", "v3"));
+ ASSERT_OK(Put(0, "foo", "v2"));
+ ASSERT_OK(Put(2, "fodor", "v5"));
+
+ for (int j = 0; j < 2; j++) {
+ ReadOptions ro;
+ std::vector<Iterator*> iterators;
+ // Hold super version.
+ if (j == 0) {
+ ASSERT_OK(db_->NewIterators(ro, handles_, &iterators));
+ }
+
+ for (int i = 0; i < 3; ++i) {
+ uint64_t max_total_in_memory_state = MaxTotalInMemoryState();
+ ASSERT_OK(Flush(i));
+ AssertMaxTotalInMemoryState(max_total_in_memory_state);
+ }
+ ASSERT_OK(Put(1, "foofoo", "bar"));
+ ASSERT_OK(Put(0, "foofoo", "bar"));
+
+ for (auto* it : iterators) {
+ ASSERT_OK(it->status());
+ delete it;
+ }
+ }
+ Reopen();
+
+ for (int iter = 0; iter <= 2; ++iter) {
+ ASSERT_EQ("v2", Get(0, "foo"));
+ ASSERT_EQ("v2", Get(0, "bar"));
+ ASSERT_EQ("v3", Get(1, "mirko"));
+ ASSERT_EQ("v5", Get(2, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+ ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+ if (iter <= 1) {
+ Reopen();
+ }
+ }
+ Close();
+}
+
+// Makes sure that obsolete log files get deleted
+TEST_P(ColumnFamilyTest, LogDeletionTest) {
+ db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+ column_family_options_.arena_block_size = 4 * 1024;
+ column_family_options_.write_buffer_size = 128000; // 128KB
+ Open();
+ CreateColumnFamilies({"one", "two", "three", "four"});
+ // Each bracket is one log file. if number is in (), it means
+ // we don't need it anymore (it's been flushed)
+ // []
+ AssertCountLiveLogFiles(0);
+ PutRandomData(0, 1, 128);
+ // [0]
+ PutRandomData(1, 1, 128);
+ // [0, 1]
+ PutRandomData(1, 1000, 128);
+ WaitForFlush(1);
+ // [0, (1)] [1]
+ AssertCountLiveLogFiles(2);
+ PutRandomData(0, 1, 128);
+ // [0, (1)] [0, 1]
+ AssertCountLiveLogFiles(2);
+ PutRandomData(2, 1, 128);
+ // [0, (1)] [0, 1, 2]
+ PutRandomData(2, 1000, 128);
+ WaitForFlush(2);
+ // [0, (1)] [0, 1, (2)] [2]
+ AssertCountLiveLogFiles(3);
+ PutRandomData(2, 1000, 128);
+ WaitForFlush(2);
+ // [0, (1)] [0, 1, (2)] [(2)] [2]
+ AssertCountLiveLogFiles(4);
+ PutRandomData(3, 1, 128);
+ // [0, (1)] [0, 1, (2)] [(2)] [2, 3]
+ PutRandomData(1, 1, 128);
+ // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
+ AssertCountLiveLogFiles(4);
+ PutRandomData(1, 1000, 128);
+ WaitForFlush(1);
+ // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
+ AssertCountLiveLogFiles(5);
+ PutRandomData(0, 1000, 128);
+ WaitForFlush(0);
+ // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
+ // delete obsolete logs -->
+ // [(1), 2, 3] [1, (0)] [0]
+ AssertCountLiveLogFiles(3);
+ PutRandomData(0, 1000, 128);
+ WaitForFlush(0);
+ // [(1), 2, 3] [1, (0)], [(0)] [0]
+ AssertCountLiveLogFiles(4);
+ PutRandomData(1, 1000, 128);
+ WaitForFlush(1);
+ // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
+ AssertCountLiveLogFiles(5);
+ PutRandomData(2, 1000, 128);
+ WaitForFlush(2);
+ // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
+ AssertCountLiveLogFiles(6);
+ PutRandomData(3, 1000, 128);
+ WaitForFlush(3);
+ // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
+ // delete obsolete logs -->
+ // [0, (1)] [1, (2)], [2, (3)] [3]
+ AssertCountLiveLogFiles(4);
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, CrashAfterFlush) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(env_));
+ db_options_.env = fault_env.get();
+ Open();
+ CreateColumnFamilies({"one"});
+
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(handles_[0], Slice("foo"), Slice("bar")));
+ ASSERT_OK(batch.Put(handles_[1], Slice("foo"), Slice("bar")));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ ASSERT_OK(Flush(0));
+ fault_env->SetFilesystemActive(false);
+
+ std::vector<std::string> names;
+ for (auto name : names_) {
+ if (name != "") {
+ names.push_back(name);
+ }
+ }
+ Close();
+ ASSERT_OK(fault_env->DropUnsyncedFileData());
+ fault_env->ResetState();
+ Open(names, {});
+
+ // Write batch should be atomic.
+ ASSERT_EQ(Get(0, "foo"), Get(1, "foo"));
+
+ Close();
+ db_options_.env = env_;
+}
+
+TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) {
+ ASSERT_OK(TryOpen({"default"}));
+ Close();
+ ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument());
+}
+
+#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
+// Makes sure that obsolete log files get deleted
+TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
+ // disable flushing stale column families
+ db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+ Open();
+ CreateColumnFamilies({"one", "two", "three"});
+ ColumnFamilyOptions default_cf, one, two, three;
+ // setup options. all column families have max_write_buffer_number setup to 10
+ // "default" -> 100KB memtable, start flushing immediately
+ // "one" -> 200KB memtable, start flushing with two immutable memtables
+ // "two" -> 1MB memtable, start flushing with three immutable memtables
+ // "three" -> 90KB memtable, start flushing with four immutable memtables
+ default_cf.write_buffer_size = 100000;
+ default_cf.arena_block_size = 4 * 4096;
+ default_cf.max_write_buffer_number = 10;
+ default_cf.min_write_buffer_number_to_merge = 1;
+ default_cf.max_write_buffer_size_to_maintain = 0;
+ one.write_buffer_size = 200000;
+ one.arena_block_size = 4 * 4096;
+ one.max_write_buffer_number = 10;
+ one.min_write_buffer_number_to_merge = 2;
+ one.max_write_buffer_size_to_maintain =
+ static_cast<int>(one.write_buffer_size);
+ two.write_buffer_size = 1000000;
+ two.arena_block_size = 4 * 4096;
+ two.max_write_buffer_number = 10;
+ two.min_write_buffer_number_to_merge = 3;
+ two.max_write_buffer_size_to_maintain =
+ static_cast<int>(two.write_buffer_size);
+ three.write_buffer_size = 4096 * 22;
+ three.arena_block_size = 4096;
+ three.max_write_buffer_number = 10;
+ three.min_write_buffer_number_to_merge = 4;
+ three.max_write_buffer_size_to_maintain =
+ static_cast<int>(three.write_buffer_size);
+
+ Reopen({default_cf, one, two, three});
+
+ int micros_wait_for_flush = 10000;
+ PutRandomData(0, 100, 1000);
+ WaitForFlush(0);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+ AssertCountLiveLogFiles(1);
+ PutRandomData(1, 200, 1000);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+ AssertCountLiveLogFiles(2);
+ PutRandomData(2, 1000, 1000);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 1, 0});
+ AssertCountLiveLogFiles(3);
+ PutRandomData(2, 1000, 1000);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 2, 0});
+ AssertCountLiveLogFiles(4);
+ PutRandomData(3, 93, 990);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 2, 1});
+ AssertCountLiveLogFiles(5);
+ PutRandomData(3, 88, 990);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 2, 2});
+ AssertCountLiveLogFiles(6);
+ PutRandomData(3, 88, 990);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+ AssertCountLiveLogFiles(7);
+ PutRandomData(0, 100, 1000);
+ WaitForFlush(0);
+ AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+ AssertCountLiveLogFiles(8);
+ PutRandomData(2, 100, 10000);
+ WaitForFlush(2);
+ AssertNumberOfImmutableMemtables({0, 1, 0, 3});
+ AssertCountLiveLogFiles(9);
+ PutRandomData(3, 88, 990);
+ WaitForFlush(3);
+ AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+ AssertCountLiveLogFiles(10);
+ PutRandomData(3, 88, 990);
+ env_->SleepForMicroseconds(micros_wait_for_flush);
+ AssertNumberOfImmutableMemtables({0, 1, 0, 1});
+ AssertCountLiveLogFiles(11);
+ PutRandomData(1, 200, 1000);
+ WaitForFlush(1);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 1});
+ AssertCountLiveLogFiles(5);
+ PutRandomData(3, 88 * 3, 990);
+ WaitForFlush(3);
+ PutRandomData(3, 88 * 4, 990);
+ WaitForFlush(3);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+ AssertCountLiveLogFiles(12);
+ PutRandomData(0, 100, 1000);
+ WaitForFlush(0);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+ AssertCountLiveLogFiles(12);
+ PutRandomData(2, 3 * 1000, 1000);
+ WaitForFlush(2);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+ AssertCountLiveLogFiles(12);
+ PutRandomData(1, 2 * 200, 1000);
+ WaitForFlush(1);
+ AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+ AssertCountLiveLogFiles(7);
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+// The test is commented out because we want to test that snapshot is
+// not created for memtables not supported it, but There isn't a memtable
+// that doesn't support snapshot right now. If we have one later, we can
+// re-enable the test.
+//
+// #ifndef ROCKSDB_LITE // Cuckoo is not supported in lite
+// TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) {
+// db_options_.allow_concurrent_memtable_write = false;
+// Open();
+// auto* s1 = dbfull()->GetSnapshot();
+// ASSERT_TRUE(s1 != nullptr);
+// dbfull()->ReleaseSnapshot(s1);
+
+// // Add a column family that doesn't support snapshot
+// ColumnFamilyOptions first;
+// first.memtable_factory.reset(new DummyMemtableNotSupportingSnapshot());
+// CreateColumnFamilies({"first"}, {first});
+// auto* s2 = dbfull()->GetSnapshot();
+// ASSERT_TRUE(s2 == nullptr);
+
+// // Add a column family that supports snapshot. Snapshot stays not
+// supported. ColumnFamilyOptions second; CreateColumnFamilies({"second"},
+// {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr);
+// Close();
+// }
+// #endif // !ROCKSDB_LITE
+
+class TestComparator : public Comparator {
+ int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/,
+ const ROCKSDB_NAMESPACE::Slice& /*b*/) const override {
+ return 0;
+ }
+ const char* Name() const override { return "Test"; }
+ void FindShortestSeparator(
+ std::string* /*start*/,
+ const ROCKSDB_NAMESPACE::Slice& /*limit*/) const override {}
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+static TestComparator third_comparator;
+static TestComparator fourth_comparator;
+
+// Test that we can retrieve the comparator from a created CF
+TEST_P(ColumnFamilyTest, GetComparator) {
+ Open();
+ // Add a column family with no comparator specified
+ CreateColumnFamilies({"first"});
+ const Comparator* comp = handles_[0]->GetComparator();
+ ASSERT_EQ(comp, BytewiseComparator());
+
+ // Add three column families - one with no comparator and two
+ // with comparators specified
+ ColumnFamilyOptions second, third, fourth;
+ second.comparator = &third_comparator;
+ third.comparator = &fourth_comparator;
+ CreateColumnFamilies({"second", "third", "fourth"}, {second, third, fourth});
+ ASSERT_EQ(handles_[1]->GetComparator(), BytewiseComparator());
+ ASSERT_EQ(handles_[2]->GetComparator(), &third_comparator);
+ ASSERT_EQ(handles_[3]->GetComparator(), &fourth_comparator);
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, DifferentMergeOperators) {
+ Open();
+ CreateColumnFamilies({"first", "second"});
+ ColumnFamilyOptions default_cf, first, second;
+ first.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ second.merge_operator = MergeOperators::CreateStringAppendOperator();
+ Reopen({default_cf, first, second});
+
+ std::string one, two, three;
+ PutFixed64(&one, 1);
+ PutFixed64(&two, 2);
+ PutFixed64(&three, 3);
+
+ ASSERT_OK(Put(0, "foo", two));
+ ASSERT_OK(Put(0, "foo", one));
+ ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
+ ASSERT_EQ(Get(0, "foo"), one);
+
+ ASSERT_OK(Put(1, "foo", two));
+ ASSERT_OK(Put(1, "foo", one));
+ ASSERT_OK(Merge(1, "foo", two));
+ ASSERT_EQ(Get(1, "foo"), three);
+
+ ASSERT_OK(Put(2, "foo", two));
+ ASSERT_OK(Put(2, "foo", one));
+ ASSERT_OK(Merge(2, "foo", two));
+ ASSERT_EQ(Get(2, "foo"), one + "," + two);
+ Close();
+}
+
+#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
+TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ ColumnFamilyOptions default_cf, one, two;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = static_cast<uint64_t>(1) << 60;
+
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ two.compaction_style = kCompactionStyleLevel;
+ two.num_levels = 4;
+ two.level0_file_num_compaction_trigger = 3;
+ two.write_buffer_size = 100000;
+
+ Reopen({default_cf, one, two});
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
+ PutRandomData(1, 10, 12000);
+ PutRandomData(1, 1, 10);
+ WaitForFlush(1);
+ AssertFilesPerLevel(std::to_string(i + 1), 1);
+ }
+
+ // SETUP column family "two" -- level style with 4 levels
+ for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
+ PutRandomData(2, 10, 12000);
+ PutRandomData(2, 1, 10);
+ WaitForFlush(2);
+ AssertFilesPerLevel(std::to_string(i + 1), 2);
+ }
+
+ // TRIGGER compaction "one"
+ PutRandomData(1, 10, 12000);
+ PutRandomData(1, 1, 10);
+
+ // TRIGGER compaction "two"
+ PutRandomData(2, 10, 12000);
+ PutRandomData(2, 1, 10);
+
+ // WAIT for compactions
+ WaitForCompaction();
+
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("1", 1);
+
+ // VERIFY compaction "two"
+ AssertFilesPerLevel("0,1", 2);
+ CompactAll(2);
+ AssertFilesPerLevel("0,1", 2);
+
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+// Sync points not supported in RocksDB Lite
+
+TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ ColumnFamilyOptions default_cf, one, two;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ two.compaction_style = kCompactionStyleLevel;
+ two.num_levels = 4;
+ two.level0_file_num_compaction_trigger = 3;
+ two.write_buffer_size = 100000;
+
+ Reopen({default_cf, one, two});
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(std::to_string(i + 1), 1);
+ }
+ std::atomic_bool cf_1_1{true};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::MultiManual:4", "ColumnFamilyTest::MultiManual:1"},
+ {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:5"},
+ {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:4");
+ TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:3");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ std::vector<port::Thread> threads;
+ threads.emplace_back([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ // SETUP column family "two" -- level style with 4 levels
+ for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(2, 10, 12000);
+ PutRandomData(2, 1, 10);
+ WaitForFlush(2);
+ AssertFilesPerLevel(std::to_string(i + 1), 2);
+ }
+ threads.emplace_back([&] {
+ TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:1");
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[2], nullptr, nullptr));
+ TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:2");
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:5");
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("1", 1);
+
+ // VERIFY compaction "two"
+ AssertFilesPerLevel("0,1", 2);
+ CompactAll(2);
+ AssertFilesPerLevel("0,1", 2);
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ ColumnFamilyOptions default_cf, one, two;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ two.compaction_style = kCompactionStyleLevel;
+ two.num_levels = 4;
+ two.level0_file_num_compaction_trigger = 3;
+ two.write_buffer_size = 100000;
+
+ Reopen({default_cf, one, two});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ std::atomic_bool cf_1_1{true};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:1"},
+ {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:5"},
+ {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3");
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(std::to_string(i + 1), 1);
+ }
+
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1");
+
+ // SETUP column family "two" -- level style with 4 levels
+ for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(2, 10, 12000);
+ PutRandomData(2, 1, 10);
+ WaitForFlush(2);
+ AssertFilesPerLevel(std::to_string(i + 1), 2);
+ }
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[2], nullptr, nullptr));
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2");
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5");
+ threads.join();
+
+ // WAIT for compactions
+ WaitForCompaction();
+
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("1", 1);
+
+ // VERIFY compaction "two"
+ AssertFilesPerLevel("0,1", 2);
+ CompactAll(2);
+ AssertFilesPerLevel("0,1", 2);
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ ColumnFamilyOptions default_cf, one, two;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ two.compaction_style = kCompactionStyleLevel;
+ two.num_levels = 4;
+ two.level0_file_num_compaction_trigger = 3;
+ two.write_buffer_size = 100000;
+
+ Reopen({default_cf, one, two});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(std::to_string(i + 1), 1);
+ }
+ std::atomic_bool cf_1_1{true};
+ std::atomic_bool cf_1_2{true};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:1"},
+ {"ColumnFamilyTest::ManualAuto:5", "ColumnFamilyTest::ManualAuto:2"},
+ {"ColumnFamilyTest::ManualAuto:2", "ColumnFamilyTest::ManualAuto:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+ } else if (cf_1_2.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+ // SETUP column family "two" -- level style with 4 levels
+ for (int i = 0; i < two.level0_file_num_compaction_trigger; ++i) {
+ PutRandomData(2, 10, 12000);
+ PutRandomData(2, 1, 10);
+ WaitForFlush(2);
+ AssertFilesPerLevel(std::to_string(i + 1), 2);
+ }
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+ threads.join();
+
+ // WAIT for compactions
+ WaitForCompaction();
+
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("1", 1);
+
+ // VERIFY compaction "two"
+ AssertFilesPerLevel("0,1", 2);
+ CompactAll(2);
+ AssertFilesPerLevel("0,1", 2);
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualManualCompactions) {
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyOptions default_cf, one;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ Reopen({default_cf, one});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(std::to_string(i + 1), 1);
+ }
+ std::atomic_bool cf_1_1{true};
+ std::atomic_bool cf_1_2{true};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:2"},
+ {"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:5"},
+ {"ColumnFamilyTest::ManualManual:1", "ColumnFamilyTest::ManualManual:2"},
+ {"ColumnFamilyTest::ManualManual:1",
+ "ColumnFamilyTest::ManualManual:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:4");
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:3");
+ } else if (cf_1_2.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:2");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = true;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:5");
+
+ WaitForFlush(1);
+
+ // Add more L0 files and force another manual compaction
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(
+ std::to_string(one.level0_file_num_compaction_trigger + i), 1);
+ }
+
+ ROCKSDB_NAMESPACE::port::Thread threads1([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:1");
+
+ threads.join();
+ threads1.join();
+ WaitForCompaction();
+ // VERIFY compaction "one"
+ ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
+
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactions) {
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyOptions default_cf, one;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ Reopen({default_cf, one});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(std::to_string(i + 1), 1);
+ }
+ std::atomic_bool cf_1_1{true};
+ std::atomic_bool cf_1_2{true};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"},
+ {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"},
+ {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:2"},
+ {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+ } else if (cf_1_2.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+
+ WaitForFlush(1);
+
+ // Add more L0 files and force automatic compaction
+ for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(
+ std::to_string(one.level0_file_num_compaction_trigger + i), 1);
+ }
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+ threads.join();
+ WaitForCompaction();
+ // VERIFY compaction "one"
+ ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
+
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) {
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyOptions default_cf, one;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleLevel;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 3;
+ one.write_buffer_size = 120000;
+
+ Reopen({default_cf, one});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ // SETUP column family "one" -- level style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(std::to_string(i + 1), 1);
+ }
+ std::atomic_bool cf_1_1{true};
+ std::atomic_bool cf_1_2{true};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"},
+ {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"},
+ {"ColumnFamilyTest::ManualAuto:3", "ColumnFamilyTest::ManualAuto:2"},
+ {"LevelCompactionPicker::PickCompactionBySize:0",
+ "ColumnFamilyTest::ManualAuto:3"},
+ {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+ } else if (cf_1_2.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+
+ // Add more L0 files and force automatic compaction
+ for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(
+ std::to_string(one.level0_file_num_compaction_trigger + i), 1);
+ }
+
+ TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+ threads.join();
+ WaitForCompaction();
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("0,1", 1);
+
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// In this test, we generate enough files to trigger automatic compactions.
+// The automatic compaction waits in NonTrivial:AfterRun
+// We generate more files and then trigger an automatic compaction
+// This will wait because the automatic compaction has files it needs.
+// Once the conflict is hit, the automatic compaction starts and ends
+// Then the manual will run and end.
+TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyOptions default_cf, one;
+ db_options_.max_open_files = 20; // only 10 files in file cache
+ db_options_.max_background_compactions = 3;
+
+ default_cf.compaction_style = kCompactionStyleLevel;
+ default_cf.num_levels = 3;
+ default_cf.write_buffer_size = 64 << 10; // 64KB
+ default_cf.target_file_size_base = 30 << 10;
+ default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ ;
+ table_options.no_block_cache = true;
+ default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ one.compaction_style = kCompactionStyleUniversal;
+
+ one.num_levels = 1;
+ // trigger compaction if there are >= 4 files
+ one.level0_file_num_compaction_trigger = 4;
+ one.write_buffer_size = 120000;
+
+ Reopen({default_cf, one});
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ std::atomic_bool cf_1_1{true};
+ std::atomic_bool cf_1_2{true};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:2"},
+ {"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:5"},
+ {"CompactionPicker::CompactRange:Conflict",
+ "ColumnFamilyTest::AutoManual:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (cf_1_1.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3");
+ } else if (cf_1_2.exchange(false)) {
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // SETUP column family "one" -- universal style
+ for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ AssertFilesPerLevel(std::to_string(i + 1), 1);
+ }
+
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5");
+
+ // Add another L0 file and force automatic compaction
+ for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+ PutRandomData(1, 10, 12000, true);
+ PutRandomData(1, 1, 10, true);
+ WaitForFlush(1);
+ }
+
+ CompactRangeOptions compact_options;
+ compact_options.exclusive_manual_compaction = false;
+ ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+
+ TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1");
+
+ WaitForCompaction();
+ // VERIFY compaction "one"
+ AssertFilesPerLevel("1", 1);
+ // Compare against saved keys
+ std::set<std::string>::iterator key_iter = keys_[1].begin();
+ while (key_iter != keys_[1].end()) {
+ ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+ key_iter++;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // Tailing iterator not supported
+namespace {
+std::string IterStatus(Iterator* iter) {
+ std::string result;
+ if (iter->Valid()) {
+ result = iter->key().ToString() + "->" + iter->value().ToString();
+ } else {
+ EXPECT_OK(iter->status());
+ result = "(invalid)";
+ }
+ return result;
+}
+} // anonymous namespace
+
+TEST_P(ColumnFamilyTest, NewIteratorsTest) {
+ // iter == 0 -- no tailing
+ // iter == 2 -- tailing
+ for (int iter = 0; iter < 2; ++iter) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+ ASSERT_OK(Put(0, "a", "b"));
+ ASSERT_OK(Put(1, "b", "a"));
+ ASSERT_OK(Put(2, "c", "m"));
+ ASSERT_OK(Put(2, "v", "t"));
+ std::vector<Iterator*> iterators;
+ ReadOptions options;
+ options.tailing = (iter == 1);
+ ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
+
+ for (auto it : iterators) {
+ it->SeekToFirst();
+ }
+ ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+ ASSERT_EQ(IterStatus(iterators[1]), "b->a");
+ ASSERT_EQ(IterStatus(iterators[2]), "c->m");
+
+ ASSERT_OK(Put(1, "x", "x"));
+
+ for (auto it : iterators) {
+ it->Next();
+ }
+
+ ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+ if (iter == 0) {
+ // no tailing
+ ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+ } else {
+ // tailing
+ ASSERT_EQ(IterStatus(iterators[1]), "x->x");
+ }
+ ASSERT_EQ(IterStatus(iterators[2]), "v->t");
+
+ for (auto it : iterators) {
+ delete it;
+ }
+ Destroy();
+ }
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported
+TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+ ASSERT_OK(Put(0, "a", "b"));
+ ASSERT_OK(Put(1, "foo", "bla"));
+ ASSERT_OK(Put(2, "foo", "blabla"));
+ ASSERT_OK(Put(3, "foo", "blablabla"));
+ ASSERT_OK(Put(4, "foo", "blablablabla"));
+
+ DropColumnFamilies({2});
+ Close();
+ // open only a subset of column families
+ AssertOpenReadOnly({"default", "one", "four"});
+ ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+ ASSERT_EQ("bla", Get(1, "foo"));
+ ASSERT_EQ("blablablabla", Get(2, "foo"));
+
+ // test newiterators
+ {
+ std::vector<Iterator*> iterators;
+ ASSERT_OK(db_->NewIterators(ReadOptions(), handles_, &iterators));
+ for (auto it : iterators) {
+ it->SeekToFirst();
+ }
+ ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+ ASSERT_EQ(IterStatus(iterators[1]), "foo->bla");
+ ASSERT_EQ(IterStatus(iterators[2]), "foo->blablablabla");
+ for (auto it : iterators) {
+ it->Next();
+ }
+ ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+ ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+ ASSERT_EQ(IterStatus(iterators[2]), "(invalid)");
+
+ for (auto it : iterators) {
+ delete it;
+ }
+ }
+
+ Close();
+ // can't open dropped column family
+ Status s = OpenReadOnly({"default", "one", "two"});
+ ASSERT_TRUE(!s.ok());
+
+ // Can't open without specifying default column family
+ s = OpenReadOnly({"one", "four"});
+ ASSERT_TRUE(!s.ok());
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // WaitForFlush() is not supported in lite
+TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+
+ for (size_t i = 0; i < handles_.size(); ++i) {
+ PutRandomData(static_cast<int>(i), 10, 100);
+ }
+ int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
+ // this will trigger the flushes
+ for (int i = 0; i <= 4; ++i) {
+ ASSERT_OK(Flush(i));
+ }
+
+ for (int i = 0; i < 4; ++i) {
+ WaitForFlush(i);
+ }
+ int total_new_writable_files =
+ env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start;
+ ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // WaitForCompaction() is not supported in lite
+TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
+ Open();
+ CreateColumnFamilies({"one", "two"});
+ ColumnFamilyOptions default_cf, one, two;
+ default_cf.write_buffer_size = 100000; // small write buffer size
+ default_cf.arena_block_size = 4096;
+ default_cf.disable_auto_compactions = true;
+ one.disable_auto_compactions = true;
+ two.disable_auto_compactions = true;
+ db_options_.max_total_wal_size = 210000;
+
+ Reopen({default_cf, one, two});
+
+ PutRandomData(2, 1, 10); // 10 bytes
+ for (int i = 0; i < 2; ++i) {
+ PutRandomData(0, 100, 1000); // flush
+ WaitForFlush(0);
+
+ AssertCountLiveFiles(i + 1);
+ }
+ // third flush. now, CF [two] should be detected as stale and flushed
+ // column family 1 should not be flushed since it's empty
+ PutRandomData(0, 100, 1000); // flush
+ WaitForFlush(0);
+ WaitForFlush(2);
+ // at least 3 files for default column families, 1 file for column family
+ // [two], zero files for column family [one], because it's empty
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_GE(metadata.size(), 4);
+ bool has_cf1_sst = false;
+ bool has_cf2_sst = false;
+ for (const auto& file : metadata) {
+ if (file.column_family_name == "one") {
+ has_cf1_sst = true;
+ } else if (file.column_family_name == "two") {
+ has_cf2_sst = true;
+ }
+ }
+ ASSERT_FALSE(has_cf1_sst);
+ ASSERT_TRUE(has_cf2_sst);
+
+ ASSERT_OK(Flush(0));
+ ASSERT_EQ(0, dbfull()->TEST_total_log_size());
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) {
+ Status s = TryOpen({"one", "two"});
+ ASSERT_TRUE(!s.ok());
+ db_options_.create_missing_column_families = true;
+ s = TryOpen({"default", "one", "two"});
+ ASSERT_TRUE(s.ok());
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, SanitizeOptions) {
+ DBOptions db_options;
+ for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) {
+ for (int l = 0; l <= 2; l++) {
+ for (int i = 1; i <= 3; i++) {
+ for (int j = 1; j <= 3; j++) {
+ for (int k = 1; k <= 3; k++) {
+ ColumnFamilyOptions original;
+ original.compaction_style = static_cast<CompactionStyle>(s);
+ original.num_levels = l;
+ original.level0_stop_writes_trigger = i;
+ original.level0_slowdown_writes_trigger = j;
+ original.level0_file_num_compaction_trigger = k;
+ original.write_buffer_size =
+ l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k;
+
+ ColumnFamilyOptions result =
+ SanitizeOptions(ImmutableDBOptions(db_options), original);
+ ASSERT_TRUE(result.level0_stop_writes_trigger >=
+ result.level0_slowdown_writes_trigger);
+ ASSERT_TRUE(result.level0_slowdown_writes_trigger >=
+ result.level0_file_num_compaction_trigger);
+ ASSERT_TRUE(result.level0_file_num_compaction_trigger ==
+ original.level0_file_num_compaction_trigger);
+ if (s == kCompactionStyleLevel) {
+ ASSERT_GE(result.num_levels, 2);
+ } else {
+ ASSERT_GE(result.num_levels, 1);
+ if (original.num_levels >= 1) {
+ ASSERT_EQ(result.num_levels, original.num_levels);
+ }
+ }
+
+ // Make sure Sanitize options sets arena_block_size to 1/8 of
+ // the write_buffer_size, rounded up to a multiple of 4k.
+ size_t expected_arena_block_size =
+ l * 4 * 1024 * 1024 / 8 + i * 1024 * 1024 / 8;
+ if (j + k != 0) {
+ // not a multiple of 4k, round up 4k
+ expected_arena_block_size += 4 * 1024;
+ }
+ expected_arena_block_size =
+ std::min(size_t{1024 * 1024}, expected_arena_block_size);
+ ASSERT_EQ(expected_arena_block_size, result.arena_block_size);
+ }
+ }
+ }
+ }
+ }
+}
+
+TEST_P(ColumnFamilyTest, ReadDroppedColumnFamily) {
+ // iter 0 -- drop CF, don't reopen
+ // iter 1 -- delete CF, reopen
+ for (int iter = 0; iter < 2; ++iter) {
+ db_options_.create_missing_column_families = true;
+ db_options_.max_open_files = 20;
+ // delete obsolete files always
+ db_options_.delete_obsolete_files_period_micros = 0;
+ Open({"default", "one", "two"});
+ ColumnFamilyOptions options;
+ options.level0_file_num_compaction_trigger = 100;
+ options.level0_slowdown_writes_trigger = 200;
+ options.level0_stop_writes_trigger = 200;
+ options.write_buffer_size = 100000; // small write buffer size
+ Reopen({options, options, options});
+
+ // 1MB should create ~10 files for each CF
+ int kKeysNum = 10000;
+ PutRandomData(0, kKeysNum, 100);
+ PutRandomData(1, kKeysNum, 100);
+ PutRandomData(2, kKeysNum, 100);
+
+ {
+ std::unique_ptr<Iterator> iterator(
+ db_->NewIterator(ReadOptions(), handles_[2]));
+ iterator->SeekToFirst();
+
+ if (iter == 0) {
+ // Drop CF two
+ ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+ } else {
+ // delete CF two
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[2]));
+ handles_[2] = nullptr;
+ }
+ // Make sure iterator created can still be used.
+ int count = 0;
+ for (; iterator->Valid(); iterator->Next()) {
+ ASSERT_OK(iterator->status());
+ ++count;
+ }
+ ASSERT_OK(iterator->status());
+ ASSERT_EQ(count, kKeysNum);
+ }
+
+ // Add bunch more data to other CFs
+ PutRandomData(0, kKeysNum, 100);
+ PutRandomData(1, kKeysNum, 100);
+
+ if (iter == 1) {
+ Reopen();
+ }
+
+ // Since we didn't delete CF handle, RocksDB's contract guarantees that
+ // we're still able to read dropped CF
+ for (int i = 0; i < 3; ++i) {
+ std::unique_ptr<Iterator> iterator(
+ db_->NewIterator(ReadOptions(), handles_[i]));
+ int count = 0;
+ for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+ ASSERT_OK(iterator->status());
+ ++count;
+ }
+ ASSERT_OK(iterator->status());
+ ASSERT_EQ(count, kKeysNum * ((i == 2) ? 1 : 2));
+ }
+
+ Close();
+ Destroy();
+ }
+}
+
+TEST_P(ColumnFamilyTest, LiveIteratorWithDroppedColumnFamily) {
+ db_options_.create_missing_column_families = true;
+ db_options_.max_open_files = 20;
+ // delete obsolete files always
+ db_options_.delete_obsolete_files_period_micros = 0;
+ Open({"default", "one", "two"});
+ ColumnFamilyOptions options;
+ options.level0_file_num_compaction_trigger = 100;
+ options.level0_slowdown_writes_trigger = 200;
+ options.level0_stop_writes_trigger = 200;
+ options.write_buffer_size = 100000; // small write buffer size
+ Reopen({options, options, options});
+
+ // 1MB should create ~10 files for each CF
+ int kKeysNum = 10000;
+ PutRandomData(1, kKeysNum, 100);
+ {
+ std::unique_ptr<Iterator> iterator(
+ db_->NewIterator(ReadOptions(), handles_[1]));
+ iterator->SeekToFirst();
+
+ DropColumnFamilies({1});
+
+ // Make sure iterator created can still be used.
+ int count = 0;
+ for (; iterator->Valid(); iterator->Next()) {
+ ASSERT_OK(iterator->status());
+ ++count;
+ }
+ ASSERT_OK(iterator->status());
+ ASSERT_EQ(count, kKeysNum);
+ }
+
+ Reopen();
+ Close();
+ Destroy();
+}
+
+TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) {
+ db_options_.create_missing_column_families = true;
+ Open({"default", "one"});
+ ColumnFamilyOptions options;
+ options.level0_file_num_compaction_trigger = 100;
+ options.level0_slowdown_writes_trigger = 200;
+ options.level0_stop_writes_trigger = 200;
+ options.max_write_buffer_number = 20;
+ options.write_buffer_size = 100000; // small write buffer size
+ Reopen({options, options});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"VersionSet::LogAndApply::ColumnFamilyDrop:0",
+ "FlushJob::WriteLevel0Table"},
+ {"VersionSet::LogAndApply::ColumnFamilyDrop:1",
+ "FlushJob::InstallResults"},
+ {"FlushJob::InstallResults",
+ "VersionSet::LogAndApply::ColumnFamilyDrop:2"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ test::SleepingBackgroundTask sleeping_task;
+
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::HIGH);
+ // Make sure the task is sleeping. Otherwise, it might start to execute
+ // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+ sleeping_task.WaitUntilSleeping();
+
+ // 1MB should create ~10 files for each CF
+ int kKeysNum = 10000;
+ PutRandomData(1, kKeysNum, 100);
+
+ std::vector<port::Thread> threads;
+ threads.emplace_back([&] { ASSERT_OK(db_->DropColumnFamily(handles_[1])); });
+
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+ sleeping_task.Reset();
+ // now we sleep again. this is just so we're certain that flush job finished
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::HIGH);
+ // Make sure the task is sleeping. Otherwise, it might start to execute
+ // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+ sleeping_task.WaitUntilSleeping();
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+
+ {
+ // Since we didn't delete CF handle, RocksDB's contract guarantees that
+ // we're still able to read dropped CF
+ std::unique_ptr<Iterator> iterator(
+ db_->NewIterator(ReadOptions(), handles_[1]));
+ int count = 0;
+ for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+ ASSERT_OK(iterator->status());
+ ++count;
+ }
+ ASSERT_OK(iterator->status());
+ ASSERT_EQ(count, kKeysNum);
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ Close();
+ Destroy();
+}
+
+#ifndef ROCKSDB_LITE
+// skipped as persisting options is not supported in ROCKSDB_LITE
+namespace {
+std::atomic<int> test_stage(0);
+std::atomic<bool> ordered_by_writethread(false);
+const int kMainThreadStartPersistingOptionsFile = 1;
+const int kChildThreadFinishDroppingColumnFamily = 2;
+void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id,
+ std::vector<Comparator*>* comparators) {
+ while (test_stage < kMainThreadStartPersistingOptionsFile &&
+ !ordered_by_writethread) {
+ Env::Default()->SleepForMicroseconds(100);
+ }
+ cf_test->DropColumnFamilies({cf_id});
+ if ((*comparators)[cf_id]) {
+ delete (*comparators)[cf_id];
+ (*comparators)[cf_id] = nullptr;
+ }
+ test_stage = kChildThreadFinishDroppingColumnFamily;
+}
+} // anonymous namespace
+
+TEST_P(ColumnFamilyTest, CreateAndDropRace) {
+ const int kCfCount = 5;
+ std::vector<ColumnFamilyOptions> cf_opts;
+ std::vector<Comparator*> comparators;
+ for (int i = 0; i < kCfCount; ++i) {
+ cf_opts.emplace_back();
+ comparators.push_back(new test::SimpleSuffixReverseComparator());
+ cf_opts.back().comparator = comparators.back();
+ }
+ db_options_.create_if_missing = true;
+ db_options_.create_missing_column_families = true;
+
+ auto main_thread_id = std::this_thread::get_id();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PersistRocksDBOptions:start", [&](void* /*arg*/) {
+ auto current_thread_id = std::this_thread::get_id();
+ // If it's the main thread hitting this sync-point, then it
+ // will be blocked until some other thread update the test_stage.
+ if (main_thread_id == current_thread_id) {
+ test_stage = kMainThreadStartPersistingOptionsFile;
+ while (test_stage < kChildThreadFinishDroppingColumnFamily &&
+ !ordered_by_writethread) {
+ Env::Default()->SleepForMicroseconds(100);
+ }
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::EnterUnbatched:Wait", [&](void* /*arg*/) {
+ // This means a thread doing DropColumnFamily() is waiting for
+ // other thread to finish persisting options.
+ // In such case, we update the test_stage to unblock the main thread.
+ ordered_by_writethread = true;
+ });
+
+ // Create a database with four column families
+ Open({"default", "one", "two", "three"},
+ {cf_opts[0], cf_opts[1], cf_opts[2], cf_opts[3]});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Start a thread that will drop the first column family
+ // and its comparator
+ ROCKSDB_NAMESPACE::port::Thread drop_cf_thread(DropSingleColumnFamily, this,
+ 1, &comparators);
+
+ DropColumnFamilies({2});
+
+ drop_cf_thread.join();
+ Close();
+ Destroy();
+ for (auto* comparator : comparators) {
+ if (comparator) {
+ delete comparator;
+ }
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
+ const uint64_t kBaseRate = 800000u;
+ db_options_.delayed_write_rate = kBaseRate;
+ db_options_.max_background_compactions = 6;
+
+ Open({"default"});
+ ColumnFamilyData* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+ VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+ MutableCFOptions mutable_cf_options(column_family_options_);
+
+ mutable_cf_options.level0_slowdown_writes_trigger = 20;
+ mutable_cf_options.level0_stop_writes_trigger = 10000;
+ mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+ mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+ mutable_cf_options.disable_auto_compactions = false;
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(201);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(400);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(450);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(205);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(202);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(201);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(198);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(399);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(599);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(2001);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(3001);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(390);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(100);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage->set_l0_delay_trigger_count(100);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(101);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->set_l0_delay_trigger_count(0);
+ vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->set_l0_delay_trigger_count(101);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(200);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->set_l0_delay_trigger_count(0);
+ vstorage->TEST_set_estimated_compaction_needed_bytes(0);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ mutable_cf_options.disable_auto_compactions = true;
+ dbfull()->TEST_write_controler().set_delayed_write_rate(kBaseRate);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage->set_l0_delay_trigger_count(50);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(0, GetDbDelayedWriteRate());
+ ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate());
+
+ vstorage->set_l0_delay_trigger_count(60);
+ vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(0, GetDbDelayedWriteRate());
+ ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate());
+
+ mutable_cf_options.disable_auto_compactions = false;
+ vstorage->set_l0_delay_trigger_count(70);
+ vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->set_l0_delay_trigger_count(71);
+ vstorage->TEST_set_estimated_compaction_needed_bytes(501);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+}
+
+TEST_P(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) {
+ db_options_.max_background_compactions = 6;
+ Open({"default"});
+ ColumnFamilyData* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+ VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+ MutableCFOptions mutable_cf_options(column_family_options_);
+
+ // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8
+ mutable_cf_options.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options.level0_slowdown_writes_trigger = 36;
+ mutable_cf_options.level0_stop_writes_trigger = 50;
+ // Speedup threshold = 200 / 4 = 50
+ mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+ mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(40);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(45);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(7);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(9);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(6);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ // Speed up threshold = min(4 * 2, 4 + (12 - 4)/4) = 6
+ mutable_cf_options.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options.level0_slowdown_writes_trigger = 16;
+ mutable_cf_options.level0_stop_writes_trigger = 30;
+
+ vstorage->set_l0_delay_trigger_count(5);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(7);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(3);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_P(ColumnFamilyTest, WriteStallTwoColumnFamilies) {
+ const uint64_t kBaseRate = 810000u;
+ db_options_.delayed_write_rate = kBaseRate;
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyData* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+ VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+ ColumnFamilyData* cfd1 =
+ static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+ VersionStorageInfo* vstorage1 = cfd1->current()->storage_info();
+
+ MutableCFOptions mutable_cf_options(column_family_options_);
+ mutable_cf_options.level0_slowdown_writes_trigger = 20;
+ mutable_cf_options.level0_stop_writes_trigger = 10000;
+ mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+ mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+ MutableCFOptions mutable_cf_options1 = mutable_cf_options;
+ mutable_cf_options1.soft_pending_compaction_bytes_limit = 500;
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(201);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(70);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(800);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(700);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_TRUE(!IsDbWriteStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+}
+
+TEST_P(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) {
+ db_options_.max_background_compactions = 6;
+ column_family_options_.soft_pending_compaction_bytes_limit = 200;
+ column_family_options_.hard_pending_compaction_bytes_limit = 2000;
+ Open();
+ CreateColumnFamilies({"one"});
+ ColumnFamilyData* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+ VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+ ColumnFamilyData* cfd1 =
+ static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+ VersionStorageInfo* vstorage1 = cfd1->current()->storage_info();
+
+ MutableCFOptions mutable_cf_options(column_family_options_);
+ // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8
+ mutable_cf_options.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options.level0_slowdown_writes_trigger = 36;
+ mutable_cf_options.level0_stop_writes_trigger = 30;
+ // Speedup threshold = 200 / 4 = 50
+ mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+ mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+ MutableCFOptions mutable_cf_options1 = mutable_cf_options;
+ mutable_cf_options1.level0_slowdown_writes_trigger = 16;
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(40);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(60);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(30);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(70);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->TEST_set_estimated_compaction_needed_bytes(20);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage1->TEST_set_estimated_compaction_needed_bytes(3);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(9);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage1->set_l0_delay_trigger_count(2);
+ RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+ ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+ vstorage->set_l0_delay_trigger_count(0);
+ RecalculateWriteStallConditions(cfd, mutable_cf_options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_P(ColumnFamilyTest, CreateAndDestroyOptions) {
+ std::unique_ptr<ColumnFamilyOptions> cfo(new ColumnFamilyOptions());
+ ColumnFamilyHandle* cfh;
+ Open();
+ ASSERT_OK(db_->CreateColumnFamily(*(cfo.get()), "yoyo", &cfh));
+ cfo.reset();
+ ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+ ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+ ASSERT_OK(db_->DropColumnFamily(cfh));
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, CreateDropAndDestroy) {
+ ColumnFamilyHandle* cfh;
+ Open();
+ ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+ ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+ ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+ ASSERT_OK(db_->DropColumnFamily(cfh));
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) {
+ ColumnFamilyHandle* cfh;
+ Open();
+ ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+ ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+ ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+ ASSERT_OK(db_->DisableFileDeletions());
+ ASSERT_OK(db_->DropColumnFamily(cfh));
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, FlushCloseWALFiles) {
+ SpecialEnv env(Env::Default());
+ db_options_.env = &env;
+ db_options_.max_background_flushes = 1;
+ column_family_options_.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(2));
+ Open();
+ CreateColumnFamilies({"one"});
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(0, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::BGWorkFlush:done", "FlushCloseWALFiles:0"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Block flush jobs from running
+ test::SleepingBackgroundTask sleeping_task;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::HIGH);
+ // Make sure the task is sleeping. Otherwise, it might start to execute
+ // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+ sleeping_task.WaitUntilSleeping();
+
+ WriteOptions wo;
+ wo.sync = true;
+ ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+ TEST_SYNC_POINT("FlushCloseWALFiles:0");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+
+ Reopen();
+ ASSERT_EQ("mirko", Get(0, "fodor"));
+ ASSERT_EQ("mirko", Get(1, "fodor"));
+ db_options_.env = env_;
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) {
+ SpecialEnv env(Env::Default());
+ db_options_.env = &env;
+ db_options_.max_background_flushes = 1;
+ column_family_options_.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(2));
+ Open();
+ CreateColumnFamilies({"one"});
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ // Create an iterator holding the current super version.
+ Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]);
+ ASSERT_OK(it->status());
+ // A flush will make `it` hold the last reference of its super version.
+ ASSERT_OK(Flush(1));
+
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(0, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+
+ // Flush jobs will close previous WAL files after finishing. By
+ // block flush jobs from running, we trigger a condition where
+ // the iterator destructor should close the WAL files.
+ test::SleepingBackgroundTask sleeping_task;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::HIGH);
+ // Make sure the task is sleeping. Otherwise, it might start to execute
+ // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+ sleeping_task.WaitUntilSleeping();
+
+ WriteOptions wo;
+ wo.sync = true;
+ ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+ // Deleting the iterator will clear its super version, triggering
+ // closing all files
+ delete it;
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+ WaitForFlush(1);
+
+ Reopen();
+ ASSERT_EQ("mirko", Get(0, "fodor"));
+ ASSERT_EQ("mirko", Get(1, "fodor"));
+ db_options_.env = env_;
+ Close();
+}
+
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) {
+ SpecialEnv env(Env::Default());
+ // Allow both of flush and purge job to schedule.
+ env.SetBackgroundThreads(2, Env::HIGH);
+ db_options_.env = &env;
+ db_options_.max_background_flushes = 1;
+ column_family_options_.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(2));
+ Open();
+ CreateColumnFamilies({"one"});
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ // Create an iterator holding the current super version.
+ ReadOptions ro;
+ ro.background_purge_on_iterator_cleanup = true;
+ Iterator* it = db_->NewIterator(ro, handles_[1]);
+ ASSERT_OK(it->status());
+ // A flush will make `it` hold the last reference of its super version.
+ ASSERT_OK(Flush(1));
+
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(0, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"ColumnFamilyTest::IteratorCloseWALFile2:0",
+ "DBImpl::BGWorkPurge:start"},
+ {"ColumnFamilyTest::IteratorCloseWALFile2:2",
+ "DBImpl::BackgroundCallFlush:start"},
+ {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = true;
+ ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+ // Deleting the iterator will clear its super version, triggering
+ // closing all files
+ delete it;
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0");
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1");
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2");
+ WaitForFlush(1);
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ Reopen();
+ ASSERT_EQ("mirko", Get(0, "fodor"));
+ ASSERT_EQ("mirko", Get(1, "fodor"));
+ db_options_.env = env_;
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE // TEST functions are not supported in lite
+TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
+ SpecialEnv env(Env::Default());
+ // Allow both of flush and purge job to schedule.
+ env.SetBackgroundThreads(2, Env::HIGH);
+ db_options_.env = &env;
+ db_options_.max_background_flushes = 1;
+ column_family_options_.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(3));
+ column_family_options_.level0_file_num_compaction_trigger = 2;
+ Open();
+ CreateColumnFamilies({"one"});
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodar2", "mirko"));
+ ASSERT_OK(Flush(1));
+
+ // Create an iterator holding the current super version, as well as
+ // the SST file just flushed.
+ ReadOptions ro;
+ ro.tailing = true;
+ ro.background_purge_on_iterator_cleanup = true;
+ Iterator* it = db_->NewIterator(ro, handles_[1]);
+ // A flush will make `it` hold the last reference of its super version.
+
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodar2", "mirko"));
+ ASSERT_OK(Flush(1));
+
+ WaitForCompaction();
+
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+ ASSERT_OK(Put(0, "fodor", "mirko"));
+ ASSERT_OK(Put(1, "fodor", "mirko"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"ColumnFamilyTest::IteratorCloseWALFile2:0",
+ "DBImpl::BGWorkPurge:start"},
+ {"ColumnFamilyTest::IteratorCloseWALFile2:2",
+ "DBImpl::BackgroundCallFlush:start"},
+ {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = true;
+ ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+ env.delete_count_.store(0);
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+ // Deleting the iterator will clear its super version, triggering
+ // closing all files
+ it->Seek("");
+ ASSERT_OK(it->status());
+
+ ASSERT_EQ(2, env.num_open_wal_file_.load());
+ ASSERT_EQ(0, env.delete_count_.load());
+
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0");
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1");
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+ ASSERT_EQ(1, env.delete_count_.load());
+ TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2");
+ WaitForFlush(1);
+ ASSERT_EQ(1, env.num_open_wal_file_.load());
+ ASSERT_EQ(1, env.delete_count_.load());
+
+ delete it;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ Reopen();
+ ASSERT_EQ("mirko", Get(0, "fodor"));
+ ASSERT_EQ("mirko", Get(1, "fodor"));
+ db_options_.env = env_;
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+// Disable on windows because SyncWAL requires env->IsSyncThreadSafe()
+// to return true which is not so in unbuffered mode.
+#ifndef OS_WIN
+TEST_P(ColumnFamilyTest, LogSyncConflictFlush) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+
+ ASSERT_OK(Put(0, "", ""));
+ ASSERT_OK(Put(1, "foo", "bar"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::SyncWAL:BeforeMarkLogsSynced:1",
+ "ColumnFamilyTest::LogSyncConflictFlush:1"},
+ {"ColumnFamilyTest::LogSyncConflictFlush:2",
+ "DBImpl::SyncWAL:BeforeMarkLogsSynced:2"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread thread([&] { ASSERT_OK(db_->SyncWAL()); });
+
+ TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1");
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Flush(1));
+
+ TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2");
+
+ thread.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ Close();
+}
+#endif
+
+// this test is placed here, because the infrastructure for Column Family
+// test is being used to ensure a roll of wal files.
+// Basic idea is to test that WAL truncation is being detected and not
+// ignored
+TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) {
+ Open();
+ CreateColumnFamiliesAndReopen({"one", "two"});
+
+ Build(0, 100);
+
+ // Flush the 0th column family to force a roll of the wal log
+ ASSERT_OK(Flush(0));
+
+ // Add some more entries
+ Build(100, 100);
+
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+
+ // collect wal files
+ std::vector<std::string> logfs;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ uint64_t number;
+ FileType type;
+ if (!(ParseFileName(filenames[i], &number, &type))) continue;
+
+ if (type != kWalFile) continue;
+
+ logfs.push_back(filenames[i]);
+ }
+
+ std::sort(logfs.begin(), logfs.end());
+ ASSERT_GE(logfs.size(), 2);
+
+ // Take the last but one file, and truncate it
+ std::string fpath = dbname_ + "/" + logfs[logfs.size() - 2];
+ std::vector<std::string> names_save = names_;
+
+ uint64_t fsize;
+ ASSERT_OK(env_->GetFileSize(fpath, &fsize));
+ ASSERT_GT(fsize, 0);
+
+ Close();
+
+ std::string backup_logs = dbname_ + "/backup_logs";
+ std::string t_fpath = backup_logs + "/" + logfs[logfs.size() - 2];
+
+ ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+ // Not sure how easy it is to make this data driven.
+ // need to read back the WAL file and truncate last 10
+ // entries
+ CopyFile(fpath, t_fpath, fsize - 9180);
+
+ ASSERT_OK(env_->DeleteFile(fpath));
+ ASSERT_OK(env_->RenameFile(t_fpath, fpath));
+
+ db_options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+ OpenReadOnly(names_save);
+
+ CheckMissed();
+
+ Close();
+
+ Open(names_save);
+
+ CheckMissed();
+
+ Close();
+
+ // cleanup
+ ASSERT_OK(env_->DeleteDir(backup_logs));
+}
+
+TEST_P(ColumnFamilyTest, DefaultCfPathsTest) {
+ Open();
+ // Leave cf_paths for one column families to be empty.
+ // Files should be generated according to db_paths for that
+ // column family.
+ ColumnFamilyOptions cf_opt1, cf_opt2;
+ cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+ std::numeric_limits<uint64_t>::max());
+ CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+ Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+ // Fill Column family 1.
+ PutRandomData(1, 100, 100);
+ ASSERT_OK(Flush(1));
+
+ ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Fill column family 2
+ PutRandomData(2, 100, 100);
+ ASSERT_OK(Flush(2));
+
+ // SST from Column family 2 should be generated in
+ // db_paths which is dbname_ in this case.
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+}
+
+TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
+ Open();
+ // Configure Column family specific paths.
+ ColumnFamilyOptions cf_opt1, cf_opt2;
+ cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+ std::numeric_limits<uint64_t>::max());
+ cf_opt2.cf_paths.emplace_back(dbname_ + "_two_1",
+ std::numeric_limits<uint64_t>::max());
+ CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+ Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+ PutRandomData(1, 100, 100, true /* save */);
+ ASSERT_OK(Flush(1));
+
+ // Check that files are generated in appropriate paths.
+ ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ PutRandomData(2, 100, 100, true /* save */);
+ ASSERT_OK(Flush(2));
+
+ ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Re-open and verify the keys.
+ Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ for (int cf = 1; cf != 3; ++cf) {
+ ReadOptions read_options;
+ read_options.readahead_size = 0;
+ auto it = dbi->NewIterator(read_options, handles_[cf]);
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ ASSERT_OK(it->status());
+ Slice key(it->key());
+ ASSERT_NE(keys_[cf].end(), keys_[cf].find(key.ToString()));
+ }
+ ASSERT_OK(it->status());
+ delete it;
+
+ for (const auto& key : keys_[cf]) {
+ ASSERT_NE("NOT_FOUND", Get(cf, key));
+ }
+ }
+}
+
+TEST(ColumnFamilyTest, ValidateBlobGCCutoff) {
+ DBOptions db_options;
+
+ ColumnFamilyOptions cf_options;
+ cf_options.enable_blob_garbage_collection = true;
+
+ cf_options.blob_garbage_collection_age_cutoff = -0.5;
+ ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+ .IsInvalidArgument());
+
+ cf_options.blob_garbage_collection_age_cutoff = 0.0;
+ ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+ cf_options.blob_garbage_collection_age_cutoff = 0.5;
+ ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+ cf_options.blob_garbage_collection_age_cutoff = 1.0;
+ ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+ cf_options.blob_garbage_collection_age_cutoff = 1.5;
+ ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+ .IsInvalidArgument());
+}
+
+TEST(ColumnFamilyTest, ValidateBlobGCForceThreshold) {
+ DBOptions db_options;
+
+ ColumnFamilyOptions cf_options;
+ cf_options.enable_blob_garbage_collection = true;
+
+ cf_options.blob_garbage_collection_force_threshold = -0.5;
+ ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+ .IsInvalidArgument());
+
+ cf_options.blob_garbage_collection_force_threshold = 0.0;
+ ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+ cf_options.blob_garbage_collection_force_threshold = 0.5;
+ ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+ cf_options.blob_garbage_collection_force_threshold = 1.0;
+ ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+ cf_options.blob_garbage_collection_force_threshold = 1.5;
+ ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+ .IsInvalidArgument());
+}
+
+TEST(ColumnFamilyTest, ValidateMemtableKVChecksumOption) {
+ DBOptions db_options;
+
+ ColumnFamilyOptions cf_options;
+ ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+ cf_options.memtable_protection_bytes_per_key = 5;
+ ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+ .IsNotSupported());
+
+ cf_options.memtable_protection_bytes_per_key = 1;
+ ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+ cf_options.memtable_protection_bytes_per_key = 16;
+ ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+ .IsNotSupported());
+
+ cf_options.memtable_protection_bytes_per_key = 0;
+ ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compact_files_test.cc b/src/rocksdb/db/compact_files_test.cc
new file mode 100644
index 000000000..ef38946f7
--- /dev/null
+++ b/src/rocksdb/db/compact_files_test.cc
@@ -0,0 +1,502 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactFilesTest : public testing::Test {
+ public:
+ CompactFilesTest() {
+ env_ = Env::Default();
+ db_name_ = test::PerThreadDBPath("compact_files_test");
+ }
+
+ std::string db_name_;
+ Env* env_;
+};
+
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+ FlushedFileCollector() {}
+ ~FlushedFileCollector() override {}
+
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.push_back(info.file_path);
+ }
+
+ std::vector<std::string> GetFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ std::vector<std::string> result;
+ for (auto fname : flushed_files_) {
+ result.push_back(fname);
+ }
+ return result;
+ }
+ void ClearFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.clear();
+ }
+
+ private:
+ std::vector<std::string> flushed_files_;
+ std::mutex mutex_;
+};
+
+TEST_F(CompactFilesTest, L0ConflictsFiles) {
+ Options options;
+ // to trigger compaction more easily
+ const int kWriteBufferSize = 10000;
+ const int kLevel0Trigger = 2;
+ options.create_if_missing = true;
+ options.compaction_style = kCompactionStyleLevel;
+ // Small slowdown and stop trigger for experimental purpose.
+ options.level0_slowdown_writes_trigger = 20;
+ options.level0_stop_writes_trigger = 20;
+ options.level0_stop_writes_trigger = 20;
+ options.write_buffer_size = kWriteBufferSize;
+ options.level0_file_num_compaction_trigger = kLevel0Trigger;
+ options.compression = kNoCompression;
+
+ DB* db = nullptr;
+ ASSERT_OK(DestroyDB(db_name_, options));
+ Status s = DB::Open(options, db_name_, &db);
+ assert(s.ok());
+ assert(db);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"CompactFilesImpl:0", "BackgroundCallCompaction:0"},
+ {"BackgroundCallCompaction:1", "CompactFilesImpl:1"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // create couple files
+ // Background compaction starts and waits in BackgroundCallCompaction:0
+ for (int i = 0; i < kLevel0Trigger * 4; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), ""));
+ ASSERT_OK(db->Put(WriteOptions(), std::to_string(100 - i), ""));
+ ASSERT_OK(db->Flush(FlushOptions()));
+ }
+
+ ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
+ db->GetColumnFamilyMetaData(&meta);
+ std::string file1;
+ for (auto& file : meta.levels[0].files) {
+ ASSERT_EQ(0, meta.levels[0].level);
+ if (file1 == "") {
+ file1 = file.db_path + "/" + file.name;
+ } else {
+ std::string file2 = file.db_path + "/" + file.name;
+ // Another thread starts a compact files and creates an L0 compaction
+ // The background compaction then notices that there is an L0 compaction
+ // already in progress and doesn't do an L0 compaction
+ // Once the background compaction finishes, the compact files finishes
+ ASSERT_OK(db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+ {file1, file2}, 0));
+ break;
+ }
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ delete db;
+}
+
+TEST_F(CompactFilesTest, MultipleLevel) {
+ Options options;
+ options.create_if_missing = true;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.num_levels = 6;
+ // Add listener
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ DB* db = nullptr;
+ ASSERT_OK(DestroyDB(db_name_, options));
+ Status s = DB::Open(options, db_name_, &db);
+ ASSERT_OK(s);
+ ASSERT_NE(db, nullptr);
+
+ // create couple files in L0, L3, L4 and L5
+ for (int i = 5; i > 2; --i) {
+ collector->ClearFlushedFiles();
+ ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), ""));
+ ASSERT_OK(db->Flush(FlushOptions()));
+ // Ensure background work is fully finished including listener callbacks
+ // before accessing listener state.
+ ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+ auto l0_files = collector->GetFlushedFiles();
+ ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i));
+
+ std::string prop;
+ ASSERT_TRUE(db->GetProperty(
+ "rocksdb.num-files-at-level" + std::to_string(i), &prop));
+ ASSERT_EQ("1", prop);
+ }
+ ASSERT_OK(db->Put(WriteOptions(), std::to_string(0), ""));
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ ColumnFamilyMetaData meta;
+ db->GetColumnFamilyMetaData(&meta);
+ // Compact files except the file in L3
+ std::vector<std::string> files;
+ for (int i = 0; i < 6; ++i) {
+ if (i == 3) continue;
+ for (auto& file : meta.levels[i].files) {
+ files.push_back(file.db_path + "/" + file.name);
+ }
+ }
+
+ SyncPoint::GetInstance()->LoadDependency({
+ {"CompactionJob::Run():Start", "CompactFilesTest.MultipleLevel:0"},
+ {"CompactFilesTest.MultipleLevel:1", "CompactFilesImpl:3"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::thread thread([&] {
+ TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:0");
+ ASSERT_OK(db->Put(WriteOptions(), "bar", "v2"));
+ ASSERT_OK(db->Put(WriteOptions(), "foo", "v2"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+ TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:1");
+ });
+
+ // Compaction cannot move up the data to higher level
+ // here we have input file from level 5, so the output level has to be >= 5
+ for (int invalid_output_level = 0; invalid_output_level < 5;
+ invalid_output_level++) {
+ s = db->CompactFiles(CompactionOptions(), files, invalid_output_level);
+ std::cout << s.ToString() << std::endl;
+ ASSERT_TRUE(s.IsInvalidArgument());
+ }
+
+ ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5));
+ SyncPoint::GetInstance()->DisableProcessing();
+ thread.join();
+
+ delete db;
+}
+
+TEST_F(CompactFilesTest, ObsoleteFiles) {
+ Options options;
+ // to trigger compaction more easily
+ const int kWriteBufferSize = 65536;
+ options.create_if_missing = true;
+ // Disable RocksDB background compaction.
+ options.compaction_style = kCompactionStyleNone;
+ options.level0_slowdown_writes_trigger = (1 << 30);
+ options.level0_stop_writes_trigger = (1 << 30);
+ options.write_buffer_size = kWriteBufferSize;
+ options.max_write_buffer_number = 2;
+ options.compression = kNoCompression;
+
+ // Add listener
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ DB* db = nullptr;
+ ASSERT_OK(DestroyDB(db_name_, options));
+ Status s = DB::Open(options, db_name_, &db);
+ ASSERT_OK(s);
+ ASSERT_NE(db, nullptr);
+
+ // create couple files
+ for (int i = 1000; i < 2000; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+ std::string(kWriteBufferSize / 10, 'a' + (i % 26))));
+ }
+
+ auto l0_files = collector->GetFlushedFiles();
+ ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1));
+ ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForCompact());
+
+ // verify all compaction input files are deleted
+ for (auto fname : l0_files) {
+ ASSERT_EQ(Status::NotFound(), env_->FileExists(fname));
+ }
+ delete db;
+}
+
+TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
+ Options options;
+ options.create_if_missing = true;
+ // Disable RocksDB background compaction.
+ options.compaction_style = kCompactionStyleNone;
+ options.level0_slowdown_writes_trigger = 1000;
+ options.level0_stop_writes_trigger = 1000;
+ options.write_buffer_size = 65536;
+ options.max_write_buffer_number = 2;
+ options.compression = kNoCompression;
+ options.max_compaction_bytes = 5000;
+
+ // Add listener
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ DB* db = nullptr;
+ ASSERT_OK(DestroyDB(db_name_, options));
+ Status s = DB::Open(options, db_name_, &db);
+ assert(s.ok());
+ assert(db);
+
+ // create couple files
+ for (int i = 0; i < 500; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+ std::string(1000, 'a' + (i % 26))));
+ }
+ ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+ auto l0_files_1 = collector->GetFlushedFiles();
+ collector->ClearFlushedFiles();
+ for (int i = 0; i < 500; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+ std::string(1000, 'a' + (i % 26))));
+ }
+ ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+ auto l0_files_2 = collector->GetFlushedFiles();
+ ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0));
+ ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0));
+ // no assertion failure
+ delete db;
+}
+
+TEST_F(CompactFilesTest, CapturingPendingFiles) {
+ Options options;
+ options.create_if_missing = true;
+ // Disable RocksDB background compaction.
+ options.compaction_style = kCompactionStyleNone;
+ // Always do full scans for obsolete files (needed to reproduce the issue).
+ options.delete_obsolete_files_period_micros = 0;
+
+ // Add listener.
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ DB* db = nullptr;
+ ASSERT_OK(DestroyDB(db_name_, options));
+ Status s = DB::Open(options, db_name_, &db);
+ ASSERT_OK(s);
+ assert(db);
+
+ // Create 5 files.
+ for (int i = 0; i < 5; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), "key" + std::to_string(i), "value"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+ }
+
+ // Ensure background work is fully finished including listener callbacks
+ // before accessing listener state.
+ ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+ auto l0_files = collector->GetFlushedFiles();
+ EXPECT_EQ(5, l0_files.size());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"CompactFilesImpl:2", "CompactFilesTest.CapturingPendingFiles:0"},
+ {"CompactFilesTest.CapturingPendingFiles:1", "CompactFilesImpl:3"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Start compacting files.
+ ROCKSDB_NAMESPACE::port::Thread compaction_thread(
+ [&] { EXPECT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); });
+
+ // In the meantime flush another file.
+ TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0");
+ ASSERT_OK(db->Put(WriteOptions(), "key5", "value"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+ TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1");
+
+ compaction_thread.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ delete db;
+
+ // Make sure we can reopen the DB.
+ s = DB::Open(options, db_name_, &db);
+ ASSERT_OK(s);
+ assert(db);
+ delete db;
+}
+
+TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
+ class FilterWithGet : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ if (db_ == nullptr) {
+ return true;
+ }
+ std::string res;
+ db_->Get(ReadOptions(), "", &res);
+ return true;
+ }
+
+ void SetDB(DB* db) { db_ = db; }
+
+ const char* Name() const override { return "FilterWithGet"; }
+
+ private:
+ DB* db_;
+ };
+
+ std::shared_ptr<FilterWithGet> cf(new FilterWithGet());
+
+ Options options;
+ options.create_if_missing = true;
+ options.compaction_filter = cf.get();
+
+ DB* db = nullptr;
+ ASSERT_OK(DestroyDB(db_name_, options));
+ Status s = DB::Open(options, db_name_, &db);
+ ASSERT_OK(s);
+
+ cf->SetDB(db);
+
+ // Write one L0 file
+ ASSERT_OK(db->Put(WriteOptions(), "K1", "V1"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ // Compact all L0 files using CompactFiles
+ ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
+ db->GetColumnFamilyMetaData(&meta);
+ for (auto& file : meta.levels[0].files) {
+ std::string fname = file.db_path + "/" + file.name;
+ ASSERT_OK(
+ db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), {fname}, 0));
+ }
+
+ delete db;
+}
+
+TEST_F(CompactFilesTest, SentinelCompressionType) {
+ if (!Zlib_Supported()) {
+ fprintf(stderr, "zlib compression not supported, skip this test\n");
+ return;
+ }
+ if (!Snappy_Supported()) {
+ fprintf(stderr, "snappy compression not supported, skip this test\n");
+ return;
+ }
+ // Check that passing `CompressionType::kDisableCompressionOption` to
+ // `CompactFiles` causes it to use the column family compression options.
+ for (auto compaction_style : {CompactionStyle::kCompactionStyleLevel,
+ CompactionStyle::kCompactionStyleUniversal,
+ CompactionStyle::kCompactionStyleNone}) {
+ ASSERT_OK(DestroyDB(db_name_, Options()));
+ Options options;
+ options.compaction_style = compaction_style;
+ // L0: Snappy, L1: ZSTD, L2: Snappy
+ options.compression_per_level = {CompressionType::kSnappyCompression,
+ CompressionType::kZlibCompression,
+ CompressionType::kSnappyCompression};
+ options.create_if_missing = true;
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+ DB* db = nullptr;
+ ASSERT_OK(DB::Open(options, db_name_, &db));
+
+ ASSERT_OK(db->Put(WriteOptions(), "key", "val"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ // Ensure background work is fully finished including listener callbacks
+ // before accessing listener state.
+ ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+ auto l0_files = collector->GetFlushedFiles();
+ ASSERT_EQ(1, l0_files.size());
+
+ // L0->L1 compaction, so output should be ZSTD-compressed
+ CompactionOptions compaction_opts;
+ compaction_opts.compression = CompressionType::kDisableCompressionOption;
+ ASSERT_OK(db->CompactFiles(compaction_opts, l0_files, 1));
+
+ ROCKSDB_NAMESPACE::TablePropertiesCollection all_tables_props;
+ ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props));
+ for (const auto& name_and_table_props : all_tables_props) {
+ ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression),
+ name_and_table_props.second->compression_name);
+ }
+ delete db;
+ }
+}
+
+TEST_F(CompactFilesTest, GetCompactionJobInfo) {
+ Options options;
+ options.create_if_missing = true;
+ // Disable RocksDB background compaction.
+ options.compaction_style = kCompactionStyleNone;
+ options.level0_slowdown_writes_trigger = 1000;
+ options.level0_stop_writes_trigger = 1000;
+ options.write_buffer_size = 65536;
+ options.max_write_buffer_number = 2;
+ options.compression = kNoCompression;
+ options.max_compaction_bytes = 5000;
+
+ // Add listener
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ DB* db = nullptr;
+ ASSERT_OK(DestroyDB(db_name_, options));
+ Status s = DB::Open(options, db_name_, &db);
+ ASSERT_OK(s);
+ assert(db);
+
+ // create couple files
+ for (int i = 0; i < 500; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+ std::string(1000, 'a' + (i % 26))));
+ }
+ ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+ auto l0_files_1 = collector->GetFlushedFiles();
+ CompactionOptions co;
+ co.compression = CompressionType::kLZ4Compression;
+ CompactionJobInfo compaction_job_info{};
+ ASSERT_OK(
+ db->CompactFiles(co, l0_files_1, 0, -1, nullptr, &compaction_job_info));
+ ASSERT_EQ(compaction_job_info.base_input_level, 0);
+ ASSERT_EQ(compaction_job_info.cf_id, db->DefaultColumnFamily()->GetID());
+ ASSERT_EQ(compaction_job_info.cf_name, db->DefaultColumnFamily()->GetName());
+ ASSERT_EQ(compaction_job_info.compaction_reason,
+ CompactionReason::kManualCompaction);
+ ASSERT_EQ(compaction_job_info.compression, CompressionType::kLZ4Compression);
+ ASSERT_EQ(compaction_job_info.output_level, 0);
+ ASSERT_OK(compaction_job_info.status);
+ // no assertion failure
+ delete db;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/clipping_iterator.h b/src/rocksdb/db/compaction/clipping_iterator.h
new file mode 100644
index 000000000..1ed465c2c
--- /dev/null
+++ b/src/rocksdb/db/compaction/clipping_iterator.h
@@ -0,0 +1,276 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "rocksdb/comparator.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An internal iterator that wraps another one and ensures that any keys
+// returned are strictly within a range [start, end). If the underlying
+// iterator has already performed the bounds checking, it relies on that result;
+// otherwise, it performs the necessary key comparisons itself. Both bounds
+// are optional.
+class ClippingIterator : public InternalIterator {
+ public:
+ ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end,
+ const CompareInterface* cmp)
+ : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) {
+ assert(iter_);
+ assert(cmp_);
+ assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0);
+
+ UpdateAndEnforceBounds();
+ }
+
+ bool Valid() const override { return valid_; }
+
+ void SeekToFirst() override {
+ if (start_) {
+ iter_->Seek(*start_);
+ } else {
+ iter_->SeekToFirst();
+ }
+
+ UpdateAndEnforceUpperBound();
+ }
+
+ void SeekToLast() override {
+ if (end_) {
+ iter_->SeekForPrev(*end_);
+
+ // Upper bound is exclusive, so we need a key which is strictly smaller
+ if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+ iter_->Prev();
+ }
+ } else {
+ iter_->SeekToLast();
+ }
+
+ UpdateAndEnforceLowerBound();
+ }
+
+ void Seek(const Slice& target) override {
+ if (start_ && cmp_->Compare(target, *start_) < 0) {
+ iter_->Seek(*start_);
+ UpdateAndEnforceUpperBound();
+ return;
+ }
+
+ if (end_ && cmp_->Compare(target, *end_) >= 0) {
+ valid_ = false;
+ return;
+ }
+
+ iter_->Seek(target);
+ UpdateAndEnforceUpperBound();
+ }
+
+ void SeekForPrev(const Slice& target) override {
+ if (start_ && cmp_->Compare(target, *start_) < 0) {
+ valid_ = false;
+ return;
+ }
+
+ if (end_ && cmp_->Compare(target, *end_) >= 0) {
+ iter_->SeekForPrev(*end_);
+
+ // Upper bound is exclusive, so we need a key which is strictly smaller
+ if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+ iter_->Prev();
+ }
+
+ UpdateAndEnforceLowerBound();
+ return;
+ }
+
+ iter_->SeekForPrev(target);
+ UpdateAndEnforceLowerBound();
+ }
+
+ void Next() override {
+ assert(valid_);
+ iter_->Next();
+ UpdateAndEnforceUpperBound();
+ }
+
+ bool NextAndGetResult(IterateResult* result) override {
+ assert(valid_);
+ assert(result);
+
+ IterateResult res;
+ valid_ = iter_->NextAndGetResult(&res);
+
+ if (!valid_) {
+ return false;
+ }
+
+ if (end_) {
+ EnforceUpperBoundImpl(res.bound_check_result);
+
+ if (!valid_) {
+ return false;
+ }
+ }
+
+ res.bound_check_result = IterBoundCheck::kInbound;
+ *result = res;
+
+ return true;
+ }
+
+ void Prev() override {
+ assert(valid_);
+ iter_->Prev();
+ UpdateAndEnforceLowerBound();
+ }
+
+ Slice key() const override {
+ assert(valid_);
+ return iter_->key();
+ }
+
+ Slice user_key() const override {
+ assert(valid_);
+ return iter_->user_key();
+ }
+
+ Slice value() const override {
+ assert(valid_);
+ return iter_->value();
+ }
+
+ Status status() const override { return iter_->status(); }
+
+ bool PrepareValue() override {
+ assert(valid_);
+
+ if (iter_->PrepareValue()) {
+ return true;
+ }
+
+ assert(!iter_->Valid());
+ valid_ = false;
+ return false;
+ }
+
+ bool MayBeOutOfLowerBound() override {
+ assert(valid_);
+ return false;
+ }
+
+ IterBoundCheck UpperBoundCheckResult() override {
+ assert(valid_);
+ return IterBoundCheck::kInbound;
+ }
+
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ iter_->SetPinnedItersMgr(pinned_iters_mgr);
+ }
+
+ bool IsKeyPinned() const override {
+ assert(valid_);
+ return iter_->IsKeyPinned();
+ }
+
+ bool IsValuePinned() const override {
+ assert(valid_);
+ return iter_->IsValuePinned();
+ }
+
+ Status GetProperty(std::string prop_name, std::string* prop) override {
+ return iter_->GetProperty(prop_name, prop);
+ }
+
+ private:
+ void UpdateValid() {
+ assert(!iter_->Valid() || iter_->status().ok());
+
+ valid_ = iter_->Valid();
+ }
+
+ void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) {
+ if (bound_check_result == IterBoundCheck::kInbound) {
+ return;
+ }
+
+ if (bound_check_result == IterBoundCheck::kOutOfBound) {
+ valid_ = false;
+ return;
+ }
+
+ assert(bound_check_result == IterBoundCheck::kUnknown);
+
+ if (cmp_->Compare(key(), *end_) >= 0) {
+ valid_ = false;
+ }
+ }
+
+ void EnforceUpperBound() {
+ if (!valid_) {
+ return;
+ }
+
+ if (!end_) {
+ return;
+ }
+
+ EnforceUpperBoundImpl(iter_->UpperBoundCheckResult());
+ }
+
+ void EnforceLowerBound() {
+ if (!valid_) {
+ return;
+ }
+
+ if (!start_) {
+ return;
+ }
+
+ if (!iter_->MayBeOutOfLowerBound()) {
+ return;
+ }
+
+ if (cmp_->Compare(key(), *start_) < 0) {
+ valid_ = false;
+ }
+ }
+
+ void AssertBounds() {
+ assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0);
+ assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0);
+ }
+
+ void UpdateAndEnforceBounds() {
+ UpdateValid();
+ EnforceUpperBound();
+ EnforceLowerBound();
+ AssertBounds();
+ }
+
+ void UpdateAndEnforceUpperBound() {
+ UpdateValid();
+ EnforceUpperBound();
+ AssertBounds();
+ }
+
+ void UpdateAndEnforceLowerBound() {
+ UpdateValid();
+ EnforceLowerBound();
+ AssertBounds();
+ }
+
+ InternalIterator* iter_;
+ const Slice* start_;
+ const Slice* end_;
+ const CompareInterface* cmp_;
+ bool valid_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/clipping_iterator_test.cc b/src/rocksdb/db/compaction/clipping_iterator_test.cc
new file mode 100644
index 000000000..b2b167048
--- /dev/null
+++ b/src/rocksdb/db/compaction/clipping_iterator_test.cc
@@ -0,0 +1,259 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/clipping_iterator.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A vector iterator which does its own bounds checking. This is for testing the
+// optimizations in the clipping iterator where we bypass the bounds checking if
+// the input iterator has already performed it.
+class BoundsCheckingVectorIterator : public VectorIterator {
+ public:
+ BoundsCheckingVectorIterator(const std::vector<std::string>& keys,
+ const std::vector<std::string>& values,
+ const Slice* start, const Slice* end,
+ const Comparator* cmp)
+ : VectorIterator(keys, values, cmp), start_(start), end_(end), cmp_(cmp) {
+ assert(cmp_);
+ }
+
+ bool NextAndGetResult(IterateResult* result) override {
+ assert(Valid());
+ assert(result);
+
+ Next();
+
+ if (!Valid()) {
+ return false;
+ }
+
+ result->key = key();
+ result->bound_check_result = UpperBoundCheckResult();
+ result->value_prepared = true;
+
+ return true;
+ }
+
+ bool MayBeOutOfLowerBound() override {
+ assert(Valid());
+
+ if (!start_) {
+ return false;
+ }
+
+ return cmp_->Compare(key(), *start_) < 0;
+ }
+
+ IterBoundCheck UpperBoundCheckResult() override {
+ assert(Valid());
+
+ if (!end_) {
+ return IterBoundCheck::kInbound;
+ }
+
+ return cmp_->Compare(key(), *end_) >= 0 ? IterBoundCheck::kOutOfBound
+ : IterBoundCheck::kInbound;
+ }
+
+ private:
+ const Slice* start_;
+ const Slice* end_;
+ const Comparator* cmp_;
+};
+
+class ClippingIteratorTest
+ : public ::testing::Test,
+ public ::testing::WithParamInterface<std::tuple<bool, size_t, size_t>> {};
+
+TEST_P(ClippingIteratorTest, Clip) {
+ const std::vector<std::string> keys{"key0", "key1", "key2", "key3", "key4",
+ "key5", "key6", "key7", "key8", "key9"};
+ const std::vector<std::string> values{
+ "unused0", "value1", "value2", "value3", "unused4",
+ "unused5", "unused6", "unused7", "unused8", "unused9"};
+
+ assert(keys.size() == values.size());
+
+ // Note: the input always contains key1, key2, and key3; however, the clipping
+ // window is based on the test parameters: its left edge is a value in the
+ // range [0, 4], and its size is a value in the range [0, 5]
+ const std::vector<std::string> input_keys{keys[1], keys[2], keys[3]};
+ const std::vector<std::string> input_values{values[1], values[2], values[3]};
+
+ const bool use_bounds_checking_vec_it = std::get<0>(GetParam());
+
+ const size_t clip_start_idx = std::get<1>(GetParam());
+ const size_t clip_window_size = std::get<2>(GetParam());
+ const size_t clip_end_idx = clip_start_idx + clip_window_size;
+
+ const Slice start(keys[clip_start_idx]);
+ const Slice end(keys[clip_end_idx]);
+
+ std::unique_ptr<InternalIterator> input(
+ use_bounds_checking_vec_it
+ ? new BoundsCheckingVectorIterator(input_keys, input_values, &start,
+ &end, BytewiseComparator())
+ : new VectorIterator(input_keys, input_values, BytewiseComparator()));
+
+ ClippingIterator clip(input.get(), &start, &end, BytewiseComparator());
+
+ // The range the clipping iterator should return values from. This is
+ // essentially the intersection of the input range [1, 4) and the clipping
+ // window [clip_start_idx, clip_end_idx)
+ const size_t data_start_idx =
+ std::max(clip_start_idx, static_cast<size_t>(1));
+ const size_t data_end_idx = std::min(clip_end_idx, static_cast<size_t>(4));
+
+ // Range is empty; all Seeks should fail
+ if (data_start_idx >= data_end_idx) {
+ clip.SeekToFirst();
+ ASSERT_FALSE(clip.Valid());
+
+ clip.SeekToLast();
+ ASSERT_FALSE(clip.Valid());
+
+ for (size_t i = 0; i < keys.size(); ++i) {
+ clip.Seek(keys[i]);
+ ASSERT_FALSE(clip.Valid());
+
+ clip.SeekForPrev(keys[i]);
+ ASSERT_FALSE(clip.Valid());
+ }
+
+ return;
+ }
+
+ // Range is non-empty; call SeekToFirst and iterate forward
+ clip.SeekToFirst();
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[data_start_idx]);
+ ASSERT_EQ(clip.value(), values[data_start_idx]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+ for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+ clip.Next();
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[i]);
+ ASSERT_EQ(clip.value(), values[i]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ }
+
+ clip.Next();
+ ASSERT_FALSE(clip.Valid());
+
+ // Do it again using NextAndGetResult
+ clip.SeekToFirst();
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[data_start_idx]);
+ ASSERT_EQ(clip.value(), values[data_start_idx]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+ for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+ IterateResult result;
+ ASSERT_TRUE(clip.NextAndGetResult(&result));
+ ASSERT_EQ(result.key, keys[i]);
+ ASSERT_EQ(result.bound_check_result, IterBoundCheck::kInbound);
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[i]);
+ ASSERT_EQ(clip.value(), values[i]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ }
+
+ IterateResult result;
+ ASSERT_FALSE(clip.NextAndGetResult(&result));
+ ASSERT_FALSE(clip.Valid());
+
+ // Call SeekToLast and iterate backward
+ clip.SeekToLast();
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+ ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+ for (size_t i = data_end_idx - 2; i >= data_start_idx; --i) {
+ clip.Prev();
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[i]);
+ ASSERT_EQ(clip.value(), values[i]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ }
+
+ clip.Prev();
+ ASSERT_FALSE(clip.Valid());
+
+ // Call Seek/SeekForPrev for all keys; Seek should return the smallest key
+ // which is >= the target; SeekForPrev should return the largest key which is
+ // <= the target
+ for (size_t i = 0; i < keys.size(); ++i) {
+ clip.Seek(keys[i]);
+
+ if (i < data_start_idx) {
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[data_start_idx]);
+ ASSERT_EQ(clip.value(), values[data_start_idx]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ } else if (i < data_end_idx) {
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[i]);
+ ASSERT_EQ(clip.value(), values[i]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ } else {
+ ASSERT_FALSE(clip.Valid());
+ }
+
+ clip.SeekForPrev(keys[i]);
+
+ if (i < data_start_idx) {
+ ASSERT_FALSE(clip.Valid());
+ } else if (i < data_end_idx) {
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[i]);
+ ASSERT_EQ(clip.value(), values[i]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ } else {
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+ ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ }
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ ClippingIteratorTest, ClippingIteratorTest,
+ ::testing::Combine(
+ ::testing::Bool(),
+ ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(5)),
+ ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(6))));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction.cc b/src/rocksdb/db/compaction/compaction.cc
new file mode 100644
index 000000000..a32b529f7
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.cc
@@ -0,0 +1,855 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction.h"
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/sst_partitioner.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint64_t kRangeTombstoneSentinel =
+ PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey& b) {
+ auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key());
+ if (c != 0) {
+ return c;
+ }
+ auto a_footer = ExtractInternalKeyFooter(a.Encode());
+ auto b_footer = ExtractInternalKeyFooter(b.Encode());
+ if (a_footer == kRangeTombstoneSentinel) {
+ if (b_footer != kRangeTombstoneSentinel) {
+ return -1;
+ }
+ } else if (b_footer == kRangeTombstoneSentinel) {
+ return 1;
+ }
+ return 0;
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+ const InternalKey& b) {
+ if (a == nullptr) {
+ return -1;
+ }
+ return sstableKeyCompare(user_cmp, *a, b);
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey* b) {
+ if (b == nullptr) {
+ return -1;
+ }
+ return sstableKeyCompare(user_cmp, a, *b);
+}
+
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+ uint64_t sum = 0;
+ for (size_t i = 0; i < files.size() && files[i]; i++) {
+ sum += files[i]->fd.GetFileSize();
+ }
+ return sum;
+}
+
+void Compaction::SetInputVersion(Version* _input_version) {
+ input_version_ = _input_version;
+ cfd_ = input_version_->cfd();
+
+ cfd_->Ref();
+ input_version_->Ref();
+ edit_.SetColumnFamily(cfd_->GetID());
+}
+
+void Compaction::GetBoundaryKeys(
+ VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
+ Slice* largest_user_key, int exclude_level) {
+ bool initialized = false;
+ const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ if (inputs[i].files.empty() || inputs[i].level == exclude_level) {
+ continue;
+ }
+ if (inputs[i].level == 0) {
+ // we need to consider all files on level 0
+ for (const auto* f : inputs[i].files) {
+ const Slice& start_user_key = f->smallest.user_key();
+ if (!initialized ||
+ ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+ *smallest_user_key = start_user_key;
+ }
+ const Slice& end_user_key = f->largest.user_key();
+ if (!initialized ||
+ ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+ *largest_user_key = end_user_key;
+ }
+ initialized = true;
+ }
+ } else {
+ // we only need to consider the first and last file
+ const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
+ if (!initialized ||
+ ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+ *smallest_user_key = start_user_key;
+ }
+ const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
+ if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+ *largest_user_key = end_user_key;
+ }
+ initialized = true;
+ }
+ }
+}
+
+std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
+ VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
+ const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+ for (size_t i = 0; i < inputs.size(); i++) {
+ if (inputs[i].level == 0 || inputs[i].files.empty()) {
+ continue;
+ }
+ inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
+ AtomicCompactionUnitBoundary cur_boundary;
+ size_t first_atomic_idx = 0;
+ auto add_unit_boundary = [&](size_t to) {
+ if (first_atomic_idx == to) return;
+ for (size_t k = first_atomic_idx; k < to; k++) {
+ inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
+ }
+ first_atomic_idx = to;
+ };
+ for (size_t j = 0; j < inputs[i].files.size(); j++) {
+ const auto* f = inputs[i].files[j];
+ if (j == 0) {
+ // First file in a level.
+ cur_boundary.smallest = &f->smallest;
+ cur_boundary.largest = &f->largest;
+ } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
+ 0) {
+ // SSTs overlap but the end key of the previous file was not
+ // artificially extended by a range tombstone. Extend the current
+ // boundary.
+ cur_boundary.largest = &f->largest;
+ } else {
+ // Atomic compaction unit has ended.
+ add_unit_boundary(j);
+ cur_boundary.smallest = &f->smallest;
+ cur_boundary.largest = &f->largest;
+ }
+ }
+ add_unit_boundary(inputs[i].files.size());
+ assert(inputs[i].files.size() ==
+ inputs[i].atomic_compaction_unit_boundaries.size());
+ }
+ return inputs;
+}
+
+// helper function to determine if compaction is creating files at the
+// bottommost level
+bool Compaction::IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ int output_l0_idx;
+ if (output_level == 0) {
+ output_l0_idx = 0;
+ for (const auto* file : vstorage->LevelFiles(0)) {
+ if (inputs[0].files.back() == file) {
+ break;
+ }
+ ++output_l0_idx;
+ }
+ assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size());
+ } else {
+ output_l0_idx = -1;
+ }
+ Slice smallest_key, largest_key;
+ GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key);
+ return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key,
+ output_level, output_l0_idx);
+}
+
+// test function to validate the functionality of IsBottommostLevel()
+// function -- determines if compaction with inputs and storage is bottommost
+bool Compaction::TEST_IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ return IsBottommostLevel(output_level, vstorage, inputs);
+}
+
+bool Compaction::IsFullCompaction(
+ VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ size_t num_files_in_compaction = 0;
+ size_t total_num_files = 0;
+ for (int l = 0; l < vstorage->num_levels(); l++) {
+ total_num_files += vstorage->NumLevelFiles(l);
+ }
+ for (size_t i = 0; i < inputs.size(); i++) {
+ num_files_in_compaction += inputs[i].size();
+ }
+ return num_files_in_compaction == total_num_files;
+}
+
+Compaction::Compaction(
+ VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options,
+ const MutableCFOptions& _mutable_cf_options,
+ const MutableDBOptions& _mutable_db_options,
+ std::vector<CompactionInputFiles> _inputs, int _output_level,
+ uint64_t _target_file_size, uint64_t _max_compaction_bytes,
+ uint32_t _output_path_id, CompressionType _compression,
+ CompressionOptions _compression_opts, Temperature _output_temperature,
+ uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents,
+ bool _manual_compaction, const std::string& _trim_ts, double _score,
+ bool _deletion_compaction, bool l0_files_might_overlap,
+ CompactionReason _compaction_reason,
+ BlobGarbageCollectionPolicy _blob_garbage_collection_policy,
+ double _blob_garbage_collection_age_cutoff)
+ : input_vstorage_(vstorage),
+ start_level_(_inputs[0].level),
+ output_level_(_output_level),
+ target_output_file_size_(_target_file_size),
+ max_compaction_bytes_(_max_compaction_bytes),
+ max_subcompactions_(_max_subcompactions),
+ immutable_options_(_immutable_options),
+ mutable_cf_options_(_mutable_cf_options),
+ input_version_(nullptr),
+ number_levels_(vstorage->num_levels()),
+ cfd_(nullptr),
+ output_path_id_(_output_path_id),
+ output_compression_(_compression),
+ output_compression_opts_(_compression_opts),
+ output_temperature_(_output_temperature),
+ deletion_compaction_(_deletion_compaction),
+ l0_files_might_overlap_(l0_files_might_overlap),
+ inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
+ grandparents_(std::move(_grandparents)),
+ score_(_score),
+ bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
+ is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
+ is_manual_compaction_(_manual_compaction),
+ trim_ts_(_trim_ts),
+ is_trivial_move_(false),
+
+ compaction_reason_(_compaction_reason),
+ notify_on_compaction_completion_(false),
+ enable_blob_garbage_collection_(
+ _blob_garbage_collection_policy == BlobGarbageCollectionPolicy::kForce
+ ? true
+ : (_blob_garbage_collection_policy ==
+ BlobGarbageCollectionPolicy::kDisable
+ ? false
+ : mutable_cf_options()->enable_blob_garbage_collection)),
+ blob_garbage_collection_age_cutoff_(
+ _blob_garbage_collection_age_cutoff < 0 ||
+ _blob_garbage_collection_age_cutoff > 1
+ ? mutable_cf_options()->blob_garbage_collection_age_cutoff
+ : _blob_garbage_collection_age_cutoff),
+ penultimate_level_(EvaluatePenultimateLevel(
+ vstorage, immutable_options_, start_level_, output_level_)) {
+ MarkFilesBeingCompacted(true);
+ if (is_manual_compaction_) {
+ compaction_reason_ = CompactionReason::kManualCompaction;
+ }
+ if (max_subcompactions_ == 0) {
+ max_subcompactions_ = _mutable_db_options.max_subcompactions;
+ }
+
+ // for the non-bottommost levels, it tries to build files match the target
+ // file size, but not guaranteed. It could be 2x the size of the target size.
+ max_output_file_size_ =
+ bottommost_level_ || grandparents_.empty() ||
+ !_immutable_options.level_compaction_dynamic_file_size
+ ? target_output_file_size_
+ : 2 * target_output_file_size_;
+
+#ifndef NDEBUG
+ for (size_t i = 1; i < inputs_.size(); ++i) {
+ assert(inputs_[i].level > inputs_[i - 1].level);
+ }
+#endif
+
+ // setup input_levels_
+ {
+ input_levels_.resize(num_input_levels());
+ for (size_t which = 0; which < num_input_levels(); which++) {
+ DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
+ &arena_);
+ }
+ }
+
+ GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_);
+
+ // Every compaction regardless of any compaction reason may respect the
+ // existing compact cursor in the output level to split output files
+ output_split_key_ = nullptr;
+ if (immutable_options_.compaction_style == kCompactionStyleLevel &&
+ immutable_options_.compaction_pri == kRoundRobin) {
+ const InternalKey* cursor =
+ &input_vstorage_->GetCompactCursors()[output_level_];
+ if (cursor->size() != 0) {
+ const Slice& cursor_user_key = ExtractUserKey(cursor->Encode());
+ auto ucmp = vstorage->InternalComparator()->user_comparator();
+ // May split output files according to the cursor if it in the user-key
+ // range
+ if (ucmp->CompareWithoutTimestamp(cursor_user_key, smallest_user_key_) >
+ 0 &&
+ ucmp->CompareWithoutTimestamp(cursor_user_key, largest_user_key_) <=
+ 0) {
+ output_split_key_ = cursor;
+ }
+ }
+ }
+
+ PopulatePenultimateLevelOutputRange();
+}
+
+void Compaction::PopulatePenultimateLevelOutputRange() {
+ if (!SupportsPerKeyPlacement()) {
+ return;
+ }
+
+ // exclude the last level, the range of all input levels is the safe range
+ // of keys that can be moved up.
+ int exclude_level = number_levels_ - 1;
+ penultimate_output_range_type_ = PenultimateOutputRangeType::kNonLastRange;
+
+ // For universal compaction, the penultimate_output_range could be extended if
+ // all penultimate level files are included in the compaction (which includes
+ // the case that the penultimate level is empty).
+ if (immutable_options_.compaction_style == kCompactionStyleUniversal) {
+ exclude_level = kInvalidLevel;
+ std::set<uint64_t> penultimate_inputs;
+ for (const auto& input_lvl : inputs_) {
+ if (input_lvl.level == penultimate_level_) {
+ for (const auto& file : input_lvl.files) {
+ penultimate_inputs.emplace(file->fd.GetNumber());
+ }
+ }
+ }
+ auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
+ for (const auto& file : penultimate_files) {
+ if (penultimate_inputs.find(file->fd.GetNumber()) ==
+ penultimate_inputs.end()) {
+ exclude_level = number_levels_ - 1;
+ penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange;
+ break;
+ }
+ }
+ }
+
+ GetBoundaryKeys(input_vstorage_, inputs_,
+ &penultimate_level_smallest_user_key_,
+ &penultimate_level_largest_user_key_, exclude_level);
+
+ // If there's a case that the penultimate level output range is overlapping
+ // with the existing files, disable the penultimate level output by setting
+ // the range to empty. One example is the range delete could have overlap
+ // boundary with the next file. (which is actually a false overlap)
+ // TODO: Exclude such false overlap, so it won't disable the penultimate
+ // output.
+ std::set<uint64_t> penultimate_inputs;
+ for (const auto& input_lvl : inputs_) {
+ if (input_lvl.level == penultimate_level_) {
+ for (const auto& file : input_lvl.files) {
+ penultimate_inputs.emplace(file->fd.GetNumber());
+ }
+ }
+ }
+
+ auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
+ for (const auto& file : penultimate_files) {
+ if (penultimate_inputs.find(file->fd.GetNumber()) ==
+ penultimate_inputs.end() &&
+ OverlapPenultimateLevelOutputRange(file->smallest.user_key(),
+ file->largest.user_key())) {
+ // basically disable the penultimate range output. which should be rare
+ // or a false overlap caused by range del
+ penultimate_level_smallest_user_key_ = "";
+ penultimate_level_largest_user_key_ = "";
+ penultimate_output_range_type_ = PenultimateOutputRangeType::kDisabled;
+ }
+ }
+}
+
+Compaction::~Compaction() {
+ if (input_version_ != nullptr) {
+ input_version_->Unref();
+ }
+ if (cfd_ != nullptr) {
+ cfd_->UnrefAndTryDelete();
+ }
+}
+
+bool Compaction::SupportsPerKeyPlacement() const {
+ return penultimate_level_ != kInvalidLevel;
+}
+
+int Compaction::GetPenultimateLevel() const { return penultimate_level_; }
+
+// smallest_key and largest_key include timestamps if user-defined timestamp is
+// enabled.
+bool Compaction::OverlapPenultimateLevelOutputRange(
+ const Slice& smallest_key, const Slice& largest_key) const {
+ if (!SupportsPerKeyPlacement()) {
+ return false;
+ }
+ const Comparator* ucmp =
+ input_vstorage_->InternalComparator()->user_comparator();
+
+ return ucmp->CompareWithoutTimestamp(
+ smallest_key, penultimate_level_largest_user_key_) <= 0 &&
+ ucmp->CompareWithoutTimestamp(
+ largest_key, penultimate_level_smallest_user_key_) >= 0;
+}
+
+// key includes timestamp if user-defined timestamp is enabled.
+bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const {
+ if (!SupportsPerKeyPlacement()) {
+ return false;
+ }
+
+ if (penultimate_level_smallest_user_key_.empty() ||
+ penultimate_level_largest_user_key_.empty()) {
+ return false;
+ }
+
+ const Comparator* ucmp =
+ input_vstorage_->InternalComparator()->user_comparator();
+
+ return ucmp->CompareWithoutTimestamp(
+ key, penultimate_level_smallest_user_key_) >= 0 &&
+ ucmp->CompareWithoutTimestamp(
+ key, penultimate_level_largest_user_key_) <= 0;
+}
+
+bool Compaction::InputCompressionMatchesOutput() const {
+ int base_level = input_vstorage_->base_level();
+ bool matches =
+ (GetCompressionType(input_vstorage_, mutable_cf_options_, start_level_,
+ base_level) == output_compression_);
+ if (matches) {
+ TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches");
+ return true;
+ }
+ TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch");
+ return matches;
+}
+
+bool Compaction::IsTrivialMove() const {
+ // Avoid a move if there is lots of overlapping grandparent data.
+ // Otherwise, the move could create a parent file that will require
+ // a very expensive merge later on.
+ // If start_level_== output_level_, the purpose is to force compaction
+ // filter to be applied to that level, and thus cannot be a trivial move.
+
+ // Check if start level have files with overlapping ranges
+ if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false &&
+ l0_files_might_overlap_) {
+ // We cannot move files from L0 to L1 if the L0 files in the LSM-tree are
+ // overlapping, unless we are sure that files picked in L0 don't overlap.
+ return false;
+ }
+
+ if (is_manual_compaction_ &&
+ (immutable_options_.compaction_filter != nullptr ||
+ immutable_options_.compaction_filter_factory != nullptr)) {
+ // This is a manual compaction and we have a compaction filter that should
+ // be executed, we cannot do a trivial move
+ return false;
+ }
+
+ if (start_level_ == output_level_) {
+ // It doesn't make sense if compaction picker picks files just to trivial
+ // move to the same level.
+ return false;
+ }
+
+ // Used in universal compaction, where trivial move can be done if the
+ // input files are non overlapping
+ if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
+ (output_level_ != 0) &&
+ (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal)) {
+ return is_trivial_move_;
+ }
+
+ if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
+ input(0, 0)->fd.GetPathId() == output_path_id() &&
+ InputCompressionMatchesOutput())) {
+ return false;
+ }
+
+ // assert inputs_.size() == 1
+
+ std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
+
+ for (const auto& file : inputs_.front().files) {
+ std::vector<FileMetaData*> file_grand_parents;
+ if (output_level_ + 1 >= number_levels_) {
+ continue;
+ }
+ input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
+ &file->largest, &file_grand_parents);
+ const auto compaction_size =
+ file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
+ if (compaction_size > max_compaction_bytes_) {
+ return false;
+ }
+
+ if (partitioner.get() != nullptr) {
+ if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
+ file->largest.user_key())) {
+ return false;
+ }
+ }
+ }
+
+ // PerKeyPlacement compaction should never be trivial move.
+ if (SupportsPerKeyPlacement()) {
+ return false;
+ }
+
+ return true;
+}
+
+void Compaction::AddInputDeletions(VersionEdit* out_edit) {
+ for (size_t which = 0; which < num_input_levels(); which++) {
+ for (size_t i = 0; i < inputs_[which].size(); i++) {
+ out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
+ }
+ }
+}
+
+bool Compaction::KeyNotExistsBeyondOutputLevel(
+ const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+ assert(input_version_ != nullptr);
+ assert(level_ptrs != nullptr);
+ assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
+ if (bottommost_level_) {
+ return true;
+ } else if (output_level_ != 0 &&
+ cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+ // Maybe use binary search to find right entry instead of linear search?
+ const Comparator* user_cmp = cfd_->user_comparator();
+ for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
+ const std::vector<FileMetaData*>& files =
+ input_vstorage_->LevelFiles(lvl);
+ for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
+ auto* f = files[level_ptrs->at(lvl)];
+ if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+ // We've advanced far enough
+ // In the presence of user-defined timestamp, we may need to handle
+ // the case in which f->smallest.user_key() (including ts) has the
+ // same user key, but the ts part is smaller. If so,
+ // Compare(user_key, f->smallest.user_key()) returns -1.
+ // That's why we need CompareWithoutTimestamp().
+ if (user_cmp->CompareWithoutTimestamp(user_key,
+ f->smallest.user_key()) >= 0) {
+ // Key falls in this file's range, so it may
+ // exist beyond output level
+ return false;
+ }
+ break;
+ }
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
+ for (size_t i = 0; i < num_input_levels(); i++) {
+ for (size_t j = 0; j < inputs_[i].size(); j++) {
+ assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
+ : inputs_[i][j]->being_compacted);
+ inputs_[i][j]->being_compacted = mark_as_compacted;
+ }
+ }
+}
+
+// Sample output:
+// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5,
+// print: "3@0 + 2@3 + 1@4 files to L5"
+const char* Compaction::InputLevelSummary(
+ InputLevelSummaryBuffer* scratch) const {
+ int len = 0;
+ bool is_first = true;
+ for (auto& input_level : inputs_) {
+ if (input_level.empty()) {
+ continue;
+ }
+ if (!is_first) {
+ len +=
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
+ len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+ } else {
+ is_first = false;
+ }
+ len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ "%" ROCKSDB_PRIszt "@%d", input_level.size(),
+ input_level.level);
+ len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+ }
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ " files to L%d", output_level());
+
+ return scratch->buffer;
+}
+
+uint64_t Compaction::CalculateTotalInputSize() const {
+ uint64_t size = 0;
+ for (auto& input_level : inputs_) {
+ for (auto f : input_level.files) {
+ size += f->fd.GetFileSize();
+ }
+ }
+ return size;
+}
+
+void Compaction::ReleaseCompactionFiles(Status status) {
+ MarkFilesBeingCompacted(false);
+ cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
+}
+
+void Compaction::ResetNextCompactionIndex() {
+ assert(input_version_ != nullptr);
+ input_vstorage_->ResetNextCompactionIndex(start_level_);
+}
+
+namespace {
+int InputSummary(const std::vector<FileMetaData*>& files, char* output,
+ int len) {
+ *output = '\0';
+ int write = 0;
+ for (size_t i = 0; i < files.size(); i++) {
+ int sz = len - write;
+ int ret;
+ char sztxt[16];
+ AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
+ ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
+ files.at(i)->fd.GetNumber(), sztxt);
+ if (ret < 0 || ret >= sz) break;
+ write += ret;
+ }
+ // if files.size() is non-zero, overwrite the last space
+ return write - !!files.size();
+}
+} // namespace
+
+void Compaction::Summary(char* output, int len) {
+ int write =
+ snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [",
+ input_version_->GetVersionNumber(), start_level_);
+ if (write < 0 || write >= len) {
+ return;
+ }
+
+ for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+ if (level_iter > 0) {
+ write += snprintf(output + write, len - write, "], [");
+ if (write < 0 || write >= len) {
+ return;
+ }
+ }
+ write +=
+ InputSummary(inputs_[level_iter].files, output + write, len - write);
+ if (write < 0 || write >= len) {
+ return;
+ }
+ }
+
+ snprintf(output + write, len - write, "]");
+}
+
+uint64_t Compaction::OutputFilePreallocationSize() const {
+ uint64_t preallocation_size = 0;
+
+ for (const auto& level_files : inputs_) {
+ for (const auto& file : level_files.files) {
+ preallocation_size += file->fd.GetFileSize();
+ }
+ }
+
+ if (max_output_file_size_ != std::numeric_limits<uint64_t>::max() &&
+ (immutable_options_.compaction_style == kCompactionStyleLevel ||
+ output_level() > 0)) {
+ preallocation_size = std::min(max_output_file_size_, preallocation_size);
+ }
+
+ // Over-estimate slightly so we don't end up just barely crossing
+ // the threshold
+ // No point to preallocate more than 1GB.
+ return std::min(uint64_t{1073741824},
+ preallocation_size + (preallocation_size / 10));
+}
+
+std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
+ if (!cfd_->ioptions()->compaction_filter_factory) {
+ return nullptr;
+ }
+
+ if (!cfd_->ioptions()
+ ->compaction_filter_factory->ShouldFilterTableFileCreation(
+ TableFileCreationReason::kCompaction)) {
+ return nullptr;
+ }
+
+ CompactionFilter::Context context;
+ context.is_full_compaction = is_full_compaction_;
+ context.is_manual_compaction = is_manual_compaction_;
+ context.column_family_id = cfd_->GetID();
+ context.reason = TableFileCreationReason::kCompaction;
+ return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
+ context);
+}
+
+std::unique_ptr<SstPartitioner> Compaction::CreateSstPartitioner() const {
+ if (!immutable_options_.sst_partitioner_factory) {
+ return nullptr;
+ }
+
+ SstPartitioner::Context context;
+ context.is_full_compaction = is_full_compaction_;
+ context.is_manual_compaction = is_manual_compaction_;
+ context.output_level = output_level_;
+ context.smallest_user_key = smallest_user_key_;
+ context.largest_user_key = largest_user_key_;
+ return immutable_options_.sst_partitioner_factory->CreatePartitioner(context);
+}
+
+bool Compaction::IsOutputLevelEmpty() const {
+ return inputs_.back().level != output_level_ || inputs_.back().empty();
+}
+
+bool Compaction::ShouldFormSubcompactions() const {
+ if (cfd_ == nullptr) {
+ return false;
+ }
+
+ // Round-Robin pri under leveled compaction allows subcompactions by default
+ // and the number of subcompactions can be larger than max_subcompactions_
+ if (cfd_->ioptions()->compaction_pri == kRoundRobin &&
+ cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+ return output_level_ > 0;
+ }
+
+ if (max_subcompactions_ <= 1) {
+ return false;
+ }
+
+ if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+ return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0;
+ } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
+ return number_levels_ > 1 && output_level_ > 0;
+ } else {
+ return false;
+ }
+}
+
+bool Compaction::DoesInputReferenceBlobFiles() const {
+ assert(input_version_);
+
+ const VersionStorageInfo* storage_info = input_version_->storage_info();
+ assert(storage_info);
+
+ if (storage_info->GetBlobFiles().empty()) {
+ return false;
+ }
+
+ for (size_t i = 0; i < inputs_.size(); ++i) {
+ for (const FileMetaData* meta : inputs_[i].files) {
+ assert(meta);
+
+ if (meta->oldest_blob_file_number != kInvalidBlobFileNumber) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+uint64_t Compaction::MinInputFileOldestAncesterTime(
+ const InternalKey* start, const InternalKey* end) const {
+ uint64_t min_oldest_ancester_time = std::numeric_limits<uint64_t>::max();
+ const InternalKeyComparator& icmp =
+ column_family_data()->internal_comparator();
+ for (const auto& level_files : inputs_) {
+ for (const auto& file : level_files.files) {
+ if (start != nullptr && icmp.Compare(file->largest, *start) < 0) {
+ continue;
+ }
+ if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) {
+ continue;
+ }
+ uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+ if (oldest_ancester_time != 0) {
+ min_oldest_ancester_time =
+ std::min(min_oldest_ancester_time, oldest_ancester_time);
+ }
+ }
+ }
+ return min_oldest_ancester_time;
+}
+
+int Compaction::EvaluatePenultimateLevel(
+ const VersionStorageInfo* vstorage,
+ const ImmutableOptions& immutable_options, const int start_level,
+ const int output_level) {
+ // TODO: currently per_key_placement feature only support level and universal
+ // compaction
+ if (immutable_options.compaction_style != kCompactionStyleLevel &&
+ immutable_options.compaction_style != kCompactionStyleUniversal) {
+ return kInvalidLevel;
+ }
+ if (output_level != immutable_options.num_levels - 1) {
+ return kInvalidLevel;
+ }
+
+ int penultimate_level = output_level - 1;
+ assert(penultimate_level < immutable_options.num_levels);
+ if (penultimate_level <= 0) {
+ return kInvalidLevel;
+ }
+
+ // If the penultimate level is not within input level -> output level range
+ // check if the penultimate output level is empty, if it's empty, it could
+ // also be locked for the penultimate output.
+ // TODO: ideally, it only needs to check if there's a file within the
+ // compaction output key range. For simplicity, it just check if there's any
+ // file on the penultimate level.
+ if (start_level == immutable_options.num_levels - 1 &&
+ (immutable_options.compaction_style != kCompactionStyleUniversal ||
+ !vstorage->LevelFiles(penultimate_level).empty())) {
+ return kInvalidLevel;
+ }
+
+ bool supports_per_key_placement =
+ immutable_options.preclude_last_level_data_seconds > 0;
+
+ // it could be overridden by unittest
+ TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
+ &supports_per_key_placement);
+ if (!supports_per_key_placement) {
+ return kInvalidLevel;
+ }
+
+ return penultimate_level;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h
new file mode 100644
index 000000000..21d1190ac
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.h
@@ -0,0 +1,559 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/sst_partitioner.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The file contains class Compaction, as well as some helper functions
+// and data structures used by the class.
+
+// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
+// null which provides the property that a==null indicates a key that is less
+// than any key and b==null indicates a key that is greater than any key. Note
+// that the comparison is performed primarily on the user-key portion of the
+// key. If the user-keys compare equal, an additional test is made to sort
+// range tombstone sentinel keys before other keys with the same user-key. The
+// result is that 2 user-keys will compare equal if they differ purely on
+// their sequence number and value, but the range tombstone sentinel for that
+// user-key will compare not equal. This is necessary because the range
+// tombstone sentinel key is set as the largest key for an sstable even though
+// that key never appears in the database. We don't want adjacent sstables to
+// be considered overlapping if they are separated by the range tombstone
+// sentinel.
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+ const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey* b);
+
+// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
+// largest] that exactly spans one ore more neighbouring SSTs on the same
+// level. Every pair of SSTs in this range "overlap" (i.e., the largest
+// user key of one file is the smallest user key of the next file). These
+// boundaries are propagated down to RangeDelAggregator during compaction
+// to provide safe truncation boundaries for range tombstones.
+struct AtomicCompactionUnitBoundary {
+ const InternalKey* smallest = nullptr;
+ const InternalKey* largest = nullptr;
+};
+
+// The structure that manages compaction input files associated
+// with the same physical level.
+struct CompactionInputFiles {
+ int level;
+ std::vector<FileMetaData*> files;
+ std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
+ inline bool empty() const { return files.empty(); }
+ inline size_t size() const { return files.size(); }
+ inline void clear() { files.clear(); }
+ inline FileMetaData* operator[](size_t i) const { return files[i]; }
+};
+
+class Version;
+class ColumnFamilyData;
+class VersionStorageInfo;
+class CompactionFilter;
+
+// A Compaction encapsulates metadata about a compaction.
+class Compaction {
+ public:
+ Compaction(VersionStorageInfo* input_version,
+ const ImmutableOptions& immutable_options,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ std::vector<CompactionInputFiles> inputs, int output_level,
+ uint64_t target_file_size, uint64_t max_compaction_bytes,
+ uint32_t output_path_id, CompressionType compression,
+ CompressionOptions compression_opts,
+ Temperature output_temperature, uint32_t max_subcompactions,
+ std::vector<FileMetaData*> grandparents,
+ bool manual_compaction = false, const std::string& trim_ts = "",
+ double score = -1, bool deletion_compaction = false,
+ bool l0_files_might_overlap = true,
+ CompactionReason compaction_reason = CompactionReason::kUnknown,
+ BlobGarbageCollectionPolicy blob_garbage_collection_policy =
+ BlobGarbageCollectionPolicy::kUseDefault,
+ double blob_garbage_collection_age_cutoff = -1);
+
+ // The type of the penultimate level output range
+ enum class PenultimateOutputRangeType : int {
+ kNotSupported, // it cannot output to the penultimate level
+ kFullRange, // any data could be output to the penultimate level
+ kNonLastRange, // only the keys within non_last_level compaction inputs can
+ // be outputted to the penultimate level
+ kDisabled, // no data can be outputted to the penultimate level
+ };
+
+ // No copying allowed
+ Compaction(const Compaction&) = delete;
+ void operator=(const Compaction&) = delete;
+
+ ~Compaction();
+
+ // Returns the level associated to the specified compaction input level.
+ // If compaction_input_level is not specified, then input_level is set to 0.
+ int level(size_t compaction_input_level = 0) const {
+ return inputs_[compaction_input_level].level;
+ }
+
+ int start_level() const { return start_level_; }
+
+ // Outputs will go to this level
+ int output_level() const { return output_level_; }
+
+ // Returns the number of input levels in this compaction.
+ size_t num_input_levels() const { return inputs_.size(); }
+
+ // Return the object that holds the edits to the descriptor done
+ // by this compaction.
+ VersionEdit* edit() { return &edit_; }
+
+ // Returns the number of input files associated to the specified
+ // compaction input level.
+ // The function will return 0 if when "compaction_input_level" < 0
+ // or "compaction_input_level" >= "num_input_levels()".
+ size_t num_input_files(size_t compaction_input_level) const {
+ if (compaction_input_level < inputs_.size()) {
+ return inputs_[compaction_input_level].size();
+ }
+ return 0;
+ }
+
+ // Returns input version of the compaction
+ Version* input_version() const { return input_version_; }
+
+ // Returns the ColumnFamilyData associated with the compaction.
+ ColumnFamilyData* column_family_data() const { return cfd_; }
+
+ // Returns the file meta data of the 'i'th input file at the
+ // specified compaction input level.
+ // REQUIREMENT: "compaction_input_level" must be >= 0 and
+ // < "input_levels()"
+ FileMetaData* input(size_t compaction_input_level, size_t i) const {
+ assert(compaction_input_level < inputs_.size());
+ return inputs_[compaction_input_level][i];
+ }
+
+ const std::vector<AtomicCompactionUnitBoundary>* boundaries(
+ size_t compaction_input_level) const {
+ assert(compaction_input_level < inputs_.size());
+ return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
+ }
+
+ // Returns the list of file meta data of the specified compaction
+ // input level.
+ // REQUIREMENT: "compaction_input_level" must be >= 0 and
+ // < "input_levels()"
+ const std::vector<FileMetaData*>* inputs(
+ size_t compaction_input_level) const {
+ assert(compaction_input_level < inputs_.size());
+ return &inputs_[compaction_input_level].files;
+ }
+
+ const std::vector<CompactionInputFiles>* inputs() { return &inputs_; }
+
+ // Returns the LevelFilesBrief of the specified compaction input level.
+ const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
+ return &input_levels_[compaction_input_level];
+ }
+
+ // Maximum size of files to build during this compaction.
+ uint64_t max_output_file_size() const { return max_output_file_size_; }
+
+ // Target output file size for this compaction
+ uint64_t target_output_file_size() const { return target_output_file_size_; }
+
+ // What compression for output
+ CompressionType output_compression() const { return output_compression_; }
+
+ // What compression options for output
+ const CompressionOptions& output_compression_opts() const {
+ return output_compression_opts_;
+ }
+
+ // Whether need to write output file to second DB path.
+ uint32_t output_path_id() const { return output_path_id_; }
+
+ // Is this a trivial compaction that can be implemented by just
+ // moving a single input file to the next level (no merging or splitting)
+ bool IsTrivialMove() const;
+
+ // The split user key in the output level if this compaction is required to
+ // split the output files according to the existing cursor in the output
+ // level under round-robin compaction policy. Empty indicates no required
+ // splitting key
+ const InternalKey* GetOutputSplitKey() const { return output_split_key_; }
+
+ // If true, then the compaction can be done by simply deleting input files.
+ bool deletion_compaction() const { return deletion_compaction_; }
+
+ // Add all inputs to this compaction as delete operations to *edit.
+ void AddInputDeletions(VersionEdit* edit);
+
+ // Returns true if the available information we have guarantees that
+ // the input "user_key" does not exist in any level beyond "output_level()".
+ bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
+ std::vector<size_t>* level_ptrs) const;
+
+ // Clear all files to indicate that they are not being compacted
+ // Delete this compaction from the list of running compactions.
+ //
+ // Requirement: DB mutex held
+ void ReleaseCompactionFiles(Status status);
+
+ // Returns the summary of the compaction in "output" with maximum "len"
+ // in bytes. The caller is responsible for the memory management of
+ // "output".
+ void Summary(char* output, int len);
+
+ // Return the score that was used to pick this compaction run.
+ double score() const { return score_; }
+
+ // Is this compaction creating a file in the bottom most level?
+ bool bottommost_level() const { return bottommost_level_; }
+
+ // Is the compaction compact to the last level
+ bool is_last_level() const {
+ return output_level_ == immutable_options_.num_levels - 1;
+ }
+
+ // Does this compaction include all sst files?
+ bool is_full_compaction() const { return is_full_compaction_; }
+
+ // Was this compaction triggered manually by the client?
+ bool is_manual_compaction() const { return is_manual_compaction_; }
+
+ std::string trim_ts() const { return trim_ts_; }
+
+ // Used when allow_trivial_move option is set in
+ // Universal compaction. If all the input files are
+ // non overlapping, then is_trivial_move_ variable
+ // will be set true, else false
+ void set_is_trivial_move(bool trivial_move) {
+ is_trivial_move_ = trivial_move;
+ }
+
+ // Used when allow_trivial_move option is set in
+ // Universal compaction. Returns true, if the input files
+ // are non-overlapping and can be trivially moved.
+ bool is_trivial_move() const { return is_trivial_move_; }
+
+ // How many total levels are there?
+ int number_levels() const { return number_levels_; }
+
+ // Return the ImmutableOptions that should be used throughout the compaction
+ // procedure
+ const ImmutableOptions* immutable_options() const {
+ return &immutable_options_;
+ }
+
+ // Return the MutableCFOptions that should be used throughout the compaction
+ // procedure
+ const MutableCFOptions* mutable_cf_options() const {
+ return &mutable_cf_options_;
+ }
+
+ // Returns the size in bytes that the output file should be preallocated to.
+ // In level compaction, that is max_file_size_. In universal compaction, that
+ // is the sum of all input file sizes.
+ uint64_t OutputFilePreallocationSize() const;
+
+ void SetInputVersion(Version* input_version);
+
+ struct InputLevelSummaryBuffer {
+ char buffer[128];
+ };
+
+ const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const;
+
+ uint64_t CalculateTotalInputSize() const;
+
+ // In case of compaction error, reset the nextIndex that is used
+ // to pick up the next file to be compacted from files_by_size_
+ void ResetNextCompactionIndex();
+
+ // Create a CompactionFilter from compaction_filter_factory
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter() const;
+
+ // Create a SstPartitioner from sst_partitioner_factory
+ std::unique_ptr<SstPartitioner> CreateSstPartitioner() const;
+
+ // Is the input level corresponding to output_level_ empty?
+ bool IsOutputLevelEmpty() const;
+
+ // Should this compaction be broken up into smaller ones run in parallel?
+ bool ShouldFormSubcompactions() const;
+
+ // Returns true iff at least one input file references a blob file.
+ //
+ // PRE: input version has been set.
+ bool DoesInputReferenceBlobFiles() const;
+
+ // test function to validate the functionality of IsBottommostLevel()
+ // function -- determines if compaction with inputs and storage is bottommost
+ static bool TEST_IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ TablePropertiesCollection GetOutputTableProperties() const {
+ return output_table_properties_;
+ }
+
+ void SetOutputTableProperties(TablePropertiesCollection tp) {
+ output_table_properties_ = std::move(tp);
+ }
+
+ Slice GetSmallestUserKey() const { return smallest_user_key_; }
+
+ Slice GetLargestUserKey() const { return largest_user_key_; }
+
+ Slice GetPenultimateLevelSmallestUserKey() const {
+ return penultimate_level_smallest_user_key_;
+ }
+
+ Slice GetPenultimateLevelLargestUserKey() const {
+ return penultimate_level_largest_user_key_;
+ }
+
+ PenultimateOutputRangeType GetPenultimateOutputRangeType() const {
+ return penultimate_output_range_type_;
+ }
+
+ // Return true if the compaction supports per_key_placement
+ bool SupportsPerKeyPlacement() const;
+
+ // Get per_key_placement penultimate output level, which is `last_level - 1`
+ // if per_key_placement feature is supported. Otherwise, return -1.
+ int GetPenultimateLevel() const;
+
+ // Return true if the given range is overlap with penultimate level output
+ // range.
+ // Both smallest_key and largest_key include timestamps if user-defined
+ // timestamp is enabled.
+ bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
+ const Slice& largest_key) const;
+
+ // Return true if the key is within penultimate level output range for
+ // per_key_placement feature, which is safe to place the key to the
+ // penultimate level. different compaction strategy has different rules.
+ // If per_key_placement is not supported, always return false.
+ // TODO: currently it doesn't support moving data from the last level to the
+ // penultimate level
+ // key includes timestamp if user-defined timestamp is enabled.
+ bool WithinPenultimateLevelOutputRange(const Slice& key) const;
+
+ CompactionReason compaction_reason() const { return compaction_reason_; }
+
+ const std::vector<FileMetaData*>& grandparents() const {
+ return grandparents_;
+ }
+
+ uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
+
+ Temperature output_temperature() const { return output_temperature_; }
+
+ uint32_t max_subcompactions() const { return max_subcompactions_; }
+
+ bool enable_blob_garbage_collection() const {
+ return enable_blob_garbage_collection_;
+ }
+
+ double blob_garbage_collection_age_cutoff() const {
+ return blob_garbage_collection_age_cutoff_;
+ }
+
+ // start and end are sub compact range. Null if no boundary.
+ // This is used to filter out some input files' ancester's time range.
+ uint64_t MinInputFileOldestAncesterTime(const InternalKey* start,
+ const InternalKey* end) const;
+
+ // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of
+ // compaction begin and compaction completion callbacks match.
+ void SetNotifyOnCompactionCompleted() {
+ notify_on_compaction_completion_ = true;
+ }
+
+ bool ShouldNotifyOnCompactionCompleted() const {
+ return notify_on_compaction_completion_;
+ }
+
+ static constexpr int kInvalidLevel = -1;
+
+ // Evaluate penultimate output level. If the compaction supports
+ // per_key_placement feature, it returns the penultimate level number.
+ // Otherwise, it's set to kInvalidLevel (-1), which means
+ // output_to_penultimate_level is not supported.
+ // Note: even the penultimate level output is supported (PenultimateLevel !=
+ // kInvalidLevel), some key range maybe unsafe to be outputted to the
+ // penultimate level. The safe key range is populated by
+ // `PopulatePenultimateLevelOutputRange()`.
+ // Which could potentially disable all penultimate level output.
+ static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage,
+ const ImmutableOptions& immutable_options,
+ const int start_level,
+ const int output_level);
+
+ private:
+ // mark (or clear) all files that are being compacted
+ void MarkFilesBeingCompacted(bool mark_as_compacted);
+
+ // get the smallest and largest key present in files to be compacted
+ static void GetBoundaryKeys(VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs,
+ Slice* smallest_key, Slice* largest_key,
+ int exclude_level = -1);
+
+ // populate penultimate level output range, which will be used to determine if
+ // a key is safe to output to the penultimate level (details see
+ // `Compaction::WithinPenultimateLevelOutputRange()`.
+ void PopulatePenultimateLevelOutputRange();
+
+ // Get the atomic file boundaries for all files in the compaction. Necessary
+ // in order to avoid the scenario described in
+ // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and
+ // plumb down appropriate key boundaries to RangeDelAggregator during
+ // compaction.
+ static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
+ VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
+
+ // helper function to determine if compaction with inputs and storage is
+ // bottommost
+ static bool IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ static bool IsFullCompaction(VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ VersionStorageInfo* input_vstorage_;
+
+ const int start_level_; // the lowest level to be compacted
+ const int output_level_; // levels to which output files are stored
+ uint64_t target_output_file_size_;
+ uint64_t max_output_file_size_;
+ uint64_t max_compaction_bytes_;
+ uint32_t max_subcompactions_;
+ const ImmutableOptions immutable_options_;
+ const MutableCFOptions mutable_cf_options_;
+ Version* input_version_;
+ VersionEdit edit_;
+ const int number_levels_;
+ ColumnFamilyData* cfd_;
+ Arena arena_; // Arena used to allocate space for file_levels_
+
+ const uint32_t output_path_id_;
+ CompressionType output_compression_;
+ CompressionOptions output_compression_opts_;
+ Temperature output_temperature_;
+ // If true, then the compaction can be done by simply deleting input files.
+ const bool deletion_compaction_;
+ // should it split the output file using the compact cursor?
+ const InternalKey* output_split_key_;
+
+ // L0 files in LSM-tree might be overlapping. But the compaction picking
+ // logic might pick a subset of the files that aren't overlapping. if
+ // that is the case, set the value to false. Otherwise, set it true.
+ bool l0_files_might_overlap_;
+
+ // Compaction input files organized by level. Constant after construction
+ const std::vector<CompactionInputFiles> inputs_;
+
+ // A copy of inputs_, organized more closely in memory
+ autovector<LevelFilesBrief, 2> input_levels_;
+
+ // State used to check for number of overlapping grandparent files
+ // (grandparent == "output_level_ + 1")
+ std::vector<FileMetaData*> grandparents_;
+ const double score_; // score that was used to pick this compaction.
+
+ // Is this compaction creating a file in the bottom most level?
+ const bool bottommost_level_;
+ // Does this compaction include all sst files?
+ const bool is_full_compaction_;
+
+ // Is this compaction requested by the client?
+ const bool is_manual_compaction_;
+
+ // The data with timestamp > trim_ts_ will be removed
+ const std::string trim_ts_;
+
+ // True if we can do trivial move in Universal multi level
+ // compaction
+ bool is_trivial_move_;
+
+ // Does input compression match the output compression?
+ bool InputCompressionMatchesOutput() const;
+
+ // table properties of output files
+ TablePropertiesCollection output_table_properties_;
+
+ // smallest user keys in compaction
+ // includes timestamp if user-defined timestamp is enabled.
+ Slice smallest_user_key_;
+
+ // largest user keys in compaction
+ // includes timestamp if user-defined timestamp is enabled.
+ Slice largest_user_key_;
+
+ // Reason for compaction
+ CompactionReason compaction_reason_;
+
+ // Notify on compaction completion only if listener was notified on compaction
+ // begin.
+ bool notify_on_compaction_completion_;
+
+ // Enable/disable GC collection for blobs during compaction.
+ bool enable_blob_garbage_collection_;
+
+ // Blob garbage collection age cutoff.
+ double blob_garbage_collection_age_cutoff_;
+
+ // only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
+ // means not supported.
+ const int penultimate_level_;
+
+ // Key range for penultimate level output
+ // includes timestamp if user-defined timestamp is enabled.
+ // penultimate_output_range_type_ shows the range type
+ Slice penultimate_level_smallest_user_key_;
+ Slice penultimate_level_largest_user_key_;
+ PenultimateOutputRangeType penultimate_output_range_type_ =
+ PenultimateOutputRangeType::kNotSupported;
+};
+
+#ifndef NDEBUG
+// Helper struct only for tests, which contains the data to decide if a key
+// should be output to the penultimate level.
+// TODO: remove this when the public feature knob is available
+struct PerKeyPlacementContext {
+ const int level;
+ const Slice key;
+ const Slice value;
+ const SequenceNumber seq_num;
+
+ bool output_to_penultimate_level;
+
+ PerKeyPlacementContext(int _level, Slice _key, Slice _value,
+ SequenceNumber _seq_num)
+ : level(_level), key(_key), value(_value), seq_num(_seq_num) {
+ output_to_penultimate_level = false;
+ }
+};
+#endif /* !NDEBUG */
+
+// Return sum of sizes of all files in `files`.
+extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iteration_stats.h b/src/rocksdb/db/compaction/compaction_iteration_stats.h
new file mode 100644
index 000000000..1b1c28b57
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iteration_stats.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct CompactionIterationStats {
+ // Compaction statistics
+
+ // Doesn't include records skipped because of
+ // CompactionFilter::Decision::kRemoveAndSkipUntil.
+ int64_t num_record_drop_user = 0;
+
+ int64_t num_record_drop_hidden = 0;
+ int64_t num_record_drop_obsolete = 0;
+ int64_t num_record_drop_range_del = 0;
+ int64_t num_range_del_drop_obsolete = 0;
+ // Deletions obsoleted before bottom level due to file gap optimization.
+ int64_t num_optimized_del_drop_obsolete = 0;
+ uint64_t total_filter_time = 0;
+
+ // Input statistics
+ // TODO(noetzli): The stats are incomplete. They are lacking everything
+ // consumed by MergeHelper.
+ uint64_t num_input_records = 0;
+ uint64_t num_input_deletion_records = 0;
+ uint64_t num_input_corrupt_records = 0;
+ uint64_t total_input_raw_key_bytes = 0;
+ uint64_t total_input_raw_value_bytes = 0;
+
+ // Single-Delete diagnostics for exceptional situations
+ uint64_t num_single_del_fallthru = 0;
+ uint64_t num_single_del_mismatch = 0;
+
+ // Blob related statistics
+ uint64_t num_blobs_read = 0;
+ uint64_t total_blob_bytes_read = 0;
+ uint64_t num_blobs_relocated = 0;
+ uint64_t total_blob_bytes_relocated = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator.cc b/src/rocksdb/db/compaction/compaction_iterator.cc
new file mode 100644
index 000000000..9f54f7813
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.cc
@@ -0,0 +1,1338 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/compaction_iterator.h"
+
+#include <iterator>
+#include <limits>
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_file_builder.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/prefetch_buffer_collection.h"
+#include "db/snapshot_checker.h"
+#include "logging/logging.h"
+#include "port/likely.h"
+#include "rocksdb/listener.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+CompactionIterator::CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+ Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+ bool enforce_single_del_contracts,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const Compaction* compaction, const CompactionFilter* compaction_filter,
+ const std::atomic<bool>* shutting_down,
+ const std::shared_ptr<Logger> info_log,
+ const std::string* full_history_ts_low,
+ const SequenceNumber preserve_time_min_seqno,
+ const SequenceNumber preclude_last_level_min_seqno)
+ : CompactionIterator(
+ input, cmp, merge_helper, last_sequence, snapshots,
+ earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
+ report_detailed_time, expect_valid_internal_key, range_del_agg,
+ blob_file_builder, allow_data_in_errors, enforce_single_del_contracts,
+ manual_compaction_canceled,
+ std::unique_ptr<CompactionProxy>(
+ compaction ? new RealCompaction(compaction) : nullptr),
+ compaction_filter, shutting_down, info_log, full_history_ts_low,
+ preserve_time_min_seqno, preclude_last_level_min_seqno) {}
+
+CompactionIterator::CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber /*last_sequence*/, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+ Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+ bool enforce_single_del_contracts,
+ const std::atomic<bool>& manual_compaction_canceled,
+ std::unique_ptr<CompactionProxy> compaction,
+ const CompactionFilter* compaction_filter,
+ const std::atomic<bool>* shutting_down,
+ const std::shared_ptr<Logger> info_log,
+ const std::string* full_history_ts_low,
+ const SequenceNumber preserve_time_min_seqno,
+ const SequenceNumber preclude_last_level_min_seqno)
+ : input_(input, cmp,
+ !compaction || compaction->DoesInputReferenceBlobFiles()),
+ cmp_(cmp),
+ merge_helper_(merge_helper),
+ snapshots_(snapshots),
+ earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+ job_snapshot_(job_snapshot),
+ snapshot_checker_(snapshot_checker),
+ env_(env),
+ clock_(env_->GetSystemClock().get()),
+ report_detailed_time_(report_detailed_time),
+ expect_valid_internal_key_(expect_valid_internal_key),
+ range_del_agg_(range_del_agg),
+ blob_file_builder_(blob_file_builder),
+ compaction_(std::move(compaction)),
+ compaction_filter_(compaction_filter),
+ shutting_down_(shutting_down),
+ manual_compaction_canceled_(manual_compaction_canceled),
+ bottommost_level_(!compaction_ ? false
+ : compaction_->bottommost_level() &&
+ !compaction_->allow_ingest_behind()),
+ // snapshots_ cannot be nullptr, but we will assert later in the body of
+ // the constructor.
+ visible_at_tip_(snapshots_ ? snapshots_->empty() : false),
+ earliest_snapshot_(!snapshots_ || snapshots_->empty()
+ ? kMaxSequenceNumber
+ : snapshots_->at(0)),
+ info_log_(info_log),
+ allow_data_in_errors_(allow_data_in_errors),
+ enforce_single_del_contracts_(enforce_single_del_contracts),
+ timestamp_size_(cmp_ ? cmp_->timestamp_size() : 0),
+ full_history_ts_low_(full_history_ts_low),
+ current_user_key_sequence_(0),
+ current_user_key_snapshot_(0),
+ merge_out_iter_(merge_helper_),
+ blob_garbage_collection_cutoff_file_number_(
+ ComputeBlobGarbageCollectionCutoffFileNumber(compaction_.get())),
+ blob_fetcher_(CreateBlobFetcherIfNeeded(compaction_.get())),
+ prefetch_buffers_(
+ CreatePrefetchBufferCollectionIfNeeded(compaction_.get())),
+ current_key_committed_(false),
+ cmp_with_history_ts_low_(0),
+ level_(compaction_ == nullptr ? 0 : compaction_->level()),
+ preserve_time_min_seqno_(preserve_time_min_seqno),
+ preclude_last_level_min_seqno_(preclude_last_level_min_seqno) {
+ assert(snapshots_ != nullptr);
+ assert(preserve_time_min_seqno_ <= preclude_last_level_min_seqno_);
+
+ if (compaction_ != nullptr) {
+ level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
+ }
+#ifndef NDEBUG
+ // findEarliestVisibleSnapshot assumes this ordering.
+ for (size_t i = 1; i < snapshots_->size(); ++i) {
+ assert(snapshots_->at(i - 1) < snapshots_->at(i));
+ }
+ assert(timestamp_size_ == 0 || !full_history_ts_low_ ||
+ timestamp_size_ == full_history_ts_low_->size());
+#endif
+ input_.SetPinnedItersMgr(&pinned_iters_mgr_);
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
+}
+
+CompactionIterator::~CompactionIterator() {
+ // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime
+ input_.SetPinnedItersMgr(nullptr);
+}
+
+void CompactionIterator::ResetRecordCounts() {
+ iter_stats_.num_record_drop_user = 0;
+ iter_stats_.num_record_drop_hidden = 0;
+ iter_stats_.num_record_drop_obsolete = 0;
+ iter_stats_.num_record_drop_range_del = 0;
+ iter_stats_.num_range_del_drop_obsolete = 0;
+ iter_stats_.num_optimized_del_drop_obsolete = 0;
+}
+
+void CompactionIterator::SeekToFirst() {
+ NextFromInput();
+ PrepareOutput();
+}
+
+void CompactionIterator::Next() {
+ // If there is a merge output, return it before continuing to process the
+ // input.
+ if (merge_out_iter_.Valid()) {
+ merge_out_iter_.Next();
+
+ // Check if we returned all records of the merge output.
+ if (merge_out_iter_.Valid()) {
+ key_ = merge_out_iter_.key();
+ value_ = merge_out_iter_.value();
+ Status s = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+ // MergeUntil stops when it encounters a corrupt key and does not
+ // include them in the result, so we expect the keys here to be valid.
+ if (!s.ok()) {
+ ROCKS_LOG_FATAL(
+ info_log_, "Invalid ikey %s in compaction. %s",
+ allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden",
+ s.getState());
+ assert(false);
+ }
+
+ // Keep current_key_ in sync.
+ if (0 == timestamp_size_) {
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ } else {
+ Slice ts = ikey_.GetTimestamp(timestamp_size_);
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type, &ts);
+ }
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+ validity_info_.SetValid(ValidContext::kMerge1);
+ } else {
+ // We consumed all pinned merge operands, release pinned iterators
+ pinned_iters_mgr_.ReleasePinnedData();
+ // MergeHelper moves the iterator to the first record after the merged
+ // records, so even though we reached the end of the merge output, we do
+ // not want to advance the iterator.
+ NextFromInput();
+ }
+ } else {
+ // Only advance the input iterator if there is no merge output and the
+ // iterator is not already at the next record.
+ if (!at_next_) {
+ AdvanceInputIter();
+ }
+ NextFromInput();
+ }
+
+ if (Valid()) {
+ // Record that we've outputted a record for the current key.
+ has_outputted_key_ = true;
+ }
+
+ PrepareOutput();
+}
+
+bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
+ Slice* skip_until) {
+ // TODO: support compaction filter for wide-column entities
+ if (!compaction_filter_ ||
+ (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) {
+ return true;
+ }
+ bool error = false;
+ // If the user has specified a compaction filter and the sequence
+ // number is greater than any external snapshot, then invoke the
+ // filter. If the return value of the compaction filter is true,
+ // replace the entry with a deletion marker.
+ CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined;
+ compaction_filter_value_.clear();
+ compaction_filter_skip_until_.Clear();
+ CompactionFilter::ValueType value_type =
+ ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+ : CompactionFilter::ValueType::kBlobIndex;
+ // Hack: pass internal key to BlobIndexCompactionFilter since it needs
+ // to get sequence number.
+ assert(compaction_filter_);
+ Slice& filter_key =
+ (ikey_.type == kTypeValue ||
+ !compaction_filter_->IsStackedBlobDbInternalCompactionFilter())
+ ? ikey_.user_key
+ : key_;
+ {
+ StopWatchNano timer(clock_, report_detailed_time_);
+ if (kTypeBlobIndex == ikey_.type) {
+ filter = compaction_filter_->FilterBlobByKey(
+ level_, filter_key, &compaction_filter_value_,
+ compaction_filter_skip_until_.rep());
+ if (CompactionFilter::Decision::kUndetermined == filter &&
+ !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+ if (compaction_ == nullptr) {
+ status_ =
+ Status::Corruption("Unexpected blob index outside of compaction");
+ validity_info_.Invalidate();
+ return false;
+ }
+
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex",
+ &value_);
+
+ // For integrated BlobDB impl, CompactionIterator reads blob value.
+ // For Stacked BlobDB impl, the corresponding CompactionFilter's
+ // FilterV2 method should read the blob value.
+ BlobIndex blob_index;
+ Status s = blob_index.DecodeFrom(value_);
+ if (!s.ok()) {
+ status_ = s;
+ validity_info_.Invalidate();
+ return false;
+ }
+
+ FilePrefetchBuffer* prefetch_buffer =
+ prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer(
+ blob_index.file_number())
+ : nullptr;
+
+ uint64_t bytes_read = 0;
+
+ assert(blob_fetcher_);
+
+ s = blob_fetcher_->FetchBlob(ikey_.user_key, blob_index,
+ prefetch_buffer, &blob_value_,
+ &bytes_read);
+ if (!s.ok()) {
+ status_ = s;
+ validity_info_.Invalidate();
+ return false;
+ }
+
+ ++iter_stats_.num_blobs_read;
+ iter_stats_.total_blob_bytes_read += bytes_read;
+
+ value_type = CompactionFilter::ValueType::kValue;
+ }
+ }
+ if (CompactionFilter::Decision::kUndetermined == filter) {
+ filter = compaction_filter_->FilterV2(
+ level_, filter_key, value_type,
+ blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_,
+ compaction_filter_skip_until_.rep());
+ }
+ iter_stats_.total_filter_time +=
+ env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
+ }
+
+ if (CompactionFilter::Decision::kUndetermined == filter) {
+ // Should not reach here, since FilterV2 should never return kUndetermined.
+ status_ =
+ Status::NotSupported("FilterV2() should never return kUndetermined");
+ validity_info_.Invalidate();
+ return false;
+ }
+
+ if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
+ cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
+ 0) {
+ // Can't skip to a key smaller than the current one.
+ // Keep the key as per FilterV2 documentation.
+ filter = CompactionFilter::Decision::kKeep;
+ }
+
+ if (filter == CompactionFilter::Decision::kRemove) {
+ // convert the current key to a delete; key_ is pointing into
+ // current_key_ at this point, so updating current_key_ updates key()
+ ikey_.type = kTypeDeletion;
+ current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
+ // no value associated with delete
+ value_.clear();
+ iter_stats_.num_record_drop_user++;
+ } else if (filter == CompactionFilter::Decision::kPurge) {
+ // convert the current key to a single delete; key_ is pointing into
+ // current_key_ at this point, so updating current_key_ updates key()
+ ikey_.type = kTypeSingleDeletion;
+ current_key_.UpdateInternalKey(ikey_.sequence, kTypeSingleDeletion);
+ // no value associated with single delete
+ value_.clear();
+ iter_stats_.num_record_drop_user++;
+ } else if (filter == CompactionFilter::Decision::kChangeValue) {
+ if (ikey_.type == kTypeBlobIndex) {
+ // value transfer from blob file to inlined data
+ ikey_.type = kTypeValue;
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ }
+ value_ = compaction_filter_value_;
+ } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+ *need_skip = true;
+ compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+ kValueTypeForSeek);
+ *skip_until = compaction_filter_skip_until_.Encode();
+ } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) {
+ // Only the StackableDB-based BlobDB impl's compaction filter should return
+ // kChangeBlobIndex. Decision about rewriting blob and changing blob index
+ // in the integrated BlobDB impl is made in subsequent call to
+ // PrepareOutput() and its callees.
+ if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+ status_ = Status::NotSupported(
+ "Only stacked BlobDB's internal compaction filter can return "
+ "kChangeBlobIndex.");
+ validity_info_.Invalidate();
+ return false;
+ }
+ if (ikey_.type == kTypeValue) {
+ // value transfer from inlined data to blob file
+ ikey_.type = kTypeBlobIndex;
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ }
+ value_ = compaction_filter_value_;
+ } else if (filter == CompactionFilter::Decision::kIOError) {
+ if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+ status_ = Status::NotSupported(
+ "CompactionFilter for integrated BlobDB should not return kIOError");
+ validity_info_.Invalidate();
+ return false;
+ }
+ status_ = Status::IOError("Failed to access blob during compaction filter");
+ error = true;
+ }
+ return !error;
+}
+
+void CompactionIterator::NextFromInput() {
+ at_next_ = false;
+ validity_info_.Invalidate();
+
+ while (!Valid() && input_.Valid() && !IsPausingManualCompaction() &&
+ !IsShuttingDown()) {
+ key_ = input_.key();
+ value_ = input_.value();
+ blob_value_.Reset();
+ iter_stats_.num_input_records++;
+
+ Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+ if (!pik_status.ok()) {
+ iter_stats_.num_input_corrupt_records++;
+
+ // If `expect_valid_internal_key_` is false, return the corrupted key
+ // and let the caller decide what to do with it.
+ if (expect_valid_internal_key_) {
+ status_ = pik_status;
+ return;
+ }
+ key_ = current_key_.SetInternalKey(key_);
+ has_current_user_key_ = false;
+ current_user_key_sequence_ = kMaxSequenceNumber;
+ current_user_key_snapshot_ = 0;
+ validity_info_.SetValid(ValidContext::kParseKeyError);
+ break;
+ }
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
+
+ // Update input statistics
+ if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion ||
+ ikey_.type == kTypeDeletionWithTimestamp) {
+ iter_stats_.num_input_deletion_records++;
+ }
+ iter_stats_.total_input_raw_key_bytes += key_.size();
+ iter_stats_.total_input_raw_value_bytes += value_.size();
+
+ // If need_skip is true, we should seek the input iterator
+ // to internal key skip_until and continue from there.
+ bool need_skip = false;
+ // Points either into compaction_filter_skip_until_ or into
+ // merge_helper_->compaction_filter_skip_until_.
+ Slice skip_until;
+
+ bool user_key_equal_without_ts = false;
+ int cmp_ts = 0;
+ if (has_current_user_key_) {
+ user_key_equal_without_ts =
+ cmp_->EqualWithoutTimestamp(ikey_.user_key, current_user_key_);
+ // if timestamp_size_ > 0, then curr_ts_ has been initialized by a
+ // previous key.
+ cmp_ts = timestamp_size_ ? cmp_->CompareTimestamp(
+ ExtractTimestampFromUserKey(
+ ikey_.user_key, timestamp_size_),
+ curr_ts_)
+ : 0;
+ }
+
+ // Check whether the user key changed. After this if statement current_key_
+ // is a copy of the current input key (maybe converted to a delete by the
+ // compaction filter). ikey_.user_key is pointing to the copy.
+ if (!has_current_user_key_ || !user_key_equal_without_ts || cmp_ts != 0) {
+ // First occurrence of this user key
+ // Copy key for output
+ key_ = current_key_.SetInternalKey(key_, &ikey_);
+
+ int prev_cmp_with_ts_low =
+ !full_history_ts_low_ ? 0
+ : curr_ts_.empty()
+ ? 0
+ : cmp_->CompareTimestamp(curr_ts_, *full_history_ts_low_);
+
+ // If timestamp_size_ > 0, then copy from ikey_ to curr_ts_ for the use
+ // in next iteration to compare with the timestamp of next key.
+ UpdateTimestampAndCompareWithFullHistoryLow();
+
+ // If
+ // (1) !has_current_user_key_, OR
+ // (2) timestamp is disabled, OR
+ // (3) all history will be preserved, OR
+ // (4) user key (excluding timestamp) is different from previous key, OR
+ // (5) timestamp is NO older than *full_history_ts_low_, OR
+ // (6) timestamp is the largest one older than full_history_ts_low_,
+ // then current_user_key_ must be treated as a different user key.
+ // This means, if a user key (excluding ts) is the same as the previous
+ // user key, and its ts is older than *full_history_ts_low_, then we
+ // consider this key for GC, e.g. it may be dropped if certain conditions
+ // match.
+ if (!has_current_user_key_ || !timestamp_size_ || !full_history_ts_low_ ||
+ !user_key_equal_without_ts || cmp_with_history_ts_low_ >= 0 ||
+ prev_cmp_with_ts_low >= 0) {
+ // Initialize for future comparison for rule (A) and etc.
+ current_user_key_sequence_ = kMaxSequenceNumber;
+ current_user_key_snapshot_ = 0;
+ has_current_user_key_ = true;
+ }
+ current_user_key_ = ikey_.user_key;
+
+ has_outputted_key_ = false;
+
+ last_key_seq_zeroed_ = false;
+
+ current_key_committed_ = KeyCommitted(ikey_.sequence);
+
+ // Apply the compaction filter to the first committed version of the user
+ // key.
+ if (current_key_committed_ &&
+ !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
+ break;
+ }
+ } else {
+ // Update the current key to reflect the new sequence number/type without
+ // copying the user key.
+ // TODO(rven): Compaction filter does not process keys in this path
+ // Need to have the compaction filter process multiple versions
+ // if we have versions on both sides of a snapshot
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+
+ // Note that newer version of a key is ordered before older versions. If a
+ // newer version of a key is committed, so as the older version. No need
+ // to query snapshot_checker_ in that case.
+ if (UNLIKELY(!current_key_committed_)) {
+ assert(snapshot_checker_ != nullptr);
+ current_key_committed_ = KeyCommitted(ikey_.sequence);
+ // Apply the compaction filter to the first committed version of the
+ // user key.
+ if (current_key_committed_ &&
+ !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
+ break;
+ }
+ }
+ }
+
+ if (UNLIKELY(!current_key_committed_)) {
+ assert(snapshot_checker_ != nullptr);
+ validity_info_.SetValid(ValidContext::kCurrentKeyUncommitted);
+ break;
+ }
+
+ // If there are no snapshots, then this kv affect visibility at tip.
+ // Otherwise, search though all existing snapshots to find the earliest
+ // snapshot that is affected by this kv.
+ SequenceNumber last_sequence = current_user_key_sequence_;
+ current_user_key_sequence_ = ikey_.sequence;
+ SequenceNumber last_snapshot = current_user_key_snapshot_;
+ SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot
+ current_user_key_snapshot_ =
+ visible_at_tip_
+ ? earliest_snapshot_
+ : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot);
+
+ if (need_skip) {
+ // This case is handled below.
+ } else if (clear_and_output_next_key_) {
+ // In the previous iteration we encountered a single delete that we could
+ // not compact out. We will keep this Put, but can drop it's data.
+ // (See Optimization 3, below.)
+ if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex &&
+ ikey_.type != kTypeWideColumnEntity) {
+ ROCKS_LOG_FATAL(info_log_, "Unexpected key %s for compaction output",
+ ikey_.DebugString(allow_data_in_errors_, true).c_str());
+ assert(false);
+ }
+ if (current_user_key_snapshot_ < last_snapshot) {
+ ROCKS_LOG_FATAL(info_log_,
+ "key %s, current_user_key_snapshot_ (%" PRIu64
+ ") < last_snapshot (%" PRIu64 ")",
+ ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+ current_user_key_snapshot_, last_snapshot);
+ assert(false);
+ }
+
+ if (ikey_.type == kTypeBlobIndex || ikey_.type == kTypeWideColumnEntity) {
+ ikey_.type = kTypeValue;
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ }
+
+ value_.clear();
+ validity_info_.SetValid(ValidContext::kKeepSDAndClearPut);
+ clear_and_output_next_key_ = false;
+ } else if (ikey_.type == kTypeSingleDeletion) {
+ // We can compact out a SingleDelete if:
+ // 1) We encounter the corresponding PUT -OR- we know that this key
+ // doesn't appear past this output level
+ // =AND=
+ // 2) We've already returned a record in this snapshot -OR-
+ // there are no earlier earliest_write_conflict_snapshot.
+ //
+ // A note about 2) above:
+ // we try to determine whether there is any earlier write conflict
+ // checking snapshot by calling DefinitelyInSnapshot() with seq and
+ // earliest_write_conflict_snapshot as arguments. For write-prepared
+ // and write-unprepared transactions, if earliest_write_conflict_snapshot
+ // is evicted from WritePreparedTxnDB::commit_cache, then
+ // DefinitelyInSnapshot(seq, earliest_write_conflict_snapshot) returns
+ // false, even if the seq is actually visible within
+ // earliest_write_conflict_snapshot. Consequently, CompactionIterator
+ // may try to zero out its sequence number, thus hitting assertion error
+ // in debug mode or cause incorrect DBIter return result.
+ // We observe that earliest_write_conflict_snapshot >= earliest_snapshot,
+ // and the seq zeroing logic depends on
+ // DefinitelyInSnapshot(seq, earliest_snapshot). Therefore, if we cannot
+ // determine whether seq is **definitely** in
+ // earliest_write_conflict_snapshot, then we can additionally check if
+ // seq is definitely in earliest_snapshot. If the latter holds, then the
+ // former holds too.
+ //
+ // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to
+ // allow Transactions to do write-conflict checking (if we compacted away
+ // all keys, then we wouldn't know that a write happened in this
+ // snapshot). If there is no earlier snapshot, then we know that there
+ // are no active transactions that need to know about any writes.
+ //
+ // Optimization 3:
+ // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT
+ // true, then we must output a SingleDelete. In this case, we will decide
+ // to also output the PUT. While we are compacting less by outputting the
+ // PUT now, hopefully this will lead to better compaction in the future
+ // when Rule 2 is later true (Ie, We are hoping we can later compact out
+ // both the SingleDelete and the Put, while we couldn't if we only
+ // outputted the SingleDelete now).
+ // In this case, we can save space by removing the PUT's value as it will
+ // never be read.
+ //
+ // Deletes and Merges are not supported on the same key that has a
+ // SingleDelete as it is not possible to correctly do any partial
+ // compaction of such a combination of operations. The result of mixing
+ // those operations for a given key is documented as being undefined. So
+ // we can choose how to handle such a combinations of operations. We will
+ // try to compact out as much as we can in these cases.
+ // We will report counts on these anomalous cases.
+ //
+ // Note: If timestamp is enabled, then record will be eligible for
+ // deletion, only if, along with above conditions (Rule 1 and Rule 2)
+ // full_history_ts_low_ is specified and timestamp for that key is less
+ // than *full_history_ts_low_. If it's not eligible for deletion, then we
+ // will output the SingleDelete. For Optimization 3 also, if
+ // full_history_ts_low_ is specified and timestamp for the key is less
+ // than *full_history_ts_low_ then only optimization will be applied.
+
+ // The easiest way to process a SingleDelete during iteration is to peek
+ // ahead at the next key.
+ const bool is_timestamp_eligible_for_gc =
+ (timestamp_size_ == 0 ||
+ (full_history_ts_low_ && cmp_with_history_ts_low_ < 0));
+
+ ParsedInternalKey next_ikey;
+ AdvanceInputIter();
+
+ // Check whether the next key exists, is not corrupt, and is the same key
+ // as the single delete.
+ if (input_.Valid() &&
+ ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+ .ok() &&
+ cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
+#ifndef NDEBUG
+ const Compaction* c =
+ compaction_ ? compaction_->real_compaction() : nullptr;
+#endif
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::NextFromInput:SingleDelete:1",
+ const_cast<Compaction*>(c));
+ if (last_key_seq_zeroed_) {
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_obsolete;
+ assert(bottommost_level_);
+ AdvanceInputIter();
+ } else if (prev_snapshot == 0 ||
+ DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot)) {
+ // Check whether the next key belongs to the same snapshot as the
+ // SingleDelete.
+
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::NextFromInput:SingleDelete:2", nullptr);
+ if (next_ikey.type == kTypeSingleDeletion) {
+ // We encountered two SingleDeletes for same key in a row. This
+ // could be due to unexpected user input. If write-(un)prepared
+ // transaction is used, this could also be due to releasing an old
+ // snapshot between a Put and its matching SingleDelete.
+ // Skip the first SingleDelete and let the next iteration decide
+ // how to handle the second SingleDelete.
+
+ // First SingleDelete has been skipped since we already called
+ // input_.Next().
+ ++iter_stats_.num_record_drop_obsolete;
+ ++iter_stats_.num_single_del_mismatch;
+ } else if (next_ikey.type == kTypeDeletion) {
+ std::ostringstream oss;
+ oss << "Found SD and type: " << static_cast<int>(next_ikey.type)
+ << " on the same key, violating the contract "
+ "of SingleDelete. Check your application to make sure the "
+ "application does not mix SingleDelete and Delete for "
+ "the same key. If you are using "
+ "write-prepared/write-unprepared transactions, and use "
+ "SingleDelete to delete certain keys, then make sure "
+ "TransactionDBOptions::rollback_deletion_type_callback is "
+ "configured properly. Mixing SD and DEL can lead to "
+ "undefined behaviors";
+ ++iter_stats_.num_record_drop_obsolete;
+ ++iter_stats_.num_single_del_mismatch;
+ if (enforce_single_del_contracts_) {
+ ROCKS_LOG_ERROR(info_log_, "%s", oss.str().c_str());
+ validity_info_.Invalidate();
+ status_ = Status::Corruption(oss.str());
+ return;
+ }
+ ROCKS_LOG_WARN(info_log_, "%s", oss.str().c_str());
+ } else if (!is_timestamp_eligible_for_gc) {
+ // We cannot drop the SingleDelete as timestamp is enabled, and
+ // timestamp of this key is greater than or equal to
+ // *full_history_ts_low_. We will output the SingleDelete.
+ validity_info_.SetValid(ValidContext::kKeepTsHistory);
+ } else if (has_outputted_key_ ||
+ DefinitelyInSnapshot(ikey_.sequence,
+ earliest_write_conflict_snapshot_) ||
+ (earliest_snapshot_ < earliest_write_conflict_snapshot_ &&
+ DefinitelyInSnapshot(ikey_.sequence,
+ earliest_snapshot_))) {
+ // Found a matching value, we can drop the single delete and the
+ // value. It is safe to drop both records since we've already
+ // outputted a key in this snapshot, or there is no earlier
+ // snapshot (Rule 2 above).
+
+ // Note: it doesn't matter whether the second key is a Put or if it
+ // is an unexpected Merge or Delete. We will compact it out
+ // either way. We will maintain counts of how many mismatches
+ // happened
+ if (next_ikey.type != kTypeValue &&
+ next_ikey.type != kTypeBlobIndex &&
+ next_ikey.type != kTypeWideColumnEntity) {
+ ++iter_stats_.num_single_del_mismatch;
+ }
+
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_obsolete;
+ // Already called input_.Next() once. Call it a second time to
+ // skip past the second key.
+ AdvanceInputIter();
+ } else {
+ // Found a matching value, but we cannot drop both keys since
+ // there is an earlier snapshot and we need to leave behind a record
+ // to know that a write happened in this snapshot (Rule 2 above).
+ // Clear the value and output the SingleDelete. (The value will be
+ // outputted on the next iteration.)
+
+ // Setting valid_ to true will output the current SingleDelete
+ validity_info_.SetValid(ValidContext::kKeepSDForConflictCheck);
+
+ // Set up the Put to be outputted in the next iteration.
+ // (Optimization 3).
+ clear_and_output_next_key_ = true;
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::NextFromInput:KeepSDForWW",
+ /*arg=*/nullptr);
+ }
+ } else {
+ // We hit the next snapshot without hitting a put, so the iterator
+ // returns the single delete.
+ validity_info_.SetValid(ValidContext::kKeepSDForSnapshot);
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::NextFromInput:SingleDelete:3",
+ const_cast<Compaction*>(c));
+ }
+ } else {
+ // We are at the end of the input, could not parse the next key, or hit
+ // a different key. The iterator returns the single delete if the key
+ // possibly exists beyond the current output level. We set
+ // has_current_user_key to false so that if the iterator is at the next
+ // key, we do not compare it again against the previous key at the next
+ // iteration. If the next key is corrupt, we return before the
+ // comparison, so the value of has_current_user_key does not matter.
+ has_current_user_key_ = false;
+ if (compaction_ != nullptr &&
+ DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+ compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+ &level_ptrs_) &&
+ is_timestamp_eligible_for_gc) {
+ // Key doesn't exist outside of this range.
+ // Can compact out this SingleDelete.
+ ++iter_stats_.num_record_drop_obsolete;
+ ++iter_stats_.num_single_del_fallthru;
+ if (!bottommost_level_) {
+ ++iter_stats_.num_optimized_del_drop_obsolete;
+ }
+ } else if (last_key_seq_zeroed_) {
+ // Skip.
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_obsolete;
+ assert(bottommost_level_);
+ } else {
+ // Output SingleDelete
+ validity_info_.SetValid(ValidContext::kKeepSD);
+ }
+ }
+
+ if (Valid()) {
+ at_next_ = true;
+ }
+ } else if (last_snapshot == current_user_key_snapshot_ ||
+ (last_snapshot > 0 &&
+ last_snapshot < current_user_key_snapshot_)) {
+ // If the earliest snapshot is which this key is visible in
+ // is the same as the visibility of a previous instance of the
+ // same key, then this kv is not visible in any snapshot.
+ // Hidden by an newer entry for same user key
+ //
+ // Note: Dropping this key will not affect TransactionDB write-conflict
+ // checking since there has already been a record returned for this key
+ // in this snapshot.
+ if (last_sequence < current_user_key_sequence_) {
+ ROCKS_LOG_FATAL(info_log_,
+ "key %s, last_sequence (%" PRIu64
+ ") < current_user_key_sequence_ (%" PRIu64 ")",
+ ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+ last_sequence, current_user_key_sequence_);
+ assert(false);
+ }
+
+ ++iter_stats_.num_record_drop_hidden; // rule (A)
+ AdvanceInputIter();
+ } else if (compaction_ != nullptr &&
+ (ikey_.type == kTypeDeletion ||
+ (ikey_.type == kTypeDeletionWithTimestamp &&
+ cmp_with_history_ts_low_ < 0)) &&
+ DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+ compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+ &level_ptrs_)) {
+ // TODO(noetzli): This is the only place where we use compaction_
+ // (besides the constructor). We should probably get rid of this
+ // dependency and find a way to do similar filtering during flushes.
+ //
+ // For this user key:
+ // (1) there is no data in higher levels
+ // (2) data in lower levels will have larger sequence numbers
+ // (3) data in layers that are being compacted here and have
+ // smaller sequence numbers will be dropped in the next
+ // few iterations of this loop (by rule (A) above).
+ // Therefore this deletion marker is obsolete and can be dropped.
+ //
+ // Note: Dropping this Delete will not affect TransactionDB
+ // write-conflict checking since it is earlier than any snapshot.
+ //
+ // It seems that we can also drop deletion later than earliest snapshot
+ // given that:
+ // (1) The deletion is earlier than earliest_write_conflict_snapshot, and
+ // (2) No value exist earlier than the deletion.
+ //
+ // Note also that a deletion marker of type kTypeDeletionWithTimestamp
+ // will be treated as a different user key unless the timestamp is older
+ // than *full_history_ts_low_.
+ ++iter_stats_.num_record_drop_obsolete;
+ if (!bottommost_level_) {
+ ++iter_stats_.num_optimized_del_drop_obsolete;
+ }
+ AdvanceInputIter();
+ } else if ((ikey_.type == kTypeDeletion ||
+ (ikey_.type == kTypeDeletionWithTimestamp &&
+ cmp_with_history_ts_low_ < 0)) &&
+ bottommost_level_) {
+ // Handle the case where we have a delete key at the bottom most level
+ // We can skip outputting the key iff there are no subsequent puts for
+ // this key
+ assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel(
+ ikey_.user_key, &level_ptrs_));
+ ParsedInternalKey next_ikey;
+ AdvanceInputIter();
+#ifndef NDEBUG
+ const Compaction* c =
+ compaction_ ? compaction_->real_compaction() : nullptr;
+#endif
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::NextFromInput:BottommostDelete:1",
+ const_cast<Compaction*>(c));
+ // Skip over all versions of this key that happen to occur in the same
+ // snapshot range as the delete.
+ //
+ // Note that a deletion marker of type kTypeDeletionWithTimestamp will be
+ // considered to have a different user key unless the timestamp is older
+ // than *full_history_ts_low_.
+ while (!IsPausingManualCompaction() && !IsShuttingDown() &&
+ input_.Valid() &&
+ (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+ .ok()) &&
+ cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) &&
+ (prev_snapshot == 0 ||
+ DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) {
+ AdvanceInputIter();
+ }
+ // If you find you still need to output a row with this key, we need to
+ // output the delete too
+ if (input_.Valid() &&
+ (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+ .ok()) &&
+ cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
+ validity_info_.SetValid(ValidContext::kKeepDel);
+ at_next_ = true;
+ }
+ } else if (ikey_.type == kTypeMerge) {
+ if (!merge_helper_->HasOperator()) {
+ status_ = Status::InvalidArgument(
+ "merge_operator is not properly initialized.");
+ return;
+ }
+
+ pinned_iters_mgr_.StartPinning();
+
+ // We know the merge type entry is not hidden, otherwise we would
+ // have hit (A)
+ // We encapsulate the merge related state machine in a different
+ // object to minimize change to the existing flow.
+ Status s = merge_helper_->MergeUntil(
+ &input_, range_del_agg_, prev_snapshot, bottommost_level_,
+ allow_data_in_errors_, blob_fetcher_.get(), full_history_ts_low_,
+ prefetch_buffers_.get(), &iter_stats_);
+ merge_out_iter_.SeekToFirst();
+
+ if (!s.ok() && !s.IsMergeInProgress()) {
+ status_ = s;
+ return;
+ } else if (merge_out_iter_.Valid()) {
+ // NOTE: key, value, and ikey_ refer to old entries.
+ // These will be correctly set below.
+ key_ = merge_out_iter_.key();
+ value_ = merge_out_iter_.value();
+ pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+ // MergeUntil stops when it encounters a corrupt key and does not
+ // include them in the result, so we expect the keys here to valid.
+ if (!pik_status.ok()) {
+ ROCKS_LOG_FATAL(
+ info_log_, "Invalid key %s in compaction. %s",
+ allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden",
+ pik_status.getState());
+ assert(false);
+ }
+ // Keep current_key_ in sync.
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+ validity_info_.SetValid(ValidContext::kMerge2);
+ } else {
+ // all merge operands were filtered out. reset the user key, since the
+ // batch consumed by the merge operator should not shadow any keys
+ // coming after the merges
+ has_current_user_key_ = false;
+ pinned_iters_mgr_.ReleasePinnedData();
+
+ if (merge_helper_->FilteredUntil(&skip_until)) {
+ need_skip = true;
+ }
+ }
+ } else {
+ // 1. new user key -OR-
+ // 2. different snapshot stripe
+ // If user-defined timestamp is enabled, we consider keys for GC if they
+ // are below history_ts_low_. CompactionRangeDelAggregator::ShouldDelete()
+ // only considers range deletions that are at or below history_ts_low_ and
+ // trim_ts_. We drop keys here that are below history_ts_low_ and are
+ // covered by a range tombstone that is at or below history_ts_low_ and
+ // trim_ts.
+ bool should_delete = false;
+ if (!timestamp_size_ || cmp_with_history_ts_low_ < 0) {
+ should_delete = range_del_agg_->ShouldDelete(
+ key_, RangeDelPositioningMode::kForwardTraversal);
+ }
+ if (should_delete) {
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_range_del;
+ AdvanceInputIter();
+ } else {
+ validity_info_.SetValid(ValidContext::kNewUserKey);
+ }
+ }
+
+ if (need_skip) {
+ SkipUntil(skip_until);
+ }
+ }
+
+ if (!Valid() && IsShuttingDown()) {
+ status_ = Status::ShutdownInProgress();
+ }
+
+ if (IsPausingManualCompaction()) {
+ status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+
+ // Propagate corruption status from memtable itereator
+ if (!input_.Valid() && input_.status().IsCorruption()) {
+ status_ = input_.status();
+ }
+}
+
+bool CompactionIterator::ExtractLargeValueIfNeededImpl() {
+ if (!blob_file_builder_) {
+ return false;
+ }
+
+ blob_index_.clear();
+ const Status s = blob_file_builder_->Add(user_key(), value_, &blob_index_);
+
+ if (!s.ok()) {
+ status_ = s;
+ validity_info_.Invalidate();
+
+ return false;
+ }
+
+ if (blob_index_.empty()) {
+ return false;
+ }
+
+ value_ = blob_index_;
+
+ return true;
+}
+
+void CompactionIterator::ExtractLargeValueIfNeeded() {
+ assert(ikey_.type == kTypeValue);
+
+ if (!ExtractLargeValueIfNeededImpl()) {
+ return;
+ }
+
+ ikey_.type = kTypeBlobIndex;
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+}
+
+void CompactionIterator::GarbageCollectBlobIfNeeded() {
+ assert(ikey_.type == kTypeBlobIndex);
+
+ if (!compaction_) {
+ return;
+ }
+
+ // GC for integrated BlobDB
+ if (compaction_->enable_blob_garbage_collection()) {
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex",
+ &value_);
+
+ BlobIndex blob_index;
+
+ {
+ const Status s = blob_index.DecodeFrom(value_);
+
+ if (!s.ok()) {
+ status_ = s;
+ validity_info_.Invalidate();
+
+ return;
+ }
+ }
+
+ if (blob_index.file_number() >=
+ blob_garbage_collection_cutoff_file_number_) {
+ return;
+ }
+
+ FilePrefetchBuffer* prefetch_buffer =
+ prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer(
+ blob_index.file_number())
+ : nullptr;
+
+ uint64_t bytes_read = 0;
+
+ {
+ assert(blob_fetcher_);
+
+ const Status s = blob_fetcher_->FetchBlob(
+ user_key(), blob_index, prefetch_buffer, &blob_value_, &bytes_read);
+
+ if (!s.ok()) {
+ status_ = s;
+ validity_info_.Invalidate();
+
+ return;
+ }
+ }
+
+ ++iter_stats_.num_blobs_read;
+ iter_stats_.total_blob_bytes_read += bytes_read;
+
+ ++iter_stats_.num_blobs_relocated;
+ iter_stats_.total_blob_bytes_relocated += blob_index.size();
+
+ value_ = blob_value_;
+
+ if (ExtractLargeValueIfNeededImpl()) {
+ return;
+ }
+
+ ikey_.type = kTypeValue;
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+
+ return;
+ }
+
+ // GC for stacked BlobDB
+ if (compaction_filter_ &&
+ compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+ const auto blob_decision = compaction_filter_->PrepareBlobOutput(
+ user_key(), value_, &compaction_filter_value_);
+
+ if (blob_decision == CompactionFilter::BlobDecision::kCorruption) {
+ status_ =
+ Status::Corruption("Corrupted blob reference encountered during GC");
+ validity_info_.Invalidate();
+
+ return;
+ }
+
+ if (blob_decision == CompactionFilter::BlobDecision::kIOError) {
+ status_ = Status::IOError("Could not relocate blob during GC");
+ validity_info_.Invalidate();
+
+ return;
+ }
+
+ if (blob_decision == CompactionFilter::BlobDecision::kChangeValue) {
+ value_ = compaction_filter_value_;
+
+ return;
+ }
+ }
+}
+
+void CompactionIterator::DecideOutputLevel() {
+ assert(compaction_->SupportsPerKeyPlacement());
+#ifndef NDEBUG
+ // Could be overridden by unittest
+ PerKeyPlacementContext context(level_, ikey_.user_key, value_,
+ ikey_.sequence);
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
+ &context);
+ output_to_penultimate_level_ = context.output_to_penultimate_level;
+#else
+ output_to_penultimate_level_ = false;
+#endif // NDEBUG
+
+ // if the key is newer than the cutoff sequence or within the earliest
+ // snapshot, it should output to the penultimate level.
+ if (ikey_.sequence > preclude_last_level_min_seqno_ ||
+ ikey_.sequence > earliest_snapshot_) {
+ output_to_penultimate_level_ = true;
+ }
+
+ if (output_to_penultimate_level_) {
+ // If it's decided to output to the penultimate level, but unsafe to do so,
+ // still output to the last level. For example, moving the data from a lower
+ // level to a higher level outside of the higher-level input key range is
+ // considered unsafe, because the key may conflict with higher-level SSTs
+ // not from this compaction.
+ // TODO: add statistic for declined output_to_penultimate_level
+ bool safe_to_penultimate_level =
+ compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key);
+ if (!safe_to_penultimate_level) {
+ output_to_penultimate_level_ = false;
+ // It could happen when disable/enable `last_level_temperature` while
+ // holding a snapshot. When `last_level_temperature` is not set
+ // (==kUnknown), the data newer than any snapshot is pushed to the last
+ // level, but when the per_key_placement feature is enabled on the fly,
+ // the data later than the snapshot has to be moved to the penultimate
+ // level, which may or may not be safe. So the user needs to make sure all
+ // snapshot is released before enabling `last_level_temperature` feature
+ // We will migrate the feature to `last_level_temperature` and maybe make
+ // it not dynamically changeable.
+ if (ikey_.sequence > earliest_snapshot_) {
+ status_ = Status::Corruption(
+ "Unsafe to store Seq later than snapshot in the last level if "
+ "per_key_placement is enabled");
+ }
+ }
+ }
+}
+
+void CompactionIterator::PrepareOutput() {
+ if (Valid()) {
+ if (ikey_.type == kTypeValue) {
+ ExtractLargeValueIfNeeded();
+ } else if (ikey_.type == kTypeBlobIndex) {
+ GarbageCollectBlobIfNeeded();
+ }
+
+ if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
+ DecideOutputLevel();
+ }
+
+ // Zeroing out the sequence number leads to better compression.
+ // If this is the bottommost level (no files in lower levels)
+ // and the earliest snapshot is larger than this seqno
+ // and the userkey differs from the last userkey in compaction
+ // then we can squash the seqno to zero.
+ //
+ // This is safe for TransactionDB write-conflict checking since transactions
+ // only care about sequence number larger than any active snapshots.
+ //
+ // Can we do the same for levels above bottom level as long as
+ // KeyNotExistsBeyondOutputLevel() return true?
+ if (Valid() && compaction_ != nullptr &&
+ !compaction_->allow_ingest_behind() && bottommost_level_ &&
+ DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+ ikey_.type != kTypeMerge && current_key_committed_ &&
+ !output_to_penultimate_level_ &&
+ ikey_.sequence < preserve_time_min_seqno_) {
+ if (ikey_.type == kTypeDeletion ||
+ (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
+ ROCKS_LOG_FATAL(
+ info_log_,
+ "Unexpected key %s for seq-zero optimization. "
+ "earliest_snapshot %" PRIu64
+ ", earliest_write_conflict_snapshot %" PRIu64
+ " job_snapshot %" PRIu64
+ ". timestamp_size: %d full_history_ts_low_ %s. validity %x",
+ ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+ earliest_snapshot_, earliest_write_conflict_snapshot_,
+ job_snapshot_, static_cast<int>(timestamp_size_),
+ full_history_ts_low_ != nullptr
+ ? Slice(*full_history_ts_low_).ToString(true).c_str()
+ : "null",
+ validity_info_.rep);
+ assert(false);
+ }
+ ikey_.sequence = 0;
+ last_key_seq_zeroed_ = true;
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq",
+ &ikey_);
+ if (!timestamp_size_) {
+ current_key_.UpdateInternalKey(0, ikey_.type);
+ } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) {
+ // We can also zero out timestamp for better compression.
+ // For the same user key (excluding timestamp), the timestamp-based
+ // history can be collapsed to save some space if the timestamp is
+ // older than *full_history_ts_low_.
+ const std::string kTsMin(timestamp_size_, static_cast<char>(0));
+ const Slice ts_slice = kTsMin;
+ ikey_.SetTimestamp(ts_slice);
+ current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice);
+ }
+ }
+ }
+}
+
+inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
+ SequenceNumber in, SequenceNumber* prev_snapshot) {
+ assert(snapshots_->size());
+ if (snapshots_->size() == 0) {
+ ROCKS_LOG_FATAL(info_log_,
+ "No snapshot left in findEarliestVisibleSnapshot");
+ }
+ auto snapshots_iter =
+ std::lower_bound(snapshots_->begin(), snapshots_->end(), in);
+ assert(prev_snapshot != nullptr);
+ if (snapshots_iter == snapshots_->begin()) {
+ *prev_snapshot = 0;
+ } else {
+ *prev_snapshot = *std::prev(snapshots_iter);
+ if (*prev_snapshot >= in) {
+ ROCKS_LOG_FATAL(info_log_,
+ "*prev_snapshot (%" PRIu64 ") >= in (%" PRIu64
+ ") in findEarliestVisibleSnapshot",
+ *prev_snapshot, in);
+ assert(false);
+ }
+ }
+ if (snapshot_checker_ == nullptr) {
+ return snapshots_iter != snapshots_->end() ? *snapshots_iter
+ : kMaxSequenceNumber;
+ }
+ bool has_released_snapshot = !released_snapshots_.empty();
+ for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
+ auto cur = *snapshots_iter;
+ if (in > cur) {
+ ROCKS_LOG_FATAL(info_log_,
+ "in (%" PRIu64 ") > cur (%" PRIu64
+ ") in findEarliestVisibleSnapshot",
+ in, cur);
+ assert(false);
+ }
+ // Skip if cur is in released_snapshots.
+ if (has_released_snapshot && released_snapshots_.count(cur) > 0) {
+ continue;
+ }
+ auto res = snapshot_checker_->CheckInSnapshot(in, cur);
+ if (res == SnapshotCheckerResult::kInSnapshot) {
+ return cur;
+ } else if (res == SnapshotCheckerResult::kSnapshotReleased) {
+ released_snapshots_.insert(cur);
+ }
+ *prev_snapshot = cur;
+ }
+ return kMaxSequenceNumber;
+}
+
+uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber(
+ const CompactionProxy* compaction) {
+ if (!compaction) {
+ return 0;
+ }
+
+ if (!compaction->enable_blob_garbage_collection()) {
+ return 0;
+ }
+
+ const Version* const version = compaction->input_version();
+ assert(version);
+
+ const VersionStorageInfo* const storage_info = version->storage_info();
+ assert(storage_info);
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+
+ const size_t cutoff_index = static_cast<size_t>(
+ compaction->blob_garbage_collection_age_cutoff() * blob_files.size());
+
+ if (cutoff_index >= blob_files.size()) {
+ return std::numeric_limits<uint64_t>::max();
+ }
+
+ const auto& meta = blob_files[cutoff_index];
+ assert(meta);
+
+ return meta->GetBlobFileNumber();
+}
+
+std::unique_ptr<BlobFetcher> CompactionIterator::CreateBlobFetcherIfNeeded(
+ const CompactionProxy* compaction) {
+ if (!compaction) {
+ return nullptr;
+ }
+
+ const Version* const version = compaction->input_version();
+ if (!version) {
+ return nullptr;
+ }
+
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+
+ return std::unique_ptr<BlobFetcher>(new BlobFetcher(version, read_options));
+}
+
+std::unique_ptr<PrefetchBufferCollection>
+CompactionIterator::CreatePrefetchBufferCollectionIfNeeded(
+ const CompactionProxy* compaction) {
+ if (!compaction) {
+ return nullptr;
+ }
+
+ if (!compaction->input_version()) {
+ return nullptr;
+ }
+
+ if (compaction->allow_mmap_reads()) {
+ return nullptr;
+ }
+
+ const uint64_t readahead_size = compaction->blob_compaction_readahead_size();
+ if (!readahead_size) {
+ return nullptr;
+ }
+
+ return std::unique_ptr<PrefetchBufferCollection>(
+ new PrefetchBufferCollection(readahead_size));
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator.h b/src/rocksdb/db/compaction/compaction_iterator.h
new file mode 100644
index 000000000..c215d2bbb
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.h
@@ -0,0 +1,513 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <cinttypes>
+#include <deque>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileBuilder;
+class BlobFetcher;
+class PrefetchBufferCollection;
+
+// A wrapper of internal iterator whose purpose is to count how
+// many entries there are in the iterator.
+class SequenceIterWrapper : public InternalIterator {
+ public:
+ SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp,
+ bool need_count_entries)
+ : icmp_(cmp),
+ inner_iter_(iter),
+ need_count_entries_(need_count_entries) {}
+ bool Valid() const override { return inner_iter_->Valid(); }
+ Status status() const override { return inner_iter_->status(); }
+ void Next() override {
+ num_itered_++;
+ inner_iter_->Next();
+ }
+ void Seek(const Slice& target) override {
+ if (!need_count_entries_) {
+ inner_iter_->Seek(target);
+ } else {
+ // For flush cases, we need to count total number of entries, so we
+ // do Next() rather than Seek().
+ while (inner_iter_->Valid() &&
+ icmp_.Compare(inner_iter_->key(), target) < 0) {
+ Next();
+ }
+ }
+ }
+ Slice key() const override { return inner_iter_->key(); }
+ Slice value() const override { return inner_iter_->value(); }
+
+ // Unused InternalIterator methods
+ void SeekToFirst() override { assert(false); }
+ void Prev() override { assert(false); }
+ void SeekForPrev(const Slice& /* target */) override { assert(false); }
+ void SeekToLast() override { assert(false); }
+
+ uint64_t num_itered() const { return num_itered_; }
+
+ private:
+ InternalKeyComparator icmp_;
+ InternalIterator* inner_iter_; // not owned
+ uint64_t num_itered_ = 0;
+ bool need_count_entries_;
+};
+
+class CompactionIterator {
+ public:
+ // A wrapper around Compaction. Has a much smaller interface, only what
+ // CompactionIterator uses. Tests can override it.
+ class CompactionProxy {
+ public:
+ virtual ~CompactionProxy() = default;
+
+ virtual int level() const = 0;
+
+ virtual bool KeyNotExistsBeyondOutputLevel(
+ const Slice& user_key, std::vector<size_t>* level_ptrs) const = 0;
+
+ virtual bool bottommost_level() const = 0;
+
+ virtual int number_levels() const = 0;
+
+ // Result includes timestamp if user-defined timestamp is enabled.
+ virtual Slice GetLargestUserKey() const = 0;
+
+ virtual bool allow_ingest_behind() const = 0;
+
+ virtual bool allow_mmap_reads() const = 0;
+
+ virtual bool enable_blob_garbage_collection() const = 0;
+
+ virtual double blob_garbage_collection_age_cutoff() const = 0;
+
+ virtual uint64_t blob_compaction_readahead_size() const = 0;
+
+ virtual const Version* input_version() const = 0;
+
+ virtual bool DoesInputReferenceBlobFiles() const = 0;
+
+ virtual const Compaction* real_compaction() const = 0;
+
+ virtual bool SupportsPerKeyPlacement() const = 0;
+
+ // `key` includes timestamp if user-defined timestamp is enabled.
+ virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0;
+ };
+
+ class RealCompaction : public CompactionProxy {
+ public:
+ explicit RealCompaction(const Compaction* compaction)
+ : compaction_(compaction) {
+ assert(compaction_);
+ assert(compaction_->immutable_options());
+ assert(compaction_->mutable_cf_options());
+ }
+
+ int level() const override { return compaction_->level(); }
+
+ bool KeyNotExistsBeyondOutputLevel(
+ const Slice& user_key, std::vector<size_t>* level_ptrs) const override {
+ return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs);
+ }
+
+ bool bottommost_level() const override {
+ return compaction_->bottommost_level();
+ }
+
+ int number_levels() const override { return compaction_->number_levels(); }
+
+ // Result includes timestamp if user-defined timestamp is enabled.
+ Slice GetLargestUserKey() const override {
+ return compaction_->GetLargestUserKey();
+ }
+
+ bool allow_ingest_behind() const override {
+ return compaction_->immutable_options()->allow_ingest_behind;
+ }
+
+ bool allow_mmap_reads() const override {
+ return compaction_->immutable_options()->allow_mmap_reads;
+ }
+
+ bool enable_blob_garbage_collection() const override {
+ return compaction_->enable_blob_garbage_collection();
+ }
+
+ double blob_garbage_collection_age_cutoff() const override {
+ return compaction_->blob_garbage_collection_age_cutoff();
+ }
+
+ uint64_t blob_compaction_readahead_size() const override {
+ return compaction_->mutable_cf_options()->blob_compaction_readahead_size;
+ }
+
+ const Version* input_version() const override {
+ return compaction_->input_version();
+ }
+
+ bool DoesInputReferenceBlobFiles() const override {
+ return compaction_->DoesInputReferenceBlobFiles();
+ }
+
+ const Compaction* real_compaction() const override { return compaction_; }
+
+ bool SupportsPerKeyPlacement() const override {
+ return compaction_->SupportsPerKeyPlacement();
+ }
+
+ // Check if key is within penultimate level output range, to see if it's
+ // safe to output to the penultimate level for per_key_placement feature.
+ // `key` includes timestamp if user-defined timestamp is enabled.
+ bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
+ return compaction_->WithinPenultimateLevelOutputRange(key);
+ }
+
+ private:
+ const Compaction* compaction_;
+ };
+
+ CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+ Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+ bool enforce_single_del_contracts,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const Compaction* compaction = nullptr,
+ const CompactionFilter* compaction_filter = nullptr,
+ const std::atomic<bool>* shutting_down = nullptr,
+ const std::shared_ptr<Logger> info_log = nullptr,
+ const std::string* full_history_ts_low = nullptr,
+ const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber,
+ const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber);
+
+ // Constructor with custom CompactionProxy, used for tests.
+ CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+ Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+ bool enforce_single_del_contracts,
+ const std::atomic<bool>& manual_compaction_canceled,
+ std::unique_ptr<CompactionProxy> compaction,
+ const CompactionFilter* compaction_filter = nullptr,
+ const std::atomic<bool>* shutting_down = nullptr,
+ const std::shared_ptr<Logger> info_log = nullptr,
+ const std::string* full_history_ts_low = nullptr,
+ const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber,
+ const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber);
+
+ ~CompactionIterator();
+
+ void ResetRecordCounts();
+
+ // Seek to the beginning of the compaction iterator output.
+ //
+ // REQUIRED: Call only once.
+ void SeekToFirst();
+
+ // Produces the next record in the compaction.
+ //
+ // REQUIRED: SeekToFirst() has been called.
+ void Next();
+
+ // Getters
+ const Slice& key() const { return key_; }
+ const Slice& value() const { return value_; }
+ const Status& status() const { return status_; }
+ const ParsedInternalKey& ikey() const { return ikey_; }
+ inline bool Valid() const { return validity_info_.IsValid(); }
+ const Slice& user_key() const { return current_user_key_; }
+ const CompactionIterationStats& iter_stats() const { return iter_stats_; }
+ uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
+ // If the current key should be placed on penultimate level, only valid if
+ // per_key_placement is supported
+ bool output_to_penultimate_level() const {
+ return output_to_penultimate_level_;
+ }
+ Status InputStatus() const { return input_.status(); }
+
+ private:
+ // Processes the input stream to find the next output
+ void NextFromInput();
+
+ // Do final preparations before presenting the output to the callee.
+ void PrepareOutput();
+
+ // Decide the current key should be output to the last level or penultimate
+ // level, only call for compaction supports per key placement
+ void DecideOutputLevel();
+
+ // Passes the output value to the blob file builder (if any), and replaces it
+ // with the corresponding blob reference if it has been actually written to a
+ // blob file (i.e. if it passed the value size check). Returns true if the
+ // value got extracted to a blob file, false otherwise.
+ bool ExtractLargeValueIfNeededImpl();
+
+ // Extracts large values as described above, and updates the internal key's
+ // type to kTypeBlobIndex if the value got extracted. Should only be called
+ // for regular values (kTypeValue).
+ void ExtractLargeValueIfNeeded();
+
+ // Relocates valid blobs residing in the oldest blob files if garbage
+ // collection is enabled. Relocated blobs are written to new blob files or
+ // inlined in the LSM tree depending on the current settings (i.e.
+ // enable_blob_files and min_blob_size). Should only be called for blob
+ // references (kTypeBlobIndex).
+ //
+ // Note: the stacked BlobDB implementation's compaction filter based GC
+ // algorithm is also called from here.
+ void GarbageCollectBlobIfNeeded();
+
+ // Invoke compaction filter if needed.
+ // Return true on success, false on failures (e.g.: kIOError).
+ bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
+
+ // Given a sequence number, return the sequence number of the
+ // earliest snapshot that this sequence number is visible in.
+ // The snapshots themselves are arranged in ascending order of
+ // sequence numbers.
+ // Employ a sequential search because the total number of
+ // snapshots are typically small.
+ inline SequenceNumber findEarliestVisibleSnapshot(
+ SequenceNumber in, SequenceNumber* prev_snapshot);
+
+ inline bool KeyCommitted(SequenceNumber sequence) {
+ return snapshot_checker_ == nullptr ||
+ snapshot_checker_->CheckInSnapshot(sequence, job_snapshot_) ==
+ SnapshotCheckerResult::kInSnapshot;
+ }
+
+ bool DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
+
+ bool DefinitelyNotInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
+
+ // Extract user-defined timestamp from user key if possible and compare it
+ // with *full_history_ts_low_ if applicable.
+ inline void UpdateTimestampAndCompareWithFullHistoryLow() {
+ if (!timestamp_size_) {
+ return;
+ }
+ Slice ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_);
+ curr_ts_.assign(ts.data(), ts.size());
+ if (full_history_ts_low_) {
+ cmp_with_history_ts_low_ =
+ cmp_->CompareTimestamp(ts, *full_history_ts_low_);
+ }
+ }
+
+ static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber(
+ const CompactionProxy* compaction);
+ static std::unique_ptr<BlobFetcher> CreateBlobFetcherIfNeeded(
+ const CompactionProxy* compaction);
+ static std::unique_ptr<PrefetchBufferCollection>
+ CreatePrefetchBufferCollectionIfNeeded(const CompactionProxy* compaction);
+
+ SequenceIterWrapper input_;
+ const Comparator* cmp_;
+ MergeHelper* merge_helper_;
+ const std::vector<SequenceNumber>* snapshots_;
+ // List of snapshots released during compaction.
+ // findEarliestVisibleSnapshot() find them out from return of
+ // snapshot_checker, and make sure they will not be returned as
+ // earliest visible snapshot of an older value.
+ // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3.
+ std::unordered_set<SequenceNumber> released_snapshots_;
+ const SequenceNumber earliest_write_conflict_snapshot_;
+ const SequenceNumber job_snapshot_;
+ const SnapshotChecker* const snapshot_checker_;
+ Env* env_;
+ SystemClock* clock_;
+ const bool report_detailed_time_;
+ const bool expect_valid_internal_key_;
+ CompactionRangeDelAggregator* range_del_agg_;
+ BlobFileBuilder* blob_file_builder_;
+ std::unique_ptr<CompactionProxy> compaction_;
+ const CompactionFilter* compaction_filter_;
+ const std::atomic<bool>* shutting_down_;
+ const std::atomic<bool>& manual_compaction_canceled_;
+ const bool bottommost_level_;
+ const bool visible_at_tip_;
+ const SequenceNumber earliest_snapshot_;
+
+ std::shared_ptr<Logger> info_log_;
+
+ const bool allow_data_in_errors_;
+
+ const bool enforce_single_del_contracts_;
+
+ // Comes from comparator.
+ const size_t timestamp_size_;
+
+ // Lower bound timestamp to retain full history in terms of user-defined
+ // timestamp. If a key's timestamp is older than full_history_ts_low_, then
+ // the key *may* be eligible for garbage collection (GC). The skipping logic
+ // is in `NextFromInput()` and `PrepareOutput()`.
+ // If nullptr, NO GC will be performed and all history will be preserved.
+ const std::string* const full_history_ts_low_;
+
+ // State
+ //
+ enum ValidContext : uint8_t {
+ kMerge1 = 0,
+ kMerge2 = 1,
+ kParseKeyError = 2,
+ kCurrentKeyUncommitted = 3,
+ kKeepSDAndClearPut = 4,
+ kKeepTsHistory = 5,
+ kKeepSDForConflictCheck = 6,
+ kKeepSDForSnapshot = 7,
+ kKeepSD = 8,
+ kKeepDel = 9,
+ kNewUserKey = 10,
+ };
+
+ struct ValidityInfo {
+ inline bool IsValid() const { return rep & 1; }
+ ValidContext GetContext() const {
+ return static_cast<ValidContext>(rep >> 1);
+ }
+ inline void SetValid(uint8_t ctx) { rep = (ctx << 1) | 1; }
+ inline void Invalidate() { rep = 0; }
+
+ uint8_t rep{0};
+ } validity_info_;
+
+ // Points to a copy of the current compaction iterator output (current_key_)
+ // if valid.
+ Slice key_;
+ // Points to the value in the underlying iterator that corresponds to the
+ // current output.
+ Slice value_;
+ // The status is OK unless compaction iterator encounters a merge operand
+ // while not having a merge operator defined.
+ Status status_;
+ // Stores the user key, sequence number and type of the current compaction
+ // iterator output (or current key in the underlying iterator during
+ // NextFromInput()).
+ ParsedInternalKey ikey_;
+ // Stores whether ikey_.user_key is valid. If set to false, the user key is
+ // not compared against the current key in the underlying iterator.
+ bool has_current_user_key_ = false;
+ // If false, the iterator holds a copy of the current compaction iterator
+ // output (or current key in the underlying iterator during NextFromInput()).
+ bool at_next_ = false;
+
+ IterKey current_key_;
+ Slice current_user_key_;
+ std::string curr_ts_;
+ SequenceNumber current_user_key_sequence_;
+ SequenceNumber current_user_key_snapshot_;
+
+ // True if the iterator has already returned a record for the current key.
+ bool has_outputted_key_ = false;
+
+ // truncated the value of the next key and output it without applying any
+ // compaction rules. This is used for outputting a put after a single delete.
+ bool clear_and_output_next_key_ = false;
+
+ MergeOutputIterator merge_out_iter_;
+ // PinnedIteratorsManager used to pin input_ Iterator blocks while reading
+ // merge operands and then releasing them after consuming them.
+ PinnedIteratorsManager pinned_iters_mgr_;
+
+ uint64_t blob_garbage_collection_cutoff_file_number_;
+
+ std::unique_ptr<BlobFetcher> blob_fetcher_;
+ std::unique_ptr<PrefetchBufferCollection> prefetch_buffers_;
+
+ std::string blob_index_;
+ PinnableSlice blob_value_;
+ std::string compaction_filter_value_;
+ InternalKey compaction_filter_skip_until_;
+ // "level_ptrs" holds indices that remember which file of an associated
+ // level we were last checking during the last call to compaction->
+ // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function
+ // to pick off where it left off since each subcompaction's key range is
+ // increasing so a later call to the function must be looking for a key that
+ // is in or beyond the last file checked during the previous call
+ std::vector<size_t> level_ptrs_;
+ CompactionIterationStats iter_stats_;
+
+ // Used to avoid purging uncommitted values. The application can specify
+ // uncommitted values by providing a SnapshotChecker object.
+ bool current_key_committed_;
+
+ // Saved result of ucmp->CompareTimestamp(current_ts_, *full_history_ts_low_)
+ int cmp_with_history_ts_low_;
+
+ const int level_;
+
+ // True if the previous internal key (same user key)'s sequence number has
+ // just been zeroed out during bottommost compaction.
+ bool last_key_seq_zeroed_{false};
+
+ // True if the current key should be output to the penultimate level if
+ // possible, compaction logic makes the final decision on which level to
+ // output to.
+ bool output_to_penultimate_level_{false};
+
+ // min seqno for preserving the time information.
+ const SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber;
+
+ // min seqno to preclude the data from the last level, if the key seqno larger
+ // than this, it will be output to penultimate level
+ const SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
+
+ void AdvanceInputIter() { input_.Next(); }
+
+ void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); }
+
+ bool IsShuttingDown() {
+ // This is a best-effort facility, so memory_order_relaxed is sufficient.
+ return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+ }
+
+ bool IsPausingManualCompaction() {
+ // This is a best-effort facility, so memory_order_relaxed is sufficient.
+ return manual_compaction_canceled_.load(std::memory_order_relaxed);
+ }
+};
+
+inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq,
+ SequenceNumber snapshot) {
+ return ((seq) <= (snapshot) &&
+ (snapshot_checker_ == nullptr ||
+ LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
+ SnapshotCheckerResult::kInSnapshot)));
+}
+
+inline bool CompactionIterator::DefinitelyNotInSnapshot(
+ SequenceNumber seq, SequenceNumber snapshot) {
+ return ((seq) > (snapshot) ||
+ (snapshot_checker_ != nullptr &&
+ UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
+ SnapshotCheckerResult::kNotInSnapshot)));
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator_test.cc b/src/rocksdb/db/compaction/compaction_iterator_test.cc
new file mode 100644
index 000000000..81362d792
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator_test.cc
@@ -0,0 +1,1618 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/compaction_iterator.h"
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "util/vector_iterator.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Expects no merging attempts.
+class NoMergingMergeOp : public MergeOperator {
+ public:
+ bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+ MergeOperationOutput* /*merge_out*/) const override {
+ ADD_FAILURE();
+ return false;
+ }
+ bool PartialMergeMulti(const Slice& /*key*/,
+ const std::deque<Slice>& /*operand_list*/,
+ std::string* /*new_value*/,
+ Logger* /*logger*/) const override {
+ ADD_FAILURE();
+ return false;
+ }
+ const char* Name() const override {
+ return "CompactionIteratorTest NoMergingMergeOp";
+ }
+};
+
+// Compaction filter that gets stuck when it sees a particular key,
+// then gets unstuck when told to.
+// Always returns Decision::kRemove.
+class StallingFilter : public CompactionFilter {
+ public:
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ int k = std::atoi(key.ToString().c_str());
+ last_seen.store(k);
+ while (k >= stall_at.load()) {
+ std::this_thread::yield();
+ }
+ return Decision::kRemove;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest StallingFilter";
+ }
+
+ // Wait until the filter sees a key >= k and stalls at that key.
+ // If `exact`, asserts that the seen key is equal to k.
+ void WaitForStall(int k, bool exact = true) {
+ stall_at.store(k);
+ while (last_seen.load() < k) {
+ std::this_thread::yield();
+ }
+ if (exact) {
+ EXPECT_EQ(k, last_seen.load());
+ }
+ }
+
+ // Filter will stall on key >= stall_at. Advance stall_at to unstall.
+ mutable std::atomic<int> stall_at{0};
+ // Last key the filter was called with.
+ mutable std::atomic<int> last_seen{0};
+};
+
+// Compaction filter that filter out all keys.
+class FilterAllKeysCompactionFilter : public CompactionFilter {
+ public:
+ Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ return Decision::kRemove;
+ }
+
+ const char* Name() const override { return "AllKeysCompactionFilter"; }
+};
+
+class LoggingForwardVectorIterator : public VectorIterator {
+ public:
+ struct Action {
+ enum class Type {
+ SEEK_TO_FIRST,
+ SEEK,
+ NEXT,
+ };
+
+ Type type;
+ std::string arg;
+
+ explicit Action(Type _type, std::string _arg = "")
+ : type(_type), arg(_arg) {}
+
+ bool operator==(const Action& rhs) const {
+ return std::tie(type, arg) == std::tie(rhs.type, rhs.arg);
+ }
+ };
+
+ LoggingForwardVectorIterator(const std::vector<std::string>& keys,
+ const std::vector<std::string>& values)
+ : VectorIterator(keys, values) {
+ current_ = keys_.size();
+ }
+
+ void SeekToFirst() override {
+ log.emplace_back(Action::Type::SEEK_TO_FIRST);
+ VectorIterator::SeekToFirst();
+ }
+ void SeekToLast() override { assert(false); }
+
+ void Seek(const Slice& target) override {
+ log.emplace_back(Action::Type::SEEK, target.ToString());
+ VectorIterator::Seek(target);
+ }
+
+ void SeekForPrev(const Slice& /*target*/) override { assert(false); }
+
+ void Next() override {
+ assert(Valid());
+ log.emplace_back(Action::Type::NEXT);
+ VectorIterator::Next();
+ }
+ void Prev() override { assert(false); }
+
+ Slice key() const override {
+ assert(Valid());
+ return VectorIterator::key();
+ }
+ Slice value() const override {
+ assert(Valid());
+ return VectorIterator::value();
+ }
+
+ std::vector<Action> log;
+};
+
+class FakeCompaction : public CompactionIterator::CompactionProxy {
+ public:
+ int level() const override { return 0; }
+
+ bool KeyNotExistsBeyondOutputLevel(
+ const Slice& /*user_key*/,
+ std::vector<size_t>* /*level_ptrs*/) const override {
+ return is_bottommost_level || key_not_exists_beyond_output_level;
+ }
+
+ bool bottommost_level() const override { return is_bottommost_level; }
+
+ int number_levels() const override { return 1; }
+
+ Slice GetLargestUserKey() const override {
+ return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+ }
+
+ bool allow_ingest_behind() const override { return is_allow_ingest_behind; }
+
+ bool allow_mmap_reads() const override { return false; }
+
+ bool enable_blob_garbage_collection() const override { return false; }
+
+ double blob_garbage_collection_age_cutoff() const override { return 0.0; }
+
+ uint64_t blob_compaction_readahead_size() const override { return 0; }
+
+ const Version* input_version() const override { return nullptr; }
+
+ bool DoesInputReferenceBlobFiles() const override { return false; }
+
+ const Compaction* real_compaction() const override { return nullptr; }
+
+ bool SupportsPerKeyPlacement() const override {
+ return supports_per_key_placement;
+ }
+
+ bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
+ return (!key.starts_with("unsafe_pb"));
+ }
+
+ bool key_not_exists_beyond_output_level = false;
+
+ bool is_bottommost_level = false;
+
+ bool is_allow_ingest_behind = false;
+
+ bool supports_per_key_placement = false;
+};
+
+// A simplified snapshot checker which assumes each snapshot has a global
+// last visible sequence.
+class TestSnapshotChecker : public SnapshotChecker {
+ public:
+ explicit TestSnapshotChecker(
+ SequenceNumber last_committed_sequence,
+ const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots =
+ {{}})
+ : last_committed_sequence_(last_committed_sequence),
+ snapshots_(snapshots) {}
+
+ SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+ if (snapshot_seq == kMaxSequenceNumber) {
+ return seq <= last_committed_sequence_
+ ? SnapshotCheckerResult::kInSnapshot
+ : SnapshotCheckerResult::kNotInSnapshot;
+ }
+ assert(snapshots_.count(snapshot_seq) > 0);
+ return seq <= snapshots_.at(snapshot_seq)
+ ? SnapshotCheckerResult::kInSnapshot
+ : SnapshotCheckerResult::kNotInSnapshot;
+ }
+
+ private:
+ SequenceNumber last_committed_sequence_;
+ // A map of valid snapshot to last visible sequence to the snapshot.
+ std::unordered_map<SequenceNumber, SequenceNumber> snapshots_;
+};
+
+// Test param:
+// bool: whether to pass snapshot_checker to compaction iterator.
+class CompactionIteratorTest : public testing::TestWithParam<bool> {
+ public:
+ CompactionIteratorTest()
+ : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {}
+
+ explicit CompactionIteratorTest(const Comparator* ucmp)
+ : cmp_(ucmp), icmp_(cmp_), snapshots_({}) {}
+
+ void InitIterators(
+ const std::vector<std::string>& ks, const std::vector<std::string>& vs,
+ const std::vector<std::string>& range_del_ks,
+ const std::vector<std::string>& range_del_vs,
+ SequenceNumber last_sequence,
+ SequenceNumber last_committed_sequence = kMaxSequenceNumber,
+ MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
+ bool bottommost_level = false,
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+ bool key_not_exists_beyond_output_level = false,
+ const std::string* full_history_ts_low = nullptr) {
+ std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
+ new VectorIterator(range_del_ks, range_del_vs, &icmp_));
+ auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
+ std::move(unfragmented_range_del_iter), icmp_);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ new FragmentedRangeTombstoneIterator(tombstone_list, icmp_,
+ kMaxSequenceNumber));
+ range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_));
+ range_del_agg_->AddTombstones(std::move(range_del_iter));
+
+ std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
+ if (filter || bottommost_level || key_not_exists_beyond_output_level) {
+ compaction_proxy_ = new FakeCompaction();
+ compaction_proxy_->is_bottommost_level = bottommost_level;
+ compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind();
+ compaction_proxy_->key_not_exists_beyond_output_level =
+ key_not_exists_beyond_output_level;
+ compaction_proxy_->supports_per_key_placement = SupportsPerKeyPlacement();
+ compaction.reset(compaction_proxy_);
+ }
+ bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
+ if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) {
+ snapshot_checker_.reset(
+ new TestSnapshotChecker(last_committed_sequence, snapshot_map_));
+ }
+ merge_helper_.reset(
+ new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false,
+ 0 /*latest_snapshot*/, snapshot_checker_.get(),
+ 0 /*level*/, nullptr /*statistics*/, &shutting_down_));
+
+ if (c_iter_) {
+ // Since iter_ is still used in ~CompactionIterator(), we call
+ // ~CompactionIterator() first.
+ c_iter_.reset();
+ }
+ iter_.reset(new LoggingForwardVectorIterator(ks, vs));
+ iter_->SeekToFirst();
+ c_iter_.reset(new CompactionIterator(
+ iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_,
+ earliest_write_conflict_snapshot, kMaxSequenceNumber,
+ snapshot_checker_.get(), Env::Default(),
+ false /* report_detailed_time */, false, range_del_agg_.get(),
+ nullptr /* blob_file_builder */, true /*allow_data_in_errors*/,
+ true /*enforce_single_del_contracts*/,
+ /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_,
+ std::move(compaction), filter, &shutting_down_, /*info_log=*/nullptr,
+ full_history_ts_low));
+ }
+
+ void AddSnapshot(SequenceNumber snapshot,
+ SequenceNumber last_visible_seq = kMaxSequenceNumber) {
+ snapshots_.push_back(snapshot);
+ snapshot_map_[snapshot] = last_visible_seq;
+ }
+
+ virtual bool UseSnapshotChecker() const { return false; }
+
+ virtual bool AllowIngestBehind() const { return false; }
+
+ virtual bool SupportsPerKeyPlacement() const { return false; }
+
+ void RunTest(
+ const std::vector<std::string>& input_keys,
+ const std::vector<std::string>& input_values,
+ const std::vector<std::string>& expected_keys,
+ const std::vector<std::string>& expected_values,
+ SequenceNumber last_committed_seq = kMaxSequenceNumber,
+ MergeOperator* merge_operator = nullptr,
+ CompactionFilter* compaction_filter = nullptr,
+ bool bottommost_level = false,
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+ bool key_not_exists_beyond_output_level = false,
+ const std::string* full_history_ts_low = nullptr) {
+ InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber,
+ last_committed_seq, merge_operator, compaction_filter,
+ bottommost_level, earliest_write_conflict_snapshot,
+ key_not_exists_beyond_output_level, full_history_ts_low);
+ c_iter_->SeekToFirst();
+ for (size_t i = 0; i < expected_keys.size(); i++) {
+ std::string info = "i = " + std::to_string(i);
+ ASSERT_TRUE(c_iter_->Valid()) << info;
+ ASSERT_OK(c_iter_->status()) << info;
+ ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info;
+ ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info;
+ c_iter_->Next();
+ }
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+ }
+
+ void ClearSnapshots() {
+ snapshots_.clear();
+ snapshot_map_.clear();
+ }
+
+ const Comparator* cmp_;
+ const InternalKeyComparator icmp_;
+ std::vector<SequenceNumber> snapshots_;
+ // A map of valid snapshot to last visible sequence to the snapshot.
+ std::unordered_map<SequenceNumber, SequenceNumber> snapshot_map_;
+ std::unique_ptr<MergeHelper> merge_helper_;
+ std::unique_ptr<LoggingForwardVectorIterator> iter_;
+ std::unique_ptr<CompactionIterator> c_iter_;
+ std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
+ std::unique_ptr<SnapshotChecker> snapshot_checker_;
+ std::atomic<bool> shutting_down_{false};
+ const std::atomic<bool> kManualCompactionCanceledFalse_{false};
+ FakeCompaction* compaction_proxy_;
+};
+
+// It is possible that the output of the compaction iterator is empty even if
+// the input is not.
+TEST_P(CompactionIteratorTest, EmptyResult) {
+ InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+ test::KeyStr("a", 3, kTypeValue)},
+ {"", "val"}, {}, {}, 5);
+ c_iter_->SeekToFirst();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+// If there is a corruption after a single deletion, the corrupted key should
+// be preserved.
+TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
+ InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+ test::KeyStr("a", 3, kTypeValue, true),
+ test::KeyStr("b", 10, kTypeValue)},
+ {"", "val", "val2"}, {}, {}, 10);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion),
+ c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, SimpleRangeDeletion) {
+ InitIterators({test::KeyStr("morning", 5, kTypeValue),
+ test::KeyStr("morning", 2, kTypeValue),
+ test::KeyStr("night", 3, kTypeValue)},
+ {"zao", "zao", "wan"},
+ {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) {
+ AddSnapshot(10);
+ std::vector<std::string> ks1;
+ ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion));
+ std::vector<std::string> vs1{"mz"};
+ std::vector<std::string> ks2{test::KeyStr("morning", 15, kTypeValue),
+ test::KeyStr("morning", 5, kTypeValue),
+ test::KeyStr("night", 40, kTypeValue),
+ test::KeyStr("night", 20, kTypeValue)};
+ std::vector<std::string> vs2{"zao 15", "zao 5", "wan 40", "wan 20"};
+ InitIterators(ks2, vs2, ks1, vs1, 40);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) {
+ class Filter : public CompactionFilter {
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+ const Slice& existing_value, std::string* /*new_value*/,
+ std::string* skip_until) const override {
+ std::string k = key.ToString();
+ std::string v = existing_value.ToString();
+ // See InitIterators() call below for the sequence of keys and their
+ // filtering decisions. Here we closely assert that compaction filter is
+ // called with the expected keys and only them, and with the right values.
+ if (k == "a") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("av50", v);
+ return Decision::kKeep;
+ }
+ if (k == "b") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("bv60", v);
+ *skip_until = "d+";
+ return Decision::kRemoveAndSkipUntil;
+ }
+ if (k == "e") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("em71", v);
+ return Decision::kKeep;
+ }
+ if (k == "f") {
+ if (v == "fm65") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ *skip_until = "f";
+ } else {
+ EXPECT_EQ("fm30", v);
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ *skip_until = "g+";
+ }
+ return Decision::kRemoveAndSkipUntil;
+ }
+ if (k == "h") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("hv91", v);
+ return Decision::kKeep;
+ }
+ if (k == "i") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("im95", v);
+ *skip_until = "z";
+ return Decision::kRemoveAndSkipUntil;
+ }
+ ADD_FAILURE();
+ return Decision::kKeep;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest.CompactionFilterSkipUntil::Filter";
+ }
+ };
+
+ NoMergingMergeOp merge_op;
+ Filter filter;
+ InitIterators(
+ {test::KeyStr("a", 50, kTypeValue), // keep
+ test::KeyStr("a", 45, kTypeMerge),
+ test::KeyStr("b", 60, kTypeValue), // skip to "d+"
+ test::KeyStr("b", 40, kTypeValue), test::KeyStr("c", 35, kTypeValue),
+ test::KeyStr("d", 70, kTypeMerge),
+ test::KeyStr("e", 71, kTypeMerge), // keep
+ test::KeyStr("f", 65, kTypeMerge), // skip to "f", aka keep
+ test::KeyStr("f", 30, kTypeMerge), // skip to "g+"
+ test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue),
+ test::KeyStr("h", 91, kTypeValue), // keep
+ test::KeyStr("i", 95, kTypeMerge), // skip to "z"
+ test::KeyStr("j", 99, kTypeValue)},
+ {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30",
+ "fv25", "gv90", "hv91", "im95", "jv99"},
+ {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter);
+
+ // Compaction should output just "a", "e" and "h" keys.
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeValue), c_iter_->key().ToString());
+ ASSERT_EQ("av50", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("e", 71, kTypeMerge), c_iter_->key().ToString());
+ ASSERT_EQ("em71", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString());
+ ASSERT_EQ("hv91", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+
+ // Check that the compaction iterator did the correct sequence of calls on
+ // the underlying iterator.
+ using A = LoggingForwardVectorIterator::Action;
+ using T = A::Type;
+ std::vector<A> expected_actions = {
+ A(T::SEEK_TO_FIRST),
+ A(T::NEXT),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("d+", kMaxSequenceNumber, kValueTypeForSeek)),
+ A(T::NEXT),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("g+", kMaxSequenceNumber, kValueTypeForSeek)),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("z", kMaxSequenceNumber, kValueTypeForSeek))};
+ ASSERT_EQ(expected_actions, iter_->log);
+}
+
+TEST_P(CompactionIteratorTest, ShuttingDownInFilter) {
+ NoMergingMergeOp merge_op;
+ StallingFilter filter;
+ InitIterators(
+ {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue),
+ test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)},
+ {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ &merge_op, &filter);
+ // Don't leave tombstones (kTypeDeletion) for filtered keys.
+ compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+ std::atomic<bool> seek_done{false};
+ ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+ c_iter_->SeekToFirst();
+ EXPECT_FALSE(c_iter_->Valid());
+ EXPECT_TRUE(c_iter_->status().IsShutdownInProgress());
+ seek_done.store(true);
+ });
+
+ // Let key 1 through.
+ filter.WaitForStall(1);
+
+ // Shutdown during compaction filter call for key 2.
+ filter.WaitForStall(2);
+ shutting_down_.store(true);
+ EXPECT_FALSE(seek_done.load());
+
+ // Unstall filter and wait for SeekToFirst() to return.
+ filter.stall_at.store(3);
+ compaction_thread.join();
+ assert(seek_done.load());
+
+ // Check that filter was never called again.
+ EXPECT_EQ(2, filter.last_seen.load());
+}
+
+// Same as ShuttingDownInFilter, but shutdown happens during filter call for
+// a merge operand, not for a value.
+TEST_P(CompactionIteratorTest, ShuttingDownInMerge) {
+ NoMergingMergeOp merge_op;
+ StallingFilter filter;
+ InitIterators(
+ {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge),
+ test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)},
+ {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ &merge_op, &filter);
+ compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+ std::atomic<bool> seek_done{false};
+ ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+ c_iter_->SeekToFirst();
+ ASSERT_FALSE(c_iter_->Valid());
+ ASSERT_TRUE(c_iter_->status().IsShutdownInProgress());
+ seek_done.store(true);
+ });
+
+ // Let key 1 through.
+ filter.WaitForStall(1);
+
+ // Shutdown during compaction filter call for key 2.
+ filter.WaitForStall(2);
+ shutting_down_.store(true);
+ EXPECT_FALSE(seek_done.load());
+
+ // Unstall filter and wait for SeekToFirst() to return.
+ filter.stall_at.store(3);
+ compaction_thread.join();
+ assert(seek_done.load());
+
+ // Check that filter was never called again.
+ EXPECT_EQ(2, filter.last_seen.load());
+}
+
+TEST_P(CompactionIteratorTest, SingleMergeOperand) {
+ class Filter : public CompactionFilter {
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+ const Slice& existing_value, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ std::string k = key.ToString();
+ std::string v = existing_value.ToString();
+
+ // See InitIterators() call below for the sequence of keys and their
+ // filtering decisions. Here we closely assert that compaction filter is
+ // called with the expected keys and only them, and with the right values.
+ if (k == "a") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("av1", v);
+ return Decision::kKeep;
+ } else if (k == "b") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ return Decision::kKeep;
+ } else if (k == "c") {
+ return Decision::kKeep;
+ }
+
+ ADD_FAILURE();
+ return Decision::kKeep;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest.SingleMergeOperand::Filter";
+ }
+ };
+
+ class SingleMergeOp : public MergeOperator {
+ public:
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ // See InitIterators() call below for why "c" is the only key for which
+ // FullMergeV2 should be called.
+ EXPECT_EQ("c", merge_in.key.ToString());
+
+ std::string temp_value;
+ if (merge_in.existing_value != nullptr) {
+ temp_value = merge_in.existing_value->ToString();
+ }
+
+ for (auto& operand : merge_in.operand_list) {
+ temp_value.append(operand.ToString());
+ }
+ merge_out->new_value = temp_value;
+
+ return true;
+ }
+
+ bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* /*logger*/) const override {
+ std::string string_key = key.ToString();
+ EXPECT_TRUE(string_key == "a" || string_key == "b");
+
+ if (string_key == "a") {
+ EXPECT_EQ(1, operand_list.size());
+ } else if (string_key == "b") {
+ EXPECT_EQ(2, operand_list.size());
+ }
+
+ std::string temp_value;
+ for (auto& operand : operand_list) {
+ temp_value.append(operand.ToString());
+ }
+ swap(temp_value, *new_value);
+
+ return true;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest SingleMergeOp";
+ }
+
+ bool AllowSingleOperand() const override { return true; }
+ };
+
+ SingleMergeOp merge_op;
+ Filter filter;
+ InitIterators(
+ // a should invoke PartialMergeMulti with a single merge operand.
+ {test::KeyStr("a", 50, kTypeMerge),
+ // b should invoke PartialMergeMulti with two operands.
+ test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge),
+ // c should invoke FullMerge due to kTypeValue at the beginning.
+ test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)},
+ {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber,
+ kMaxSequenceNumber, &merge_op, &filter);
+
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString());
+ ASSERT_EQ("av1", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ("bv1bv2", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_EQ("cv1cv2", c_iter_->value().ToString());
+}
+
+// In bottommost level, values earlier than earliest snapshot can be output
+// with sequence = 0.
+TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+ {"v1", "v2"},
+ {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+ {"v1", "v2"}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+// In bottommost level, deletions earlier than earliest snapshot can be removed
+// permanently.
+TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest(
+ {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 3, kTypeDeletion),
+ test::KeyStr("b", 1, kTypeValue)},
+ {"", "", ""},
+ {test::KeyStr("b", 3, kTypeDeletion), test::KeyStr("b", 0, kTypeValue)},
+ {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+// In bottommost level, single deletions earlier than earliest snapshot can be
+// removed permanently.
+TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+ test::KeyStr("b", 2, kTypeSingleDeletion)},
+ {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""},
+ kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+ test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+ {"a4", "a3", "a2", "b1"},
+ {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 0, kTypeValue)},
+ {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+ merge_op.get(), nullptr /*compaction_filter*/,
+ true /*bottomost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
+ testing::Values(true, false));
+
+class PerKeyPlacementCompIteratorTest : public CompactionIteratorTest {
+ public:
+ bool SupportsPerKeyPlacement() const override { return true; }
+};
+
+TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) {
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ latest_cold_seq = 5;
+
+ InitIterators(
+ {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeValue),
+ test::KeyStr("c", 5, kTypeValue)},
+ {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ nullptr, nullptr, true);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+
+ // the first 2 keys are hot, which should has
+ // `output_to_penultimate_level()==true` and seq num not zeroed out
+ ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("b", 6, kTypeValue), c_iter_->key().ToString());
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ // `a` is cold data, which should be output to bottommost
+ ASSERT_EQ(test::KeyStr("c", 0, kTypeValue), c_iter_->key().ToString());
+ ASSERT_FALSE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) {
+ AddSnapshot(5);
+
+ InitIterators(
+ {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeDeletion),
+ test::KeyStr("b", 5, kTypeValue)},
+ {"vala", "", "valb"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ nullptr, nullptr, true);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+
+ // The first key and the tombstone are within snapshot, which should output
+ // to the penultimate level (and seq num cannot be zeroed out).
+ ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("b", 6, kTypeDeletion), c_iter_->key().ToString());
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ // `a` is not protected by the snapshot, the sequence number is zero out and
+ // should output bottommost
+ ASSERT_EQ(test::KeyStr("b", 0, kTypeValue), c_iter_->key().ToString());
+ ASSERT_FALSE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) {
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ latest_cold_seq = 6;
+
+ AddSnapshot(5);
+
+ InitIterators({test::KeyStr("a", 7, kTypeValue),
+ test::KeyStr("unsafe_pb", 6, kTypeValue),
+ test::KeyStr("c", 5, kTypeValue)},
+ {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber,
+ kMaxSequenceNumber, nullptr, nullptr, true);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+
+ ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+ // the 2nd key is unsafe to output_to_penultimate_level, but it's within
+ // snapshot so for per_key_placement feature it has to be outputted to the
+ // penultimate level. which is a corruption. We should never see
+ // such case as the data with seq num (within snapshot) should always come
+ // from higher compaction input level, which makes it safe to
+ // output_to_penultimate_level.
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->status().IsCorruption());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest,
+ PerKeyPlacementCompIteratorTest,
+ testing::Values(true, false));
+
+// Tests how CompactionIterator work together with SnapshotChecker.
+class CompactionIteratorWithSnapshotCheckerTest
+ : public CompactionIteratorTest {
+ public:
+ bool UseSnapshotChecker() const override { return true; }
+};
+
+// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is
+// while committed version of these keys should get compacted as usual.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Value) {
+ RunTest(
+ {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)},
+ {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Deletion) {
+ RunTest({test::KeyStr("foo", 2, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("foo", 2, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Merge) {
+ auto merge_op = MergeOperators::CreateStringAppendOperator();
+ RunTest(
+ {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)},
+ {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_SingleDelete) {
+ RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("foo", 2, kTypeSingleDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_BlobIndex) {
+ RunTest({test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex)},
+ {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+// Test compaction iterator dedup keys visible to the same snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeDeletion),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) {
+ AddSnapshot(2, 1);
+ AddSnapshot(4, 3);
+ auto merge_op = MergeOperators::CreateStringAppendOperator();
+ RunTest(
+ {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+ test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v5", "v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+ test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)},
+ {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ DedupSameSnapshot_SingleDeletion) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeSingleDeletion),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("foo", 4, kTypeBlobIndex),
+ test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeBlobIndex),
+ test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+// At bottom level, sequence numbers can be zero out, and deletions can be
+// removed, but only when they are visible to earliest snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+ test::KeyStr("c", 3, kTypeValue)},
+ {"v1", "v2", "v3"},
+ {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+ test::KeyStr("c", 3, kTypeValue)},
+ {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveDeletionIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
+ test::KeyStr("c", 3, kTypeDeletion)},
+ {"", "", ""}, {}, {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 4, kTypeDeletion),
+ test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+ {"", "", ""},
+ {test::KeyStr("a", 4, kTypeDeletion),
+ test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+ {"", "", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+ test::KeyStr("b", 2, kTypeSingleDeletion),
+ test::KeyStr("c", 3, kTypeSingleDeletion)},
+ {"", "", ""},
+ {test::KeyStr("b", 2, kTypeSingleDeletion),
+ test::KeyStr("c", 3, kTypeSingleDeletion)},
+ {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+// Single delete should not cancel out values that not visible to the
+// same set of snapshots
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ SingleDeleteAcrossSnapshotBoundary) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"}, 2 /*last_committed_seq*/);
+}
+
+// Single delete should be kept in case it is not visible to the
+// earliest write conflict snapshot. If a single delete is kept for this reason,
+// corresponding value can be trimmed to save space.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ KeepSingleDeletionForWriteConflictChecking) {
+ AddSnapshot(2, 0);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, false /*bottommost_level*/,
+ 2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Same as above but with a blob index. In addition to the value getting
+// trimmed, the type of the KV is changed to kTypeValue.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ KeepSingleDeletionForWriteConflictChecking_BlobIndex) {
+ AddSnapshot(2, 0);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeBlobIndex)},
+ {"", "fake_blob_index"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, false /*bottommost_level*/,
+ 2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Same as above but with a wide-column entity. In addition to the value getting
+// trimmed, the type of the KV is changed to kTypeValue.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ KeepSingleDeletionForWriteConflictChecking_WideColumnEntity) {
+ AddSnapshot(2, 0);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeWideColumnEntity)},
+ {"", "fake_entity"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", ""}, 2 /* last_committed_seq */, nullptr /* merge_operator */,
+ nullptr /* compaction_filter */, false /* bottommost_level */,
+ 2 /* earliest_write_conflict_snapshot */);
+}
+
+// Compaction filter should keep uncommitted key as-is, and
+// * Convert the latest value to deletion, and/or
+// * if latest value is a merge, apply filter to all subsequent merges.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue),
+ test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)},
+ {"v2", "v1", "v3", "v4"},
+ {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion),
+ test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)},
+ {"v2", "", "v3", ""}, 1 /*last_committed_seq*/,
+ nullptr /*merge_operator*/, compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) {
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeDeletion),
+ test::KeyStr("a", 1, kTypeDeletion)},
+ {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/,
+ compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ CompactionFilter_PartialMerge) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+ test::KeyStr("a", 1, kTypeMerge)},
+ {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"},
+ 2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)},
+ {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(),
+ compaction_filter.get());
+}
+
+// Tests how CompactionIterator work together with AllowIngestBehind.
+class CompactionIteratorWithAllowIngestBehindTest
+ : public CompactionIteratorTest {
+ public:
+ bool AllowIngestBehind() const override { return true; }
+};
+
+// When allow_ingest_behind is set, compaction iterator is not targeting
+// the bottommost level since there is no guarantee there won't be further
+// data ingested under the compaction output in future.
+TEST_P(CompactionIteratorWithAllowIngestBehindTest, NoConvertToPutAtBottom) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+ test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+ {"a4", "a3", "a2", "b1"},
+ {test::KeyStr("a", 4, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+ {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+ merge_op.get(), nullptr /*compaction_filter*/,
+ true /*bottomost_level*/);
+}
+
+TEST_P(CompactionIteratorWithAllowIngestBehindTest,
+ MergeToPutIfEncounteredPutAtBottom) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+ test::KeyStr("a", 2, kTypeValue), test::KeyStr("b", 1, kTypeValue)},
+ {"a4", "a3", "a2", "b1"},
+ {test::KeyStr("a", 4, kTypeValue), test::KeyStr("b", 1, kTypeValue)},
+ {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+ merge_op.get(), nullptr /*compaction_filter*/,
+ true /*bottomost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorWithAllowIngestBehindTestInstance,
+ CompactionIteratorWithAllowIngestBehindTest,
+ testing::Values(true, false));
+
+class CompactionIteratorTsGcTest : public CompactionIteratorTest {
+ public:
+ CompactionIteratorTsGcTest()
+ : CompactionIteratorTest(test::BytewiseComparatorWithU64TsWrapper()) {}
+};
+
+TEST_P(CompactionIteratorTsGcTest, NoKeyEligibleForGC) {
+ constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeValue),
+ test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3,
+ kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+ const std::vector<std::string> input_values = {"a3", "", "b2"};
+ std::string full_history_ts_low;
+ // All keys' timestamps are newer than or equal to 102, thus none of them
+ // will be eligible for GC.
+ PutFixed64(&full_history_ts_low, 102);
+ const std::vector<std::string>& expected_keys = input_keys;
+ const std::vector<std::string>& expected_values = input_values;
+ const std::vector<std::pair<bool, bool>> params = {
+ {false, false}, {false, true}, {true, true}};
+ for (const std::pair<bool, bool>& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, NoMergeEligibleForGc) {
+ constexpr char user_key[] = "a";
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(10002, user_key, 102, kTypeMerge),
+ test::KeyStr(10001, user_key, 101, kTypeMerge),
+ test::KeyStr(10000, user_key, 100, kTypeValue)};
+ const std::vector<std::string> input_values = {"2", "1", "a0"};
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendTESTOperator();
+ const std::vector<std::string>& expected_keys = input_keys;
+ const std::vector<std::string>& expected_values = input_values;
+ const std::vector<std::pair<bool, bool>> params = {
+ {false, false}, {false, true}, {true, true}};
+ for (const auto& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+ /*compaction_filter=*/nullptr, bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level,
+ /*full_history_ts_low=*/nullptr);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) {
+ constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4,
+ kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/101, user_key[0], /*seq=*/2, kTypeValue),
+ test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "a1", "b5"};
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+ {
+ // With a snapshot at seq 3, both the deletion marker and the key at 3 must
+ // be preserved.
+ AddSnapshot(3);
+ const std::vector<std::string> expected_keys = {
+ input_keys[0], input_keys[1], input_keys[3]};
+ const std::vector<std::string> expected_values = {"", "a2", "b5"};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ ClearSnapshots();
+ }
+ {
+ // No snapshot, the deletion marker should be preserved because the user
+ // key may appear beyond output level.
+ const std::vector<std::string> expected_keys = {input_keys[0],
+ input_keys[3]};
+ const std::vector<std::string> expected_values = {"", "b5"};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ }
+ {
+ // No snapshot, the deletion marker can be dropped because the user key
+ // does not appear in higher levels.
+ const std::vector<std::string> expected_keys = {input_keys[3]};
+ const std::vector<std::string> expected_values = {"b5"};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SomeMergesOlderThanThreshold) {
+ constexpr char user_key[][2] = {"a", "f"};
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge),
+ test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge),
+ test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge),
+ test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue),
+ test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge),
+ test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge),
+ test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600,
+ kTypeDeletionWithTimestamp)};
+ const std::vector<std::string> input_values = {"25", "19", "18", "16",
+ "19", "17", ""};
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendTESTOperator();
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, 20000);
+
+ const std::vector<std::pair<bool, bool>> params = {
+ {false, false}, {false, true}, {true, true}};
+
+ {
+ AddSnapshot(1600);
+ AddSnapshot(1900);
+ const std::vector<std::string> expected_keys = {
+ test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge),
+ test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge),
+ test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge),
+ test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue),
+ test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge),
+ test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge),
+ test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600,
+ kTypeDeletionWithTimestamp)};
+ const std::vector<std::string> expected_values = {"25", "19", "18", "16",
+ "19", "17", ""};
+ for (const auto& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ auto expected_keys_copy = expected_keys;
+ auto expected_values_copy = expected_values;
+ if (bottommost_level || key_not_exists_beyond_output_level) {
+ // the kTypeDeletionWithTimestamp will be dropped
+ expected_keys_copy.pop_back();
+ expected_values_copy.pop_back();
+ if (bottommost_level) {
+ // seq zero
+ expected_keys_copy[3] =
+ test::KeyStr(/*ts=*/0, user_key[0], /*seq=*/0, kTypeValue);
+ }
+ }
+ RunTest(input_keys, input_values, expected_keys_copy,
+ expected_values_copy,
+ /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+ /*compaction_filter=*/nullptr, bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level, &full_history_ts_low);
+ }
+ ClearSnapshots();
+ }
+
+ // No snapshots
+ {
+ const std::vector<std::string> expected_keys = {
+ test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeValue),
+ test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeValue)};
+ const std::vector<std::string> expected_values = {"16,18,19,25", "17,19"};
+ for (const auto& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ auto expected_keys_copy = expected_keys;
+ auto expected_values_copy = expected_values;
+ if (bottommost_level) {
+ expected_keys_copy[1] =
+ test::KeyStr(/*ts=*/0, user_key[1], /*seq=*/0, kTypeValue);
+ }
+ RunTest(input_keys, input_values, expected_keys_copy,
+ expected_values_copy,
+ /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+ /*compaction_filter=*/nullptr, bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level, &full_history_ts_low);
+ }
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, NewHidesOldSameSnapshot) {
+ constexpr char user_key[] = "a";
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeValue),
+ test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "a1", "a0"};
+ {
+ std::string full_history_ts_low;
+ // Keys whose timestamps larger than or equal to 102 will be preserved.
+ PutFixed64(&full_history_ts_low, 102);
+ const std::vector<std::string> expected_keys = {
+ input_keys[0], input_keys[1], input_keys[2]};
+ const std::vector<std::string> expected_values = {"", input_values[1],
+ input_values[2]};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, DropTombstones) {
+ constexpr char user_key[] = "a";
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+ const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]};
+ const std::vector<std::string> expected_values = {"", "a2"};
+
+ // Take a snapshot at seq 2.
+ AddSnapshot(2);
+
+ {
+ // Non-bottommost level, but key does not exist beyond output level.
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, 102);
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_sequence=*/kMaxSequenceNumber,
+ /*merge_op=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+ }
+ {
+ // Bottommost level
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, 102);
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/true,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, RewriteTs) {
+ constexpr char user_key[] = "a";
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+ const std::vector<std::string> expected_keys = {
+ input_keys[0], input_keys[1], input_keys[2],
+ test::KeyStr(/*ts=*/0, user_key, /*seq=*/0, kTypeValue)};
+ const std::vector<std::string> expected_values = {"", "a2", "", "a0"};
+
+ AddSnapshot(1);
+ AddSnapshot(2);
+
+ {
+ // Bottommost level and need to rewrite both ts and seq.
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, 102);
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/true,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteNoKeyEligibleForGC) {
+ constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/104, user_key[0], /*seq=*/4, kTypeSingleDeletion),
+ test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/102, user_key[1], /*seq=*/2, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a3", "b2"};
+ std::string full_history_ts_low;
+ // All keys' timestamps are newer than or equal to 102, thus none of them
+ // will be eligible for GC.
+ PutFixed64(&full_history_ts_low, 102);
+ const std::vector<std::string>& expected_keys = input_keys;
+ const std::vector<std::string>& expected_values = input_values;
+ const std::vector<std::pair<bool, bool>> params = {
+ {false, false}, {false, true}, {true, true}};
+ for (const std::pair<bool, bool>& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteDropTombstones) {
+ constexpr char user_key[] = "a";
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeSingleDeletion),
+ test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeSingleDeletion),
+ test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+ const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]};
+ const std::vector<std::string> expected_values = {"", "a2"};
+
+ // Take a snapshot at seq 2.
+ AddSnapshot(2);
+ {
+ const std::vector<std::pair<bool, bool>> params = {
+ {false, false}, {false, true}, {true, true}};
+ for (const std::pair<bool, bool>& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, 102);
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level, &full_history_ts_low);
+ }
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteAllKeysOlderThanThreshold) {
+ constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeSingleDeletion),
+ test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "b5"};
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+ {
+ // With a snapshot at seq 3, both the deletion marker and the key at 3 must
+ // be preserved.
+ AddSnapshot(3);
+ const std::vector<std::string> expected_keys = {
+ input_keys[0], input_keys[1], input_keys[2]};
+ const std::vector<std::string> expected_values = {"", "a2", "b5"};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ ClearSnapshots();
+ }
+ {
+ // No snapshot.
+ const std::vector<std::string> expected_keys = {input_keys[2]};
+ const std::vector<std::string> expected_values = {"b5"};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTsGcTestInstance,
+ CompactionIteratorTsGcTest,
+ testing::Values(true, false));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_job.cc b/src/rocksdb/db/compaction/compaction_job.cc
new file mode 100644
index 000000000..1da1bcda8
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.cc
@@ -0,0 +1,2060 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <memory>
+#include <optional>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_counting_iterator.h"
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_builder.h"
+#include "db/builder.h"
+#include "db/compaction/clipping_iterator.h"
+#include "db/compaction/compaction_state.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/history_trimming_iterator.h"
+#include "db/log_writer.h"
+#include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetCompactionReasonString(CompactionReason compaction_reason) {
+ switch (compaction_reason) {
+ case CompactionReason::kUnknown:
+ return "Unknown";
+ case CompactionReason::kLevelL0FilesNum:
+ return "LevelL0FilesNum";
+ case CompactionReason::kLevelMaxLevelSize:
+ return "LevelMaxLevelSize";
+ case CompactionReason::kUniversalSizeAmplification:
+ return "UniversalSizeAmplification";
+ case CompactionReason::kUniversalSizeRatio:
+ return "UniversalSizeRatio";
+ case CompactionReason::kUniversalSortedRunNum:
+ return "UniversalSortedRunNum";
+ case CompactionReason::kFIFOMaxSize:
+ return "FIFOMaxSize";
+ case CompactionReason::kFIFOReduceNumFiles:
+ return "FIFOReduceNumFiles";
+ case CompactionReason::kFIFOTtl:
+ return "FIFOTtl";
+ case CompactionReason::kManualCompaction:
+ return "ManualCompaction";
+ case CompactionReason::kFilesMarkedForCompaction:
+ return "FilesMarkedForCompaction";
+ case CompactionReason::kBottommostFiles:
+ return "BottommostFiles";
+ case CompactionReason::kTtl:
+ return "Ttl";
+ case CompactionReason::kFlush:
+ return "Flush";
+ case CompactionReason::kExternalSstIngestion:
+ return "ExternalSstIngestion";
+ case CompactionReason::kPeriodicCompaction:
+ return "PeriodicCompaction";
+ case CompactionReason::kChangeTemperature:
+ return "ChangeTemperature";
+ case CompactionReason::kForcedBlobGC:
+ return "ForcedBlobGC";
+ case CompactionReason::kRoundRobinTtl:
+ return "RoundRobinTtl";
+ case CompactionReason::kNumOfReasons:
+ // fall through
+ default:
+ assert(false);
+ return "Invalid";
+ }
+}
+
+const char* GetCompactionPenultimateOutputRangeTypeString(
+ Compaction::PenultimateOutputRangeType range_type) {
+ switch (range_type) {
+ case Compaction::PenultimateOutputRangeType::kNotSupported:
+ return "NotSupported";
+ case Compaction::PenultimateOutputRangeType::kFullRange:
+ return "FullRange";
+ case Compaction::PenultimateOutputRangeType::kNonLastRange:
+ return "NonLastRange";
+ case Compaction::PenultimateOutputRangeType::kDisabled:
+ return "Disabled";
+ default:
+ assert(false);
+ return "Invalid";
+ }
+}
+
+CompactionJob::CompactionJob(
+ int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+ const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+ VersionSet* versions, const std::atomic<bool>* shutting_down,
+ LogBuffer* log_buffer, FSDirectory* db_directory,
+ FSDirectory* output_directory, FSDirectory* blob_output_directory,
+ Statistics* stats, InstrumentedMutex* db_mutex,
+ ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, JobContext* job_context,
+ std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+ bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname,
+ CompactionJobStats* compaction_job_stats, Env::Priority thread_pri,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const std::string& db_id, const std::string& db_session_id,
+ std::string full_history_ts_low, std::string trim_ts,
+ BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled,
+ int* bg_bottom_compaction_scheduled)
+ : compact_(new CompactionState(compaction)),
+ compaction_stats_(compaction->compaction_reason(), 1),
+ db_options_(db_options),
+ mutable_db_options_copy_(mutable_db_options),
+ log_buffer_(log_buffer),
+ output_directory_(output_directory),
+ stats_(stats),
+ bottommost_level_(false),
+ write_hint_(Env::WLTH_NOT_SET),
+ compaction_job_stats_(compaction_job_stats),
+ job_id_(job_id),
+ dbname_(dbname),
+ db_id_(db_id),
+ db_session_id_(db_session_id),
+ file_options_(file_options),
+ env_(db_options.env),
+ io_tracer_(io_tracer),
+ fs_(db_options.fs, io_tracer),
+ file_options_for_read_(
+ fs_->OptimizeForCompactionTableRead(file_options, db_options_)),
+ versions_(versions),
+ shutting_down_(shutting_down),
+ manual_compaction_canceled_(manual_compaction_canceled),
+ db_directory_(db_directory),
+ blob_output_directory_(blob_output_directory),
+ db_mutex_(db_mutex),
+ db_error_handler_(db_error_handler),
+ existing_snapshots_(std::move(existing_snapshots)),
+ earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+ snapshot_checker_(snapshot_checker),
+ job_context_(job_context),
+ table_cache_(std::move(table_cache)),
+ event_logger_(event_logger),
+ paranoid_file_checks_(paranoid_file_checks),
+ measure_io_stats_(measure_io_stats),
+ thread_pri_(thread_pri),
+ full_history_ts_low_(std::move(full_history_ts_low)),
+ trim_ts_(std::move(trim_ts)),
+ blob_callback_(blob_callback),
+ extra_num_subcompaction_threads_reserved_(0),
+ bg_compaction_scheduled_(bg_compaction_scheduled),
+ bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
+ assert(compaction_job_stats_ != nullptr);
+ assert(log_buffer_ != nullptr);
+
+ const auto* cfd = compact_->compaction->column_family_data();
+ ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+ db_options_.enable_thread_tracking);
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+ ReportStartedCompaction(compaction);
+}
+
+CompactionJob::~CompactionJob() {
+ assert(compact_ == nullptr);
+ ThreadStatusUtil::ResetThreadStatus();
+}
+
+void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
+ const auto* cfd = compact_->compaction->column_family_data();
+ ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+ db_options_.enable_thread_tracking);
+
+ ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
+ job_id_);
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL,
+ (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) +
+ compact_->compaction->output_level());
+
+ // In the current design, a CompactionJob is always created
+ // for non-trivial compaction.
+ assert(compaction->IsTrivialMove() == false ||
+ compaction->is_manual_compaction() == true);
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_PROP_FLAGS,
+ compaction->is_manual_compaction() +
+ (compaction->deletion_compaction() << 1));
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
+ compaction->CalculateTotalInputSize());
+
+ IOSTATS_RESET(bytes_written);
+ IOSTATS_RESET(bytes_read);
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_WRITTEN, 0);
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_READ, 0);
+
+ // Set the thread operation after operation properties
+ // to ensure GetThreadList() can always show them all together.
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+ compaction_job_stats_->is_manual_compaction =
+ compaction->is_manual_compaction();
+ compaction_job_stats_->is_full_compaction = compaction->is_full_compaction();
+}
+
+void CompactionJob::Prepare() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_PREPARE);
+
+ // Generate file_levels_ for compaction before making Iterator
+ auto* c = compact_->compaction;
+ ColumnFamilyData* cfd = c->column_family_data();
+ assert(cfd != nullptr);
+ assert(cfd->current()->storage_info()->NumLevelFiles(
+ compact_->compaction->level()) > 0);
+
+ write_hint_ = cfd->CalculateSSTWriteHint(c->output_level());
+ bottommost_level_ = c->bottommost_level();
+
+ if (c->ShouldFormSubcompactions()) {
+ StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
+ GenSubcompactionBoundaries();
+ }
+ if (boundaries_.size() > 1) {
+ for (size_t i = 0; i <= boundaries_.size(); i++) {
+ compact_->sub_compact_states.emplace_back(
+ c, (i != 0) ? std::optional<Slice>(boundaries_[i - 1]) : std::nullopt,
+ (i != boundaries_.size()) ? std::optional<Slice>(boundaries_[i])
+ : std::nullopt,
+ static_cast<uint32_t>(i));
+ // assert to validate that boundaries don't have same user keys (without
+ // timestamp part).
+ assert(i == 0 || i == boundaries_.size() ||
+ cfd->user_comparator()->CompareWithoutTimestamp(
+ boundaries_[i - 1], boundaries_[i]) < 0);
+ }
+ RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
+ compact_->sub_compact_states.size());
+ } else {
+ compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt,
+ /*sub_job_id*/ 0);
+ }
+
+ // collect all seqno->time information from the input files which will be used
+ // to encode seqno->time to the output files.
+ uint64_t preserve_time_duration =
+ std::max(c->immutable_options()->preserve_internal_time_seconds,
+ c->immutable_options()->preclude_last_level_data_seconds);
+
+ if (preserve_time_duration > 0) {
+ // setup seqno_time_mapping_
+ seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration);
+ for (const auto& each_level : *c->inputs()) {
+ for (const auto& fmd : each_level.files) {
+ std::shared_ptr<const TableProperties> tp;
+ Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr);
+ if (s.ok()) {
+ seqno_time_mapping_.Add(tp->seqno_to_time_mapping)
+ .PermitUncheckedError();
+ seqno_time_mapping_.Add(fmd->fd.smallest_seqno,
+ fmd->oldest_ancester_time);
+ }
+ }
+ }
+
+ auto status = seqno_time_mapping_.Sort();
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Invalid sequence number to time mapping: Status: %s",
+ status.ToString().c_str());
+ }
+ int64_t _current_time = 0;
+ status = db_options_.clock->GetCurrentTime(&_current_time);
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to get current time in compaction: Status: %s",
+ status.ToString().c_str());
+ // preserve all time information
+ preserve_time_min_seqno_ = 0;
+ preclude_last_level_min_seqno_ = 0;
+ } else {
+ seqno_time_mapping_.TruncateOldEntries(_current_time);
+ uint64_t preserve_time =
+ static_cast<uint64_t>(_current_time) > preserve_time_duration
+ ? _current_time - preserve_time_duration
+ : 0;
+ preserve_time_min_seqno_ =
+ seqno_time_mapping_.GetOldestSequenceNum(preserve_time);
+ if (c->immutable_options()->preclude_last_level_data_seconds > 0) {
+ uint64_t preclude_last_level_time =
+ static_cast<uint64_t>(_current_time) >
+ c->immutable_options()->preclude_last_level_data_seconds
+ ? _current_time -
+ c->immutable_options()->preclude_last_level_data_seconds
+ : 0;
+ preclude_last_level_min_seqno_ =
+ seqno_time_mapping_.GetOldestSequenceNum(preclude_last_level_time);
+ }
+ }
+ }
+}
+
+uint64_t CompactionJob::GetSubcompactionsLimit() {
+ return extra_num_subcompaction_threads_reserved_ +
+ std::max(
+ std::uint64_t(1),
+ static_cast<uint64_t>(compact_->compaction->max_subcompactions()));
+}
+
+void CompactionJob::AcquireSubcompactionResources(
+ int num_extra_required_subcompactions) {
+ TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:0");
+ TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:1");
+ int max_db_compactions =
+ DBImpl::GetBGJobLimits(
+ mutable_db_options_copy_.max_background_flushes,
+ mutable_db_options_copy_.max_background_compactions,
+ mutable_db_options_copy_.max_background_jobs,
+ versions_->GetColumnFamilySet()
+ ->write_controller()
+ ->NeedSpeedupCompaction())
+ .max_compactions;
+ InstrumentedMutexLock l(db_mutex_);
+ // Apply min function first since We need to compute the extra subcompaction
+ // against compaction limits. And then try to reserve threads for extra
+ // subcompactions. The actual number of reserved threads could be less than
+ // the desired number.
+ int available_bg_compactions_against_db_limit =
+ std::max(max_db_compactions - *bg_compaction_scheduled_ -
+ *bg_bottom_compaction_scheduled_,
+ 0);
+ // Reservation only supports backgrdoun threads of which the priority is
+ // between BOTTOM and HIGH. Need to degrade the priority to HIGH if the
+ // origin thread_pri_ is higher than that. Similar to ReleaseThreads().
+ extra_num_subcompaction_threads_reserved_ =
+ env_->ReserveThreads(std::min(num_extra_required_subcompactions,
+ available_bg_compactions_against_db_limit),
+ std::min(thread_pri_, Env::Priority::HIGH));
+
+ // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
+ // depending on if this compaction has the bottommost priority
+ if (thread_pri_ == Env::Priority::BOTTOM) {
+ *bg_bottom_compaction_scheduled_ +=
+ extra_num_subcompaction_threads_reserved_;
+ } else {
+ *bg_compaction_scheduled_ += extra_num_subcompaction_threads_reserved_;
+ }
+}
+
+void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) {
+ // Do nothing when we have zero resources to shrink
+ if (num_extra_resources == 0) return;
+ db_mutex_->Lock();
+ // We cannot release threads more than what we reserved before
+ int extra_num_subcompaction_threads_released = env_->ReleaseThreads(
+ (int)num_extra_resources, std::min(thread_pri_, Env::Priority::HIGH));
+ // Update the number of reserved threads and the number of background
+ // scheduled compactions for this compaction job
+ extra_num_subcompaction_threads_reserved_ -=
+ extra_num_subcompaction_threads_released;
+ // TODO (zichen): design a test case with new subcompaction partitioning
+ // when the number of actual partitions is less than the number of planned
+ // partitions
+ assert(extra_num_subcompaction_threads_released == (int)num_extra_resources);
+ // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
+ // depending on if this compaction has the bottommost priority
+ if (thread_pri_ == Env::Priority::BOTTOM) {
+ *bg_bottom_compaction_scheduled_ -=
+ extra_num_subcompaction_threads_released;
+ } else {
+ *bg_compaction_scheduled_ -= extra_num_subcompaction_threads_released;
+ }
+ db_mutex_->Unlock();
+ TEST_SYNC_POINT("CompactionJob::ShrinkSubcompactionResources:0");
+}
+
+void CompactionJob::ReleaseSubcompactionResources() {
+ if (extra_num_subcompaction_threads_reserved_ == 0) {
+ return;
+ }
+ {
+ InstrumentedMutexLock l(db_mutex_);
+ // The number of reserved threads becomes larger than 0 only if the
+ // compaction prioity is round robin and there is no sufficient
+ // sub-compactions available
+
+ // The scheduled compaction must be no less than 1 + extra number
+ // subcompactions using acquired resources since this compaction job has not
+ // finished yet
+ assert(*bg_bottom_compaction_scheduled_ >=
+ 1 + extra_num_subcompaction_threads_reserved_ ||
+ *bg_compaction_scheduled_ >=
+ 1 + extra_num_subcompaction_threads_reserved_);
+ }
+ ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_);
+}
+
+struct RangeWithSize {
+ Range range;
+ uint64_t size;
+
+ RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0)
+ : range(a, b), size(s) {}
+};
+
+void CompactionJob::GenSubcompactionBoundaries() {
+ // The goal is to find some boundary keys so that we can evenly partition
+ // the compaction input data into max_subcompactions ranges.
+ // For every input file, we ask TableReader to estimate 128 anchor points
+ // that evenly partition the input file into 128 ranges and the range
+ // sizes. This can be calculated by scanning index blocks of the file.
+ // Once we have the anchor points for all the input files, we merge them
+ // together and try to find keys dividing ranges evenly.
+ // For example, if we have two input files, and each returns following
+ // ranges:
+ // File1: (a1, 1000), (b1, 1200), (c1, 1100)
+ // File2: (a2, 1100), (b2, 1000), (c2, 1000)
+ // We total sort the keys to following:
+ // (a1, 1000), (a2, 1100), (b1, 1200), (b2, 1000), (c1, 1100), (c2, 1000)
+ // We calculate the total size by adding up all ranges' size, which is 6400.
+ // If we would like to partition into 2 subcompactions, the target of the
+ // range size is 3200. Based on the size, we take "b1" as the partition key
+ // since the first three ranges would hit 3200.
+ //
+ // Note that the ranges are actually overlapping. For example, in the example
+ // above, the range ending with "b1" is overlapping with the range ending with
+ // "b2". So the size 1000+1100+1200 is an underestimation of data size up to
+ // "b1". In extreme cases where we only compact N L0 files, a range can
+ // overlap with N-1 other ranges. Since we requested a relatively large number
+ // (128) of ranges from each input files, even N range overlapping would
+ // cause relatively small inaccuracy.
+
+ auto* c = compact_->compaction;
+ if (c->max_subcompactions() <= 1 &&
+ !(c->immutable_options()->compaction_pri == kRoundRobin &&
+ c->immutable_options()->compaction_style == kCompactionStyleLevel)) {
+ return;
+ }
+ auto* cfd = c->column_family_data();
+ const Comparator* cfd_comparator = cfd->user_comparator();
+ const InternalKeyComparator& icomp = cfd->internal_comparator();
+
+ auto* v = compact_->compaction->input_version();
+ int base_level = v->storage_info()->base_level();
+ InstrumentedMutexUnlock unlock_guard(db_mutex_);
+
+ uint64_t total_size = 0;
+ std::vector<TableReader::Anchor> all_anchors;
+ int start_lvl = c->start_level();
+ int out_lvl = c->output_level();
+
+ for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
+ int lvl = c->level(lvl_idx);
+ if (lvl >= start_lvl && lvl <= out_lvl) {
+ const LevelFilesBrief* flevel = c->input_levels(lvl_idx);
+ size_t num_files = flevel->num_files;
+
+ if (num_files == 0) {
+ continue;
+ }
+
+ for (size_t i = 0; i < num_files; i++) {
+ FileMetaData* f = flevel->files[i].file_metadata;
+ std::vector<TableReader::Anchor> my_anchors;
+ Status s = cfd->table_cache()->ApproximateKeyAnchors(
+ ReadOptions(), icomp, *f, my_anchors);
+ if (!s.ok() || my_anchors.empty()) {
+ my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
+ }
+ for (auto& ac : my_anchors) {
+ // Can be optimize to avoid this loop.
+ total_size += ac.range_size;
+ }
+
+ all_anchors.insert(all_anchors.end(), my_anchors.begin(),
+ my_anchors.end());
+ }
+ }
+ }
+ // Here we total sort all the anchor points across all files and go through
+ // them in the sorted order to find partitioning boundaries.
+ // Not the most efficient implementation. A much more efficient algorithm
+ // probably exists. But they are more complex. If performance turns out to
+ // be a problem, we can optimize.
+ std::sort(
+ all_anchors.begin(), all_anchors.end(),
+ [cfd_comparator](TableReader::Anchor& a, TableReader::Anchor& b) -> bool {
+ return cfd_comparator->CompareWithoutTimestamp(a.user_key, b.user_key) <
+ 0;
+ });
+
+ // Remove duplicated entries from boundaries.
+ all_anchors.erase(
+ std::unique(all_anchors.begin(), all_anchors.end(),
+ [cfd_comparator](TableReader::Anchor& a,
+ TableReader::Anchor& b) -> bool {
+ return cfd_comparator->CompareWithoutTimestamp(
+ a.user_key, b.user_key) == 0;
+ }),
+ all_anchors.end());
+
+ // Get the number of planned subcompactions, may update reserve threads
+ // and update extra_num_subcompaction_threads_reserved_ for round-robin
+ uint64_t num_planned_subcompactions;
+ if (c->immutable_options()->compaction_pri == kRoundRobin &&
+ c->immutable_options()->compaction_style == kCompactionStyleLevel) {
+ // For round-robin compaction prioity, we need to employ more
+ // subcompactions (may exceed the max_subcompaction limit). The extra
+ // subcompactions will be executed using reserved threads and taken into
+ // account bg_compaction_scheduled or bg_bottom_compaction_scheduled.
+
+ // Initialized by the number of input files
+ num_planned_subcompactions = static_cast<uint64_t>(c->num_input_files(0));
+ uint64_t max_subcompactions_limit = GetSubcompactionsLimit();
+ if (max_subcompactions_limit < num_planned_subcompactions) {
+ // Assert two pointers are not empty so that we can use extra
+ // subcompactions against db compaction limits
+ assert(bg_bottom_compaction_scheduled_ != nullptr);
+ assert(bg_compaction_scheduled_ != nullptr);
+ // Reserve resources when max_subcompaction is not sufficient
+ AcquireSubcompactionResources(
+ (int)(num_planned_subcompactions - max_subcompactions_limit));
+ // Subcompactions limit changes after acquiring additional resources.
+ // Need to call GetSubcompactionsLimit() again to update the number
+ // of planned subcompactions
+ num_planned_subcompactions =
+ std::min(num_planned_subcompactions, GetSubcompactionsLimit());
+ } else {
+ num_planned_subcompactions = max_subcompactions_limit;
+ }
+ } else {
+ num_planned_subcompactions = GetSubcompactionsLimit();
+ }
+
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0",
+ &num_planned_subcompactions);
+ if (num_planned_subcompactions == 1) return;
+
+ // Group the ranges into subcompactions
+ uint64_t target_range_size = std::max(
+ total_size / num_planned_subcompactions,
+ MaxFileSizeForLevel(
+ *(c->mutable_cf_options()), out_lvl,
+ c->immutable_options()->compaction_style, base_level,
+ c->immutable_options()->level_compaction_dynamic_level_bytes));
+
+ if (target_range_size >= total_size) {
+ return;
+ }
+
+ uint64_t next_threshold = target_range_size;
+ uint64_t cumulative_size = 0;
+ uint64_t num_actual_subcompactions = 1U;
+ for (TableReader::Anchor& anchor : all_anchors) {
+ cumulative_size += anchor.range_size;
+ if (cumulative_size > next_threshold) {
+ next_threshold += target_range_size;
+ num_actual_subcompactions++;
+ boundaries_.push_back(anchor.user_key);
+ }
+ if (num_actual_subcompactions == num_planned_subcompactions) {
+ break;
+ }
+ }
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:1",
+ &num_actual_subcompactions);
+ // Shrink extra subcompactions resources when extra resrouces are acquired
+ ShrinkSubcompactionResources(
+ std::min((int)(num_planned_subcompactions - num_actual_subcompactions),
+ extra_num_subcompaction_threads_reserved_));
+}
+
+Status CompactionJob::Run() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_RUN);
+ TEST_SYNC_POINT("CompactionJob::Run():Start");
+ log_buffer_->FlushBufferToLog();
+ LogCompaction();
+
+ const size_t num_threads = compact_->sub_compact_states.size();
+ assert(num_threads > 0);
+ const uint64_t start_micros = db_options_.clock->NowMicros();
+
+ // Launch a thread for each of subcompactions 1...num_threads-1
+ std::vector<port::Thread> thread_pool;
+ thread_pool.reserve(num_threads - 1);
+ for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+ thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this,
+ &compact_->sub_compact_states[i]);
+ }
+
+ // Always schedule the first subcompaction (whether or not there are also
+ // others) in the current thread to be efficient with resources
+ ProcessKeyValueCompaction(&compact_->sub_compact_states[0]);
+
+ // Wait for all other threads (if there are any) to finish execution
+ for (auto& thread : thread_pool) {
+ thread.join();
+ }
+
+ compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros);
+
+ for (auto& state : compact_->sub_compact_states) {
+ compaction_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros);
+ state.RemoveLastEmptyOutput();
+ }
+
+ RecordTimeToHistogram(stats_, COMPACTION_TIME,
+ compaction_stats_.stats.micros);
+ RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+ compaction_stats_.stats.cpu_micros);
+
+ TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+
+ // Check if any thread encountered an error during execution
+ Status status;
+ IOStatus io_s;
+ bool wrote_new_blob_files = false;
+
+ for (const auto& state : compact_->sub_compact_states) {
+ if (!state.status.ok()) {
+ status = state.status;
+ io_s = state.io_status;
+ break;
+ }
+
+ if (state.Current().HasBlobFileAdditions()) {
+ wrote_new_blob_files = true;
+ }
+ }
+
+ if (io_status_.ok()) {
+ io_status_ = io_s;
+ }
+ if (status.ok()) {
+ constexpr IODebugContext* dbg = nullptr;
+
+ if (output_directory_) {
+ io_s = output_directory_->FsyncWithDirOptions(
+ IOOptions(), dbg,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+ }
+
+ if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
+ blob_output_directory_ != output_directory_) {
+ io_s = blob_output_directory_->FsyncWithDirOptions(
+ IOOptions(), dbg,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+ }
+ }
+ if (io_status_.ok()) {
+ io_status_ = io_s;
+ }
+ if (status.ok()) {
+ status = io_s;
+ }
+ if (status.ok()) {
+ thread_pool.clear();
+ std::vector<const CompactionOutputs::Output*> files_output;
+ for (const auto& state : compact_->sub_compact_states) {
+ for (const auto& output : state.GetOutputs()) {
+ files_output.emplace_back(&output);
+ }
+ }
+ ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+ auto& prefix_extractor =
+ compact_->compaction->mutable_cf_options()->prefix_extractor;
+ std::atomic<size_t> next_file_idx(0);
+ auto verify_table = [&](Status& output_status) {
+ while (true) {
+ size_t file_idx = next_file_idx.fetch_add(1);
+ if (file_idx >= files_output.size()) {
+ break;
+ }
+ // Verify that the table is usable
+ // We set for_compaction to false and don't
+ // OptimizeForCompactionTableRead here because this is a special case
+ // after we finish the table building No matter whether
+ // use_direct_io_for_flush_and_compaction is true, we will regard this
+ // verification as user reads since the goal is to cache it here for
+ // further user reads
+ ReadOptions read_options;
+ InternalIterator* iter = cfd->table_cache()->NewIterator(
+ read_options, file_options_, cfd->internal_comparator(),
+ files_output[file_idx]->meta, /*range_del_agg=*/nullptr,
+ prefix_extractor,
+ /*table_reader_ptr=*/nullptr,
+ cfd->internal_stats()->GetFileReadHist(
+ compact_->compaction->output_level()),
+ TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
+ /*skip_filters=*/false, compact_->compaction->output_level(),
+ MaxFileSizeForL0MetaPin(
+ *compact_->compaction->mutable_cf_options()),
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr,
+ /*allow_unprepared_value=*/false);
+ auto s = iter->status();
+
+ if (s.ok() && paranoid_file_checks_) {
+ OutputValidator validator(cfd->internal_comparator(),
+ /*_enable_order_check=*/true,
+ /*_enable_hash=*/true);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ s = validator.Add(iter->key(), iter->value());
+ if (!s.ok()) {
+ break;
+ }
+ }
+ if (s.ok()) {
+ s = iter->status();
+ }
+ if (s.ok() &&
+ !validator.CompareValidator(files_output[file_idx]->validator)) {
+ s = Status::Corruption("Paranoid checksums do not match");
+ }
+ }
+
+ delete iter;
+
+ if (!s.ok()) {
+ output_status = s;
+ break;
+ }
+ }
+ };
+ for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+ thread_pool.emplace_back(
+ verify_table, std::ref(compact_->sub_compact_states[i].status));
+ }
+ verify_table(compact_->sub_compact_states[0].status);
+ for (auto& thread : thread_pool) {
+ thread.join();
+ }
+
+ for (const auto& state : compact_->sub_compact_states) {
+ if (!state.status.ok()) {
+ status = state.status;
+ break;
+ }
+ }
+ }
+
+ ReleaseSubcompactionResources();
+ TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0");
+ TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1");
+
+ TablePropertiesCollection tp;
+ for (const auto& state : compact_->sub_compact_states) {
+ for (const auto& output : state.GetOutputs()) {
+ auto fn =
+ TableFileName(state.compaction->immutable_options()->cf_paths,
+ output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
+ tp[fn] = output.table_properties;
+ }
+ }
+ compact_->compaction->SetOutputTableProperties(std::move(tp));
+
+ // Finish up all book-keeping to unify the subcompaction results
+ compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
+ UpdateCompactionStats();
+
+ RecordCompactionIOStats();
+ LogFlush(db_options_.info_log);
+ TEST_SYNC_POINT("CompactionJob::Run():End");
+
+ compact_->status = status;
+ return status;
+}
+
+Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
+ assert(compact_);
+
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_INSTALL);
+ db_mutex_->AssertHeld();
+ Status status = compact_->status;
+
+ ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+ assert(cfd);
+
+ int output_level = compact_->compaction->output_level();
+ cfd->internal_stats()->AddCompactionStats(output_level, thread_pri_,
+ compaction_stats_);
+
+ if (status.ok()) {
+ status = InstallCompactionResults(mutable_cf_options);
+ }
+ if (!versions_->io_status().ok()) {
+ io_status_ = versions_->io_status();
+ }
+
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ auto vstorage = cfd->current()->storage_info();
+ const auto& stats = compaction_stats_.stats;
+
+ double read_write_amp = 0.0;
+ double write_amp = 0.0;
+ double bytes_read_per_sec = 0;
+ double bytes_written_per_sec = 0;
+
+ const uint64_t bytes_read_non_output_and_blob =
+ stats.bytes_read_non_output_levels + stats.bytes_read_blob;
+ const uint64_t bytes_read_all =
+ stats.bytes_read_output_level + bytes_read_non_output_and_blob;
+ const uint64_t bytes_written_all =
+ stats.bytes_written + stats.bytes_written_blob;
+
+ if (bytes_read_non_output_and_blob > 0) {
+ read_write_amp = (bytes_written_all + bytes_read_all) /
+ static_cast<double>(bytes_read_non_output_and_blob);
+ write_amp =
+ bytes_written_all / static_cast<double>(bytes_read_non_output_and_blob);
+ }
+ if (stats.micros > 0) {
+ bytes_read_per_sec = bytes_read_all / static_cast<double>(stats.micros);
+ bytes_written_per_sec =
+ bytes_written_all / static_cast<double>(stats.micros);
+ }
+
+ const std::string& column_family_name = cfd->GetName();
+
+ constexpr double kMB = 1048576.0;
+
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+ "files in(%d, %d) out(%d +%d blob) "
+ "MB in(%.1f, %.1f +%.1f blob) out(%.1f +%.1f blob), "
+ "read-write-amplify(%.1f) write-amplify(%.1f) %s, records in: %" PRIu64
+ ", records dropped: %" PRIu64 " output_compression: %s\n",
+ column_family_name.c_str(), vstorage->LevelSummary(&tmp),
+ bytes_read_per_sec, bytes_written_per_sec,
+ compact_->compaction->output_level(),
+ stats.num_input_files_in_non_output_levels,
+ stats.num_input_files_in_output_level, stats.num_output_files,
+ stats.num_output_files_blob, stats.bytes_read_non_output_levels / kMB,
+ stats.bytes_read_output_level / kMB, stats.bytes_read_blob / kMB,
+ stats.bytes_written / kMB, stats.bytes_written_blob / kMB, read_write_amp,
+ write_amp, status.ToString().c_str(), stats.num_input_records,
+ stats.num_dropped_records,
+ CompressionTypeToString(compact_->compaction->output_compression())
+ .c_str());
+
+ const auto& blob_files = vstorage->GetBlobFiles();
+ if (!blob_files.empty()) {
+ assert(blob_files.front());
+ assert(blob_files.back());
+
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n",
+ column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(),
+ blob_files.back()->GetBlobFileNumber());
+ }
+
+ if (compaction_stats_.has_penultimate_level_output) {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] has Penultimate Level output: %" PRIu64
+ ", level %d, number of files: %" PRIu64 ", number of records: %" PRIu64,
+ column_family_name.c_str(),
+ compaction_stats_.penultimate_level_stats.bytes_written,
+ compact_->compaction->GetPenultimateLevel(),
+ compaction_stats_.penultimate_level_stats.num_output_files,
+ compaction_stats_.penultimate_level_stats.num_output_records);
+ }
+
+ UpdateCompactionJobStats(stats);
+
+ auto stream = event_logger_->LogToBuffer(log_buffer_, 8192);
+ stream << "job" << job_id_ << "event"
+ << "compaction_finished"
+ << "compaction_time_micros" << stats.micros
+ << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level"
+ << compact_->compaction->output_level() << "num_output_files"
+ << stats.num_output_files << "total_output_size"
+ << stats.bytes_written;
+
+ if (stats.num_output_files_blob > 0) {
+ stream << "num_blob_output_files" << stats.num_output_files_blob
+ << "total_blob_output_size" << stats.bytes_written_blob;
+ }
+
+ stream << "num_input_records" << stats.num_input_records
+ << "num_output_records" << stats.num_output_records
+ << "num_subcompactions" << compact_->sub_compact_states.size()
+ << "output_compression"
+ << CompressionTypeToString(compact_->compaction->output_compression());
+
+ stream << "num_single_delete_mismatches"
+ << compaction_job_stats_->num_single_del_mismatch;
+ stream << "num_single_delete_fallthrough"
+ << compaction_job_stats_->num_single_del_fallthru;
+
+ if (measure_io_stats_) {
+ stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
+ stream << "file_range_sync_nanos"
+ << compaction_job_stats_->file_range_sync_nanos;
+ stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos;
+ stream << "file_prepare_write_nanos"
+ << compaction_job_stats_->file_prepare_write_nanos;
+ }
+
+ stream << "lsm_state";
+ stream.StartArray();
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ stream << vstorage->NumLevelFiles(level);
+ }
+ stream.EndArray();
+
+ if (!blob_files.empty()) {
+ assert(blob_files.front());
+ stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber();
+
+ assert(blob_files.back());
+ stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber();
+ }
+
+ if (compaction_stats_.has_penultimate_level_output) {
+ InternalStats::CompactionStats& pl_stats =
+ compaction_stats_.penultimate_level_stats;
+ stream << "penultimate_level_num_output_files" << pl_stats.num_output_files;
+ stream << "penultimate_level_bytes_written" << pl_stats.bytes_written;
+ stream << "penultimate_level_num_output_records"
+ << pl_stats.num_output_records;
+ stream << "penultimate_level_num_output_files_blob"
+ << pl_stats.num_output_files_blob;
+ stream << "penultimate_level_bytes_written_blob"
+ << pl_stats.bytes_written_blob;
+ }
+
+ CleanupCompaction();
+ return status;
+}
+
+void CompactionJob::NotifyOnSubcompactionBegin(
+ SubcompactionState* sub_compact) {
+#ifndef ROCKSDB_LITE
+ Compaction* c = compact_->compaction;
+
+ if (db_options_.listeners.empty()) {
+ return;
+ }
+ if (shutting_down_->load(std::memory_order_acquire)) {
+ return;
+ }
+ if (c->is_manual_compaction() &&
+ manual_compaction_canceled_.load(std::memory_order_acquire)) {
+ return;
+ }
+
+ sub_compact->notify_on_subcompaction_completion = true;
+
+ SubcompactionJobInfo info{};
+ sub_compact->BuildSubcompactionJobInfo(info);
+ info.job_id = static_cast<int>(job_id_);
+ info.thread_id = env_->GetThreadID();
+
+ for (const auto& listener : db_options_.listeners) {
+ listener->OnSubcompactionBegin(info);
+ }
+ info.status.PermitUncheckedError();
+
+#else
+ (void)sub_compact;
+#endif // ROCKSDB_LITE
+}
+
+void CompactionJob::NotifyOnSubcompactionCompleted(
+ SubcompactionState* sub_compact) {
+#ifndef ROCKSDB_LITE
+
+ if (db_options_.listeners.empty()) {
+ return;
+ }
+ if (shutting_down_->load(std::memory_order_acquire)) {
+ return;
+ }
+
+ if (sub_compact->notify_on_subcompaction_completion == false) {
+ return;
+ }
+
+ SubcompactionJobInfo info{};
+ sub_compact->BuildSubcompactionJobInfo(info);
+ info.job_id = static_cast<int>(job_id_);
+ info.thread_id = env_->GetThreadID();
+
+ for (const auto& listener : db_options_.listeners) {
+ listener->OnSubcompactionCompleted(info);
+ }
+#else
+ (void)sub_compact;
+#endif // ROCKSDB_LITE
+}
+
+void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
+ assert(sub_compact);
+ assert(sub_compact->compaction);
+
+#ifndef ROCKSDB_LITE
+ if (db_options_.compaction_service) {
+ CompactionServiceJobStatus comp_status =
+ ProcessKeyValueCompactionWithCompactionService(sub_compact);
+ if (comp_status == CompactionServiceJobStatus::kSuccess ||
+ comp_status == CompactionServiceJobStatus::kFailure) {
+ return;
+ }
+ // fallback to local compaction
+ assert(comp_status == CompactionServiceJobStatus::kUseLocal);
+ }
+#endif // !ROCKSDB_LITE
+
+ uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
+
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+
+ // Create compaction filter and fail the compaction if
+ // IgnoreSnapshots() = false because it is not supported anymore
+ const CompactionFilter* compaction_filter =
+ cfd->ioptions()->compaction_filter;
+ std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+ if (compaction_filter == nullptr) {
+ compaction_filter_from_factory =
+ sub_compact->compaction->CreateCompactionFilter();
+ compaction_filter = compaction_filter_from_factory.get();
+ }
+ if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) {
+ sub_compact->status = Status::NotSupported(
+ "CompactionFilter::IgnoreSnapshots() = false is not supported "
+ "anymore.");
+ return;
+ }
+
+ NotifyOnSubcompactionBegin(sub_compact);
+
+ auto range_del_agg = std::make_unique<CompactionRangeDelAggregator>(
+ &cfd->internal_comparator(), existing_snapshots_, &full_history_ts_low_,
+ &trim_ts_);
+
+ // TODO: since we already use C++17, should use
+ // std::optional<const Slice> instead.
+ const std::optional<Slice> start = sub_compact->start;
+ const std::optional<Slice> end = sub_compact->end;
+
+ std::optional<Slice> start_without_ts;
+ std::optional<Slice> end_without_ts;
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+ read_options.fill_cache = false;
+ read_options.rate_limiter_priority = GetRateLimiterPriority();
+ // Compaction iterators shouldn't be confined to a single prefix.
+ // Compactions use Seek() for
+ // (a) concurrent compactions,
+ // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
+ read_options.total_order_seek = true;
+
+ // Remove the timestamps from boundaries because boundaries created in
+ // GenSubcompactionBoundaries doesn't strip away the timestamp.
+ size_t ts_sz = cfd->user_comparator()->timestamp_size();
+ if (start.has_value()) {
+ read_options.iterate_lower_bound = &start.value();
+ if (ts_sz > 0) {
+ start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz);
+ read_options.iterate_lower_bound = &start_without_ts.value();
+ }
+ }
+ if (end.has_value()) {
+ read_options.iterate_upper_bound = &end.value();
+ if (ts_sz > 0) {
+ end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz);
+ read_options.iterate_upper_bound = &end_without_ts.value();
+ }
+ }
+
+ // Although the v2 aggregator is what the level iterator(s) know about,
+ // the AddTombstones calls will be propagated down to the v1 aggregator.
+ std::unique_ptr<InternalIterator> raw_input(versions_->MakeInputIterator(
+ read_options, sub_compact->compaction, range_del_agg.get(),
+ file_options_for_read_, start, end));
+ InternalIterator* input = raw_input.get();
+
+ IterKey start_ikey;
+ IterKey end_ikey;
+ Slice start_slice;
+ Slice end_slice;
+
+ static constexpr char kMaxTs[] =
+ "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+ Slice ts_slice;
+ std::string max_ts;
+ if (ts_sz > 0) {
+ if (ts_sz <= strlen(kMaxTs)) {
+ ts_slice = Slice(kMaxTs, ts_sz);
+ } else {
+ max_ts = std::string(ts_sz, '\xff');
+ ts_slice = Slice(max_ts);
+ }
+ }
+
+ if (start.has_value()) {
+ start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber,
+ kValueTypeForSeek);
+ if (ts_sz > 0) {
+ start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
+ &ts_slice);
+ }
+ start_slice = start_ikey.GetInternalKey();
+ }
+ if (end.has_value()) {
+ end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
+ if (ts_sz > 0) {
+ end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
+ &ts_slice);
+ }
+ end_slice = end_ikey.GetInternalKey();
+ }
+
+ std::unique_ptr<InternalIterator> clip;
+ if (start.has_value() || end.has_value()) {
+ clip = std::make_unique<ClippingIterator>(
+ raw_input.get(), start.has_value() ? &start_slice : nullptr,
+ end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator());
+ input = clip.get();
+ }
+
+ std::unique_ptr<InternalIterator> blob_counter;
+
+ if (sub_compact->compaction->DoesInputReferenceBlobFiles()) {
+ BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter();
+ blob_counter = std::make_unique<BlobCountingIterator>(input, meter);
+ input = blob_counter.get();
+ }
+
+ std::unique_ptr<InternalIterator> trim_history_iter;
+ if (ts_sz > 0 && !trim_ts_.empty()) {
+ trim_history_iter = std::make_unique<HistoryTrimmingIterator>(
+ input, cfd->user_comparator(), trim_ts_);
+ input = trim_history_iter.get();
+ }
+
+ input->SeekToFirst();
+
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
+
+ // I/O measurement variables
+ PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+ const uint64_t kRecordStatsEvery = 1000;
+ uint64_t prev_write_nanos = 0;
+ uint64_t prev_fsync_nanos = 0;
+ uint64_t prev_range_sync_nanos = 0;
+ uint64_t prev_prepare_write_nanos = 0;
+ uint64_t prev_cpu_write_nanos = 0;
+ uint64_t prev_cpu_read_nanos = 0;
+ if (measure_io_stats_) {
+ prev_perf_level = GetPerfLevel();
+ SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+ prev_write_nanos = IOSTATS(write_nanos);
+ prev_fsync_nanos = IOSTATS(fsync_nanos);
+ prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+ prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+ prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+ prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+ }
+
+ MergeHelper merge(
+ env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(),
+ compaction_filter, db_options_.info_log.get(),
+ false /* internal key corruption is expected */,
+ existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+ snapshot_checker_, compact_->compaction->level(), db_options_.stats);
+
+ const MutableCFOptions* mutable_cf_options =
+ sub_compact->compaction->mutable_cf_options();
+ assert(mutable_cf_options);
+
+ std::vector<std::string> blob_file_paths;
+
+ // TODO: BlobDB to support output_to_penultimate_level compaction, which needs
+ // 2 builders, so may need to move to `CompactionOutputs`
+ std::unique_ptr<BlobFileBuilder> blob_file_builder(
+ (mutable_cf_options->enable_blob_files &&
+ sub_compact->compaction->output_level() >=
+ mutable_cf_options->blob_file_starting_level)
+ ? new BlobFileBuilder(
+ versions_, fs_.get(),
+ sub_compact->compaction->immutable_options(),
+ mutable_cf_options, &file_options_, db_id_, db_session_id_,
+ job_id_, cfd->GetID(), cfd->GetName(), Env::IOPriority::IO_LOW,
+ write_hint_, io_tracer_, blob_callback_,
+ BlobFileCreationReason::kCompaction, &blob_file_paths,
+ sub_compact->Current().GetBlobFileAdditionsPtr())
+ : nullptr);
+
+ TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionJob::Run():PausingManualCompaction:1",
+ reinterpret_cast<void*>(
+ const_cast<std::atomic<bool>*>(&manual_compaction_canceled_)));
+
+ const std::string* const full_history_ts_low =
+ full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_;
+ const SequenceNumber job_snapshot_seq =
+ job_context_ ? job_context_->GetJobSnapshotSequence()
+ : kMaxSequenceNumber;
+
+ auto c_iter = std::make_unique<CompactionIterator>(
+ input, cfd->user_comparator(), &merge, versions_->LastSequence(),
+ &existing_snapshots_, earliest_write_conflict_snapshot_, job_snapshot_seq,
+ snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_),
+ /*expect_valid_internal_key=*/true, range_del_agg.get(),
+ blob_file_builder.get(), db_options_.allow_data_in_errors,
+ db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
+ sub_compact->compaction, compaction_filter, shutting_down_,
+ db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_,
+ preclude_last_level_min_seqno_);
+ c_iter->SeekToFirst();
+
+ // Assign range delete aggregator to the target output level, which makes sure
+ // it only output to single level
+ sub_compact->AssignRangeDelAggregator(std::move(range_del_agg));
+
+ const auto& c_iter_stats = c_iter->iter_stats();
+
+ // define the open and close functions for the compaction files, which will be
+ // used open/close output files when needed.
+ const CompactionFileOpenFunc open_file_func =
+ [this, sub_compact](CompactionOutputs& outputs) {
+ return this->OpenCompactionOutputFile(sub_compact, outputs);
+ };
+ const CompactionFileCloseFunc close_file_func =
+ [this, sub_compact](CompactionOutputs& outputs, const Status& status,
+ const Slice& next_table_min_key) {
+ return this->FinishCompactionOutputFile(status, sub_compact, outputs,
+ next_table_min_key);
+ };
+
+ Status status;
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionJob::ProcessKeyValueCompaction()::Processing",
+ reinterpret_cast<void*>(
+ const_cast<Compaction*>(sub_compact->compaction)));
+ while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
+ // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
+ // returns true.
+
+ assert(!end.has_value() || cfd->user_comparator()->Compare(
+ c_iter->user_key(), end.value()) < 0);
+
+ if (c_iter_stats.num_input_records % kRecordStatsEvery ==
+ kRecordStatsEvery - 1) {
+ RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+ c_iter->ResetRecordCounts();
+ RecordCompactionIOStats();
+ }
+
+ // Add current compaction_iterator key to target compaction output, if the
+ // output file needs to be close or open, it will call the `open_file_func`
+ // and `close_file_func`.
+ // TODO: it would be better to have the compaction file open/close moved
+ // into `CompactionOutputs` which has the output file information.
+ status = sub_compact->AddToOutput(*c_iter, open_file_func, close_file_func);
+ if (!status.ok()) {
+ break;
+ }
+
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionJob::Run():PausingManualCompaction:2",
+ reinterpret_cast<void*>(
+ const_cast<std::atomic<bool>*>(&manual_compaction_canceled_)));
+ c_iter->Next();
+ if (c_iter->status().IsManualCompactionPaused()) {
+ break;
+ }
+ }
+
+ sub_compact->compaction_job_stats.num_blobs_read =
+ c_iter_stats.num_blobs_read;
+ sub_compact->compaction_job_stats.total_blob_bytes_read =
+ c_iter_stats.total_blob_bytes_read;
+ sub_compact->compaction_job_stats.num_input_deletion_records =
+ c_iter_stats.num_input_deletion_records;
+ sub_compact->compaction_job_stats.num_corrupt_keys =
+ c_iter_stats.num_input_corrupt_records;
+ sub_compact->compaction_job_stats.num_single_del_fallthru =
+ c_iter_stats.num_single_del_fallthru;
+ sub_compact->compaction_job_stats.num_single_del_mismatch =
+ c_iter_stats.num_single_del_mismatch;
+ sub_compact->compaction_job_stats.total_input_raw_key_bytes +=
+ c_iter_stats.total_input_raw_key_bytes;
+ sub_compact->compaction_job_stats.total_input_raw_value_bytes +=
+ c_iter_stats.total_input_raw_value_bytes;
+
+ RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME,
+ c_iter_stats.total_filter_time);
+
+ if (c_iter_stats.num_blobs_relocated > 0) {
+ RecordTick(stats_, BLOB_DB_GC_NUM_KEYS_RELOCATED,
+ c_iter_stats.num_blobs_relocated);
+ }
+ if (c_iter_stats.total_blob_bytes_relocated > 0) {
+ RecordTick(stats_, BLOB_DB_GC_BYTES_RELOCATED,
+ c_iter_stats.total_blob_bytes_relocated);
+ }
+
+ RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+ RecordCompactionIOStats();
+
+ if (status.ok() && cfd->IsDropped()) {
+ status =
+ Status::ColumnFamilyDropped("Column family dropped during compaction");
+ }
+ if ((status.ok() || status.IsColumnFamilyDropped()) &&
+ shutting_down_->load(std::memory_order_relaxed)) {
+ status = Status::ShutdownInProgress("Database shutdown");
+ }
+ if ((status.ok() || status.IsColumnFamilyDropped()) &&
+ (manual_compaction_canceled_.load(std::memory_order_relaxed))) {
+ status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+ if (status.ok()) {
+ status = input->status();
+ }
+ if (status.ok()) {
+ status = c_iter->status();
+ }
+
+ // Call FinishCompactionOutputFile() even if status is not ok: it needs to
+ // close the output files. Open file function is also passed, in case there's
+ // only range-dels, no file was opened, to save the range-dels, it need to
+ // create a new output file.
+ status = sub_compact->CloseCompactionFiles(status, open_file_func,
+ close_file_func);
+
+ if (blob_file_builder) {
+ if (status.ok()) {
+ status = blob_file_builder->Finish();
+ } else {
+ blob_file_builder->Abandon(status);
+ }
+ blob_file_builder.reset();
+ sub_compact->Current().UpdateBlobStats();
+ }
+
+ sub_compact->compaction_job_stats.cpu_micros =
+ db_options_.clock->CPUMicros() - prev_cpu_micros;
+
+ if (measure_io_stats_) {
+ sub_compact->compaction_job_stats.file_write_nanos +=
+ IOSTATS(write_nanos) - prev_write_nanos;
+ sub_compact->compaction_job_stats.file_fsync_nanos +=
+ IOSTATS(fsync_nanos) - prev_fsync_nanos;
+ sub_compact->compaction_job_stats.file_range_sync_nanos +=
+ IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
+ sub_compact->compaction_job_stats.file_prepare_write_nanos +=
+ IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
+ sub_compact->compaction_job_stats.cpu_micros -=
+ (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos +
+ IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) /
+ 1000;
+ if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
+ SetPerfLevel(prev_perf_level);
+ }
+ }
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ if (!status.ok()) {
+ if (c_iter) {
+ c_iter->status().PermitUncheckedError();
+ }
+ if (input) {
+ input->status().PermitUncheckedError();
+ }
+ }
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+
+ blob_counter.reset();
+ clip.reset();
+ raw_input.reset();
+ sub_compact->status = status;
+ NotifyOnSubcompactionCompleted(sub_compact);
+}
+
+uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const {
+ return (uint64_t)job_id_ << 32 | sub_compact->sub_job_id;
+}
+
+void CompactionJob::RecordDroppedKeys(
+ const CompactionIterationStats& c_iter_stats,
+ CompactionJobStats* compaction_job_stats) {
+ if (c_iter_stats.num_record_drop_user > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_USER,
+ c_iter_stats.num_record_drop_user);
+ }
+ if (c_iter_stats.num_record_drop_hidden > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
+ c_iter_stats.num_record_drop_hidden);
+ if (compaction_job_stats) {
+ compaction_job_stats->num_records_replaced +=
+ c_iter_stats.num_record_drop_hidden;
+ }
+ }
+ if (c_iter_stats.num_record_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE,
+ c_iter_stats.num_record_drop_obsolete);
+ if (compaction_job_stats) {
+ compaction_job_stats->num_expired_deletion_records +=
+ c_iter_stats.num_record_drop_obsolete;
+ }
+ }
+ if (c_iter_stats.num_record_drop_range_del > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL,
+ c_iter_stats.num_record_drop_range_del);
+ }
+ if (c_iter_stats.num_range_del_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE,
+ c_iter_stats.num_range_del_drop_obsolete);
+ }
+ if (c_iter_stats.num_optimized_del_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+ c_iter_stats.num_optimized_del_drop_obsolete);
+ }
+}
+
+Status CompactionJob::FinishCompactionOutputFile(
+ const Status& input_status, SubcompactionState* sub_compact,
+ CompactionOutputs& outputs, const Slice& next_table_min_key) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
+ assert(sub_compact != nullptr);
+ assert(outputs.HasBuilder());
+
+ FileMetaData* meta = outputs.GetMetaData();
+ uint64_t output_number = meta->fd.GetNumber();
+ assert(output_number != 0);
+
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+ std::string file_checksum = kUnknownFileChecksum;
+ std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+
+ // Check for iterator errors
+ Status s = input_status;
+
+ // Add range tombstones
+ auto earliest_snapshot = kMaxSequenceNumber;
+ if (existing_snapshots_.size() > 0) {
+ earliest_snapshot = existing_snapshots_[0];
+ }
+ if (s.ok()) {
+ CompactionIterationStats range_del_out_stats;
+ // if the compaction supports per_key_placement, only output range dels to
+ // the penultimate level.
+ // Note: Use `bottommost_level_ = true` for both bottommost and
+ // output_to_penultimate_level compaction here, as it's only used to decide
+ // if range dels could be dropped.
+ if (outputs.HasRangeDel()) {
+ s = outputs.AddRangeDels(
+ sub_compact->start.has_value() ? &(sub_compact->start.value())
+ : nullptr,
+ sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr,
+ range_del_out_stats, bottommost_level_, cfd->internal_comparator(),
+ earliest_snapshot, next_table_min_key, full_history_ts_low_);
+ }
+ RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
+ TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
+ }
+
+ const uint64_t current_entries = outputs.NumEntries();
+
+ s = outputs.Finish(s, seqno_time_mapping_);
+
+ if (s.ok()) {
+ // With accurate smallest and largest key, we can get a slightly more
+ // accurate oldest ancester time.
+ // This makes oldest ancester time in manifest more accurate than in
+ // table properties. Not sure how to resolve it.
+ if (meta->smallest.size() > 0 && meta->largest.size() > 0) {
+ uint64_t refined_oldest_ancester_time;
+ Slice new_smallest = meta->smallest.user_key();
+ Slice new_largest = meta->largest.user_key();
+ if (!new_largest.empty() && !new_smallest.empty()) {
+ refined_oldest_ancester_time =
+ sub_compact->compaction->MinInputFileOldestAncesterTime(
+ &(meta->smallest), &(meta->largest));
+ if (refined_oldest_ancester_time !=
+ std::numeric_limits<uint64_t>::max()) {
+ meta->oldest_ancester_time = refined_oldest_ancester_time;
+ }
+ }
+ }
+ }
+
+ // Finish and check for file errors
+ IOStatus io_s = outputs.WriterSyncClose(s, db_options_.clock, stats_,
+ db_options_.use_fsync);
+
+ if (s.ok() && io_s.ok()) {
+ file_checksum = meta->file_checksum;
+ file_checksum_func_name = meta->file_checksum_func_name;
+ }
+
+ if (s.ok()) {
+ s = io_s;
+ }
+ if (sub_compact->io_status.ok()) {
+ sub_compact->io_status = io_s;
+ // Since this error is really a copy of the
+ // "normal" status, it does not also need to be checked
+ sub_compact->io_status.PermitUncheckedError();
+ }
+
+ TableProperties tp;
+ if (s.ok()) {
+ tp = outputs.GetTableProperties();
+ }
+
+ if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
+ // If there is nothing to output, no necessary to generate a sst file.
+ // This happens when the output level is bottom level, at the same time
+ // the sub_compact output nothing.
+ std::string fname =
+ TableFileName(sub_compact->compaction->immutable_options()->cf_paths,
+ meta->fd.GetNumber(), meta->fd.GetPathId());
+
+ // TODO(AR) it is not clear if there are any larger implications if
+ // DeleteFile fails here
+ Status ds = env_->DeleteFile(fname);
+ if (!ds.ok()) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64
+ " at bottom level%s",
+ cfd->GetName().c_str(), job_id_, output_number,
+ meta->marked_for_compaction ? " (need compaction)" : "");
+ }
+
+ // Also need to remove the file from outputs, or it will be added to the
+ // VersionEdit.
+ outputs.RemoveLastOutput();
+ meta = nullptr;
+ }
+
+ if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
+ // Output to event logger and fire events.
+ outputs.UpdateTableProperties();
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
+ " keys, %" PRIu64 " bytes%s, temperature: %s",
+ cfd->GetName().c_str(), job_id_, output_number,
+ current_entries, meta->fd.file_size,
+ meta->marked_for_compaction ? " (need compaction)" : "",
+ temperature_to_string[meta->temperature].c_str());
+ }
+ std::string fname;
+ FileDescriptor output_fd;
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+ Status status_for_listener = s;
+ if (meta != nullptr) {
+ fname = GetTableFileName(meta->fd.GetNumber());
+ output_fd = meta->fd;
+ oldest_blob_file_number = meta->oldest_blob_file_number;
+ } else {
+ fname = "(nil)";
+ if (s.ok()) {
+ status_for_listener = Status::Aborted("Empty SST file not kept");
+ }
+ }
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
+ job_id_, output_fd, oldest_blob_file_number, tp,
+ TableFileCreationReason::kCompaction, status_for_listener, file_checksum,
+ file_checksum_func_name);
+
+#ifndef ROCKSDB_LITE
+ // Report new file to SstFileManagerImpl
+ auto sfm =
+ static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+ if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) {
+ Status add_s = sfm->OnAddFile(fname);
+ if (!add_s.ok() && s.ok()) {
+ s = add_s;
+ }
+ if (sfm->IsMaxAllowedSpaceReached()) {
+ // TODO(ajkr): should we return OK() if max space was reached by the final
+ // compaction output file (similarly to how flush works when full)?
+ s = Status::SpaceLimit("Max allowed space was reached");
+ TEST_SYNC_POINT(
+ "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached");
+ InstrumentedMutexLock l(db_mutex_);
+ db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
+ }
+ }
+#endif
+
+ outputs.ResetBuilder();
+ return s;
+}
+
+Status CompactionJob::InstallCompactionResults(
+ const MutableCFOptions& mutable_cf_options) {
+ assert(compact_);
+
+ db_mutex_->AssertHeld();
+
+ auto* compaction = compact_->compaction;
+ assert(compaction);
+
+ {
+ Compaction::InputLevelSummaryBuffer inputs_summary;
+ if (compaction_stats_.has_penultimate_level_output) {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] [JOB %d] Compacted %s => output_to_penultimate_level: %" PRIu64
+ " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes",
+ compaction->column_family_data()->GetName().c_str(), job_id_,
+ compaction->InputLevelSummary(&inputs_summary),
+ compaction_stats_.penultimate_level_stats.bytes_written,
+ compaction_stats_.stats.bytes_written,
+ compaction_stats_.TotalBytesWritten());
+ } else {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
+ compaction->column_family_data()->GetName().c_str(),
+ job_id_, compaction->InputLevelSummary(&inputs_summary),
+ compaction_stats_.TotalBytesWritten());
+ }
+ }
+
+ VersionEdit* const edit = compaction->edit();
+ assert(edit);
+
+ // Add compaction inputs
+ compaction->AddInputDeletions(edit);
+
+ std::unordered_map<uint64_t, BlobGarbageMeter::BlobStats> blob_total_garbage;
+
+ for (const auto& sub_compact : compact_->sub_compact_states) {
+ sub_compact.AddOutputsEdit(edit);
+
+ for (const auto& blob : sub_compact.Current().GetBlobFileAdditions()) {
+ edit->AddBlobFile(blob);
+ }
+
+ if (sub_compact.Current().GetBlobGarbageMeter()) {
+ const auto& flows = sub_compact.Current().GetBlobGarbageMeter()->flows();
+
+ for (const auto& pair : flows) {
+ const uint64_t blob_file_number = pair.first;
+ const BlobGarbageMeter::BlobInOutFlow& flow = pair.second;
+
+ assert(flow.IsValid());
+ if (flow.HasGarbage()) {
+ blob_total_garbage[blob_file_number].Add(flow.GetGarbageCount(),
+ flow.GetGarbageBytes());
+ }
+ }
+ }
+ }
+
+ for (const auto& pair : blob_total_garbage) {
+ const uint64_t blob_file_number = pair.first;
+ const BlobGarbageMeter::BlobStats& stats = pair.second;
+
+ edit->AddBlobFileGarbage(blob_file_number, stats.GetCount(),
+ stats.GetBytes());
+ }
+
+ if ((compaction->compaction_reason() ==
+ CompactionReason::kLevelMaxLevelSize ||
+ compaction->compaction_reason() == CompactionReason::kRoundRobinTtl) &&
+ compaction->immutable_options()->compaction_pri == kRoundRobin) {
+ int start_level = compaction->start_level();
+ if (start_level > 0) {
+ auto vstorage = compaction->input_version()->storage_info();
+ edit->AddCompactCursor(start_level,
+ vstorage->GetNextCompactCursor(
+ start_level, compaction->num_input_files(0)));
+ }
+ }
+
+ return versions_->LogAndApply(compaction->column_family_data(),
+ mutable_cf_options, edit, db_mutex_,
+ db_directory_);
+}
+
+void CompactionJob::RecordCompactionIOStats() {
+ RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
+ RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
+ CompactionReason compaction_reason =
+ compact_->compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kFilesMarkedForCompaction) {
+ RecordTick(stats_, COMPACT_READ_BYTES_MARKED, IOSTATS(bytes_read));
+ RecordTick(stats_, COMPACT_WRITE_BYTES_MARKED, IOSTATS(bytes_written));
+ } else if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ RecordTick(stats_, COMPACT_READ_BYTES_PERIODIC, IOSTATS(bytes_read));
+ RecordTick(stats_, COMPACT_WRITE_BYTES_PERIODIC, IOSTATS(bytes_written));
+ } else if (compaction_reason == CompactionReason::kTtl) {
+ RecordTick(stats_, COMPACT_READ_BYTES_TTL, IOSTATS(bytes_read));
+ RecordTick(stats_, COMPACT_WRITE_BYTES_TTL, IOSTATS(bytes_written));
+ }
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read));
+ IOSTATS_RESET(bytes_read);
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written));
+ IOSTATS_RESET(bytes_written);
+}
+
+Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
+ CompactionOutputs& outputs) {
+ assert(sub_compact != nullptr);
+
+ // no need to lock because VersionSet::next_file_number_ is atomic
+ uint64_t file_number = versions_->NewFileNumber();
+ std::string fname = GetTableFileName(file_number);
+ // Fire events.
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+#ifndef ROCKSDB_LITE
+ EventHelpers::NotifyTableFileCreationStarted(
+ cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_,
+ TableFileCreationReason::kCompaction);
+#endif // !ROCKSDB_LITE
+ // Make the output file
+ std::unique_ptr<FSWritableFile> writable_file;
+#ifndef NDEBUG
+ bool syncpoint_arg = file_options_.use_direct_writes;
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
+ &syncpoint_arg);
+#endif
+
+ // Pass temperature of the last level files to FileSystem.
+ FileOptions fo_copy = file_options_;
+ Temperature temperature = sub_compact->compaction->output_temperature();
+ // only set for the last level compaction and also it's not output to
+ // penultimate level (when preclude_last_level feature is enabled)
+ if (temperature == Temperature::kUnknown &&
+ sub_compact->compaction->is_last_level() &&
+ !sub_compact->IsCurrentPenultimateLevel()) {
+ temperature =
+ sub_compact->compaction->mutable_cf_options()->last_level_temperature;
+ }
+ fo_copy.temperature = temperature;
+
+ Status s;
+ IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy);
+ s = io_s;
+ if (sub_compact->io_status.ok()) {
+ sub_compact->io_status = io_s;
+ // Since this error is really a copy of the io_s that is checked below as s,
+ // it does not also need to be checked.
+ sub_compact->io_status.PermitUncheckedError();
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ db_options_.info_log,
+ "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64
+ " fails at NewWritableFile with status %s",
+ sub_compact->compaction->column_family_data()->GetName().c_str(),
+ job_id_, file_number, s.ToString().c_str());
+ LogFlush(db_options_.info_log);
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
+ fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber,
+ TableProperties(), TableFileCreationReason::kCompaction, s,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ return s;
+ }
+
+ // Try to figure out the output file's oldest ancester time.
+ int64_t temp_current_time = 0;
+ auto get_time_status = db_options_.clock->GetCurrentTime(&temp_current_time);
+ // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+ if (!get_time_status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to get current time. Status: %s",
+ get_time_status.ToString().c_str());
+ }
+ uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+ InternalKey tmp_start, tmp_end;
+ if (sub_compact->start.has_value()) {
+ tmp_start.SetMinPossibleForUserKey(sub_compact->start.value());
+ }
+ if (sub_compact->end.has_value()) {
+ tmp_end.SetMinPossibleForUserKey(sub_compact->end.value());
+ }
+ uint64_t oldest_ancester_time =
+ sub_compact->compaction->MinInputFileOldestAncesterTime(
+ sub_compact->start.has_value() ? &tmp_start : nullptr,
+ sub_compact->end.has_value() ? &tmp_end : nullptr);
+ if (oldest_ancester_time == std::numeric_limits<uint64_t>::max()) {
+ oldest_ancester_time = current_time;
+ }
+
+ // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
+ {
+ FileMetaData meta;
+ meta.fd = FileDescriptor(file_number,
+ sub_compact->compaction->output_path_id(), 0);
+ meta.oldest_ancester_time = oldest_ancester_time;
+ meta.file_creation_time = current_time;
+ meta.temperature = temperature;
+ assert(!db_id_.empty());
+ assert(!db_session_id_.empty());
+ s = GetSstInternalUniqueId(db_id_, db_session_id_, meta.fd.GetNumber(),
+ &meta.unique_id);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "[%s] [JOB %d] file #%" PRIu64
+ " failed to generate unique id: %s.",
+ cfd->GetName().c_str(), job_id_, meta.fd.GetNumber(),
+ s.ToString().c_str());
+ return s;
+ }
+
+ outputs.AddOutput(std::move(meta), cfd->internal_comparator(),
+ sub_compact->compaction->mutable_cf_options()
+ ->check_flush_compaction_key_order,
+ paranoid_file_checks_);
+ }
+
+ writable_file->SetIOPriority(GetRateLimiterPriority());
+ writable_file->SetWriteLifeTimeHint(write_hint_);
+ FileTypeSet tmp_set = db_options_.checksum_handoff_file_types;
+ writable_file->SetPreallocationBlockSize(static_cast<size_t>(
+ sub_compact->compaction->OutputFilePreallocationSize()));
+ const auto& listeners =
+ sub_compact->compaction->immutable_options()->listeners;
+ outputs.AssignFileWriter(new WritableFileWriter(
+ std::move(writable_file), fname, fo_copy, db_options_.clock, io_tracer_,
+ db_options_.stats, listeners, db_options_.file_checksum_gen_factory.get(),
+ tmp_set.Contains(FileType::kTableFile), false));
+
+ TableBuilderOptions tboptions(
+ *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
+ cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+ sub_compact->compaction->output_compression(),
+ sub_compact->compaction->output_compression_opts(), cfd->GetID(),
+ cfd->GetName(), sub_compact->compaction->output_level(),
+ bottommost_level_, TableFileCreationReason::kCompaction,
+ 0 /* oldest_key_time */, current_time, db_id_, db_session_id_,
+ sub_compact->compaction->max_output_file_size(), file_number);
+
+ outputs.NewBuilder(tboptions);
+
+ LogFlush(db_options_.info_log);
+ return s;
+}
+
+void CompactionJob::CleanupCompaction() {
+ for (SubcompactionState& sub_compact : compact_->sub_compact_states) {
+ sub_compact.Cleanup(table_cache_.get());
+ }
+ delete compact_;
+ compact_ = nullptr;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
+ assert(prefix_length > 0);
+ size_t length = src.size() > prefix_length ? prefix_length : src.size();
+ dst->assign(src.data(), length);
+}
+} // namespace
+
+#endif // !ROCKSDB_LITE
+
+void CompactionJob::UpdateCompactionStats() {
+ assert(compact_);
+
+ Compaction* compaction = compact_->compaction;
+ compaction_stats_.stats.num_input_files_in_non_output_levels = 0;
+ compaction_stats_.stats.num_input_files_in_output_level = 0;
+ for (int input_level = 0;
+ input_level < static_cast<int>(compaction->num_input_levels());
+ ++input_level) {
+ if (compaction->level(input_level) != compaction->output_level()) {
+ UpdateCompactionInputStatsHelper(
+ &compaction_stats_.stats.num_input_files_in_non_output_levels,
+ &compaction_stats_.stats.bytes_read_non_output_levels, input_level);
+ } else {
+ UpdateCompactionInputStatsHelper(
+ &compaction_stats_.stats.num_input_files_in_output_level,
+ &compaction_stats_.stats.bytes_read_output_level, input_level);
+ }
+ }
+
+ assert(compaction_job_stats_);
+ compaction_stats_.stats.bytes_read_blob =
+ compaction_job_stats_->total_blob_bytes_read;
+
+ compaction_stats_.stats.num_dropped_records =
+ compaction_stats_.DroppedRecords();
+}
+
+void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
+ uint64_t* bytes_read,
+ int input_level) {
+ const Compaction* compaction = compact_->compaction;
+ auto num_input_files = compaction->num_input_files(input_level);
+ *num_files += static_cast<int>(num_input_files);
+
+ for (size_t i = 0; i < num_input_files; ++i) {
+ const auto* file_meta = compaction->input(input_level, i);
+ *bytes_read += file_meta->fd.GetFileSize();
+ compaction_stats_.stats.num_input_records +=
+ static_cast<uint64_t>(file_meta->num_entries);
+ }
+}
+
+void CompactionJob::UpdateCompactionJobStats(
+ const InternalStats::CompactionStats& stats) const {
+#ifndef ROCKSDB_LITE
+ compaction_job_stats_->elapsed_micros = stats.micros;
+
+ // input information
+ compaction_job_stats_->total_input_bytes =
+ stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+ compaction_job_stats_->num_input_records = stats.num_input_records;
+ compaction_job_stats_->num_input_files =
+ stats.num_input_files_in_non_output_levels +
+ stats.num_input_files_in_output_level;
+ compaction_job_stats_->num_input_files_at_output_level =
+ stats.num_input_files_in_output_level;
+
+ // output information
+ compaction_job_stats_->total_output_bytes = stats.bytes_written;
+ compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
+ compaction_job_stats_->num_output_records = stats.num_output_records;
+ compaction_job_stats_->num_output_files = stats.num_output_files;
+ compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
+
+ if (stats.num_output_files > 0) {
+ CopyPrefix(compact_->SmallestUserKey(),
+ CompactionJobStats::kMaxPrefixLength,
+ &compaction_job_stats_->smallest_output_key_prefix);
+ CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength,
+ &compaction_job_stats_->largest_output_key_prefix);
+ }
+#else
+ (void)stats;
+#endif // !ROCKSDB_LITE
+}
+
+void CompactionJob::LogCompaction() {
+ Compaction* compaction = compact_->compaction;
+ ColumnFamilyData* cfd = compaction->column_family_data();
+
+ // Let's check if anything will get logged. Don't prepare all the info if
+ // we're not logging
+ if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
+ Compaction::InputLevelSummaryBuffer inputs_summary;
+ ROCKS_LOG_INFO(
+ db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f",
+ cfd->GetName().c_str(), job_id_,
+ compaction->InputLevelSummary(&inputs_summary), compaction->score());
+ char scratch[2345];
+ compaction->Summary(scratch, sizeof(scratch));
+ ROCKS_LOG_INFO(db_options_.info_log, "[%s]: Compaction start summary: %s\n",
+ cfd->GetName().c_str(), scratch);
+ // build event logger report
+ auto stream = event_logger_->Log();
+ stream << "job" << job_id_ << "event"
+ << "compaction_started"
+ << "compaction_reason"
+ << GetCompactionReasonString(compaction->compaction_reason());
+ for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
+ stream << ("files_L" + std::to_string(compaction->level(i)));
+ stream.StartArray();
+ for (auto f : *compaction->inputs(i)) {
+ stream << f->fd.GetNumber();
+ }
+ stream.EndArray();
+ }
+ stream << "score" << compaction->score() << "input_data_size"
+ << compaction->CalculateTotalInputSize() << "oldest_snapshot_seqno"
+ << (existing_snapshots_.empty()
+ ? int64_t{-1} // Use -1 for "none"
+ : static_cast<int64_t>(existing_snapshots_[0]));
+ if (compaction->SupportsPerKeyPlacement()) {
+ stream << "preclude_last_level_min_seqno"
+ << preclude_last_level_min_seqno_;
+ stream << "penultimate_output_level" << compaction->GetPenultimateLevel();
+ stream << "penultimate_output_range"
+ << GetCompactionPenultimateOutputRangeTypeString(
+ compaction->GetPenultimateOutputRangeType());
+
+ if (compaction->GetPenultimateOutputRangeType() ==
+ Compaction::PenultimateOutputRangeType::kDisabled) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "[%s] [JOB %d] Penultimate level output is disabled, likely "
+ "because of the range conflict in the penultimate level",
+ cfd->GetName().c_str(), job_id_);
+ }
+ }
+ }
+}
+
+std::string CompactionJob::GetTableFileName(uint64_t file_number) {
+ return TableFileName(compact_->compaction->immutable_options()->cf_paths,
+ file_number, compact_->compaction->output_path_id());
+}
+
+Env::IOPriority CompactionJob::GetRateLimiterPriority() {
+ if (versions_ && versions_->GetColumnFamilySet() &&
+ versions_->GetColumnFamilySet()->write_controller()) {
+ WriteController* write_controller =
+ versions_->GetColumnFamilySet()->write_controller();
+ if (write_controller->NeedsDelay() || write_controller->IsStopped()) {
+ return Env::IO_USER;
+ }
+ }
+
+ return Env::IO_LOW;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job.h b/src/rocksdb/db/compaction/compaction_job.h
new file mode 100644
index 000000000..bfbce1011
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.h
@@ -0,0 +1,500 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/column_family.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/compaction/compaction_outputs.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/memtable_list.h"
+#include "db/range_del_aggregator.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class CompactionState;
+class ErrorHandler;
+class MemTable;
+class SnapshotChecker;
+class SystemClock;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+class SubcompactionState;
+
+// CompactionJob is responsible for executing the compaction. Each (manual or
+// automated) compaction corresponds to a CompactionJob object, and usually
+// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
+// will divide the compaction into subcompactions and execute them in parallel
+// if needed.
+//
+// CompactionJob has 2 main stats:
+// 1. CompactionJobStats compaction_job_stats_
+// CompactionJobStats is a public data structure which is part of Compaction
+// event listener that rocksdb share the job stats with the user.
+// Internally it's an aggregation of all the compaction_job_stats from each
+// `SubcompactionState`:
+// +------------------------+
+// | SubcompactionState |
+// | |
+// +--------->| compaction_job_stats |
+// | | |
+// | +------------------------+
+// +------------------------+ |
+// | CompactionJob | | +------------------------+
+// | | | | SubcompactionState |
+// | compaction_job_stats +-----+ | |
+// | | +--------->| compaction_job_stats |
+// | | | | |
+// +------------------------+ | +------------------------+
+// |
+// | +------------------------+
+// | | SubcompactionState |
+// | | |
+// +--------->+ compaction_job_stats |
+// | | |
+// | +------------------------+
+// |
+// | +------------------------+
+// | | ... |
+// +--------->+ |
+// +------------------------+
+//
+// 2. CompactionStatsFull compaction_stats_
+// `CompactionStatsFull` is an internal stats about the compaction, which
+// is eventually sent to `ColumnFamilyData::internal_stats_` and used for
+// logging and public metrics.
+// Internally, it's an aggregation of stats_ from each `SubcompactionState`.
+// It has 2 parts, normal stats about the main compaction information and
+// the penultimate level output stats.
+// `SubcompactionState` maintains the CompactionOutputs for normal output and
+// the penultimate level output if exists, the per_level stats is
+// stored with the outputs.
+// +---------------------------+
+// | SubcompactionState |
+// | |
+// | +----------------------+ |
+// | | CompactionOutputs | |
+// | | (normal output) | |
+// +---->| stats_ | |
+// | | +----------------------+ |
+// | | |
+// | | +----------------------+ |
+// +--------------------------------+ | | | CompactionOutputs | |
+// | CompactionJob | | | | (penultimate_level) | |
+// | | +--------->| stats_ | |
+// | compaction_stats_ | | | | +----------------------+ |
+// | +-------------------------+ | | | | |
+// | |stats (normal) |------|----+ +---------------------------+
+// | +-------------------------+ | | |
+// | | | |
+// | +-------------------------+ | | | +---------------------------+
+// | |penultimate_level_stats +------+ | | SubcompactionState |
+// | +-------------------------+ | | | | |
+// | | | | | +----------------------+ |
+// | | | | | | CompactionOutputs | |
+// +--------------------------------+ | | | | (normal output) | |
+// | +---->| stats_ | |
+// | | +----------------------+ |
+// | | |
+// | | +----------------------+ |
+// | | | CompactionOutputs | |
+// | | | (penultimate_level) | |
+// +--------->| stats_ | |
+// | +----------------------+ |
+// | |
+// +---------------------------+
+
+class CompactionJob {
+ public:
+ CompactionJob(
+ int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+ const MutableDBOptions& mutable_db_options,
+ const FileOptions& file_options, VersionSet* versions,
+ const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+ FSDirectory* db_directory, FSDirectory* output_directory,
+ FSDirectory* blob_output_directory, Statistics* stats,
+ InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, JobContext* job_context,
+ std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+ bool paranoid_file_checks, bool measure_io_stats,
+ const std::string& dbname, CompactionJobStats* compaction_job_stats,
+ Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const std::string& db_id = "", const std::string& db_session_id = "",
+ std::string full_history_ts_low = "", std::string trim_ts = "",
+ BlobFileCompletionCallback* blob_callback = nullptr,
+ int* bg_compaction_scheduled = nullptr,
+ int* bg_bottom_compaction_scheduled = nullptr);
+
+ virtual ~CompactionJob();
+
+ // no copy/move
+ CompactionJob(CompactionJob&& job) = delete;
+ CompactionJob(const CompactionJob& job) = delete;
+ CompactionJob& operator=(const CompactionJob& job) = delete;
+
+ // REQUIRED: mutex held
+ // Prepare for the compaction by setting up boundaries for each subcompaction
+ void Prepare();
+ // REQUIRED mutex not held
+ // Launch threads for each subcompaction and wait for them to finish. After
+ // that, verify table is usable and finally do bookkeeping to unify
+ // subcompaction results
+ Status Run();
+
+ // REQUIRED: mutex held
+ // Add compaction input/output to the current version
+ Status Install(const MutableCFOptions& mutable_cf_options);
+
+ // Return the IO status
+ IOStatus io_status() const { return io_status_; }
+
+ protected:
+ void UpdateCompactionStats();
+ void LogCompaction();
+ virtual void RecordCompactionIOStats();
+ void CleanupCompaction();
+
+ // Call compaction filter. Then iterate through input and compact the
+ // kv-pairs
+ void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
+
+ CompactionState* compact_;
+ InternalStats::CompactionStatsFull compaction_stats_;
+ const ImmutableDBOptions& db_options_;
+ const MutableDBOptions mutable_db_options_copy_;
+ LogBuffer* log_buffer_;
+ FSDirectory* output_directory_;
+ Statistics* stats_;
+ // Is this compaction creating a file in the bottom most level?
+ bool bottommost_level_;
+
+ Env::WriteLifeTimeHint write_hint_;
+
+ IOStatus io_status_;
+
+ CompactionJobStats* compaction_job_stats_;
+
+ private:
+ friend class CompactionJobTestBase;
+
+ // Generates a histogram representing potential divisions of key ranges from
+ // the input. It adds the starting and/or ending keys of certain input files
+ // to the working set and then finds the approximate size of data in between
+ // each consecutive pair of slices. Then it divides these ranges into
+ // consecutive groups such that each group has a similar size.
+ void GenSubcompactionBoundaries();
+
+ // Get the number of planned subcompactions based on max_subcompactions and
+ // extra reserved resources
+ uint64_t GetSubcompactionsLimit();
+
+ // Additional reserved threads are reserved and the number is stored in
+ // extra_num_subcompaction_threads_reserved__. For now, this happens only if
+ // the compaction priority is round-robin and max_subcompactions is not
+ // sufficient (extra resources may be needed)
+ void AcquireSubcompactionResources(int num_extra_required_subcompactions);
+
+ // Additional threads may be reserved during IncreaseSubcompactionResources()
+ // if num_actual_subcompactions is less than num_planned_subcompactions.
+ // Additional threads will be released and the bg_compaction_scheduled_ or
+ // bg_bottom_compaction_scheduled_ will be updated if they are used.
+ // DB Mutex lock is required.
+ void ShrinkSubcompactionResources(uint64_t num_extra_resources);
+
+ // Release all reserved threads and update the compaction limits.
+ void ReleaseSubcompactionResources();
+
+ CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService(
+ SubcompactionState* sub_compact);
+
+ // update the thread status for starting a compaction.
+ void ReportStartedCompaction(Compaction* compaction);
+
+ Status FinishCompactionOutputFile(const Status& input_status,
+ SubcompactionState* sub_compact,
+ CompactionOutputs& outputs,
+ const Slice& next_table_min_key);
+ Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
+ Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
+ CompactionOutputs& outputs);
+ void UpdateCompactionJobStats(
+ const InternalStats::CompactionStats& stats) const;
+ void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
+ CompactionJobStats* compaction_job_stats = nullptr);
+
+ void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read,
+ int input_level);
+
+ void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact);
+
+ void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);
+
+ uint32_t job_id_;
+
+ // DBImpl state
+ const std::string& dbname_;
+ const std::string db_id_;
+ const std::string db_session_id_;
+ const FileOptions file_options_;
+
+ Env* env_;
+ std::shared_ptr<IOTracer> io_tracer_;
+ FileSystemPtr fs_;
+ // env_option optimized for compaction table reads
+ FileOptions file_options_for_read_;
+ VersionSet* versions_;
+ const std::atomic<bool>* shutting_down_;
+ const std::atomic<bool>& manual_compaction_canceled_;
+ FSDirectory* db_directory_;
+ FSDirectory* blob_output_directory_;
+ InstrumentedMutex* db_mutex_;
+ ErrorHandler* db_error_handler_;
+ // If there were two snapshots with seq numbers s1 and
+ // s2 and s1 < s2, and if we find two instances of a key k1 then lies
+ // entirely within s1 and s2, then the earlier version of k1 can be safely
+ // deleted because that version is not visible in any snapshot.
+ std::vector<SequenceNumber> existing_snapshots_;
+
+ // This is the earliest snapshot that could be used for write-conflict
+ // checking by a transaction. For any user-key newer than this snapshot, we
+ // should make sure not to remove evidence that a write occurred.
+ SequenceNumber earliest_write_conflict_snapshot_;
+
+ const SnapshotChecker* const snapshot_checker_;
+
+ JobContext* job_context_;
+
+ std::shared_ptr<Cache> table_cache_;
+
+ EventLogger* event_logger_;
+
+ bool paranoid_file_checks_;
+ bool measure_io_stats_;
+ // Stores the Slices that designate the boundaries for each subcompaction
+ std::vector<std::string> boundaries_;
+ Env::Priority thread_pri_;
+ std::string full_history_ts_low_;
+ std::string trim_ts_;
+ BlobFileCompletionCallback* blob_callback_;
+
+ uint64_t GetCompactionId(SubcompactionState* sub_compact) const;
+ // Stores the number of reserved threads in shared env_ for the number of
+ // extra subcompaction in kRoundRobin compaction priority
+ int extra_num_subcompaction_threads_reserved_;
+
+ // Stores the pointer to bg_compaction_scheduled_,
+ // bg_bottom_compaction_scheduled_ in DBImpl. Mutex is required when accessing
+ // or updating it.
+ int* bg_compaction_scheduled_;
+ int* bg_bottom_compaction_scheduled_;
+
+ // Stores the sequence number to time mapping gathered from all input files
+ // it also collects the smallest_seqno -> oldest_ancester_time from the SST.
+ SeqnoToTimeMapping seqno_time_mapping_;
+
+ // Minimal sequence number for preserving the time information. The time info
+ // older than this sequence number won't be preserved after the compaction and
+ // if it's bottommost compaction, the seq num will be zeroed out.
+ SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber;
+
+ // Minimal sequence number to preclude the data from the last level. If the
+ // key has bigger (newer) sequence number than this, it will be precluded from
+ // the last level (output to penultimate level).
+ SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
+
+ // Get table file name in where it's outputting to, which should also be in
+ // `output_directory_`.
+ virtual std::string GetTableFileName(uint64_t file_number);
+ // The rate limiter priority (io_priority) is determined dynamically here.
+ // The Compaction Read and Write priorities are the same for different
+ // scenarios, such as write stalled.
+ Env::IOPriority GetRateLimiterPriority();
+};
+
+// CompactionServiceInput is used the pass compaction information between two
+// db instances. It contains the information needed to do a compaction. It
+// doesn't contain the LSM tree information, which is passed though MANIFEST
+// file.
+struct CompactionServiceInput {
+ ColumnFamilyDescriptor column_family;
+
+ DBOptions db_options;
+
+ std::vector<SequenceNumber> snapshots;
+
+ // SST files for compaction, it should already be expended to include all the
+ // files needed for this compaction, for both input level files and output
+ // level files.
+ std::vector<std::string> input_files;
+ int output_level;
+
+ // db_id is used to generate unique id of sst on the remote compactor
+ std::string db_id;
+
+ // information for subcompaction
+ bool has_begin = false;
+ std::string begin;
+ bool has_end = false;
+ std::string end;
+
+ // serialization interface to read and write the object
+ static Status Read(const std::string& data_str, CompactionServiceInput* obj);
+ Status Write(std::string* output);
+
+ // Initialize a dummy ColumnFamilyDescriptor
+ CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {}
+
+#ifndef NDEBUG
+ bool TEST_Equals(CompactionServiceInput* other);
+ bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch);
+#endif // NDEBUG
+};
+
+// CompactionServiceOutputFile is the metadata for the output SST file
+struct CompactionServiceOutputFile {
+ std::string file_name;
+ SequenceNumber smallest_seqno;
+ SequenceNumber largest_seqno;
+ std::string smallest_internal_key;
+ std::string largest_internal_key;
+ uint64_t oldest_ancester_time;
+ uint64_t file_creation_time;
+ uint64_t paranoid_hash;
+ bool marked_for_compaction;
+ UniqueId64x2 unique_id;
+
+ CompactionServiceOutputFile() = default;
+ CompactionServiceOutputFile(
+ const std::string& name, SequenceNumber smallest, SequenceNumber largest,
+ std::string _smallest_internal_key, std::string _largest_internal_key,
+ uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+ uint64_t _paranoid_hash, bool _marked_for_compaction,
+ UniqueId64x2 _unique_id)
+ : file_name(name),
+ smallest_seqno(smallest),
+ largest_seqno(largest),
+ smallest_internal_key(std::move(_smallest_internal_key)),
+ largest_internal_key(std::move(_largest_internal_key)),
+ oldest_ancester_time(_oldest_ancester_time),
+ file_creation_time(_file_creation_time),
+ paranoid_hash(_paranoid_hash),
+ marked_for_compaction(_marked_for_compaction),
+ unique_id(std::move(_unique_id)) {}
+};
+
+// CompactionServiceResult contains the compaction result from a different db
+// instance, with these information, the primary db instance with write
+// permission is able to install the result to the DB.
+struct CompactionServiceResult {
+ Status status;
+ std::vector<CompactionServiceOutputFile> output_files;
+ int output_level;
+
+ // location of the output files
+ std::string output_path;
+
+ // some statistics about the compaction
+ uint64_t num_output_records = 0;
+ uint64_t total_bytes = 0;
+ uint64_t bytes_read = 0;
+ uint64_t bytes_written = 0;
+ CompactionJobStats stats;
+
+ // serialization interface to read and write the object
+ static Status Read(const std::string& data_str, CompactionServiceResult* obj);
+ Status Write(std::string* output);
+
+#ifndef NDEBUG
+ bool TEST_Equals(CompactionServiceResult* other);
+ bool TEST_Equals(CompactionServiceResult* other, std::string* mismatch);
+#endif // NDEBUG
+};
+
+// CompactionServiceCompactionJob is an read-only compaction job, it takes
+// input information from `compaction_service_input` and put result information
+// in `compaction_service_result`, the SST files are generated to `output_path`.
+class CompactionServiceCompactionJob : private CompactionJob {
+ public:
+ CompactionServiceCompactionJob(
+ int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+ const MutableDBOptions& mutable_db_options,
+ const FileOptions& file_options, VersionSet* versions,
+ const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+ FSDirectory* output_directory, Statistics* stats,
+ InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+ const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const std::string& db_id, const std::string& db_session_id,
+ std::string output_path,
+ const CompactionServiceInput& compaction_service_input,
+ CompactionServiceResult* compaction_service_result);
+
+ // Run the compaction in current thread and return the result
+ Status Run();
+
+ void CleanupCompaction();
+
+ IOStatus io_status() const { return CompactionJob::io_status(); }
+
+ protected:
+ void RecordCompactionIOStats() override;
+
+ private:
+ // Get table file name in output_path
+ std::string GetTableFileName(uint64_t file_number) override;
+ // Specific the compaction output path, otherwise it uses default DB path
+ const std::string output_path_;
+
+ // Compaction job input
+ const CompactionServiceInput& compaction_input_;
+
+ // Compaction job result
+ CompactionServiceResult* compaction_result_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job_stats_test.cc b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
new file mode 100644
index 000000000..930270778
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
@@ -0,0 +1,975 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <cinttypes>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#if !defined(IOS_CROSS_COMPILE)
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+
+static std::string RandomString(Random* rnd, int len, double ratio) {
+ std::string r;
+ test::CompressibleString(rnd, ratio, len, &r);
+ return r;
+}
+
+std::string Key(uint64_t key, int length) {
+ const int kBufSize = 1000;
+ char buf[kBufSize];
+ if (length > kBufSize) {
+ length = kBufSize;
+ }
+ snprintf(buf, kBufSize, "%0*" PRIu64, length, key);
+ return std::string(buf);
+}
+
+class CompactionJobStatsTest : public testing::Test,
+ public testing::WithParamInterface<bool> {
+ public:
+ std::string dbname_;
+ std::string alternative_wal_dir_;
+ Env* env_;
+ DB* db_;
+ std::vector<ColumnFamilyHandle*> handles_;
+ uint32_t max_subcompactions_;
+
+ Options last_options_;
+
+ CompactionJobStatsTest() : env_(Env::Default()) {
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ dbname_ = test::PerThreadDBPath("compaction_job_stats_test");
+ alternative_wal_dir_ = dbname_ + "/wal";
+ Options options;
+ options.create_if_missing = true;
+ max_subcompactions_ = GetParam();
+ options.max_subcompactions = max_subcompactions_;
+ auto delete_options = options;
+ delete_options.wal_dir = alternative_wal_dir_;
+ EXPECT_OK(DestroyDB(dbname_, delete_options));
+ // Destroy it for not alternative WAL dir is used.
+ EXPECT_OK(DestroyDB(dbname_, options));
+ db_ = nullptr;
+ Reopen(options);
+ }
+
+ ~CompactionJobStatsTest() override {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ Close();
+ Options options;
+ options.db_paths.emplace_back(dbname_, 0);
+ options.db_paths.emplace_back(dbname_ + "_2", 0);
+ options.db_paths.emplace_back(dbname_ + "_3", 0);
+ options.db_paths.emplace_back(dbname_ + "_4", 0);
+ EXPECT_OK(DestroyDB(dbname_, options));
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+ void CreateColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ColumnFamilyOptions cf_opts(options);
+ size_t cfi = handles_.size();
+ handles_.resize(cfi + cfs.size());
+ for (auto cf : cfs) {
+ ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+ }
+ }
+
+ void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+ const Options& options) {
+ CreateColumnFamilies(cfs, options);
+ std::vector<std::string> cfs_plus_default = cfs;
+ cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+ ReopenWithColumnFamilies(cfs_plus_default, options);
+ }
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+ }
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+ }
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ Close();
+ EXPECT_EQ(cfs.size(), options.size());
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (size_t i = 0; i < cfs.size(); ++i) {
+ column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+ }
+ DBOptions db_opts = DBOptions(options[0]);
+ return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+ }
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ Close();
+ std::vector<Options> v_opts(cfs.size(), options);
+ return TryReopenWithColumnFamilies(cfs, v_opts);
+ }
+
+ void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); }
+
+ void Close() {
+ for (auto h : handles_) {
+ delete h;
+ }
+ handles_.clear();
+ delete db_;
+ db_ = nullptr;
+ }
+
+ void DestroyAndReopen(const Options& options) {
+ // Destroy using last options
+ Destroy(last_options_);
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void Destroy(const Options& options) {
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options));
+ }
+
+ Status ReadOnlyReopen(const Options& options) {
+ return DB::OpenForReadOnly(options, dbname_, &db_);
+ }
+
+ Status TryReopen(const Options& options) {
+ Close();
+ last_options_ = options;
+ return DB::Open(options, dbname_, &db_);
+ }
+
+ Status Flush(int cf = 0) {
+ if (cf == 0) {
+ return db_->Flush(FlushOptions());
+ } else {
+ return db_->Flush(FlushOptions(), handles_[cf]);
+ }
+ }
+
+ Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+ return db_->Put(wo, k, v);
+ }
+
+ Status Put(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo = WriteOptions()) {
+ return db_->Put(wo, handles_[cf], k, v);
+ }
+
+ Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+ Status Delete(int cf, const std::string& k) {
+ return db_->Delete(WriteOptions(), handles_[cf], k);
+ }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ std::string Get(int cf, const std::string& k,
+ const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, handles_[cf], k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ int NumTableFilesAtLevel(int level, int cf = 0) {
+ std::string property;
+ if (cf == 0) {
+ // default cfd
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.num-files-at-level" + std::to_string(level), &property));
+ } else {
+ EXPECT_TRUE(db_->GetProperty(
+ handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level),
+ &property));
+ }
+ return atoi(property.c_str());
+ }
+
+ // Return spread of files per level
+ std::string FilesPerLevel(int cf = 0) {
+ int num_levels =
+ (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+ std::string result;
+ size_t last_non_zero_offset = 0;
+ for (int level = 0; level < num_levels; level++) {
+ int f = NumTableFilesAtLevel(level, cf);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+
+ Status Size(uint64_t* size, const Slice& start, const Slice& limit,
+ int cf = 0) {
+ Range r(start, limit);
+ if (cf == 0) {
+ return db_->GetApproximateSizes(&r, 1, size);
+ } else {
+ return db_->GetApproximateSizes(handles_[1], &r, 1, size);
+ }
+ }
+
+ void Compact(int cf, const Slice& start, const Slice& limit,
+ uint32_t target_path_id) {
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = target_path_id;
+ ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+ }
+
+ void Compact(int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+ }
+
+ void Compact(const Slice& start, const Slice& limit) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+ }
+
+ void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf],
+ true /* disallow trivial move */));
+ }
+
+ // Do n memtable compactions, each of which produces an sstable
+ // covering the range [small,large].
+ void MakeTables(int n, const std::string& small, const std::string& large,
+ int cf = 0) {
+ for (int i = 0; i < n; i++) {
+ ASSERT_OK(Put(cf, small, "begin"));
+ ASSERT_OK(Put(cf, large, "end"));
+ ASSERT_OK(Flush(cf));
+ }
+ }
+
+ static void SetDeletionCompactionStats(CompactionJobStats* stats,
+ uint64_t input_deletions,
+ uint64_t expired_deletions,
+ uint64_t records_replaced) {
+ stats->num_input_deletion_records = input_deletions;
+ stats->num_expired_deletion_records = expired_deletions;
+ stats->num_records_replaced = records_replaced;
+ }
+
+ void MakeTableWithKeyValues(Random* rnd, uint64_t smallest, uint64_t largest,
+ int key_size, int value_size, uint64_t interval,
+ double ratio, int cf = 0) {
+ for (auto key = smallest; key < largest; key += interval) {
+ ASSERT_OK(Put(cf, Slice(Key(key, key_size)),
+ Slice(RandomString(rnd, value_size, ratio))));
+ }
+ ASSERT_OK(Flush(cf));
+ }
+
+ // This function behaves with the implicit understanding that two
+ // rounds of keys are inserted into the database, as per the behavior
+ // of the DeletionStatsTest.
+ void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest,
+ uint64_t interval, int deletion_interval,
+ int key_size, uint64_t cutoff_key_num,
+ CompactionJobStats* stats, int cf = 0) {
+ // interval needs to be >= 2 so that deletion entries can be inserted
+ // that are intended to not result in an actual key deletion by using
+ // an offset of 1 from another existing key
+ ASSERT_GE(interval, 2);
+
+ uint64_t ctr = 1;
+ uint32_t deletions_made = 0;
+ uint32_t num_deleted = 0;
+ uint32_t num_expired = 0;
+ for (auto key = smallest; key <= largest; key += interval, ctr++) {
+ if (ctr % deletion_interval == 0) {
+ ASSERT_OK(Delete(cf, Key(key, key_size)));
+ deletions_made++;
+ num_deleted++;
+
+ if (key > cutoff_key_num) {
+ num_expired++;
+ }
+ }
+ }
+
+ // Insert some deletions for keys that don't exist that
+ // are both in and out of the key range
+ ASSERT_OK(Delete(cf, Key(smallest + 1, key_size)));
+ deletions_made++;
+
+ ASSERT_OK(Delete(cf, Key(smallest - 1, key_size)));
+ deletions_made++;
+ num_expired++;
+
+ ASSERT_OK(Delete(cf, Key(smallest - 9, key_size)));
+ deletions_made++;
+ num_expired++;
+
+ ASSERT_OK(Flush(cf));
+ SetDeletionCompactionStats(stats, deletions_made, num_expired, num_deleted);
+ }
+};
+
+// An EventListener which helps verify the compaction results in
+// test CompactionJobStatsTest.
+class CompactionJobStatsChecker : public EventListener {
+ public:
+ CompactionJobStatsChecker()
+ : compression_enabled_(false), verify_next_comp_io_stats_(false) {}
+
+ size_t NumberOfUnverifiedStats() { return expected_stats_.size(); }
+
+ void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; }
+
+ // Once a compaction completed, this function will verify the returned
+ // CompactionJobInfo with the oldest CompactionJobInfo added earlier
+ // in "expected_stats_" which has not yet being used for verification.
+ void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+ if (verify_next_comp_io_stats_) {
+ ASSERT_GT(ci.stats.file_write_nanos, 0);
+ ASSERT_GT(ci.stats.file_range_sync_nanos, 0);
+ ASSERT_GT(ci.stats.file_fsync_nanos, 0);
+ ASSERT_GT(ci.stats.file_prepare_write_nanos, 0);
+ verify_next_comp_io_stats_ = false;
+ }
+
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (expected_stats_.size()) {
+ Verify(ci.stats, expected_stats_.front());
+ expected_stats_.pop();
+ }
+ }
+
+ // A helper function which verifies whether two CompactionJobStats
+ // match. The verification of all compaction stats are done by
+ // ASSERT_EQ except for the total input / output bytes, which we
+ // use ASSERT_GE and ASSERT_LE with a reasonable bias ---
+ // 10% in uncompressed case and 20% when compression is used.
+ virtual void Verify(const CompactionJobStats& current_stats,
+ const CompactionJobStats& stats) {
+ // time
+ ASSERT_GT(current_stats.elapsed_micros, 0U);
+
+ ASSERT_EQ(current_stats.num_input_records, stats.num_input_records);
+ ASSERT_EQ(current_stats.num_input_files, stats.num_input_files);
+ ASSERT_EQ(current_stats.num_input_files_at_output_level,
+ stats.num_input_files_at_output_level);
+
+ ASSERT_EQ(current_stats.num_output_records, stats.num_output_records);
+ ASSERT_EQ(current_stats.num_output_files, stats.num_output_files);
+
+ ASSERT_EQ(current_stats.is_full_compaction, stats.is_full_compaction);
+ ASSERT_EQ(current_stats.is_manual_compaction, stats.is_manual_compaction);
+
+ // file size
+ double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10;
+ ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias),
+ stats.total_input_bytes);
+ ASSERT_LE(current_stats.total_input_bytes,
+ stats.total_input_bytes * (1.00 + kFileSizeBias));
+ ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias),
+ stats.total_output_bytes);
+ ASSERT_LE(current_stats.total_output_bytes,
+ stats.total_output_bytes * (1.00 + kFileSizeBias));
+ ASSERT_EQ(current_stats.total_input_raw_key_bytes,
+ stats.total_input_raw_key_bytes);
+ ASSERT_EQ(current_stats.total_input_raw_value_bytes,
+ stats.total_input_raw_value_bytes);
+
+ ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced);
+
+ ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys);
+
+ ASSERT_EQ(std::string(current_stats.smallest_output_key_prefix),
+ std::string(stats.smallest_output_key_prefix));
+ ASSERT_EQ(std::string(current_stats.largest_output_key_prefix),
+ std::string(stats.largest_output_key_prefix));
+ }
+
+ // Add an expected compaction stats, which will be used to
+ // verify the CompactionJobStats returned by the OnCompactionCompleted()
+ // callback.
+ void AddExpectedStats(const CompactionJobStats& stats) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ expected_stats_.push(stats);
+ }
+
+ void EnableCompression(bool flag) { compression_enabled_ = flag; }
+
+ bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; }
+
+ private:
+ std::mutex mutex_;
+ std::queue<CompactionJobStats> expected_stats_;
+ bool compression_enabled_;
+ bool verify_next_comp_io_stats_;
+};
+
+// An EventListener which helps verify the compaction statistics in
+// the test DeletionStatsTest.
+class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker {
+ public:
+ // Verifies whether two CompactionJobStats match.
+ void Verify(const CompactionJobStats& current_stats,
+ const CompactionJobStats& stats) override {
+ ASSERT_EQ(current_stats.num_input_deletion_records,
+ stats.num_input_deletion_records);
+ ASSERT_EQ(current_stats.num_expired_deletion_records,
+ stats.num_expired_deletion_records);
+ ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced);
+
+ ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys);
+ }
+};
+
+namespace {
+
+uint64_t EstimatedFileSize(uint64_t num_records, size_t key_size,
+ size_t value_size, double compression_ratio = 1.0,
+ size_t block_size = 4096,
+ int bloom_bits_per_key = 10) {
+ const size_t kPerKeyOverhead = 8;
+ const size_t kFooterSize = 512;
+
+ uint64_t data_size = static_cast<uint64_t>(
+ num_records *
+ (key_size + value_size * compression_ratio + kPerKeyOverhead));
+
+ return data_size + kFooterSize +
+ num_records * bloom_bits_per_key / 8 // filter block
+ + data_size * (key_size + 8) / block_size; // index block
+}
+
+namespace {
+
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
+ assert(prefix_length > 0);
+ size_t length = src.size() > prefix_length ? prefix_length : src.size();
+ dst->assign(src.data(), length);
+}
+
+} // namespace
+
+CompactionJobStats NewManualCompactionJobStats(
+ const std::string& smallest_key, const std::string& largest_key,
+ size_t num_input_files, size_t num_input_files_at_output_level,
+ uint64_t num_input_records, size_t key_size, size_t value_size,
+ size_t num_output_files, uint64_t num_output_records,
+ double compression_ratio, uint64_t num_records_replaced,
+ bool is_full = false, bool is_manual = true) {
+ CompactionJobStats stats;
+ stats.Reset();
+
+ stats.num_input_records = num_input_records;
+ stats.num_input_files = num_input_files;
+ stats.num_input_files_at_output_level = num_input_files_at_output_level;
+
+ stats.num_output_records = num_output_records;
+ stats.num_output_files = num_output_files;
+
+ stats.total_input_bytes =
+ EstimatedFileSize(num_input_records / num_input_files, key_size,
+ value_size, compression_ratio) *
+ num_input_files;
+ stats.total_output_bytes =
+ EstimatedFileSize(num_output_records / num_output_files, key_size,
+ value_size, compression_ratio) *
+ num_output_files;
+ stats.total_input_raw_key_bytes = num_input_records * (key_size + 8);
+ stats.total_input_raw_value_bytes = num_input_records * value_size;
+
+ stats.is_full_compaction = is_full;
+ stats.is_manual_compaction = is_manual;
+
+ stats.num_records_replaced = num_records_replaced;
+
+ CopyPrefix(smallest_key, CompactionJobStats::kMaxPrefixLength,
+ &stats.smallest_output_key_prefix);
+ CopyPrefix(largest_key, CompactionJobStats::kMaxPrefixLength,
+ &stats.largest_output_key_prefix);
+
+ return stats;
+}
+
+CompressionType GetAnyCompression() {
+ if (Snappy_Supported()) {
+ return kSnappyCompression;
+ } else if (Zlib_Supported()) {
+ return kZlibCompression;
+ } else if (BZip2_Supported()) {
+ return kBZip2Compression;
+ } else if (LZ4_Supported()) {
+ return kLZ4Compression;
+ } else if (XPRESS_Supported()) {
+ return kXpressCompression;
+ }
+
+ return kNoCompression;
+}
+
+} // namespace
+
+TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
+ Random rnd(301);
+ const int kBufSize = 100;
+ char buf[kBufSize];
+ uint64_t key_base = 100000000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_L0_file = 100;
+ const int kTestScale = 8;
+ const int kKeySize = 10;
+ const int kValueSize = 1000;
+ const double kCompressionRatio = 0.5;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_L0_file;
+
+ // Whenever a compaction completes, this listener will try to
+ // verify whether the returned CompactionJobStats matches
+ // what we expect. The expected CompactionJobStats is added
+ // via AddExpectedStats().
+ auto* stats_checker = new CompactionJobStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ // just enough setting to hold off auto-compaction.
+ options.level0_file_num_compaction_trigger = kTestScale + 1;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.max_subcompactions = max_subcompactions_;
+ options.bytes_per_sync = 512 * 1024;
+
+ options.report_bg_io_stats = true;
+ for (int test = 0; test < 2; ++test) {
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // 1st Phase: generate "num_L0_files" L0 files.
+ int num_L0_files = 0;
+ for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1,
+ kKeySize, kValueSize, key_interval,
+ compression_ratio, 1);
+ snprintf(buf, kBufSize, "%d", ++num_L0_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+ ASSERT_EQ(std::to_string(num_L0_files), FilesPerLevel(1));
+
+ // 2nd Phase: perform L0 -> L1 compaction.
+ int L0_compaction_count = 6;
+ int count = 1;
+ std::string smallest_key;
+ std::string largest_key;
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * L0_compaction_count;
+ start_key += key_base, count++) {
+ smallest_key = Key(start_key, 10);
+ largest_key = Key(start_key + key_base - key_interval, 10);
+ stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+ smallest_key, largest_key, 1, 0, num_keys_per_L0_file, kKeySize,
+ kValueSize, 1, num_keys_per_L0_file, compression_ratio, 0));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ TEST_Compact(0, 1, smallest_key, largest_key);
+ snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // compact two files into one in the last L0 -> L1 compaction
+ int num_remaining_L0 = num_L0_files - L0_compaction_count;
+ smallest_key = Key(key_base * (L0_compaction_count + 1), 10);
+ largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+ stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+ smallest_key, largest_key, num_remaining_L0, 0,
+ num_keys_per_L0_file * num_remaining_L0, kKeySize, kValueSize, 1,
+ num_keys_per_L0_file * num_remaining_L0, compression_ratio, 0));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ TEST_Compact(0, 1, smallest_key, largest_key);
+
+ int num_L1_files = num_L0_files - num_remaining_L0 + 1;
+ num_L0_files = 0;
+ snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+
+ // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys)
+ int sparseness = 2;
+ for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+ start_key += key_base * sparseness) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base * sparseness - 1, kKeySize,
+ kValueSize, key_base * sparseness / num_keys_per_L0_file,
+ compression_ratio, 1);
+ snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp
+ // When subcompactions are enabled, the number of output files increases
+ // by 1 because multiple threads are consuming the input and generating
+ // output files without coordinating to see if the output could fit into
+ // a smaller number of files like it does when it runs sequentially
+ int num_output_files = options.max_subcompactions > 1 ? 2 : 1;
+ for (uint64_t start_key = key_base; num_L0_files > 1;
+ start_key += key_base * sparseness) {
+ smallest_key = Key(start_key, 10);
+ largest_key = Key(start_key + key_base * sparseness - key_interval, 10);
+ stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+ smallest_key, largest_key, 3, 2, num_keys_per_L0_file * 3, kKeySize,
+ kValueSize, num_output_files,
+ num_keys_per_L0_file * 2, // 1/3 of the data will be updated.
+ compression_ratio, num_keys_per_L0_file));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ Compact(1, smallest_key, largest_key);
+ if (options.max_subcompactions == 1) {
+ --num_L1_files;
+ }
+ snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // 5th Phase: Do a full compaction, which involves in two sub-compactions.
+ // Here we expect to have 1 L0 files and 4 L1 files
+ // In the first sub-compaction, we expect L0 compaction.
+ smallest_key = Key(key_base, 10);
+ largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+ stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+ Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key, 2, 1,
+ num_keys_per_L0_file * 3, kKeySize, kValueSize, 1,
+ num_keys_per_L0_file * 2, compression_ratio, num_keys_per_L0_file));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ Compact(1, smallest_key, largest_key);
+
+ num_L1_files = options.max_subcompactions > 1 ? 7 : 4;
+ char L1_buf[4];
+ snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files);
+ std::string L1_files(L1_buf);
+ ASSERT_EQ(L1_files, FilesPerLevel(1));
+ options.compression = GetAnyCompression();
+ if (options.compression == kNoCompression) {
+ break;
+ }
+ stats_checker->EnableCompression(true);
+ compression_ratio = kCompressionRatio;
+
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)),
+ Slice(RandomString(&rnd, 512 * 1024, 1))));
+ }
+
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+
+ stats_checker->set_verify_next_comp_io_stats(true);
+ std::atomic<bool> first_prepare_write(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) {
+ if (first_prepare_write.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_prepare_write.store(false);
+ }
+ });
+
+ std::atomic<bool> first_flush(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) {
+ if (first_flush.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_flush.store(false);
+ }
+ });
+
+ std::atomic<bool> first_sync(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) {
+ if (first_sync.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_sync.store(false);
+ }
+ });
+
+ std::atomic<bool> first_range_sync(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) {
+ if (first_range_sync.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_range_sync.store(false);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Compact(1, smallest_key, largest_key);
+
+ ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats());
+ ASSERT_TRUE(!first_prepare_write.load());
+ ASSERT_TRUE(!first_flush.load());
+ ASSERT_TRUE(!first_sync.load());
+ ASSERT_TRUE(!first_range_sync.load());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+TEST_P(CompactionJobStatsTest, DeletionStatsTest) {
+ Random rnd(301);
+ uint64_t key_base = 100000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_L0_file = 20;
+ const int kTestScale = 8; // make sure this is even
+ const int kKeySize = 10;
+ const int kValueSize = 100;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_L0_file;
+ uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval;
+ uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval;
+ const std::string smallest_key = Key(key_base - 10, kKeySize);
+ const std::string largest_key = Key(largest_key_num + 10, kKeySize);
+
+ // Whenever a compaction completes, this listener will try to
+ // verify whether the returned CompactionJobStats matches
+ // what we expect.
+ auto* stats_checker = new CompactionJobDeletionStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = kTestScale + 1;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_multiplier = 2;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Stage 1: Generate several L0 files and then send them to L2 by
+ // using CompactRangeOptions and CompactRange(). These files will
+ // have a strict subset of the keys from the full key-range
+ for (uint64_t start_key = key_base; start_key <= key_base * kTestScale / 2;
+ start_key += key_base) {
+ MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+ kValueSize, key_interval, compression_ratio, 1);
+ }
+
+ CompactRangeOptions cr_options;
+ cr_options.change_level = true;
+ cr_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(cr_options, handles_[1], nullptr, nullptr));
+ ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+ // Stage 2: Generate files including keys from the entire key range
+ for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+ kValueSize, key_interval, compression_ratio, 1);
+ }
+
+ // Send these L0 files to L1
+ TEST_Compact(0, 1, smallest_key, largest_key);
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+ // Add a new record and flush so now there is a L0 file
+ // with a value too (not just deletions from the next step)
+ ASSERT_OK(Put(1, Key(key_base - 6, kKeySize), "test"));
+ ASSERT_OK(Flush(1));
+
+ // Stage 3: Generate L0 files with some deletions so now
+ // there are files with the same key range in L0, L1, and L2
+ int deletion_interval = 3;
+ CompactionJobStats first_compaction_stats;
+ SelectivelyDeleteKeys(key_base, largest_key_num, key_interval,
+ deletion_interval, kKeySize, cutoff_key_num,
+ &first_compaction_stats, 1);
+
+ stats_checker->AddExpectedStats(first_compaction_stats);
+
+ // Stage 4: Trigger compaction and verify the stats
+ TEST_Compact(0, 1, smallest_key, largest_key);
+}
+
+namespace {
+int GetUniversalCompactionInputUnits(uint32_t num_flushes) {
+ uint32_t compaction_input_units;
+ for (compaction_input_units = 1; num_flushes >= compaction_input_units;
+ compaction_input_units *= 2) {
+ if ((num_flushes & compaction_input_units) != 0) {
+ return compaction_input_units > 1 ? compaction_input_units : 0;
+ }
+ }
+ return 0;
+}
+} // namespace
+
+TEST_P(CompactionJobStatsTest, UniversalCompactionTest) {
+ Random rnd(301);
+ uint64_t key_base = 100000000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_table = 100;
+ const uint32_t kTestScale = 6;
+ const int kKeySize = 10;
+ const int kValueSize = 900;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_table;
+
+ auto* stats_checker = new CompactionJobStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 2;
+ options.target_file_size_base = num_keys_per_table * 1000;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 1;
+ options.compaction_options_universal.max_size_amplification_percent = 1000;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Generates the expected CompactionJobStats for each compaction
+ for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) {
+ // Here we treat one newly flushed file as an unit.
+ //
+ // For example, if a newly flushed file is 100k, and a compaction has
+ // 4 input units, then this compaction inputs 400k.
+ uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes);
+ if (num_input_units == 0) {
+ continue;
+ }
+ // A full compaction only happens when the number of flushes equals to
+ // the number of compaction input runs.
+ bool is_full = num_flushes == num_input_units;
+ // The following statement determines the expected smallest key
+ // based on whether it is a full compaction.
+ uint64_t smallest_key = is_full ? key_base : key_base * (num_flushes - 1);
+
+ stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+ Key(smallest_key, 10),
+ Key(smallest_key + key_base * num_input_units - key_interval, 10),
+ num_input_units, num_input_units > 2 ? num_input_units / 2 : 0,
+ num_keys_per_table * num_input_units, kKeySize, kValueSize,
+ num_input_units, num_keys_per_table * num_input_units, 1.0, 0, is_full,
+ false));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U);
+
+ for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+ kValueSize, key_interval, compression_ratio, 1);
+ ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest,
+ ::testing::Values(1, 4));
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
+
+#else
+
+int main(int /*argc*/, char** /*argv*/) { return 0; }
+#endif // !defined(IOS_CROSS_COMPILE)
diff --git a/src/rocksdb/db/compaction/compaction_job_test.cc b/src/rocksdb/db/compaction/compaction_job_test.cc
new file mode 100644
index 000000000..c87871100
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_test.cc
@@ -0,0 +1,2451 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_job.h"
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <tuple>
+
+#include "db/blob/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/version_set.h"
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "options/options_helper.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void VerifyInitializationOfCompactionJobStats(
+ const CompactionJobStats& compaction_job_stats) {
+#if !defined(IOS_CROSS_COMPILE)
+ ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_input_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_input_files, 0U);
+ ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_output_files, 0U);
+
+ ASSERT_EQ(compaction_job_stats.is_manual_compaction, true);
+
+ ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
+ ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
+
+ ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U);
+ ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U);
+
+ ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0);
+ ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0);
+
+ ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U);
+#endif // !defined(IOS_CROSS_COMPILE)
+}
+
+// Mock FSWritableFile for testing io priority.
+// Only override the essential functions for testing compaction io priority.
+class MockTestWritableFile : public FSWritableFileOwnerWrapper {
+ public:
+ MockTestWritableFile(std::unique_ptr<FSWritableFile>&& file,
+ Env::IOPriority io_priority)
+ : FSWritableFileOwnerWrapper(std::move(file)),
+ write_io_priority_(io_priority) {}
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Append(data, options, dbg);
+ }
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ const DataVerificationInfo& verification_info,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Append(data, options, verification_info, dbg);
+ }
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Close(options, dbg);
+ }
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Flush(options, dbg);
+ }
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Sync(options, dbg);
+ }
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Fsync(options, dbg);
+ }
+ uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->GetFileSize(options, dbg);
+ }
+ IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->RangeSync(offset, nbytes, options, dbg);
+ }
+
+ void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ target()->PrepareWrite(offset, len, options, dbg);
+ }
+
+ IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Allocate(offset, len, options, dbg);
+ }
+
+ private:
+ Env::IOPriority write_io_priority_;
+};
+
+// Mock FSRandomAccessFile for testing io priority.
+// Only override the essential functions for testing compaction io priority.
+class MockTestRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+ MockTestRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
+ Env::IOPriority io_priority)
+ : FSRandomAccessFileOwnerWrapper(std::move(file)),
+ read_io_priority_(io_priority) {}
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override {
+ EXPECT_EQ(options.rate_limiter_priority, read_io_priority_);
+ return target()->Read(offset, n, options, result, scratch, dbg);
+ }
+ IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, read_io_priority_);
+ return target()->Prefetch(offset, n, options, dbg);
+ }
+
+ private:
+ Env::IOPriority read_io_priority_;
+};
+
+// Mock FileSystem for testing io priority.
+class MockTestFileSystem : public FileSystemWrapper {
+ public:
+ explicit MockTestFileSystem(const std::shared_ptr<FileSystem>& base,
+ Env::IOPriority read_io_priority,
+ Env::IOPriority write_io_priority)
+ : FileSystemWrapper(base),
+ read_io_priority_(read_io_priority),
+ write_io_priority_(write_io_priority) {}
+
+ static const char* kClassName() { return "MockTestFileSystem"; }
+ const char* Name() const override { return kClassName(); }
+
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* dbg) override {
+ IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+ EXPECT_OK(s);
+ result->reset(
+ new MockTestRandomAccessFile(std::move(*result), read_io_priority_));
+ return s;
+ }
+ IOStatus NewWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override {
+ IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
+ EXPECT_OK(s);
+ result->reset(
+ new MockTestWritableFile(std::move(*result), write_io_priority_));
+ return s;
+ }
+
+ private:
+ Env::IOPriority read_io_priority_;
+ Env::IOPriority write_io_priority_;
+};
+
+enum TableTypeForTest : uint8_t { kMockTable = 0, kBlockBasedTable = 1 };
+
+} // namespace
+
+class CompactionJobTestBase : public testing::Test {
+ protected:
+ CompactionJobTestBase(std::string dbname, const Comparator* ucmp,
+ std::function<std::string(uint64_t)> encode_u64_ts,
+ bool test_io_priority, TableTypeForTest table_type)
+ : dbname_(std::move(dbname)),
+ ucmp_(ucmp),
+ db_options_(),
+ mutable_cf_options_(cf_options_),
+ mutable_db_options_(),
+ table_cache_(NewLRUCache(50000, 16)),
+ write_buffer_manager_(db_options_.db_write_buffer_size),
+ versions_(new VersionSet(
+ dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr,
+ /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ "")),
+ shutting_down_(false),
+ mock_table_factory_(new mock::MockTableFactory()),
+ error_handler_(nullptr, db_options_, &mutex_),
+ encode_u64_ts_(std::move(encode_u64_ts)),
+ test_io_priority_(test_io_priority),
+ table_type_(table_type) {
+ Env* base_env = Env::Default();
+ EXPECT_OK(
+ test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+ env_ = base_env;
+ fs_ = env_->GetFileSystem();
+ // set default for the tests
+ mutable_cf_options_.target_file_size_base = 1024 * 1024;
+ mutable_cf_options_.max_compaction_bytes = 10 * 1024 * 1024;
+ }
+
+ void SetUp() override {
+ EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+ db_options_.env = env_;
+ db_options_.fs = fs_;
+ db_options_.db_paths.emplace_back(dbname_,
+ std::numeric_limits<uint64_t>::max());
+ cf_options_.comparator = ucmp_;
+ if (table_type_ == TableTypeForTest::kBlockBasedTable) {
+ BlockBasedTableOptions table_options;
+ cf_options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ } else if (table_type_ == TableTypeForTest::kMockTable) {
+ cf_options_.table_factory = mock_table_factory_;
+ } else {
+ assert(false);
+ }
+ }
+
+ std::string GenerateFileName(uint64_t file_number) {
+ FileMetaData meta;
+ std::vector<DbPath> db_paths;
+ db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+ meta.fd = FileDescriptor(file_number, 0, 0);
+ return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
+ }
+
+ std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num,
+ const ValueType t, uint64_t ts = 0) {
+ std::string user_key_with_ts = user_key + encode_u64_ts_(ts);
+ return InternalKey(user_key_with_ts, seq_num, t).Encode().ToString();
+ }
+
+ static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+ uint64_t size) {
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+ kNoCompression);
+ return blob_index;
+ }
+
+ static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset,
+ uint64_t size, uint64_t expiration) {
+ std::string blob_index;
+ BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+ size, kNoCompression);
+ return blob_index;
+ }
+
+ static std::string BlobStrInlinedTTL(const Slice& value,
+ uint64_t expiration) {
+ std::string blob_index;
+ BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+ return blob_index;
+ }
+
+ // Creates a table with the specificied key value pairs.
+ void CreateTable(const std::string& table_name,
+ const mock::KVVector& contents, uint64_t& file_size) {
+ std::unique_ptr<WritableFileWriter> file_writer;
+ Status s = WritableFileWriter::Create(fs_, table_name, FileOptions(),
+ &file_writer, nullptr);
+ ASSERT_OK(s);
+ std::unique_ptr<TableBuilder> table_builder(
+ cf_options_.table_factory->NewTableBuilder(
+ TableBuilderOptions(*cfd_->ioptions(), mutable_cf_options_,
+ cfd_->internal_comparator(),
+ cfd_->int_tbl_prop_collector_factories(),
+ CompressionType::kNoCompression,
+ CompressionOptions(), 0 /* column_family_id */,
+ kDefaultColumnFamilyName, -1 /* level */),
+ file_writer.get()));
+ // Build table.
+ for (auto kv : contents) {
+ std::string key;
+ std::string value;
+ std::tie(key, value) = kv;
+ table_builder->Add(key, value);
+ }
+ ASSERT_OK(table_builder->Finish());
+ file_size = table_builder->FileSize();
+ }
+
+ void AddMockFile(const mock::KVVector& contents, int level = 0) {
+ assert(contents.size() > 0);
+
+ bool first_key = true;
+ std::string smallest, largest;
+ InternalKey smallest_key, largest_key;
+ SequenceNumber smallest_seqno = kMaxSequenceNumber;
+ SequenceNumber largest_seqno = 0;
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+ for (auto kv : contents) {
+ ParsedInternalKey key;
+ std::string skey;
+ std::string value;
+ std::tie(skey, value) = kv;
+ const Status pik_status =
+ ParseInternalKey(skey, &key, true /* log_err_key */);
+
+ smallest_seqno = std::min(smallest_seqno, key.sequence);
+ largest_seqno = std::max(largest_seqno, key.sequence);
+
+ if (first_key ||
+ cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) {
+ smallest.assign(key.user_key.data(), key.user_key.size());
+ smallest_key.DecodeFrom(skey);
+ }
+ if (first_key ||
+ cfd_->user_comparator()->Compare(key.user_key, largest) > 0) {
+ largest.assign(key.user_key.data(), key.user_key.size());
+ largest_key.DecodeFrom(skey);
+ }
+
+ first_key = false;
+
+ if (pik_status.ok() && key.type == kTypeBlobIndex) {
+ BlobIndex blob_index;
+ const Status s = blob_index.DecodeFrom(value);
+ if (!s.ok()) {
+ continue;
+ }
+
+ if (blob_index.IsInlined() || blob_index.HasTTL() ||
+ blob_index.file_number() == kInvalidBlobFileNumber) {
+ continue;
+ }
+
+ if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+ oldest_blob_file_number > blob_index.file_number()) {
+ oldest_blob_file_number = blob_index.file_number();
+ }
+ }
+ }
+
+ uint64_t file_number = versions_->NewFileNumber();
+
+ uint64_t file_size = 0;
+ if (table_type_ == TableTypeForTest::kBlockBasedTable) {
+ CreateTable(GenerateFileName(file_number), contents, file_size);
+ } else if (table_type_ == TableTypeForTest::kMockTable) {
+ file_size = 10;
+ EXPECT_OK(mock_table_factory_->CreateMockTable(
+ env_, GenerateFileName(file_number), std::move(contents)));
+ } else {
+ assert(false);
+ }
+
+ VersionEdit edit;
+ edit.AddFile(level, file_number, 0, file_size, smallest_key, largest_key,
+ smallest_seqno, largest_seqno, false, Temperature::kUnknown,
+ oldest_blob_file_number, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ mutex_.Lock();
+ EXPECT_OK(
+ versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+ mutable_cf_options_, &edit, &mutex_, nullptr));
+ mutex_.Unlock();
+ }
+
+ void VerifyTables(int output_level,
+ const std::vector<mock::KVVector>& expected_results,
+ std::vector<uint64_t> expected_oldest_blob_file_numbers) {
+ if (expected_results.empty()) {
+ ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
+ return;
+ }
+ int expected_output_file_num = 0;
+ for (const auto& e : expected_results) {
+ if (!e.empty()) {
+ ++expected_output_file_num;
+ }
+ }
+ ASSERT_EQ(expected_output_file_num, compaction_job_stats_.num_output_files);
+ if (expected_output_file_num == 0) {
+ return;
+ }
+
+ if (expected_oldest_blob_file_numbers.empty()) {
+ expected_oldest_blob_file_numbers.resize(expected_output_file_num,
+ kInvalidBlobFileNumber);
+ }
+
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ if (table_type_ == TableTypeForTest::kMockTable) {
+ ASSERT_EQ(compaction_job_stats_.num_output_files,
+ expected_results.size());
+ mock_table_factory_->AssertLatestFiles(expected_results);
+ } else {
+ assert(table_type_ == TableTypeForTest::kBlockBasedTable);
+ }
+
+ auto output_files =
+ cfd->current()->storage_info()->LevelFiles(output_level);
+ ASSERT_EQ(expected_output_file_num, output_files.size());
+
+ if (table_type_ == TableTypeForTest::kMockTable) {
+ assert(output_files.size() ==
+ static_cast<size_t>(expected_output_file_num));
+ const FileMetaData* const output_file = output_files[0];
+ ASSERT_EQ(output_file->oldest_blob_file_number,
+ expected_oldest_blob_file_numbers[0]);
+ return;
+ }
+
+ for (size_t i = 0; i < expected_results.size(); ++i) {
+ const FileMetaData* const output_file = output_files[i];
+ std::string file_name = GenerateFileName(output_file->fd.GetNumber());
+ const auto& fs = env_->GetFileSystem();
+ std::unique_ptr<RandomAccessFileReader> freader;
+ IOStatus ios = RandomAccessFileReader::Create(
+ fs, file_name, FileOptions(), &freader, nullptr);
+ ASSERT_OK(ios);
+ std::unique_ptr<TableReader> table_reader;
+ uint64_t file_size = output_file->fd.GetFileSize();
+ ReadOptions read_opts;
+ Status s = cf_options_.table_factory->NewTableReader(
+ read_opts,
+ TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(),
+ cfd_->internal_comparator()),
+ std::move(freader), file_size, &table_reader, false);
+ ASSERT_OK(s);
+ assert(table_reader);
+ std::unique_ptr<InternalIterator> iiter(
+ table_reader->NewIterator(read_opts, nullptr, nullptr, true,
+ TableReaderCaller::kUncategorized));
+ assert(iiter);
+
+ mock::KVVector from_db;
+ for (iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) {
+ const Slice key = iiter->key();
+ const Slice value = iiter->value();
+ from_db.emplace_back(
+ make_pair(key.ToString(false), value.ToString(false)));
+ }
+ ASSERT_EQ(expected_results[i], from_db);
+ }
+ }
+
+ void SetLastSequence(const SequenceNumber sequence_number) {
+ versions_->SetLastAllocatedSequence(sequence_number + 1);
+ versions_->SetLastPublishedSequence(sequence_number + 1);
+ versions_->SetLastSequence(sequence_number + 1);
+ }
+
+ // returns expected result after compaction
+ mock::KVVector CreateTwoFiles(bool gen_corrupted_keys) {
+ stl_wrappers::KVMap expected_results;
+ constexpr int kKeysPerFile = 10000;
+ constexpr int kCorruptKeysPerFile = 200;
+ constexpr int kMatchingKeys = kKeysPerFile / 2;
+ SequenceNumber sequence_number = 0;
+
+ auto corrupt_id = [&](int id) {
+ return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile;
+ };
+
+ for (int i = 0; i < 2; ++i) {
+ auto contents = mock::MakeMockFile();
+ for (int k = 0; k < kKeysPerFile; ++k) {
+ auto key = std::to_string(i * kMatchingKeys + k);
+ auto value = std::to_string(i * kKeysPerFile + k);
+ InternalKey internal_key(key, ++sequence_number, kTypeValue);
+
+ // This is how the key will look like once it's written in bottommost
+ // file
+ InternalKey bottommost_internal_key(key, 0, kTypeValue);
+
+ if (corrupt_id(k)) {
+ test::CorruptKeyType(&internal_key);
+ test::CorruptKeyType(&bottommost_internal_key);
+ }
+ contents.push_back({internal_key.Encode().ToString(), value});
+ if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) {
+ expected_results.insert(
+ {bottommost_internal_key.Encode().ToString(), value});
+ }
+ }
+ mock::SortKVVector(&contents, ucmp_);
+
+ AddMockFile(contents);
+ }
+
+ SetLastSequence(sequence_number);
+
+ mock::KVVector expected_results_kvvector;
+ for (auto& kv : expected_results) {
+ expected_results_kvvector.push_back({kv.first, kv.second});
+ }
+
+ return expected_results_kvvector;
+ }
+
+ void NewDB() {
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+
+ std::shared_ptr<Logger> info_log;
+ DBOptions db_opts = BuildDBOptions(db_options_, mutable_db_options_);
+ Status s = CreateLoggerFromOptions(dbname_, db_opts, &info_log);
+ ASSERT_OK(s);
+ db_options_.info_log = info_log;
+
+ versions_.reset(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ compaction_job_stats_.Reset();
+ ASSERT_OK(SetIdentityFile(env_, dbname_));
+
+ VersionEdit new_db;
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ std::unique_ptr<WritableFileWriter> file_writer;
+ const auto& fs = env_->GetFileSystem();
+ s = WritableFileWriter::Create(fs, manifest,
+ fs->OptimizeForManifestWrite(env_options_),
+ &file_writer, nullptr);
+
+ ASSERT_OK(s);
+ {
+ log::Writer log(std::move(file_writer), 0, false);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+ }
+ ASSERT_OK(s);
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+
+ ASSERT_OK(s);
+
+ cf_options_.merge_operator = merge_op_;
+ cf_options_.compaction_filter = compaction_filter_.get();
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+ ASSERT_OK(versions_->Recover(column_families, false));
+ cfd_ = versions_->GetColumnFamilySet()->GetDefault();
+ }
+
+ // input_files[i] on input_levels[i]
+ void RunLastLevelCompaction(
+ const std::vector<std::vector<FileMetaData*>>& input_files,
+ const std::vector<int> input_levels,
+ std::function<void(Compaction& comp)>&& verify_func,
+ const std::vector<SequenceNumber>& snapshots = {}) {
+ const int kLastLevel = cf_options_.num_levels - 1;
+ verify_per_key_placement_ = std::move(verify_func);
+ mock::KVVector empty_map;
+ RunCompaction(input_files, input_levels, {empty_map}, snapshots,
+ kMaxSequenceNumber, kLastLevel, false);
+ }
+
+ // input_files[i] on input_levels[i]
+ void RunCompaction(
+ const std::vector<std::vector<FileMetaData*>>& input_files,
+ const std::vector<int>& input_levels,
+ const std::vector<mock::KVVector>& expected_results,
+ const std::vector<SequenceNumber>& snapshots = {},
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+ int output_level = 1, bool verify = true,
+ std::vector<uint64_t> expected_oldest_blob_file_numbers = {},
+ bool check_get_priority = false,
+ Env::IOPriority read_io_priority = Env::IO_TOTAL,
+ Env::IOPriority write_io_priority = Env::IO_TOTAL,
+ int max_subcompactions = 0) {
+ // For compaction, set fs as MockTestFileSystem to check the io_priority.
+ if (test_io_priority_) {
+ db_options_.fs.reset(
+ new MockTestFileSystem(fs_, read_io_priority, write_io_priority));
+ }
+
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+ size_t num_input_files = 0;
+ std::vector<CompactionInputFiles> compaction_input_files;
+ for (size_t i = 0; i < input_files.size(); ++i) {
+ auto level_files = input_files[i];
+ CompactionInputFiles compaction_level;
+ compaction_level.level = input_levels[i];
+ compaction_level.files.insert(compaction_level.files.end(),
+ level_files.begin(), level_files.end());
+ compaction_input_files.push_back(compaction_level);
+ num_input_files += level_files.size();
+ }
+
+ std::vector<FileMetaData*> grandparents;
+ // it should actually be the next non-empty level
+ const int kGrandparentsLevel = output_level + 1;
+ if (kGrandparentsLevel < cf_options_.num_levels) {
+ grandparents =
+ cfd_->current()->storage_info()->LevelFiles(kGrandparentsLevel);
+ }
+
+ Compaction compaction(
+ cfd->current()->storage_info(), *cfd->ioptions(),
+ *cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+ compaction_input_files, output_level,
+ mutable_cf_options_.target_file_size_base,
+ mutable_cf_options_.max_compaction_bytes, 0, kNoCompression,
+ cfd->GetLatestMutableCFOptions()->compression_opts,
+ Temperature::kUnknown, max_subcompactions, grandparents, true);
+ compaction.SetInputVersion(cfd->current());
+
+ assert(db_options_.info_log);
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+ mutex_.Lock();
+ EventLogger event_logger(db_options_.info_log.get());
+ // TODO(yiwu) add a mock snapshot checker and add test for it.
+ SnapshotChecker* snapshot_checker = nullptr;
+ ASSERT_TRUE(full_history_ts_low_.empty() ||
+ ucmp_->timestamp_size() == full_history_ts_low_.size());
+ const std::atomic<bool> kManualCompactionCanceledFalse{false};
+ CompactionJob compaction_job(
+ 0, &compaction, db_options_, mutable_db_options_, env_options_,
+ versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
+ nullptr, nullptr, &mutex_, &error_handler_, snapshots,
+ earliest_write_conflict_snapshot, snapshot_checker, nullptr,
+ table_cache_, &event_logger, false, false, dbname_,
+ &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */,
+ /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
+ env_->GenerateUniqueId(), DBImpl::GenerateDbSessionId(nullptr),
+ full_history_ts_low_);
+ VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
+
+ compaction_job.Prepare();
+ mutex_.Unlock();
+ Status s = compaction_job.Run();
+ ASSERT_OK(s);
+ ASSERT_OK(compaction_job.io_status());
+ mutex_.Lock();
+ ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions()));
+ ASSERT_OK(compaction_job.io_status());
+ mutex_.Unlock();
+ log_buffer.FlushBufferToLog();
+
+ if (verify) {
+ ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+ ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+
+ VerifyTables(output_level, expected_results,
+ expected_oldest_blob_file_numbers);
+ }
+
+ if (check_get_priority) {
+ CheckGetRateLimiterPriority(compaction_job);
+ }
+
+ if (verify_per_key_placement_) {
+ // Verify per_key_placement compaction
+ assert(compaction.SupportsPerKeyPlacement());
+ verify_per_key_placement_(compaction);
+ }
+ }
+
+ void CheckGetRateLimiterPriority(CompactionJob& compaction_job) {
+ // When the state from WriteController is normal.
+ ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_LOW);
+
+ WriteController* write_controller =
+ compaction_job.versions_->GetColumnFamilySet()->write_controller();
+
+ {
+ // When the state from WriteController is Delayed.
+ std::unique_ptr<WriteControllerToken> delay_token =
+ write_controller->GetDelayToken(1000000);
+ ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER);
+ }
+
+ {
+ // When the state from WriteController is Stopped.
+ std::unique_ptr<WriteControllerToken> stop_token =
+ write_controller->GetStopToken();
+ ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER);
+ }
+ }
+
+ std::shared_ptr<Env> env_guard_;
+ Env* env_;
+ std::shared_ptr<FileSystem> fs_;
+ std::string dbname_;
+ const Comparator* const ucmp_;
+ EnvOptions env_options_;
+ ImmutableDBOptions db_options_;
+ ColumnFamilyOptions cf_options_;
+ MutableCFOptions mutable_cf_options_;
+ MutableDBOptions mutable_db_options_;
+ std::shared_ptr<Cache> table_cache_;
+ WriteController write_controller_;
+ WriteBufferManager write_buffer_manager_;
+ std::unique_ptr<VersionSet> versions_;
+ InstrumentedMutex mutex_;
+ std::atomic<bool> shutting_down_;
+ std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+ CompactionJobStats compaction_job_stats_;
+ ColumnFamilyData* cfd_;
+ std::unique_ptr<CompactionFilter> compaction_filter_;
+ std::shared_ptr<MergeOperator> merge_op_;
+ ErrorHandler error_handler_;
+ std::string full_history_ts_low_;
+ const std::function<std::string(uint64_t)> encode_u64_ts_;
+ const bool test_io_priority_;
+ std::function<void(Compaction& comp)> verify_per_key_placement_;
+ const TableTypeForTest table_type_ = kMockTable;
+};
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest : public CompactionJobTestBase {
+ public:
+ CompactionJobTest()
+ : CompactionJobTestBase(
+ test::PerThreadDBPath("compaction_job_test"), BytewiseComparator(),
+ [](uint64_t /*ts*/) { return ""; }, /*test_io_priority=*/false,
+ TableTypeForTest::kMockTable) {}
+};
+
+TEST_F(CompactionJobTest, Simple) {
+ NewDB();
+
+ auto expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ ASSERT_EQ(2U, files.size());
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, DISABLED_SimpleCorrupted) {
+ NewDB();
+
+ auto expected_results = CreateTwoFiles(true);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+ ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U);
+}
+
+TEST_F(CompactionJobTest, SimpleDeletion) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""},
+ {KeyStr("c", 3U, kTypeValue), "val"}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"},
+ {KeyStr("b", 1U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}});
+
+ SetLastSequence(4U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, OutputNothing) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}});
+
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 2U, kTypeDeletion), ""}});
+
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile();
+
+ SetLastSequence(4U);
+
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleOverwrite) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 3U, kTypeValue), "val2"},
+ {KeyStr("b", 4U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+ {KeyStr("b", 2U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"},
+ {KeyStr("b", 0U, kTypeValue), "val3"}});
+
+ SetLastSequence(4U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleNonLastLevel) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("b", 6U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+ {KeyStr("b", 4U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+ {KeyStr("b", 2U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ // Because level 1 is not the last level, the sequence numbers of a and b
+ // cannot be set to 0
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("b", 6U, kTypeValue), "val3"}});
+
+ SetLastSequence(6U);
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files =
+ cfd_->current()->storage_info()->LevelFiles(input_levels[0]);
+ auto lvl1_files =
+ cfd_->current()->storage_info()->LevelFiles(input_levels[1]);
+ RunCompaction({lvl0_files, lvl1_files}, input_levels, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleMerge) {
+ merge_op_ = MergeOperators::CreateStringAppendOperator();
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeMerge), "5"},
+ {KeyStr("a", 4U, kTypeMerge), "4"},
+ {KeyStr("a", 3U, kTypeValue), "3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+ {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+ SetLastSequence(5U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, NonAssocMerge) {
+ merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeMerge), "5"},
+ {KeyStr("a", 4U, kTypeMerge), "4"},
+ {KeyStr("a", 3U, kTypeMerge), "3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+ {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+ SetLastSequence(5U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+// Filters merge operands with value 10.
+TEST_F(CompactionJobTest, MergeOperandFilter) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 = mock::MakeMockFile(
+ {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+ {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
+ {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)} // Filtered
+ });
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)},
+ {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}});
+
+ SetLastSequence(5U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, FilterSomeMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 = mock::MakeMockFile(
+ {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+ {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
+ {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)},
+ {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)},
+ {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)},
+ {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}});
+ AddMockFile(file2);
+
+ auto file3 =
+ mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}});
+ AddMockFile(file3, 2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)},
+ {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}
+ // b does not appear because the operands are filtered
+ });
+
+ SetLastSequence(5U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+// Test where all operands/merge results are filtered out.
+TEST_F(CompactionJobTest, FilterAllMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file2);
+
+ auto file3 =
+ mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file3, 2);
+
+ SetLastSequence(11U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ mock::KVVector empty_map;
+ RunCompaction({files}, {input_level}, {empty_map});
+}
+
+TEST_F(CompactionJobTest, SimpleSingleDelete) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeDeletion), ""},
+ {KeyStr("b", 6U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+ {KeyStr("b", 4U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("a", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}});
+
+ SetLastSequence(6U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SingleDeleteSnapshots) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+ {KeyStr("d", 9U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 9U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("l", 3U, kTypeSingleDeletion), ""},
+ {KeyStr("l", 2U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("0", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 11U, kTypeValue), "val1"},
+ {KeyStr("b", 11U, kTypeValue), "val2"},
+ {KeyStr("c", 21U, kTypeValue), "val3"},
+ {KeyStr("d", 8U, kTypeValue), "val4"},
+ {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 1U, kTypeValue), "val1"},
+ {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("h", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 12U, kTypeValue), "val1"},
+ {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 8U, kTypeValue), "val2"},
+ });
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("A", 1U, kTypeValue), "val"},
+ {KeyStr("e", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 11U, kTypeValue), ""},
+ {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("b", 11U, kTypeValue), "val2"},
+ {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+ {KeyStr("c", 21U, kTypeValue), ""},
+ {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 1U, kTypeValue), "val1"},
+ {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 12U, kTypeValue), "val1"},
+ {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 8U, kTypeValue), "val2"},
+ });
+
+ SetLastSequence(22U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U}, 10U);
+}
+
+TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) {
+ NewDB();
+
+ // Test multiple snapshots where the earliest snapshot is not a
+ // write-conflic-snapshot.
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 23U, kTypeValue), "val"},
+ {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 23U, kTypeValue), "val"},
+ {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 31U, kTypeValue), "val"},
+ {KeyStr("G", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 23U, kTypeValue), "val2"},
+ {KeyStr("H", 31U, kTypeValue), "val"},
+ {KeyStr("H", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 23U, kTypeValue), "val"},
+ {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 34U, kTypeValue), "val2"},
+ {KeyStr("I", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 32U, kTypeValue), "val3"},
+ {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 34U, kTypeValue), "val"},
+ {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 25U, kTypeValue), "val2"},
+ {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), "val2"},
+ {KeyStr("C", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("C", 13U, kTypeValue), "val"},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 3U, kTypeValue), "val"},
+ {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 13U, kTypeValue), "val3"},
+ {KeyStr("H", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 13U, kTypeValue), "val2"},
+ {KeyStr("I", 13U, kTypeValue), "val4"},
+ {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 11U, kTypeValue), "val5"},
+ {KeyStr("J", 15U, kTypeValue), "val3"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 23U, kTypeValue), ""},
+ {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 23U, kTypeValue), ""},
+ {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 31U, kTypeValue), ""},
+ {KeyStr("H", 31U, kTypeValue), "val"},
+ {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 34U, kTypeValue), ""},
+ {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), "val4"},
+ {KeyStr("J", 34U, kTypeValue), "val"},
+ {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 25U, kTypeValue), "val2"},
+ {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 15U, kTypeValue), "val3"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ });
+
+ SetLastSequence(24U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U, 30U},
+ 20U);
+}
+
+TEST_F(CompactionJobTest, SingleDeleteZeroSeq) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("dummy", 5U, kTypeValue), "val2"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 0U, kTypeValue), "val"},
+ });
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("dummy", 0U, kTypeValue), "val2"},
+ });
+
+ SetLastSequence(22U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results}, {});
+}
+
+TEST_F(CompactionJobTest, MultiSingleDelete) {
+ // Tests three scenarios involving multiple single delete/put pairs:
+ //
+ // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel
+ // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot
+ // C: SDel Put SDel Snapshot Put -> Snapshot Put
+ // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel
+ // E: Put SDel Snapshot Put SDel -> Snapshot SDel
+ // F: Put SDel Put Sdel Snapshot -> removed
+ // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel
+ // H: (Put) Put SDel Put Sdel Snapshot -> Removed
+ // I: (Put) Snapshot Put SDel Put SDel -> SDel
+ // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put
+ // -> Snapshot Put
+ // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel
+ // -> Snapshot Put Snapshot SDel
+ // L: SDel Put SDel Put SDel Snapshot SDel Put SDel SDel Put SDel
+ // -> Snapshot SDel Put SDel
+ // M: (Put) SDel Put SDel Put SDel Snapshot Put SDel SDel Put SDel SDel
+ // -> SDel Snapshot Put SDel
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), "val5"},
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 13U, kTypeValue), "val2"},
+ {KeyStr("C", 14U, kTypeValue), "val3"},
+ {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 11U, kTypeValue), "val4"},
+ {KeyStr("G", 15U, kTypeValue), "val"},
+ {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 13U, kTypeValue), "val"},
+ {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), "val"},
+ {KeyStr("J", 15U, kTypeValue), "val"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 12U, kTypeValue), "val"},
+ {KeyStr("J", 11U, kTypeValue), "val"},
+ {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 15U, kTypeValue), "val1"},
+ {KeyStr("K", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 12U, kTypeValue), "val2"},
+ {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 15U, kTypeValue), "val"},
+ {KeyStr("L", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 12U, kTypeValue), "val"},
+ {KeyStr("L", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 15U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 14U, kTypeValue), "val"},
+ {KeyStr("M", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 11U, kTypeValue), "val"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 10U, kTypeValue), "val"},
+ {KeyStr("B", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 11U, kTypeValue), "val2"},
+ {KeyStr("C", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("C", 9U, kTypeValue), "val6"},
+ {KeyStr("C", 8U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 11U, kTypeValue), "val"},
+ {KeyStr("E", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 4U, kTypeValue), "val"},
+ {KeyStr("F", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 5U, kTypeValue), "val"},
+ {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 3U, kTypeValue), "val"},
+ {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 5U, kTypeValue), "val"},
+ {KeyStr("H", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 3U, kTypeValue), "val"},
+ {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 11U, kTypeValue), "val"},
+ {KeyStr("J", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 4U, kTypeValue), "val"},
+ {KeyStr("J", 3U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 2U, kTypeValue), "val"},
+ {KeyStr("K", 8U, kTypeValue), "val3"},
+ {KeyStr("K", 7U, kTypeValue), "val4"},
+ {KeyStr("K", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 5U, kTypeValue), "val5"},
+ {KeyStr("K", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 1U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 4U, kTypeValue), "val"},
+ {KeyStr("L", 3U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 2U, kTypeValue), "val"},
+ {KeyStr("L", 1U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 7U, kTypeValue), "val"},
+ {KeyStr("M", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 4U, kTypeValue), "val"},
+ {KeyStr("M", 3U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("D", 1U, kTypeValue), "val"},
+ {KeyStr("H", 1U, kTypeValue), "val"},
+ {KeyStr("I", 2U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({
+ {KeyStr("M", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file4, 2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), ""},
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 10U, kTypeValue), "val"},
+ {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 13U, kTypeValue), ""},
+ {KeyStr("C", 14U, kTypeValue), "val3"},
+ {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 11U, kTypeValue), ""},
+ {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 11U, kTypeValue), ""},
+ {KeyStr("G", 15U, kTypeValue), "val"},
+ {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), ""},
+ {KeyStr("J", 15U, kTypeValue), "val"},
+ {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 15U, kTypeValue), ""},
+ {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 8U, kTypeValue), "val3"},
+ {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 15U, kTypeValue), ""},
+ {KeyStr("L", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 15U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 14U, kTypeValue), ""},
+ {KeyStr("M", 3U, kTypeSingleDeletion), ""}});
+
+ SetLastSequence(22U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results}, {10U}, 10U);
+}
+
+// This test documents the behavior where a corrupt key follows a deletion or a
+// single deletion and the (single) deletion gets removed while the corrupt key
+// gets written out. TODO(noetzli): We probably want a better way to treat
+// corrupt keys.
+TEST_F(CompactionJobTest, DISABLED_CorruptionAfterDeletion) {
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"},
+ {test::KeyStr("a", 5U, kTypeDeletion), ""},
+ {test::KeyStr("a", 4U, kTypeValue, true), "val"}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""},
+ {test::KeyStr("b", 2U, kTypeValue, true), "val"},
+ {test::KeyStr("c", 1U, kTypeValue), "val2"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"},
+ {test::KeyStr("a", 0U, kTypeValue, true), "val"},
+ {test::KeyStr("b", 0U, kTypeValue, true), "val"},
+ {test::KeyStr("c", 0U, kTypeValue), "val2"}});
+
+ SetLastSequence(6U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, OldestBlobFileNumber) {
+ NewDB();
+
+ // Note: blob1 is inlined TTL, so it will not be considered for the purposes
+ // of identifying the oldest referenced blob file. Similarly, blob6 will be
+ // ignored because it has TTL and hence refers to a TTL blob file.
+ const stl_wrappers::KVMap::value_type blob1(
+ KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL));
+ const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex),
+ BlobStr(59, 123456, 999));
+ const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex),
+ BlobStr(138, 1000, 1 << 8));
+ auto file1 = mock::MakeMockFile({blob1, blob2, blob3});
+ AddMockFile(file1);
+
+ const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex),
+ BlobStr(199, 3 << 10, 1 << 20));
+ const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex),
+ BlobStr(19, 6789, 333));
+ const stl_wrappers::KVMap::value_type blob6(
+ KeyStr("f", 6U, kTypeBlobIndex),
+ BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL));
+ auto file2 = mock::MakeMockFile({blob4, blob5, blob6});
+ AddMockFile(file2);
+
+ const stl_wrappers::KVMap::value_type expected_blob1(
+ KeyStr("a", 0U, kTypeBlobIndex), blob1.second);
+ const stl_wrappers::KVMap::value_type expected_blob2(
+ KeyStr("b", 0U, kTypeBlobIndex), blob2.second);
+ const stl_wrappers::KVMap::value_type expected_blob3(
+ KeyStr("c", 0U, kTypeBlobIndex), blob3.second);
+ const stl_wrappers::KVMap::value_type expected_blob4(
+ KeyStr("d", 0U, kTypeBlobIndex), blob4.second);
+ const stl_wrappers::KVMap::value_type expected_blob5(
+ KeyStr("e", 0U, kTypeBlobIndex), blob5.second);
+ const stl_wrappers::KVMap::value_type expected_blob6(
+ KeyStr("f", 0U, kTypeBlobIndex), blob6.second);
+ auto expected_results =
+ mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3,
+ expected_blob4, expected_blob5, expected_blob6});
+
+ SetLastSequence(6U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results},
+ std::vector<SequenceNumber>(), kMaxSequenceNumber,
+ /* output_level */ 1, /* verify */ true,
+ /* expected_oldest_blob_file_numbers */ {19});
+}
+
+TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
+ cf_options_.bottommost_temperature = Temperature::kCold;
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = true;
+ });
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ NewDB();
+
+ // Add files on different levels that may overlap
+ auto file0_1 = mock::MakeMockFile({{KeyStr("z", 12U, kTypeValue), "val"}});
+ AddMockFile(file0_1);
+
+ auto file1_1 = mock::MakeMockFile({{KeyStr("b", 10U, kTypeValue), "val"},
+ {KeyStr("f", 11U, kTypeValue), "val"}});
+ AddMockFile(file1_1, 1);
+ auto file1_2 = mock::MakeMockFile({{KeyStr("j", 12U, kTypeValue), "val"},
+ {KeyStr("k", 13U, kTypeValue), "val"}});
+ AddMockFile(file1_2, 1);
+ auto file1_3 = mock::MakeMockFile({{KeyStr("p", 14U, kTypeValue), "val"},
+ {KeyStr("u", 15U, kTypeValue), "val"}});
+ AddMockFile(file1_3, 1);
+
+ auto file2_1 = mock::MakeMockFile({{KeyStr("f", 8U, kTypeValue), "val"},
+ {KeyStr("h", 9U, kTypeValue), "val"}});
+ AddMockFile(file2_1, 2);
+ auto file2_2 = mock::MakeMockFile({{KeyStr("m", 6U, kTypeValue), "val"},
+ {KeyStr("p", 7U, kTypeValue), "val"}});
+ AddMockFile(file2_2, 2);
+
+ auto file3_1 = mock::MakeMockFile({{KeyStr("g", 2U, kTypeValue), "val"},
+ {KeyStr("k", 3U, kTypeValue), "val"}});
+ AddMockFile(file3_1, 3);
+ auto file3_2 = mock::MakeMockFile({{KeyStr("v", 4U, kTypeValue), "val"},
+ {KeyStr("x", 5U, kTypeValue), "val"}});
+ AddMockFile(file3_2, 3);
+
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ const std::vector<int> input_levels = {0, 1, 2, 3};
+ auto files0 = cfd->current()->storage_info()->LevelFiles(input_levels[0]);
+ auto files1 = cfd->current()->storage_info()->LevelFiles(input_levels[1]);
+ auto files2 = cfd->current()->storage_info()->LevelFiles(input_levels[2]);
+ auto files3 = cfd->current()->storage_info()->LevelFiles(input_levels[3]);
+
+ RunLastLevelCompaction(
+ {files0, files1, files2, files3}, input_levels,
+ /*verify_func=*/[&](Compaction& comp) {
+ for (char c = 'a'; c <= 'z'; c++) {
+ std::string c_str;
+ c_str = c;
+ const Slice key(c_str);
+ if (c == 'a') {
+ ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key));
+ } else {
+ ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key));
+ }
+ }
+ });
+}
+
+TEST_F(CompactionJobTest, NoEnforceSingleDeleteContract) {
+ db_options_.enforce_single_del_contracts = false;
+ NewDB();
+
+ auto file =
+ mock::MakeMockFile({{KeyStr("a", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 3U, kTypeDeletion), "dontcare"}});
+ AddMockFile(file);
+ SetLastSequence(4U);
+
+ auto expected_results = mock::MakeMockFile();
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, InputSerialization) {
+ // Setup a random CompactionServiceInput
+ CompactionServiceInput input;
+ const int kStrMaxLen = 1000;
+ Random rnd(static_cast<uint32_t>(time(nullptr)));
+ Random64 rnd64(time(nullptr));
+ input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+ input.column_family.options.comparator = ReverseBytewiseComparator();
+ input.column_family.options.max_bytes_for_level_base =
+ rnd64.Uniform(UINT64_MAX);
+ input.column_family.options.disable_auto_compactions = rnd.OneIn(2);
+ input.column_family.options.compression = kZSTD;
+ input.column_family.options.compression_opts.level = 4;
+ input.db_options.max_background_flushes = 10;
+ input.db_options.paranoid_checks = rnd.OneIn(2);
+ input.db_options.statistics = CreateDBStatistics();
+ input.db_options.env = env_;
+ while (!rnd.OneIn(10)) {
+ input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX));
+ }
+ while (!rnd.OneIn(10)) {
+ input.input_files.emplace_back(rnd.RandomString(
+ rnd.Uniform(kStrMaxLen - 1) +
+ 1)); // input file name should have at least one character
+ }
+ input.output_level = 4;
+ input.has_begin = rnd.OneIn(2);
+ if (input.has_begin) {
+ input.begin = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+ }
+ input.has_end = rnd.OneIn(2);
+ if (input.has_end) {
+ input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+ }
+
+ std::string output;
+ ASSERT_OK(input.Write(&output));
+
+ // Test deserialization
+ CompactionServiceInput deserialized1;
+ ASSERT_OK(CompactionServiceInput::Read(output, &deserialized1));
+ ASSERT_TRUE(deserialized1.TEST_Equals(&input));
+
+ // Test mismatch
+ deserialized1.db_options.max_background_flushes += 10;
+ std::string mismatch;
+ ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch));
+ ASSERT_EQ(mismatch, "db_options.max_background_flushes");
+
+ // Test unknown field
+ CompactionServiceInput deserialized2;
+ output.clear();
+ ASSERT_OK(input.Write(&output));
+ output.append("new_field=123;");
+
+ ASSERT_OK(CompactionServiceInput::Read(output, &deserialized2));
+ ASSERT_TRUE(deserialized2.TEST_Equals(&input));
+
+ // Test missing field
+ CompactionServiceInput deserialized3;
+ deserialized3.output_level = 0;
+ std::string to_remove = "output_level=4;";
+ size_t pos = output.find(to_remove);
+ ASSERT_TRUE(pos != std::string::npos);
+ output.erase(pos, to_remove.length());
+ ASSERT_OK(CompactionServiceInput::Read(output, &deserialized3));
+ mismatch.clear();
+ ASSERT_FALSE(deserialized3.TEST_Equals(&input, &mismatch));
+ ASSERT_EQ(mismatch, "output_level");
+
+ // manually set the value back, should match the original structure
+ deserialized3.output_level = 4;
+ ASSERT_TRUE(deserialized3.TEST_Equals(&input));
+
+ // Test invalid version
+ output.clear();
+ ASSERT_OK(input.Write(&output));
+
+ uint32_t data_version = DecodeFixed32(output.data());
+ const size_t kDataVersionSize = sizeof(data_version);
+ ASSERT_EQ(data_version,
+ 1U); // Update once the default data version is changed
+ char buf[kDataVersionSize];
+ EncodeFixed32(buf, data_version + 10); // make sure it's not valid
+ output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+ Status s = CompactionServiceInput::Read(output, &deserialized3);
+ ASSERT_TRUE(s.IsNotSupported());
+}
+
+TEST_F(CompactionJobTest, ResultSerialization) {
+ // Setup a random CompactionServiceResult
+ CompactionServiceResult result;
+ const int kStrMaxLen = 1000;
+ Random rnd(static_cast<uint32_t>(time(nullptr)));
+ Random64 rnd64(time(nullptr));
+ std::vector<Status> status_list = {
+ Status::OK(),
+ Status::InvalidArgument("invalid option"),
+ Status::Aborted("failed to run"),
+ Status::NotSupported("not supported option"),
+ };
+ result.status =
+ status_list.at(rnd.Uniform(static_cast<int>(status_list.size())));
+ while (!rnd.OneIn(10)) {
+ UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)};
+ result.output_files.emplace_back(
+ rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX),
+ rnd64.Uniform(UINT64_MAX),
+ rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+ rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+ rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX),
+ rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id);
+ }
+ result.output_level = rnd.Uniform(10);
+ result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+ result.num_output_records = rnd64.Uniform(UINT64_MAX);
+ result.total_bytes = rnd64.Uniform(UINT64_MAX);
+ result.bytes_read = 123;
+ result.bytes_written = rnd64.Uniform(UINT64_MAX);
+ result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX);
+ result.stats.num_output_files = rnd.Uniform(1000);
+ result.stats.is_full_compaction = rnd.OneIn(2);
+ result.stats.num_single_del_mismatch = rnd64.Uniform(UINT64_MAX);
+ result.stats.num_input_files = 9;
+
+ std::string output;
+ ASSERT_OK(result.Write(&output));
+
+ // Test deserialization
+ CompactionServiceResult deserialized1;
+ ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1));
+ ASSERT_TRUE(deserialized1.TEST_Equals(&result));
+
+ // Test mismatch
+ deserialized1.stats.num_input_files += 10;
+ std::string mismatch;
+ ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch));
+ ASSERT_EQ(mismatch, "stats.num_input_files");
+
+ // Test unique id mismatch
+ if (!result.output_files.empty()) {
+ CompactionServiceResult deserialized_tmp;
+ ASSERT_OK(CompactionServiceResult::Read(output, &deserialized_tmp));
+ deserialized_tmp.output_files[0].unique_id[0] += 1;
+ ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch));
+ ASSERT_EQ(mismatch, "output_files.unique_id");
+ deserialized_tmp.status.PermitUncheckedError();
+ }
+
+ // Test unknown field
+ CompactionServiceResult deserialized2;
+ output.clear();
+ ASSERT_OK(result.Write(&output));
+ output.append("new_field=123;");
+
+ ASSERT_OK(CompactionServiceResult::Read(output, &deserialized2));
+ ASSERT_TRUE(deserialized2.TEST_Equals(&result));
+
+ // Test missing field
+ CompactionServiceResult deserialized3;
+ deserialized3.bytes_read = 0;
+ std::string to_remove = "bytes_read=123;";
+ size_t pos = output.find(to_remove);
+ ASSERT_TRUE(pos != std::string::npos);
+ output.erase(pos, to_remove.length());
+ ASSERT_OK(CompactionServiceResult::Read(output, &deserialized3));
+ mismatch.clear();
+ ASSERT_FALSE(deserialized3.TEST_Equals(&result, &mismatch));
+ ASSERT_EQ(mismatch, "bytes_read");
+
+ deserialized3.bytes_read = 123;
+ ASSERT_TRUE(deserialized3.TEST_Equals(&result));
+
+ // Test invalid version
+ output.clear();
+ ASSERT_OK(result.Write(&output));
+
+ uint32_t data_version = DecodeFixed32(output.data());
+ const size_t kDataVersionSize = sizeof(data_version);
+ ASSERT_EQ(data_version,
+ 1U); // Update once the default data version is changed
+ char buf[kDataVersionSize];
+ EncodeFixed32(buf, data_version + 10); // make sure it's not valid
+ output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+ Status s = CompactionServiceResult::Read(output, &deserialized3);
+ ASSERT_TRUE(s.IsNotSupported());
+ for (const auto& item : status_list) {
+ item.PermitUncheckedError();
+ }
+}
+
+class CompactionJobDynamicFileSizeTest
+ : public CompactionJobTestBase,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ CompactionJobDynamicFileSizeTest()
+ : CompactionJobTestBase(
+ test::PerThreadDBPath("compaction_job_dynamic_file_size_test"),
+ BytewiseComparator(), [](uint64_t /*ts*/) { return ""; },
+ /*test_io_priority=*/false, TableTypeForTest::kMockTable) {}
+};
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytes) {
+ // dynamic_file_size option should have no impact on cutting for max
+ // compaction bytes.
+ bool enable_dyanmic_file_size = GetParam();
+ cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+ NewDB();
+ mutable_cf_options_.target_file_size_base = 80;
+ mutable_cf_options_.max_compaction_bytes = 21;
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("c", 5U, kTypeValue), "val2"},
+ {KeyStr("n", 6U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("h", 3U, kTypeValue), "val"},
+ {KeyStr("j", 4U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ // Create three L2 files, each size 10.
+ // max_compaction_bytes 21 means the compaction output in L1 will
+ // be cut to at least two files.
+ auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"},
+ {KeyStr("c", 1U, kTypeValue), "val"},
+ {KeyStr("c1", 1U, kTypeValue), "val"},
+ {KeyStr("c2", 1U, kTypeValue), "val"},
+ {KeyStr("c3", 1U, kTypeValue), "val"},
+ {KeyStr("c4", 1U, kTypeValue), "val"},
+ {KeyStr("d", 1U, kTypeValue), "val"},
+ {KeyStr("e", 2U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+ {KeyStr("i", 1U, kTypeValue), "val"},
+ {KeyStr("i1", 1U, kTypeValue), "val"},
+ {KeyStr("i2", 1U, kTypeValue), "val"},
+ {KeyStr("i3", 1U, kTypeValue), "val"},
+ {KeyStr("i4", 1U, kTypeValue), "val"},
+ {KeyStr("j", 1U, kTypeValue), "val"},
+ {KeyStr("k", 2U, kTypeValue), "val"}});
+ AddMockFile(file4, 2);
+
+ auto file5 = mock::MakeMockFile({{KeyStr("l", 1U, kTypeValue), "val"},
+ {KeyStr("m", 1U, kTypeValue), "val"},
+ {KeyStr("m1", 1U, kTypeValue), "val"},
+ {KeyStr("m2", 1U, kTypeValue), "val"},
+ {KeyStr("m3", 1U, kTypeValue), "val"},
+ {KeyStr("m4", 1U, kTypeValue), "val"},
+ {KeyStr("n", 1U, kTypeValue), "val"},
+ {KeyStr("o", 2U, kTypeValue), "val"}});
+ AddMockFile(file5, 2);
+
+ // The expected output should be:
+ // L1: [c, h, j] [n]
+ // L2: [b ... e] [h ... k] [l ... o]
+ // It's better to have "j" in the first file, because anyway it's overlapping
+ // with the second file on L2.
+ // (Note: before this PR, it was cut at "h" because it's using the internal
+ // comparator which think L1 "h" with seqno 3 is smaller than L2 "h" with
+ // seqno 1, but actually they're overlapped with the compaction picker).
+
+ auto expected_file1 =
+ mock::MakeMockFile({{KeyStr("c", 5U, kTypeValue), "val2"},
+ {KeyStr("h", 3U, kTypeValue), "val"},
+ {KeyStr("j", 4U, kTypeValue), "val"}});
+ auto expected_file2 =
+ mock::MakeMockFile({{KeyStr("n", 6U, kTypeValue), "val3"}});
+
+ SetLastSequence(6U);
+
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file1, expected_file2});
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToSkipGrandparentFile) {
+ bool enable_dyanmic_file_size = GetParam();
+ cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+ NewDB();
+ // Make sure the grandparent level file size (10) qualifies skipping.
+ // Currently, it has to be > 1/8 of target file size.
+ mutable_cf_options_.target_file_size_base = 70;
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("z", 6U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("c", 3U, kTypeValue), "val"},
+ {KeyStr("x", 4U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"},
+ {KeyStr("d", 2U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+ {KeyStr("i", 2U, kTypeValue), "val"}});
+ AddMockFile(file4, 2);
+
+ auto file5 = mock::MakeMockFile({{KeyStr("v", 1U, kTypeValue), "val"},
+ {KeyStr("y", 2U, kTypeValue), "val"}});
+ AddMockFile(file5, 2);
+
+ auto expected_file1 =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("c", 3U, kTypeValue), "val"}});
+ auto expected_file2 =
+ mock::MakeMockFile({{KeyStr("x", 4U, kTypeValue), "val"},
+ {KeyStr("z", 6U, kTypeValue), "val3"}});
+
+ auto expected_file_disable_dynamic_file_size =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("c", 3U, kTypeValue), "val"},
+ {KeyStr("x", 4U, kTypeValue), "val"},
+ {KeyStr("z", 6U, kTypeValue), "val3"}});
+
+ SetLastSequence(6U);
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+ if (enable_dyanmic_file_size) {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file1, expected_file2});
+ } else {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file_disable_dynamic_file_size});
+ }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundary) {
+ bool enable_dyanmic_file_size = GetParam();
+ cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+ NewDB();
+
+ // MockTable has 1 byte per entry by default and each file is 10 bytes.
+ // When the file size is smaller than 100, it won't cut file earlier to align
+ // with its grandparent boundary.
+ const size_t kKeyValueSize = 10000;
+ mock_table_factory_->SetKeyValueSize(kKeyValueSize);
+
+ mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize;
+
+ mock::KVVector file1;
+ char ch = 'd';
+ // Add value from d -> o
+ for (char i = 0; i < 12; i++) {
+ file1.emplace_back(KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+ "val" + std::to_string(i));
+ }
+
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("e", 3U, kTypeValue), "val"},
+ {KeyStr("s", 4U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ // the 1st grandparent file should be skipped
+ auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+ {KeyStr("b", 2U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({{KeyStr("c", 1U, kTypeValue), "val"},
+ {KeyStr("e", 2U, kTypeValue), "val"}});
+ AddMockFile(file4, 2);
+
+ auto file5 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+ {KeyStr("j", 2U, kTypeValue), "val"}});
+ AddMockFile(file5, 2);
+
+ auto file6 = mock::MakeMockFile({{KeyStr("k", 1U, kTypeValue), "val"},
+ {KeyStr("n", 2U, kTypeValue), "val"}});
+ AddMockFile(file6, 2);
+
+ auto file7 = mock::MakeMockFile({{KeyStr("q", 1U, kTypeValue), "val"},
+ {KeyStr("t", 2U, kTypeValue), "val"}});
+ AddMockFile(file7, 2);
+
+ // The expected outputs are:
+ // L1: [d,e,f,g,h,i,j] [k,l,m,n,o,s]
+ // L2: [a, b] [c, e] [h, j] [k, n] [q, t]
+ // The first output cut earlier at "j", so it could be aligned with L2 files.
+ // If dynamic_file_size is not enabled, it will be cut based on the
+ // target_file_size
+ mock::KVVector expected_file1;
+ for (char i = 0; i < 7; i++) {
+ expected_file1.emplace_back(
+ KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+ "val" + std::to_string(i));
+ }
+
+ mock::KVVector expected_file2;
+ for (char i = 7; i < 12; i++) {
+ expected_file2.emplace_back(
+ KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+ "val" + std::to_string(i));
+ }
+ expected_file2.emplace_back(KeyStr("s", 4U, kTypeValue), "val");
+
+ mock::KVVector expected_file_disable_dynamic_file_size1;
+ for (char i = 0; i < 10; i++) {
+ expected_file_disable_dynamic_file_size1.emplace_back(
+ KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+ "val" + std::to_string(i));
+ }
+
+ mock::KVVector expected_file_disable_dynamic_file_size2;
+ for (char i = 10; i < 12; i++) {
+ expected_file_disable_dynamic_file_size2.emplace_back(
+ KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+ "val" + std::to_string(i));
+ }
+
+ expected_file_disable_dynamic_file_size2.emplace_back(
+ KeyStr("s", 4U, kTypeValue), "val");
+
+ SetLastSequence(22U);
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+ if (enable_dyanmic_file_size) {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file1, expected_file2});
+ } else {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file_disable_dynamic_file_size1,
+ expected_file_disable_dynamic_file_size2});
+ }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundarySameKey) {
+ bool enable_dyanmic_file_size = GetParam();
+ cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+ NewDB();
+
+ // MockTable has 1 byte per entry by default and each file is 10 bytes.
+ // When the file size is smaller than 100, it won't cut file earlier to align
+ // with its grandparent boundary.
+ const size_t kKeyValueSize = 10000;
+ mock_table_factory_->SetKeyValueSize(kKeyValueSize);
+
+ mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize;
+
+ mock::KVVector file1;
+ for (int i = 0; i < 7; i++) {
+ file1.emplace_back(KeyStr("a", 100 - i, kTypeValue),
+ "val" + std::to_string(100 - i));
+ }
+ file1.emplace_back(KeyStr("b", 90, kTypeValue), "valb");
+
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 93U, kTypeValue), "val93"},
+ {KeyStr("b", 90U, kTypeValue), "valb"}});
+ AddMockFile(file2, 1);
+
+ auto file3 = mock::MakeMockFile({{KeyStr("a", 89U, kTypeValue), "val"},
+ {KeyStr("a", 88U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({{KeyStr("a", 87U, kTypeValue), "val"},
+ {KeyStr("a", 86U, kTypeValue), "val"}});
+ AddMockFile(file4, 2);
+
+ auto file5 = mock::MakeMockFile({{KeyStr("b", 85U, kTypeValue), "val"},
+ {KeyStr("b", 84U, kTypeValue), "val"}});
+ AddMockFile(file5, 2);
+
+ mock::KVVector expected_file1;
+ mock::KVVector expected_file_disable_dynamic_file_size;
+
+ for (int i = 0; i < 8; i++) {
+ expected_file1.emplace_back(KeyStr("a", 100 - i, kTypeValue),
+ "val" + std::to_string(100 - i));
+ expected_file_disable_dynamic_file_size.emplace_back(
+ KeyStr("a", 100 - i, kTypeValue), "val" + std::to_string(100 - i));
+ }
+
+ // make sure `b` is cut in a separated file (so internally it's not using
+ // internal comparator, which will think the "b:90" (seqno 90) here is smaller
+ // than "b:85" on L2.)
+ auto expected_file2 =
+ mock::MakeMockFile({{KeyStr("b", 90U, kTypeValue), "valb"}});
+
+ expected_file_disable_dynamic_file_size.emplace_back(
+ KeyStr("b", 90U, kTypeValue), "valb");
+
+ SetLastSequence(122U);
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+ // Just keep all the history
+ std::vector<SequenceNumber> snapshots;
+ for (int i = 80; i <= 100; i++) {
+ snapshots.emplace_back(i);
+ }
+ if (enable_dyanmic_file_size) {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file1, expected_file2}, snapshots);
+ } else {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file_disable_dynamic_file_size}, snapshots);
+ }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytesSameKey) {
+ // dynamic_file_size option should have no impact on cutting for max
+ // compaction bytes.
+ bool enable_dyanmic_file_size = GetParam();
+ cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+ NewDB();
+ mutable_cf_options_.target_file_size_base = 80;
+ mutable_cf_options_.max_compaction_bytes = 20;
+
+ auto file1 = mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"},
+ {KeyStr("b", 103U, kTypeValue), "val"}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 102U, kTypeValue), "val2"},
+ {KeyStr("c", 101U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ for (int i = 0; i < 10; i++) {
+ auto file =
+ mock::MakeMockFile({{KeyStr("a", 100 - (i * 2), kTypeValue), "val"},
+ {KeyStr("a", 99 - (i * 2), kTypeValue), "val"}});
+ AddMockFile(file, 2);
+ }
+
+ for (int i = 0; i < 10; i++) {
+ auto file =
+ mock::MakeMockFile({{KeyStr("b", 80 - (i * 2), kTypeValue), "val"},
+ {KeyStr("b", 79 - (i * 2), kTypeValue), "val"}});
+ AddMockFile(file, 2);
+ }
+
+ auto file5 = mock::MakeMockFile({{KeyStr("c", 60U, kTypeValue), "valc"},
+ {KeyStr("c", 59U, kTypeValue), "valc"}});
+
+ // "a" has 10 overlapped grandparent files (each size 10), which is far
+ // exceeded the `max_compaction_bytes`, but make sure 2 "a" are not separated,
+ // as splitting them won't help reducing the compaction size.
+ // also make sure "b" and "c" are cut separately.
+ mock::KVVector expected_file1 =
+ mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"},
+ {KeyStr("a", 102U, kTypeValue), "val2"}});
+ mock::KVVector expected_file2 =
+ mock::MakeMockFile({{KeyStr("b", 103U, kTypeValue), "val"}});
+ mock::KVVector expected_file3 =
+ mock::MakeMockFile({{KeyStr("c", 101U, kTypeValue), "val"}});
+
+ SetLastSequence(122U);
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+ // Just keep all the history
+ std::vector<SequenceNumber> snapshots;
+ for (int i = 80; i <= 105; i++) {
+ snapshots.emplace_back(i);
+ }
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file1, expected_file2, expected_file3}, snapshots);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobDynamicFileSizeTest,
+ CompactionJobDynamicFileSizeTest, testing::Bool());
+
+class CompactionJobTimestampTest : public CompactionJobTestBase {
+ public:
+ CompactionJobTimestampTest()
+ : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_ts_test"),
+ test::BytewiseComparatorWithU64TsWrapper(),
+ test::EncodeInt, /*test_io_priority=*/false,
+ TableTypeForTest::kMockTable) {}
+};
+
+TEST_F(CompactionJobTimestampTest, GCDisabled) {
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+ {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+ {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"},
+ {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"}});
+
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+ {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+ {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"},
+ {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}});
+ AddMockFile(file2);
+
+ SetLastSequence(10);
+
+ auto expected_results = mock::MakeMockFile(
+ {{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+ {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+ {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"},
+ {KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+ {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+ {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"},
+ {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"},
+ {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}});
+ constexpr int input_level = 0;
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, NoKeyExpired) {
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+ {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+ {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+ {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+ AddMockFile(file2);
+
+ SetLastSequence(101);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+ {KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+ {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+ {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"},
+ {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+ constexpr int input_level = 0;
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ full_history_ts_low_ = encode_u64_ts_(0);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, AllKeysExpired) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile(
+ {{KeyStr("a", 5, ValueType::kTypeDeletionWithTimestamp, 100), ""},
+ {KeyStr("b", 6, ValueType::kTypeSingleDeletion, 99), ""},
+ {KeyStr("c", 7, ValueType::kTypeValue, 98), "c7"}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("a", 4, ValueType::kTypeValue, 97), "a4"},
+ {KeyStr("b", 3, ValueType::kTypeValue, 96), "b3"},
+ {KeyStr("c", 2, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+ {KeyStr("c", 1, ValueType::kTypeValue, 94), "c1"}});
+ AddMockFile(file2);
+
+ SetLastSequence(7);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("c", 0, ValueType::kTypeValue, 0), "c7"}});
+ constexpr int input_level = 0;
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ full_history_ts_low_ = encode_u64_ts_(std::numeric_limits<uint64_t>::max());
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, SomeKeysExpired) {
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+ {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("a", 3, ValueType::kTypeValue, 48), "a3"},
+ {KeyStr("a", 2, ValueType::kTypeValue, 46), "a2"},
+ {KeyStr("b", 4, ValueType::kTypeDeletionWithTimestamp, 47), ""}});
+ AddMockFile(file2);
+
+ SetLastSequence(6);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+ {KeyStr("a", 0, ValueType::kTypeValue, 0), "a3"},
+ {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+ constexpr int input_level = 0;
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ full_history_ts_low_ = encode_u64_ts_(49);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+class CompactionJobTimestampTestWithBbTable : public CompactionJobTestBase {
+ public:
+ // Block-based table is needed if we want to test subcompaction partitioning
+ // with anchors.
+ explicit CompactionJobTimestampTestWithBbTable()
+ : CompactionJobTestBase(
+ test::PerThreadDBPath("compaction_job_ts_bbt_test"),
+ test::BytewiseComparatorWithU64TsWrapper(), test::EncodeInt,
+ /*test_io_priority=*/false, TableTypeForTest::kBlockBasedTable) {}
+};
+
+TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionAnchorL1) {
+ cf_options_.target_file_size_base = 20;
+ mutable_cf_options_.target_file_size_base = 20;
+ NewDB();
+
+ const std::vector<std::string> keys = {
+ KeyStr("a", 20, ValueType::kTypeValue, 200),
+ KeyStr("b", 21, ValueType::kTypeValue, 210),
+ KeyStr("b", 20, ValueType::kTypeValue, 200),
+ KeyStr("b", 18, ValueType::kTypeValue, 180),
+ KeyStr("c", 17, ValueType::kTypeValue, 170),
+ KeyStr("c", 16, ValueType::kTypeValue, 160),
+ KeyStr("c", 15, ValueType::kTypeValue, 150)};
+ const std::vector<std::string> values = {"a20", "b21", "b20", "b18",
+ "c17", "c16", "c15"};
+
+ constexpr int input_level = 1;
+
+ auto file1 = mock::MakeMockFile(
+ {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}});
+ AddMockFile(file1, input_level);
+
+ auto file2 = mock::MakeMockFile(
+ {{keys[3], values[3]}, {keys[4], values[4]}, {keys[5], values[5]}});
+ AddMockFile(file2, input_level);
+
+ auto file3 = mock::MakeMockFile({{keys[6], values[6]}});
+ AddMockFile(file3, input_level);
+
+ SetLastSequence(20);
+
+ auto output1 = mock::MakeMockFile({{keys[0], values[0]}});
+ auto output2 = mock::MakeMockFile(
+ {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}});
+ auto output3 = mock::MakeMockFile(
+ {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}});
+
+ auto expected_results =
+ std::vector<mock::KVVector>{output1, output2, output3};
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ constexpr int output_level = 2;
+ constexpr int max_subcompactions = 4;
+ RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{},
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ output_level, /*verify=*/true, {kInvalidBlobFileNumber},
+ /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL,
+ max_subcompactions);
+}
+
+TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionL0) {
+ cf_options_.target_file_size_base = 20;
+ mutable_cf_options_.target_file_size_base = 20;
+ NewDB();
+
+ const std::vector<std::string> keys = {
+ KeyStr("a", 20, ValueType::kTypeValue, 200),
+ KeyStr("b", 20, ValueType::kTypeValue, 200),
+ KeyStr("b", 19, ValueType::kTypeValue, 190),
+ KeyStr("b", 18, ValueType::kTypeValue, 180),
+ KeyStr("c", 17, ValueType::kTypeValue, 170),
+ KeyStr("c", 16, ValueType::kTypeValue, 160),
+ KeyStr("c", 15, ValueType::kTypeValue, 150)};
+ const std::vector<std::string> values = {"a20", "b20", "b19", "b18",
+ "c17", "c16", "c15"};
+
+ constexpr int input_level = 0;
+
+ auto file1 = mock::MakeMockFile({{keys[5], values[5]}, {keys[6], values[6]}});
+ AddMockFile(file1, input_level);
+
+ auto file2 = mock::MakeMockFile({{keys[3], values[3]}, {keys[4], values[4]}});
+ AddMockFile(file2, input_level);
+
+ auto file3 = mock::MakeMockFile(
+ {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}});
+ AddMockFile(file3, input_level);
+
+ SetLastSequence(20);
+
+ auto output1 = mock::MakeMockFile({{keys[0], values[0]}});
+ auto output2 = mock::MakeMockFile(
+ {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}});
+ auto output3 = mock::MakeMockFile(
+ {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}});
+
+ auto expected_results =
+ std::vector<mock::KVVector>{output1, output2, output3};
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ constexpr int output_level = 1;
+ constexpr int max_subcompactions = 4;
+ RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{},
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ output_level, /*verify=*/true, {kInvalidBlobFileNumber},
+ /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL,
+ max_subcompactions);
+}
+
+// The io priority of the compaction reads and writes are different from
+// other DB reads and writes. To prepare the compaction input files, use the
+// default filesystem from Env. To test the io priority of the compaction
+// reads and writes, db_options_.fs is set as MockTestFileSystem.
+class CompactionJobIOPriorityTest : public CompactionJobTestBase {
+ public:
+ CompactionJobIOPriorityTest()
+ : CompactionJobTestBase(
+ test::PerThreadDBPath("compaction_job_io_priority_test"),
+ BytewiseComparator(), [](uint64_t /*ts*/) { return ""; },
+ /*test_io_priority=*/true, TableTypeForTest::kBlockBasedTable) {}
+};
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateNormal) {
+ // When the state from WriteController is normal.
+ NewDB();
+ mock::KVVector expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ ASSERT_EQ(2U, files.size());
+ RunCompaction({files}, {input_level}, {expected_results}, {},
+ kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+ Env::IO_LOW, Env::IO_LOW);
+}
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateDelayed) {
+ // When the state from WriteController is Delayed.
+ NewDB();
+ mock::KVVector expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ ASSERT_EQ(2U, files.size());
+ {
+ std::unique_ptr<WriteControllerToken> delay_token =
+ write_controller_.GetDelayToken(1000000);
+ RunCompaction({files}, {input_level}, {expected_results}, {},
+ kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+ Env::IO_USER, Env::IO_USER);
+ }
+}
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateStalled) {
+ // When the state from WriteController is Stalled.
+ NewDB();
+ mock::KVVector expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ ASSERT_EQ(2U, files.size());
+ {
+ std::unique_ptr<WriteControllerToken> stop_token =
+ write_controller_.GetStopToken();
+ RunCompaction({files}, {input_level}, {expected_results}, {},
+ kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+ Env::IO_USER, Env::IO_USER);
+ }
+}
+
+TEST_F(CompactionJobIOPriorityTest, GetRateLimiterPriority) {
+ NewDB();
+ mock::KVVector expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ ASSERT_EQ(2U, files.size());
+ RunCompaction({files}, {input_level}, {expected_results}, {},
+ kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, true,
+ Env::IO_LOW, Env::IO_LOW);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_outputs.cc b/src/rocksdb/db/compaction/compaction_outputs.cc
new file mode 100644
index 000000000..e74378e2a
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_outputs.cc
@@ -0,0 +1,646 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_outputs.h"
+
+#include "db/builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) {
+ builder_.reset(NewTableBuilder(tboptions, file_writer_.get()));
+}
+
+Status CompactionOutputs::Finish(const Status& intput_status,
+ const SeqnoToTimeMapping& seqno_time_mapping) {
+ FileMetaData* meta = GetMetaData();
+ assert(meta != nullptr);
+ Status s = intput_status;
+ if (s.ok()) {
+ std::string seqno_time_mapping_str;
+ seqno_time_mapping.Encode(seqno_time_mapping_str, meta->fd.smallest_seqno,
+ meta->fd.largest_seqno, meta->file_creation_time);
+ builder_->SetSeqnoTimeTableProperties(seqno_time_mapping_str,
+ meta->oldest_ancester_time);
+ s = builder_->Finish();
+
+ } else {
+ builder_->Abandon();
+ }
+ Status io_s = builder_->io_status();
+ if (s.ok()) {
+ s = io_s;
+ } else {
+ io_s.PermitUncheckedError();
+ }
+ const uint64_t current_bytes = builder_->FileSize();
+ if (s.ok()) {
+ meta->fd.file_size = current_bytes;
+ meta->marked_for_compaction = builder_->NeedCompact();
+ }
+ current_output().finished = true;
+ stats_.bytes_written += current_bytes;
+ stats_.num_output_files = outputs_.size();
+
+ return s;
+}
+
+IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status,
+ SystemClock* clock,
+ Statistics* statistics,
+ bool use_fsync) {
+ IOStatus io_s;
+ if (input_status.ok()) {
+ StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS);
+ io_s = file_writer_->Sync(use_fsync);
+ }
+ if (input_status.ok() && io_s.ok()) {
+ io_s = file_writer_->Close();
+ }
+
+ if (input_status.ok() && io_s.ok()) {
+ FileMetaData* meta = GetMetaData();
+ meta->file_checksum = file_writer_->GetFileChecksum();
+ meta->file_checksum_func_name = file_writer_->GetFileChecksumFuncName();
+ }
+
+ file_writer_.reset();
+
+ return io_s;
+}
+
+size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
+ const Slice& internal_key) {
+ size_t curr_key_boundary_switched_num = 0;
+ const std::vector<FileMetaData*>& grandparents = compaction_->grandparents();
+
+ if (grandparents.empty()) {
+ return curr_key_boundary_switched_num;
+ }
+ assert(!internal_key.empty());
+ InternalKey ikey;
+ ikey.DecodeFrom(internal_key);
+ assert(ikey.Valid());
+
+ const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
+
+ // Move the grandparent_index_ to the file containing the current user_key.
+ // If there are multiple files containing the same user_key, make sure the
+ // index points to the last file containing the key.
+ while (grandparent_index_ < grandparents.size()) {
+ if (being_grandparent_gap_) {
+ if (sstableKeyCompare(ucmp, ikey,
+ grandparents[grandparent_index_]->smallest) < 0) {
+ break;
+ }
+ if (seen_key_) {
+ curr_key_boundary_switched_num++;
+ grandparent_overlapped_bytes_ +=
+ grandparents[grandparent_index_]->fd.GetFileSize();
+ grandparent_boundary_switched_num_++;
+ }
+ being_grandparent_gap_ = false;
+ } else {
+ int cmp_result = sstableKeyCompare(
+ ucmp, ikey, grandparents[grandparent_index_]->largest);
+ // If it's same key, make sure grandparent_index_ is pointing to the last
+ // one.
+ if (cmp_result < 0 ||
+ (cmp_result == 0 &&
+ (grandparent_index_ == grandparents.size() - 1 ||
+ sstableKeyCompare(ucmp, ikey,
+ grandparents[grandparent_index_ + 1]->smallest) <
+ 0))) {
+ break;
+ }
+ if (seen_key_) {
+ curr_key_boundary_switched_num++;
+ grandparent_boundary_switched_num_++;
+ }
+ being_grandparent_gap_ = true;
+ grandparent_index_++;
+ }
+ }
+
+ // If the first key is in the middle of a grandparent file, adding it to the
+ // overlap
+ if (!seen_key_ && !being_grandparent_gap_) {
+ assert(grandparent_overlapped_bytes_ == 0);
+ grandparent_overlapped_bytes_ =
+ GetCurrentKeyGrandparentOverlappedBytes(internal_key);
+ }
+
+ seen_key_ = true;
+ return curr_key_boundary_switched_num;
+}
+
+uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
+ const Slice& internal_key) const {
+ // no overlap with any grandparent file
+ if (being_grandparent_gap_) {
+ return 0;
+ }
+ uint64_t overlapped_bytes = 0;
+
+ const std::vector<FileMetaData*>& grandparents = compaction_->grandparents();
+ const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
+ InternalKey ikey;
+ ikey.DecodeFrom(internal_key);
+#ifndef NDEBUG
+ // make sure the grandparent_index_ is pointing to the last files containing
+ // the current key.
+ int cmp_result =
+ sstableKeyCompare(ucmp, ikey, grandparents[grandparent_index_]->largest);
+ assert(
+ cmp_result < 0 ||
+ (cmp_result == 0 &&
+ (grandparent_index_ == grandparents.size() - 1 ||
+ sstableKeyCompare(
+ ucmp, ikey, grandparents[grandparent_index_ + 1]->smallest) < 0)));
+ assert(sstableKeyCompare(ucmp, ikey,
+ grandparents[grandparent_index_]->smallest) >= 0);
+#endif
+ overlapped_bytes += grandparents[grandparent_index_]->fd.GetFileSize();
+
+ // go backwards to find all overlapped files, one key can overlap multiple
+ // files. In the following example, if the current output key is `c`, and one
+ // compaction file was cut before `c`, current `c` can overlap with 3 files:
+ // [a b] [c...
+ // [b, b] [c, c] [c, c] [c, d]
+ for (int64_t i = static_cast<int64_t>(grandparent_index_) - 1;
+ i >= 0 && sstableKeyCompare(ucmp, ikey, grandparents[i]->largest) == 0;
+ i--) {
+ overlapped_bytes += grandparents[i]->fd.GetFileSize();
+ }
+
+ return overlapped_bytes;
+}
+
+bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
+ assert(c_iter.Valid());
+
+ // always update grandparent information like overlapped file number, size
+ // etc.
+ const Slice& internal_key = c_iter.key();
+ const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
+ size_t num_grandparent_boundaries_crossed =
+ UpdateGrandparentBoundaryInfo(internal_key);
+
+ if (!HasBuilder()) {
+ return false;
+ }
+
+ // If there's user defined partitioner, check that first
+ if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest(
+ last_key_for_partitioner_, c_iter.user_key(),
+ current_output_file_size_)) == kRequired) {
+ return true;
+ }
+
+ // files output to Level 0 won't be split
+ if (compaction_->output_level() == 0) {
+ return false;
+ }
+
+ // reach the max file size
+ if (current_output_file_size_ >= compaction_->max_output_file_size()) {
+ return true;
+ }
+
+ const InternalKeyComparator* icmp =
+ &compaction_->column_family_data()->internal_comparator();
+
+ // Check if it needs to split for RoundRobin
+ // Invalid local_output_split_key indicates that we do not need to split
+ if (local_output_split_key_ != nullptr && !is_split_) {
+ // Split occurs when the next key is larger than/equal to the cursor
+ if (icmp->Compare(internal_key, local_output_split_key_->Encode()) >= 0) {
+ is_split_ = true;
+ return true;
+ }
+ }
+
+ // only check if the current key is going to cross the grandparents file
+ // boundary (either the file beginning or ending).
+ if (num_grandparent_boundaries_crossed > 0) {
+ // Cut the file before the current key if the size of the current output
+ // file + its overlapped grandparent files is bigger than
+ // max_compaction_bytes. Which is to prevent future bigger than
+ // max_compaction_bytes compaction from the current output level.
+ if (grandparent_overlapped_bytes_ + current_output_file_size_ >
+ compaction_->max_compaction_bytes()) {
+ return true;
+ }
+
+ // Cut the file if including the key is going to add a skippable file on
+ // the grandparent level AND its size is reasonably big (1/8 of target file
+ // size). For example, if it's compacting the files L0 + L1:
+ // L0: [1, 21]
+ // L1: [3, 23]
+ // L2: [2, 4] [11, 15] [22, 24]
+ // Without this break, it will output as:
+ // L1: [1,3, 21,23]
+ // With this break, it will output as (assuming [11, 15] at L2 is bigger
+ // than 1/8 of target size):
+ // L1: [1,3] [21,23]
+ // Then for the future compactions, [11,15] won't be included.
+ // For random datasets (either evenly distributed or skewed), it rarely
+ // triggers this condition, but if the user is adding 2 different datasets
+ // without any overlap, it may likely happen.
+ // More details, check PR #1963
+ const size_t num_skippable_boundaries_crossed =
+ being_grandparent_gap_ ? 2 : 3;
+ if (compaction_->immutable_options()->compaction_style ==
+ kCompactionStyleLevel &&
+ compaction_->immutable_options()->level_compaction_dynamic_file_size &&
+ num_grandparent_boundaries_crossed >=
+ num_skippable_boundaries_crossed &&
+ grandparent_overlapped_bytes_ - previous_overlapped_bytes >
+ compaction_->target_output_file_size() / 8) {
+ return true;
+ }
+
+ // Pre-cut the output file if it's reaching a certain size AND it's at the
+ // boundary of a grandparent file. It can reduce the future compaction size,
+ // the cost is having smaller files.
+ // The pre-cut size threshold is based on how many grandparent boundaries
+ // it has seen before. Basically, if it has seen no boundary at all, then it
+ // will pre-cut at 50% target file size. Every boundary it has seen
+ // increases the threshold by 5%, max at 90%, which it will always cut.
+ // The idea is based on if it has seen more boundaries before, it will more
+ // likely to see another boundary (file cutting opportunity) before the
+ // target file size. The test shows it can generate larger files than a
+ // static threshold like 75% and has a similar write amplification
+ // improvement.
+ if (compaction_->immutable_options()->compaction_style ==
+ kCompactionStyleLevel &&
+ compaction_->immutable_options()->level_compaction_dynamic_file_size &&
+ current_output_file_size_ >=
+ ((compaction_->target_output_file_size() + 99) / 100) *
+ (50 + std::min(grandparent_boundary_switched_num_ * 5,
+ size_t{40}))) {
+ return true;
+ }
+ }
+
+ // check ttl file boundaries if there's any
+ if (!files_to_cut_for_ttl_.empty()) {
+ if (cur_files_to_cut_for_ttl_ != -1) {
+ // Previous key is inside the range of a file
+ if (icmp->Compare(internal_key,
+ files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
+ ->largest.Encode()) > 0) {
+ next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
+ cur_files_to_cut_for_ttl_ = -1;
+ return true;
+ }
+ } else {
+ // Look for the key position
+ while (next_files_to_cut_for_ttl_ <
+ static_cast<int>(files_to_cut_for_ttl_.size())) {
+ if (icmp->Compare(internal_key,
+ files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
+ ->smallest.Encode()) >= 0) {
+ if (icmp->Compare(internal_key,
+ files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
+ ->largest.Encode()) <= 0) {
+ // With in the current file
+ cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
+ return true;
+ }
+ // Beyond the current file
+ next_files_to_cut_for_ttl_++;
+ } else {
+ // Still fall into the gap
+ break;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+Status CompactionOutputs::AddToOutput(
+ const CompactionIterator& c_iter,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func) {
+ Status s;
+ const Slice& key = c_iter.key();
+
+ if (ShouldStopBefore(c_iter) && HasBuilder()) {
+ s = close_file_func(*this, c_iter.InputStatus(), key);
+ if (!s.ok()) {
+ return s;
+ }
+ // reset grandparent information
+ grandparent_boundary_switched_num_ = 0;
+ grandparent_overlapped_bytes_ =
+ GetCurrentKeyGrandparentOverlappedBytes(key);
+ }
+
+ // Open output file if necessary
+ if (!HasBuilder()) {
+ s = open_file_func(*this);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ assert(builder_ != nullptr);
+ const Slice& value = c_iter.value();
+ s = current_output().validator.Add(key, value);
+ if (!s.ok()) {
+ return s;
+ }
+ builder_->Add(key, value);
+
+ stats_.num_output_records++;
+ current_output_file_size_ = builder_->EstimatedFileSize();
+
+ if (blob_garbage_meter_) {
+ s = blob_garbage_meter_->ProcessOutFlow(key, value);
+ }
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ const ParsedInternalKey& ikey = c_iter.ikey();
+ s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
+ ikey.type);
+
+ if (partitioner_) {
+ last_key_for_partitioner_.assign(c_iter.user_key().data_,
+ c_iter.user_key().size_);
+ }
+
+ return s;
+}
+
+Status CompactionOutputs::AddRangeDels(
+ const Slice* comp_start_user_key, const Slice* comp_end_user_key,
+ CompactionIterationStats& range_del_out_stats, bool bottommost_level,
+ const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
+ const Slice& next_table_min_key, const std::string& full_history_ts_low) {
+ assert(HasRangeDel());
+ FileMetaData& meta = current_output().meta;
+ const Comparator* ucmp = icmp.user_comparator();
+
+ Slice lower_bound_guard, upper_bound_guard;
+ std::string smallest_user_key;
+ const Slice *lower_bound, *upper_bound;
+ bool lower_bound_from_sub_compact = false;
+
+ size_t output_size = outputs_.size();
+ if (output_size == 1) {
+ // For the first output table, include range tombstones before the min
+ // key but after the subcompaction boundary.
+ lower_bound = comp_start_user_key;
+ lower_bound_from_sub_compact = true;
+ } else if (meta.smallest.size() > 0) {
+ // For subsequent output tables, only include range tombstones from min
+ // key onwards since the previous file was extended to contain range
+ // tombstones falling before min key.
+ smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
+ lower_bound_guard = Slice(smallest_user_key);
+ lower_bound = &lower_bound_guard;
+ } else {
+ lower_bound = nullptr;
+ }
+ if (!next_table_min_key.empty()) {
+ // This may be the last file in the subcompaction in some cases, so we
+ // need to compare the end key of subcompaction with the next file start
+ // key. When the end key is chosen by the subcompaction, we know that
+ // it must be the biggest key in output file. Therefore, it is safe to
+ // use the smaller key as the upper bound of the output file, to ensure
+ // that there is no overlapping between different output files.
+ upper_bound_guard = ExtractUserKey(next_table_min_key);
+ if (comp_end_user_key != nullptr &&
+ ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >=
+ 0) {
+ upper_bound = comp_end_user_key;
+ } else {
+ upper_bound = &upper_bound_guard;
+ }
+ } else {
+ // This is the last file in the subcompaction, so extend until the
+ // subcompaction ends.
+ upper_bound = comp_end_user_key;
+ }
+ bool has_overlapping_endpoints;
+ if (upper_bound != nullptr && meta.largest.size() > 0) {
+ has_overlapping_endpoints = ucmp->CompareWithoutTimestamp(
+ meta.largest.user_key(), *upper_bound) == 0;
+ } else {
+ has_overlapping_endpoints = false;
+ }
+
+ // The end key of the subcompaction must be bigger or equal to the upper
+ // bound. If the end of subcompaction is null or the upper bound is null,
+ // it means that this file is the last file in the compaction. So there
+ // will be no overlapping between this file and others.
+ assert(comp_end_user_key == nullptr || upper_bound == nullptr ||
+ ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0);
+ auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
+ has_overlapping_endpoints);
+ // Position the range tombstone output iterator. There may be tombstone
+ // fragments that are entirely out of range, so make sure that we do not
+ // include those.
+ if (lower_bound != nullptr) {
+ it->Seek(*lower_bound);
+ } else {
+ it->SeekToFirst();
+ }
+ for (; it->Valid(); it->Next()) {
+ auto tombstone = it->Tombstone();
+ if (upper_bound != nullptr) {
+ int cmp =
+ ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_);
+ if ((has_overlapping_endpoints && cmp < 0) ||
+ (!has_overlapping_endpoints && cmp <= 0)) {
+ // Tombstones starting after upper_bound only need to be included in
+ // the next table. If the current SST ends before upper_bound, i.e.,
+ // `has_overlapping_endpoints == false`, we can also skip over range
+ // tombstones that start exactly at upper_bound. Such range
+ // tombstones will be included in the next file and are not relevant
+ // to the point keys or endpoints of the current file.
+ break;
+ }
+ }
+
+ const size_t ts_sz = ucmp->timestamp_size();
+ // Garbage collection for range tombstones.
+ // If user-defined timestamp is enabled, range tombstones are dropped if
+ // they are at bottommost_level, below full_history_ts_low and not visible
+ // in any snapshot. trim_ts_ is passed to the constructor for
+ // range_del_agg_, and range_del_agg_ internally drops tombstones above
+ // trim_ts_.
+ if (bottommost_level && tombstone.seq_ <= earliest_snapshot &&
+ (ts_sz == 0 ||
+ (!full_history_ts_low.empty() &&
+ ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) {
+ // TODO(andrewkr): tombstones that span multiple output files are
+ // counted for each compaction output file, so lots of double
+ // counting.
+ range_del_out_stats.num_range_del_drop_obsolete++;
+ range_del_out_stats.num_record_drop_obsolete++;
+ continue;
+ }
+
+ auto kv = tombstone.Serialize();
+ assert(lower_bound == nullptr ||
+ ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0);
+ // Range tombstone is not supported by output validator yet.
+ builder_->Add(kv.first.Encode(), kv.second);
+ InternalKey smallest_candidate = std::move(kv.first);
+ if (lower_bound != nullptr &&
+ ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(),
+ *lower_bound) <= 0) {
+ // Pretend the smallest key has the same user key as lower_bound
+ // (the max key in the previous table or subcompaction) in order for
+ // files to appear key-space partitioned.
+ //
+ // When lower_bound is chosen by a subcompaction, we know that
+ // subcompactions over smaller keys cannot contain any keys at
+ // lower_bound. We also know that smaller subcompactions exist,
+ // because otherwise the subcompaction woud be unbounded on the left.
+ // As a result, we know that no other files on the output level will
+ // contain actual keys at lower_bound (an output file may have a
+ // largest key of lower_bound@kMaxSequenceNumber, but this only
+ // indicates a large range tombstone was truncated). Therefore, it is
+ // safe to use the tombstone's sequence number, to ensure that keys at
+ // lower_bound at lower levels are covered by truncated tombstones.
+ //
+ // If lower_bound was chosen by the smallest data key in the file,
+ // choose lowest seqnum so this file's smallest internal key comes
+ // after the previous file's largest. The fake seqnum is OK because
+ // the read path's file-picking code only considers user key.
+ if (lower_bound_from_sub_compact) {
+ if (ts_sz) {
+ assert(tombstone.ts_.size() == ts_sz);
+ smallest_candidate = InternalKey(*lower_bound, tombstone.seq_,
+ kTypeRangeDeletion, tombstone.ts_);
+ } else {
+ smallest_candidate =
+ InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion);
+ }
+ } else {
+ smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
+ }
+ }
+ InternalKey largest_candidate = tombstone.SerializeEndKey();
+ if (upper_bound != nullptr &&
+ ucmp->CompareWithoutTimestamp(*upper_bound,
+ largest_candidate.user_key()) <= 0) {
+ // Pretend the largest key has the same user key as upper_bound (the
+ // min key in the following table or subcompaction) in order for files
+ // to appear key-space partitioned.
+ //
+ // Choose highest seqnum so this file's largest internal key comes
+ // before the next file's/subcompaction's smallest. The fake seqnum is
+ // OK because the read path's file-picking code only considers the
+ // user key portion.
+ //
+ // Note Seek() also creates InternalKey with (user_key,
+ // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
+ // kTypeRangeDeletion (0xF), so the range tombstone comes before the
+ // Seek() key in InternalKey's ordering. So Seek() will look in the
+ // next file for the user key
+ if (ts_sz) {
+ static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+ if (ts_sz <= strlen(kTsMax)) {
+ largest_candidate =
+ InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
+ Slice(kTsMax, ts_sz));
+ } else {
+ largest_candidate =
+ InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
+ std::string(ts_sz, '\xff'));
+ }
+ } else {
+ largest_candidate =
+ InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
+ }
+ }
+#ifndef NDEBUG
+ SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
+ if (meta.smallest.size() > 0) {
+ smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode());
+ }
+#endif
+ meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
+ tombstone.seq_, icmp);
+ // The smallest key in a file is used for range tombstone truncation, so
+ // it cannot have a seqnum of 0 (unless the smallest data key in a file
+ // has a seqnum of 0). Otherwise, the truncated tombstone may expose
+ // deleted keys at lower levels.
+ assert(smallest_ikey_seqnum == 0 ||
+ ExtractInternalKeyFooter(meta.smallest.Encode()) !=
+ PackSequenceAndType(0, kTypeRangeDeletion));
+ }
+ return Status::OK();
+}
+
+void CompactionOutputs::FillFilesToCutForTtl() {
+ if (compaction_->immutable_options()->compaction_style !=
+ kCompactionStyleLevel ||
+ compaction_->immutable_options()->compaction_pri !=
+ kMinOverlappingRatio ||
+ compaction_->mutable_cf_options()->ttl == 0 ||
+ compaction_->num_input_levels() < 2 || compaction_->bottommost_level()) {
+ return;
+ }
+
+ // We define new file with the oldest ancestor time to be younger than 1/4
+ // TTL, and an old one to be older than 1/2 TTL time.
+ int64_t temp_current_time;
+ auto get_time_status =
+ compaction_->immutable_options()->clock->GetCurrentTime(
+ &temp_current_time);
+ if (!get_time_status.ok()) {
+ return;
+ }
+
+ auto current_time = static_cast<uint64_t>(temp_current_time);
+ if (current_time < compaction_->mutable_cf_options()->ttl) {
+ return;
+ }
+
+ uint64_t old_age_thres =
+ current_time - compaction_->mutable_cf_options()->ttl / 2;
+ const std::vector<FileMetaData*>& olevel =
+ *(compaction_->inputs(compaction_->num_input_levels() - 1));
+ for (FileMetaData* file : olevel) {
+ // Worth filtering out by start and end?
+ uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+ // We put old files if they are not too small to prevent a flood
+ // of small files.
+ if (oldest_ancester_time < old_age_thres &&
+ file->fd.GetFileSize() >
+ compaction_->mutable_cf_options()->target_file_size_base / 2) {
+ files_to_cut_for_ttl_.push_back(file);
+ }
+ }
+}
+
+CompactionOutputs::CompactionOutputs(const Compaction* compaction,
+ const bool is_penultimate_level)
+ : compaction_(compaction), is_penultimate_level_(is_penultimate_level) {
+ partitioner_ = compaction->output_level() == 0
+ ? nullptr
+ : compaction->CreateSstPartitioner();
+
+ if (compaction->output_level() != 0) {
+ FillFilesToCutForTtl();
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_outputs.h b/src/rocksdb/db/compaction/compaction_outputs.h
new file mode 100644
index 000000000..f40aa8215
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_outputs.h
@@ -0,0 +1,385 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/blob/blob_garbage_meter.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/internal_stats.h"
+#include "db/output_validator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactionOutputs;
+using CompactionFileOpenFunc = std::function<Status(CompactionOutputs&)>;
+using CompactionFileCloseFunc =
+ std::function<Status(CompactionOutputs&, const Status&, const Slice&)>;
+
+// Files produced by subcompaction, most of the functions are used by
+// compaction_job Open/Close compaction file functions.
+class CompactionOutputs {
+ public:
+ // compaction output file
+ struct Output {
+ Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
+ bool _enable_order_check, bool _enable_hash, bool _finished,
+ uint64_t precalculated_hash)
+ : meta(std::move(_meta)),
+ validator(_icmp, _enable_order_check, _enable_hash,
+ precalculated_hash),
+ finished(_finished) {}
+ FileMetaData meta;
+ OutputValidator validator;
+ bool finished;
+ std::shared_ptr<const TableProperties> table_properties;
+ };
+
+ CompactionOutputs() = delete;
+
+ explicit CompactionOutputs(const Compaction* compaction,
+ const bool is_penultimate_level);
+
+ // Add generated output to the list
+ void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp,
+ bool enable_order_check, bool enable_hash,
+ bool finished = false, uint64_t precalculated_hash = 0) {
+ outputs_.emplace_back(std::move(meta), icmp, enable_order_check,
+ enable_hash, finished, precalculated_hash);
+ }
+
+ // Set new table builder for the current output
+ void NewBuilder(const TableBuilderOptions& tboptions);
+
+ // Assign a new WritableFileWriter to the current output
+ void AssignFileWriter(WritableFileWriter* writer) {
+ file_writer_.reset(writer);
+ }
+
+ // TODO: Remove it when remote compaction support tiered compaction
+ void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; }
+ void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; }
+
+ // TODO: Move the BlobDB builder into CompactionOutputs
+ const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
+ if (is_penultimate_level_) {
+ assert(blob_file_additions_.empty());
+ }
+ return blob_file_additions_;
+ }
+
+ std::vector<BlobFileAddition>* GetBlobFileAdditionsPtr() {
+ assert(!is_penultimate_level_);
+ return &blob_file_additions_;
+ }
+
+ bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); }
+
+ BlobGarbageMeter* CreateBlobGarbageMeter() {
+ assert(!is_penultimate_level_);
+ blob_garbage_meter_ = std::make_unique<BlobGarbageMeter>();
+ return blob_garbage_meter_.get();
+ }
+
+ BlobGarbageMeter* GetBlobGarbageMeter() const {
+ if (is_penultimate_level_) {
+ // blobdb doesn't support per_key_placement yet
+ assert(blob_garbage_meter_ == nullptr);
+ return nullptr;
+ }
+ return blob_garbage_meter_.get();
+ }
+
+ void UpdateBlobStats() {
+ assert(!is_penultimate_level_);
+ stats_.num_output_files_blob = blob_file_additions_.size();
+ for (const auto& blob : blob_file_additions_) {
+ stats_.bytes_written_blob += blob.GetTotalBlobBytes();
+ }
+ }
+
+ // Finish the current output file
+ Status Finish(const Status& intput_status,
+ const SeqnoToTimeMapping& seqno_time_mapping);
+
+ // Update output table properties from table builder
+ void UpdateTableProperties() {
+ current_output().table_properties =
+ std::make_shared<TableProperties>(GetTableProperties());
+ }
+
+ IOStatus WriterSyncClose(const Status& intput_status, SystemClock* clock,
+ Statistics* statistics, bool use_fsync);
+
+ TableProperties GetTableProperties() {
+ return builder_->GetTableProperties();
+ }
+
+ Slice SmallestUserKey() const {
+ if (!outputs_.empty() && outputs_[0].finished) {
+ return outputs_[0].meta.smallest.user_key();
+ } else {
+ return Slice{nullptr, 0};
+ }
+ }
+
+ Slice LargestUserKey() const {
+ if (!outputs_.empty() && outputs_.back().finished) {
+ return outputs_.back().meta.largest.user_key();
+ } else {
+ return Slice{nullptr, 0};
+ }
+ }
+
+ // In case the last output file is empty, which doesn't need to keep.
+ void RemoveLastEmptyOutput() {
+ if (!outputs_.empty() && !outputs_.back().meta.fd.file_size) {
+ // An error occurred, so ignore the last output.
+ outputs_.pop_back();
+ }
+ }
+
+ // Remove the last output, for example the last output doesn't have data (no
+ // entry and no range-dels), but file_size might not be 0, as it has SST
+ // metadata.
+ void RemoveLastOutput() {
+ assert(!outputs_.empty());
+ outputs_.pop_back();
+ }
+
+ bool HasBuilder() const { return builder_ != nullptr; }
+
+ FileMetaData* GetMetaData() { return &current_output().meta; }
+
+ bool HasOutput() const { return !outputs_.empty(); }
+
+ uint64_t NumEntries() const { return builder_->NumEntries(); }
+
+ void ResetBuilder() {
+ builder_.reset();
+ current_output_file_size_ = 0;
+ }
+
+ // Add range-dels from the aggregator to the current output file
+ // @param comp_start_user_key and comp_end_user_key include timestamp if
+ // user-defined timestamp is enabled.
+ // @param full_history_ts_low used for range tombstone garbage collection.
+ Status AddRangeDels(const Slice* comp_start_user_key,
+ const Slice* comp_end_user_key,
+ CompactionIterationStats& range_del_out_stats,
+ bool bottommost_level, const InternalKeyComparator& icmp,
+ SequenceNumber earliest_snapshot,
+ const Slice& next_table_min_key,
+ const std::string& full_history_ts_low);
+
+ // if the outputs have range delete, range delete is also data
+ bool HasRangeDel() const {
+ return range_del_agg_ && !range_del_agg_->IsEmpty();
+ }
+
+ private:
+ friend class SubcompactionState;
+
+ void FillFilesToCutForTtl();
+
+ void SetOutputSlitKey(const std::optional<Slice> start,
+ const std::optional<Slice> end) {
+ const InternalKeyComparator* icmp =
+ &compaction_->column_family_data()->internal_comparator();
+
+ const InternalKey* output_split_key = compaction_->GetOutputSplitKey();
+ // Invalid output_split_key indicates that we do not need to split
+ if (output_split_key != nullptr) {
+ // We may only split the output when the cursor is in the range. Split
+ if ((!end.has_value() ||
+ icmp->user_comparator()->Compare(
+ ExtractUserKey(output_split_key->Encode()), end.value()) < 0) &&
+ (!start.has_value() || icmp->user_comparator()->Compare(
+ ExtractUserKey(output_split_key->Encode()),
+ start.value()) > 0)) {
+ local_output_split_key_ = output_split_key;
+ }
+ }
+ }
+
+ // Returns true iff we should stop building the current output
+ // before processing the current key in compaction iterator.
+ bool ShouldStopBefore(const CompactionIterator& c_iter);
+
+ void Cleanup() {
+ if (builder_ != nullptr) {
+ // May happen if we get a shutdown call in the middle of compaction
+ builder_->Abandon();
+ builder_.reset();
+ }
+ }
+
+ // update tracked grandparents information like grandparent index, if it's
+ // in the gap between 2 grandparent files, accumulated grandparent files size
+ // etc.
+ // It returns how many boundaries it crosses by including current key.
+ size_t UpdateGrandparentBoundaryInfo(const Slice& internal_key);
+
+ // helper function to get the overlapped grandparent files size, it's only
+ // used for calculating the first key's overlap.
+ uint64_t GetCurrentKeyGrandparentOverlappedBytes(
+ const Slice& internal_key) const;
+
+ // Add current key from compaction_iterator to the output file. If needed
+ // close and open new compaction output with the functions provided.
+ Status AddToOutput(const CompactionIterator& c_iter,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func);
+
+ // Close the current output. `open_file_func` is needed for creating new file
+ // for range-dels only output file.
+ Status CloseOutput(const Status& curr_status,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func) {
+ Status status = curr_status;
+ // handle subcompaction containing only range deletions
+ if (status.ok() && !HasBuilder() && !HasOutput() && HasRangeDel()) {
+ status = open_file_func(*this);
+ }
+ if (HasBuilder()) {
+ const Slice empty_key{};
+ Status s = close_file_func(*this, status, empty_key);
+ if (!s.ok() && status.ok()) {
+ status = s;
+ }
+ }
+
+ return status;
+ }
+
+ // This subcompaction's output could be empty if compaction was aborted before
+ // this subcompaction had a chance to generate any output files. When
+ // subcompactions are executed sequentially this is more likely and will be
+ // particularly likely for the later subcompactions to be empty. Once they are
+ // run in parallel however it should be much rarer.
+ // It's caller's responsibility to make sure it's not empty.
+ Output& current_output() {
+ assert(!outputs_.empty());
+ return outputs_.back();
+ }
+
+ // Assign the range_del_agg to the target output level. There's only one
+ // range-del-aggregator per compaction outputs, for
+ // output_to_penultimate_level compaction it is only assigned to the
+ // penultimate level.
+ void AssignRangeDelAggregator(
+ std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
+ assert(range_del_agg_ == nullptr);
+ range_del_agg_ = std::move(range_del_agg);
+ }
+
+ const Compaction* compaction_;
+
+ // current output builder and writer
+ std::unique_ptr<TableBuilder> builder_;
+ std::unique_ptr<WritableFileWriter> file_writer_;
+ uint64_t current_output_file_size_ = 0;
+
+ // all the compaction outputs so far
+ std::vector<Output> outputs_;
+
+ // BlobDB info
+ std::vector<BlobFileAddition> blob_file_additions_;
+ std::unique_ptr<BlobGarbageMeter> blob_garbage_meter_;
+
+ // Basic compaction output stats for this level's outputs
+ InternalStats::CompactionOutputsStats stats_;
+
+ // indicate if this CompactionOutputs obj for penultimate_level, should always
+ // be false if per_key_placement feature is not enabled.
+ const bool is_penultimate_level_;
+ std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_ = nullptr;
+
+ // partitioner information
+ std::string last_key_for_partitioner_;
+ std::unique_ptr<SstPartitioner> partitioner_;
+
+ // A flag determines if this subcompaction has been split by the cursor
+ bool is_split_ = false;
+
+ // We also maintain the output split key for each subcompaction to avoid
+ // repetitive comparison in ShouldStopBefore()
+ const InternalKey* local_output_split_key_ = nullptr;
+
+ // Some identified files with old oldest ancester time and the range should be
+ // isolated out so that the output file(s) in that range can be merged down
+ // for TTL and clear the timestamps for the range.
+ std::vector<FileMetaData*> files_to_cut_for_ttl_;
+ int cur_files_to_cut_for_ttl_ = -1;
+ int next_files_to_cut_for_ttl_ = 0;
+
+ // An index that used to speed up ShouldStopBefore().
+ size_t grandparent_index_ = 0;
+
+ // if the output key is being grandparent files gap, so:
+ // key > grandparents[grandparent_index_ - 1].largest &&
+ // key < grandparents[grandparent_index_].smallest
+ bool being_grandparent_gap_ = true;
+
+ // The number of bytes overlapping between the current output and
+ // grandparent files used in ShouldStopBefore().
+ uint64_t grandparent_overlapped_bytes_ = 0;
+
+ // A flag determines whether the key has been seen in ShouldStopBefore()
+ bool seen_key_ = false;
+
+ // for the current output file, how many file boundaries has it crossed,
+ // basically number of files overlapped * 2
+ size_t grandparent_boundary_switched_num_ = 0;
+};
+
+// helper struct to concatenate the last level and penultimate level outputs
+// which could be replaced by std::ranges::join_view() in c++20
+struct OutputIterator {
+ public:
+ explicit OutputIterator(const std::vector<CompactionOutputs::Output>& a,
+ const std::vector<CompactionOutputs::Output>& b)
+ : a_(a), b_(b) {
+ within_a = !a_.empty();
+ idx_ = 0;
+ }
+
+ OutputIterator begin() { return *this; }
+
+ OutputIterator end() { return *this; }
+
+ size_t size() { return a_.size() + b_.size(); }
+
+ const CompactionOutputs::Output& operator*() const {
+ return within_a ? a_[idx_] : b_[idx_];
+ }
+
+ OutputIterator& operator++() {
+ idx_++;
+ if (within_a && idx_ >= a_.size()) {
+ within_a = false;
+ idx_ = 0;
+ }
+ assert(within_a || idx_ <= b_.size());
+ return *this;
+ }
+
+ bool operator!=(const OutputIterator& /*rhs*/) const {
+ return within_a || idx_ < b_.size();
+ }
+
+ private:
+ const std::vector<CompactionOutputs::Output>& a_;
+ const std::vector<CompactionOutputs::Output>& b_;
+ bool within_a;
+ size_t idx_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker.cc b/src/rocksdb/db/compaction/compaction_picker.cc
new file mode 100644
index 000000000..abdecca9f
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.cc
@@ -0,0 +1,1234 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker.h"
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
+ size_t min_files_to_compact,
+ uint64_t max_compact_bytes_per_del_file,
+ uint64_t max_compaction_bytes,
+ CompactionInputFiles* comp_inputs,
+ SequenceNumber earliest_mem_seqno) {
+ // Do not pick ingested file when there is at least one memtable not flushed
+ // which of seqno is overlap with the sst.
+ TEST_SYNC_POINT("FindIntraL0Compaction");
+ size_t start = 0;
+ for (; start < level_files.size(); start++) {
+ if (level_files[start]->being_compacted) {
+ return false;
+ }
+ // If there is no data in memtable, the earliest sequence number would the
+ // largest sequence number in last memtable.
+ // Because all files are sorted in descending order by largest_seqno, so we
+ // only need to check the first one.
+ if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) {
+ break;
+ }
+ }
+ if (start >= level_files.size()) {
+ return false;
+ }
+ size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size);
+ size_t compact_bytes_per_del_file = std::numeric_limits<size_t>::max();
+ // Compaction range will be [start, limit).
+ size_t limit;
+ // Pull in files until the amount of compaction work per deleted file begins
+ // increasing or maximum total compaction size is reached.
+ size_t new_compact_bytes_per_del_file = 0;
+ for (limit = start + 1; limit < level_files.size(); ++limit) {
+ compact_bytes += static_cast<size_t>(level_files[limit]->fd.file_size);
+ new_compact_bytes_per_del_file = compact_bytes / (limit - start);
+ if (level_files[limit]->being_compacted ||
+ new_compact_bytes_per_del_file > compact_bytes_per_del_file ||
+ compact_bytes > max_compaction_bytes) {
+ break;
+ }
+ compact_bytes_per_del_file = new_compact_bytes_per_del_file;
+ }
+
+ if ((limit - start) >= min_files_to_compact &&
+ compact_bytes_per_del_file < max_compact_bytes_per_del_file) {
+ assert(comp_inputs != nullptr);
+ comp_inputs->level = 0;
+ for (size_t i = start; i < limit; ++i) {
+ comp_inputs->files.push_back(level_files[i]);
+ }
+ return true;
+ }
+ return false;
+}
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ int level, int base_level,
+ const bool enable_compression) {
+ if (!enable_compression) {
+ // disable compression
+ return kNoCompression;
+ }
+
+ // If bottommost_compression is set and we are compacting to the
+ // bottommost level then we should use it.
+ if (mutable_cf_options.bottommost_compression != kDisableCompressionOption &&
+ level >= (vstorage->num_non_empty_levels() - 1)) {
+ return mutable_cf_options.bottommost_compression;
+ }
+ // If the user has specified a different compression level for each level,
+ // then pick the compression for that level.
+ if (!mutable_cf_options.compression_per_level.empty()) {
+ assert(level == 0 || level >= base_level);
+ int idx = (level == 0) ? 0 : level - base_level + 1;
+
+ const int n =
+ static_cast<int>(mutable_cf_options.compression_per_level.size()) - 1;
+ // It is possible for level_ to be -1; in that case, we use level
+ // 0's compression. This occurs mostly in backwards compatibility
+ // situations when the builder doesn't know what level the file
+ // belongs to. Likewise, if level is beyond the end of the
+ // specified compression levels, use the last value.
+ return mutable_cf_options
+ .compression_per_level[std::max(0, std::min(idx, n))];
+ } else {
+ return mutable_cf_options.compression;
+ }
+}
+
+CompressionOptions GetCompressionOptions(const MutableCFOptions& cf_options,
+ const VersionStorageInfo* vstorage,
+ int level,
+ const bool enable_compression) {
+ if (!enable_compression) {
+ return cf_options.compression_opts;
+ }
+ // If bottommost_compression_opts is enabled and we are compacting to the
+ // bottommost level then we should use the specified compression options.
+ if (level >= (vstorage->num_non_empty_levels() - 1) &&
+ cf_options.bottommost_compression_opts.enabled) {
+ return cf_options.bottommost_compression_opts;
+ }
+ return cf_options.compression_opts;
+}
+
+CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : ioptions_(ioptions), icmp_(icmp) {}
+
+CompactionPicker::~CompactionPicker() {}
+
+// Delete this compaction from the list of running compactions.
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+ UnregisterCompaction(c);
+ if (!status.ok()) {
+ c->ResetNextCompactionIndex();
+ }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs,
+ InternalKey* smallest,
+ InternalKey* largest) const {
+ const int level = inputs.level;
+ assert(!inputs.empty());
+ smallest->Clear();
+ largest->Clear();
+
+ if (level == 0) {
+ for (size_t i = 0; i < inputs.size(); i++) {
+ FileMetaData* f = inputs[i];
+ if (i == 0) {
+ *smallest = f->smallest;
+ *largest = f->largest;
+ } else {
+ if (icmp_->Compare(f->smallest, *smallest) < 0) {
+ *smallest = f->smallest;
+ }
+ if (icmp_->Compare(f->largest, *largest) > 0) {
+ *largest = f->largest;
+ }
+ }
+ }
+ } else {
+ *smallest = inputs[0]->smallest;
+ *largest = inputs[inputs.size() - 1]->largest;
+ }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs1,
+ const CompactionInputFiles& inputs2,
+ InternalKey* smallest,
+ InternalKey* largest) const {
+ assert(!inputs1.empty() || !inputs2.empty());
+ if (inputs1.empty()) {
+ GetRange(inputs2, smallest, largest);
+ } else if (inputs2.empty()) {
+ GetRange(inputs1, smallest, largest);
+ } else {
+ InternalKey smallest1, smallest2, largest1, largest2;
+ GetRange(inputs1, &smallest1, &largest1);
+ GetRange(inputs2, &smallest2, &largest2);
+ *smallest =
+ icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2;
+ *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1;
+ }
+}
+
+void CompactionPicker::GetRange(const std::vector<CompactionInputFiles>& inputs,
+ InternalKey* smallest, InternalKey* largest,
+ int exclude_level) const {
+ InternalKey current_smallest;
+ InternalKey current_largest;
+ bool initialized = false;
+ for (const auto& in : inputs) {
+ if (in.empty() || in.level == exclude_level) {
+ continue;
+ }
+ GetRange(in, &current_smallest, &current_largest);
+ if (!initialized) {
+ *smallest = current_smallest;
+ *largest = current_largest;
+ initialized = true;
+ } else {
+ if (icmp_->Compare(current_smallest, *smallest) < 0) {
+ *smallest = current_smallest;
+ }
+ if (icmp_->Compare(current_largest, *largest) > 0) {
+ *largest = current_largest;
+ }
+ }
+ }
+ assert(initialized);
+}
+
+bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ InternalKey** next_smallest) {
+ // This isn't good compaction
+ assert(!inputs->empty());
+
+ const int level = inputs->level;
+ // GetOverlappingInputs will always do the right thing for level-0.
+ // So we don't need to do any expansion if level == 0.
+ if (level == 0) {
+ return true;
+ }
+
+ InternalKey smallest, largest;
+
+ // Keep expanding inputs until we are sure that there is a "clean cut"
+ // boundary between the files in input and the surrounding files.
+ // This will ensure that no parts of a key are lost during compaction.
+ int hint_index = -1;
+ size_t old_size;
+ do {
+ old_size = inputs->size();
+ GetRange(*inputs, &smallest, &largest);
+ inputs->clear();
+ vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
+ hint_index, &hint_index, true,
+ next_smallest);
+ } while (inputs->size() > old_size);
+
+ // we started off with inputs non-empty and the previous loop only grew
+ // inputs. thus, inputs should be non-empty here
+ assert(!inputs->empty());
+
+ // If, after the expansion, there are files that are already under
+ // compaction, then we must drop/cancel this compaction.
+ if (AreFilesInCompaction(inputs->files)) {
+ return false;
+ }
+ return true;
+}
+
+bool CompactionPicker::RangeOverlapWithCompaction(
+ const Slice& smallest_user_key, const Slice& largest_user_key,
+ int level) const {
+ const Comparator* ucmp = icmp_->user_comparator();
+ for (Compaction* c : compactions_in_progress_) {
+ if (c->output_level() == level &&
+ ucmp->CompareWithoutTimestamp(smallest_user_key,
+ c->GetLargestUserKey()) <= 0 &&
+ ucmp->CompareWithoutTimestamp(largest_user_key,
+ c->GetSmallestUserKey()) >= 0) {
+ // Overlap
+ return true;
+ }
+ if (c->SupportsPerKeyPlacement()) {
+ if (c->OverlapPenultimateLevelOutputRange(smallest_user_key,
+ largest_user_key)) {
+ return true;
+ }
+ }
+ }
+ // Did not overlap with any running compaction in level `level`
+ return false;
+}
+
+bool CompactionPicker::FilesRangeOverlapWithCompaction(
+ const std::vector<CompactionInputFiles>& inputs, int level,
+ int penultimate_level) const {
+ bool is_empty = true;
+ for (auto& in : inputs) {
+ if (!in.empty()) {
+ is_empty = false;
+ break;
+ }
+ }
+ if (is_empty) {
+ // No files in inputs
+ return false;
+ }
+
+ // TODO: Intra L0 compactions can have the ranges overlapped, but the input
+ // files cannot be overlapped in the order of L0 files.
+ InternalKey smallest, largest;
+ GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel);
+ if (penultimate_level != Compaction::kInvalidLevel) {
+ if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+ if (RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
+ penultimate_level)) {
+ return true;
+ }
+ } else {
+ InternalKey penultimate_smallest, penultimate_largest;
+ GetRange(inputs, &penultimate_smallest, &penultimate_largest, level);
+ if (RangeOverlapWithCompaction(penultimate_smallest.user_key(),
+ penultimate_largest.user_key(),
+ penultimate_level)) {
+ return true;
+ }
+ }
+ }
+
+ return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
+ level);
+}
+
+// Returns true if any one of specified files are being compacted
+bool CompactionPicker::AreFilesInCompaction(
+ const std::vector<FileMetaData*>& files) {
+ for (size_t i = 0; i < files.size(); i++) {
+ if (files[i]->being_compacted) {
+ return true;
+ }
+ }
+ return false;
+}
+
+Compaction* CompactionPicker::CompactFiles(
+ const CompactionOptions& compact_options,
+ const std::vector<CompactionInputFiles>& input_files, int output_level,
+ VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, uint32_t output_path_id) {
+#ifndef NDEBUG
+ assert(input_files.size());
+ // This compaction output should not overlap with a running compaction as
+ // `SanitizeCompactionInputFiles` should've checked earlier and db mutex
+ // shouldn't have been released since.
+ int start_level = Compaction::kInvalidLevel;
+ for (const auto& in : input_files) {
+ // input_files should already be sorted by level
+ if (!in.empty()) {
+ start_level = in.level;
+ break;
+ }
+ }
+ assert(output_level == 0 ||
+ !FilesRangeOverlapWithCompaction(
+ input_files, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage, ioptions_,
+ start_level, output_level)));
+#endif /* !NDEBUG */
+
+ CompressionType compression_type;
+ if (compact_options.compression == kDisableCompressionOption) {
+ int base_level;
+ if (ioptions_.compaction_style == kCompactionStyleLevel) {
+ base_level = vstorage->base_level();
+ } else {
+ base_level = 1;
+ }
+ compression_type = GetCompressionType(vstorage, mutable_cf_options,
+ output_level, base_level);
+ } else {
+ // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType`
+ // without configurable `CompressionOptions`, which is inconsistent.
+ compression_type = compact_options.compression;
+ }
+ auto c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files,
+ output_level, compact_options.output_file_size_limit,
+ mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
+ GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+ Temperature::kUnknown, compact_options.max_subcompactions,
+ /* grandparents */ {}, true);
+ RegisterCompaction(c);
+ return c;
+}
+
+Status CompactionPicker::GetCompactionInputsFromFileNumbers(
+ std::vector<CompactionInputFiles>* input_files,
+ std::unordered_set<uint64_t>* input_set, const VersionStorageInfo* vstorage,
+ const CompactionOptions& /*compact_options*/) const {
+ if (input_set->size() == 0U) {
+ return Status::InvalidArgument(
+ "Compaction must include at least one file.");
+ }
+ assert(input_files);
+
+ std::vector<CompactionInputFiles> matched_input_files;
+ matched_input_files.resize(vstorage->num_levels());
+ int first_non_empty_level = -1;
+ int last_non_empty_level = -1;
+ // TODO(yhchiang): use a lazy-initialized mapping from
+ // file_number to FileMetaData in Version.
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ for (auto file : vstorage->LevelFiles(level)) {
+ auto iter = input_set->find(file->fd.GetNumber());
+ if (iter != input_set->end()) {
+ matched_input_files[level].files.push_back(file);
+ input_set->erase(iter);
+ last_non_empty_level = level;
+ if (first_non_empty_level == -1) {
+ first_non_empty_level = level;
+ }
+ }
+ }
+ }
+
+ if (!input_set->empty()) {
+ std::string message(
+ "Cannot find matched SST files for the following file numbers:");
+ for (auto fn : *input_set) {
+ message += " ";
+ message += std::to_string(fn);
+ }
+ return Status::InvalidArgument(message);
+ }
+
+ for (int level = first_non_empty_level; level <= last_non_empty_level;
+ ++level) {
+ matched_input_files[level].level = level;
+ input_files->emplace_back(std::move(matched_input_files[level]));
+ }
+
+ return Status::OK();
+}
+
+// Returns true if any one of the parent files are being compacted
+bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage,
+ const InternalKey* smallest,
+ const InternalKey* largest,
+ int level, int* level_index) {
+ std::vector<FileMetaData*> inputs;
+ assert(level < NumberLevels());
+
+ vstorage->GetOverlappingInputs(level, smallest, largest, &inputs,
+ level_index ? *level_index : 0, level_index);
+ return AreFilesInCompaction(inputs);
+}
+
+// Populates the set of inputs of all other levels that overlap with the
+// start level.
+// Now we assume all levels except start level and output level are empty.
+// Will also attempt to expand "start level" if that doesn't expand
+// "output level" or cause "level" to include a file for compaction that has an
+// overlapping user-key with another file.
+// REQUIRES: input_level and output_level are different
+// REQUIRES: inputs->empty() == false
+// Returns false if files on parent level are currently in compaction, which
+// means that we can't compact them
+bool CompactionPicker::SetupOtherInputs(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
+ CompactionInputFiles* output_level_inputs, int* parent_index,
+ int base_index, bool only_expand_towards_right) {
+ assert(!inputs->empty());
+ assert(output_level_inputs->empty());
+ const int input_level = inputs->level;
+ const int output_level = output_level_inputs->level;
+ if (input_level == output_level) {
+ // no possibility of conflict
+ return true;
+ }
+
+ // For now, we only support merging two levels, start level and output level.
+ // We need to assert other levels are empty.
+ for (int l = input_level + 1; l < output_level; l++) {
+ assert(vstorage->NumLevelFiles(l) == 0);
+ }
+
+ InternalKey smallest, largest;
+
+ // Get the range one last time.
+ GetRange(*inputs, &smallest, &largest);
+
+ // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to
+ // include in compaction
+ vstorage->GetOverlappingInputs(output_level, &smallest, &largest,
+ &output_level_inputs->files, *parent_index,
+ parent_index);
+ if (AreFilesInCompaction(output_level_inputs->files)) {
+ return false;
+ }
+ if (!output_level_inputs->empty()) {
+ if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) {
+ return false;
+ }
+ }
+
+ // See if we can further grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up. We also choose NOT
+ // to expand if this would cause "level" to include some entries for some
+ // user key, while excluding other entries for the same user key. This
+ // can happen when one user key spans multiple files.
+ if (!output_level_inputs->empty()) {
+ const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+ const uint64_t output_level_inputs_size =
+ TotalFileSize(output_level_inputs->files);
+ const uint64_t inputs_size = TotalFileSize(inputs->files);
+ bool expand_inputs = false;
+
+ CompactionInputFiles expanded_inputs;
+ expanded_inputs.level = input_level;
+ // Get closed interval of output level
+ InternalKey all_start, all_limit;
+ GetRange(*inputs, *output_level_inputs, &all_start, &all_limit);
+ bool try_overlapping_inputs = true;
+ if (only_expand_towards_right) {
+ // Round-robin compaction only allows expansion towards the larger side.
+ vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit,
+ &expanded_inputs.files, base_index,
+ nullptr);
+ } else {
+ vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
+ &expanded_inputs.files, base_index,
+ nullptr);
+ }
+ uint64_t expanded_inputs_size = TotalFileSize(expanded_inputs.files);
+ if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) {
+ try_overlapping_inputs = false;
+ }
+ if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() &&
+ (mutable_cf_options.ignore_max_compaction_bytes_for_input ||
+ output_level_inputs_size + expanded_inputs_size < limit) &&
+ !AreFilesInCompaction(expanded_inputs.files)) {
+ InternalKey new_start, new_limit;
+ GetRange(expanded_inputs, &new_start, &new_limit);
+ CompactionInputFiles expanded_output_level_inputs;
+ expanded_output_level_inputs.level = output_level;
+ vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit,
+ &expanded_output_level_inputs.files,
+ *parent_index, parent_index);
+ assert(!expanded_output_level_inputs.empty());
+ if (!AreFilesInCompaction(expanded_output_level_inputs.files) &&
+ ExpandInputsToCleanCut(cf_name, vstorage,
+ &expanded_output_level_inputs) &&
+ expanded_output_level_inputs.size() == output_level_inputs->size()) {
+ expand_inputs = true;
+ }
+ }
+ if (!expand_inputs) {
+ vstorage->GetCleanInputsWithinInterval(input_level, &all_start,
+ &all_limit, &expanded_inputs.files,
+ base_index, nullptr);
+ expanded_inputs_size = TotalFileSize(expanded_inputs.files);
+ if (expanded_inputs.size() > inputs->size() &&
+ (mutable_cf_options.ignore_max_compaction_bytes_for_input ||
+ output_level_inputs_size + expanded_inputs_size < limit) &&
+ !AreFilesInCompaction(expanded_inputs.files)) {
+ expand_inputs = true;
+ }
+ }
+ if (expand_inputs) {
+ ROCKS_LOG_INFO(ioptions_.logger,
+ "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt
+ "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt
+ "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n",
+ cf_name.c_str(), input_level, inputs->size(),
+ output_level_inputs->size(), inputs_size,
+ output_level_inputs_size, expanded_inputs.size(),
+ output_level_inputs->size(), expanded_inputs_size,
+ output_level_inputs_size);
+ inputs->files = expanded_inputs.files;
+ }
+ } else {
+ // Likely to be trivial move. Expand files if they are still trivial moves,
+ // but limit to mutable_cf_options.max_compaction_bytes or 8 files so that
+ // we don't create too much compaction pressure for the next level.
+ }
+ return true;
+}
+
+void CompactionPicker::GetGrandparents(
+ VersionStorageInfo* vstorage, const CompactionInputFiles& inputs,
+ const CompactionInputFiles& output_level_inputs,
+ std::vector<FileMetaData*>* grandparents) {
+ InternalKey start, limit;
+ GetRange(inputs, output_level_inputs, &start, &limit);
+ // Compute the set of grandparent files that overlap this compaction
+ // (parent == level+1; grandparent == level+2 or the first
+ // level after that has overlapping files)
+ for (int level = output_level_inputs.level + 1; level < NumberLevels();
+ level++) {
+ vstorage->GetOverlappingInputs(level, &start, &limit, grandparents);
+ if (!grandparents->empty()) {
+ break;
+ }
+ }
+}
+
+Compaction* CompactionPicker::CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options, const InternalKey* begin,
+ const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+ // CompactionPickerFIFO has its own implementation of compact range
+ assert(ioptions_.compaction_style != kCompactionStyleFIFO);
+
+ if (input_level == ColumnFamilyData::kCompactAllLevels) {
+ assert(ioptions_.compaction_style == kCompactionStyleUniversal);
+
+ // Universal compaction with more than one level always compacts all the
+ // files together to the last level.
+ assert(vstorage->num_levels() > 1);
+ // DBImpl::CompactRange() set output level to be the last level
+ if (ioptions_.allow_ingest_behind) {
+ assert(output_level == vstorage->num_levels() - 2);
+ } else {
+ assert(output_level == vstorage->num_levels() - 1);
+ }
+ // DBImpl::RunManualCompaction will make full range for universal compaction
+ assert(begin == nullptr);
+ assert(end == nullptr);
+ *compaction_end = nullptr;
+
+ int start_level = 0;
+ for (; start_level < vstorage->num_levels() &&
+ vstorage->NumLevelFiles(start_level) == 0;
+ start_level++) {
+ }
+ if (start_level == vstorage->num_levels()) {
+ return nullptr;
+ }
+
+ if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) {
+ *manual_conflict = true;
+ // Only one level 0 compaction allowed
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs(vstorage->num_levels() -
+ start_level);
+ for (int level = start_level; level < vstorage->num_levels(); level++) {
+ inputs[level - start_level].level = level;
+ auto& files = inputs[level - start_level].files;
+ for (FileMetaData* f : vstorage->LevelFiles(level)) {
+ files.push_back(f);
+ }
+ if (AreFilesInCompaction(files)) {
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ // 2 non-exclusive manual compactions could run at the same time producing
+ // overlaping outputs in the same level.
+ if (FilesRangeOverlapWithCompaction(
+ inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage, ioptions_,
+ start_level, output_level))) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ std::move(inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options, output_level,
+ ioptions_.compaction_style),
+ /* max_compaction_bytes */ LLONG_MAX,
+ compact_range_options.target_path_id,
+ GetCompressionType(vstorage, mutable_cf_options, output_level, 1),
+ GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+ Temperature::kUnknown, compact_range_options.max_subcompactions,
+ /* grandparents */ {}, /* is manual */ true, trim_ts, /* score */ -1,
+ /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
+ CompactionReason::kUnknown,
+ compact_range_options.blob_garbage_collection_policy,
+ compact_range_options.blob_garbage_collection_age_cutoff);
+
+ RegisterCompaction(c);
+ vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+ return c;
+ }
+
+ CompactionInputFiles inputs;
+ inputs.level = input_level;
+ bool covering_the_whole_range = true;
+
+ // All files are 'overlapping' in universal style compaction.
+ // We have to compact the entire range in one shot.
+ if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+ begin = nullptr;
+ end = nullptr;
+ }
+
+ vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files);
+ if (inputs.empty()) {
+ return nullptr;
+ }
+
+ if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) {
+ // Only one level 0 compaction allowed
+ TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict");
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ // Avoid compacting too much in one shot in case the range is large.
+ // But we cannot do this for level-0 since level-0 files can overlap
+ // and we must not pick one file and drop another older file if the
+ // two files overlap.
+ if (input_level > 0) {
+ const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+ uint64_t input_level_total = 0;
+ int hint_index = -1;
+ InternalKey* smallest = nullptr;
+ InternalKey* largest = nullptr;
+ for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+ if (!smallest) {
+ smallest = &inputs[i]->smallest;
+ }
+ largest = &inputs[i]->largest;
+
+ uint64_t input_file_size = inputs[i]->fd.GetFileSize();
+ uint64_t output_level_total = 0;
+ if (output_level < vstorage->num_non_empty_levels()) {
+ std::vector<FileMetaData*> files;
+ vstorage->GetOverlappingInputsRangeBinarySearch(
+ output_level, smallest, largest, &files, hint_index, &hint_index);
+ for (const auto& file : files) {
+ output_level_total += file->fd.GetFileSize();
+ }
+ }
+
+ input_level_total += input_file_size;
+
+ if (input_level_total + output_level_total >= limit) {
+ covering_the_whole_range = false;
+ // still include the current file, so the compaction could be larger
+ // than max_compaction_bytes, which is also to make sure the compaction
+ // can make progress even `max_compaction_bytes` is small (e.g. smaller
+ // than an SST file).
+ inputs.files.resize(i + 1);
+ break;
+ }
+ }
+ }
+
+ assert(compact_range_options.target_path_id <
+ static_cast<uint32_t>(ioptions_.cf_paths.size()));
+
+ // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
+ // files that are created during the current compaction.
+ if (compact_range_options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kForceOptimized &&
+ max_file_num_to_ignore != std::numeric_limits<uint64_t>::max()) {
+ assert(input_level == output_level);
+ // inputs_shrunk holds a continuous subset of input files which were all
+ // created before the current manual compaction
+ std::vector<FileMetaData*> inputs_shrunk;
+ size_t skip_input_index = inputs.size();
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+ inputs_shrunk.push_back(inputs[i]);
+ } else if (!inputs_shrunk.empty()) {
+ // inputs[i] was created during the current manual compaction and
+ // need to be skipped
+ skip_input_index = i;
+ break;
+ }
+ }
+ if (inputs_shrunk.empty()) {
+ return nullptr;
+ }
+ if (inputs.size() != inputs_shrunk.size()) {
+ inputs.files.swap(inputs_shrunk);
+ }
+ // set covering_the_whole_range to false if there is any file that need to
+ // be compacted in the range of inputs[skip_input_index+1, inputs.size())
+ for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) {
+ if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+ covering_the_whole_range = false;
+ }
+ }
+ }
+
+ InternalKey key_storage;
+ InternalKey* next_smallest = &key_storage;
+ if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) ==
+ false) {
+ // manual compaction is now multi-threaded, so it can
+ // happen that ExpandWhileOverlapping fails
+ // we handle it higher in RunManualCompaction
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ if (covering_the_whole_range || !next_smallest) {
+ *compaction_end = nullptr;
+ } else {
+ **compaction_end = *next_smallest;
+ }
+
+ CompactionInputFiles output_level_inputs;
+ if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+ assert(input_level == 0);
+ output_level = vstorage->base_level();
+ assert(output_level > 0);
+ }
+ output_level_inputs.level = output_level;
+ if (input_level != output_level) {
+ int parent_index = -1;
+ if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs,
+ &output_level_inputs, &parent_index, -1)) {
+ // manual compaction is now multi-threaded, so it can
+ // happen that SetupOtherInputs fails
+ // we handle it higher in RunManualCompaction
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ std::vector<CompactionInputFiles> compaction_inputs({inputs});
+ if (!output_level_inputs.empty()) {
+ compaction_inputs.push_back(output_level_inputs);
+ }
+ for (size_t i = 0; i < compaction_inputs.size(); i++) {
+ if (AreFilesInCompaction(compaction_inputs[i].files)) {
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ // 2 non-exclusive manual compactions could run at the same time producing
+ // overlaping outputs in the same level.
+ if (FilesRangeOverlapWithCompaction(
+ compaction_inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, input_level,
+ output_level))) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ std::vector<FileMetaData*> grandparents;
+ GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents);
+ Compaction* compaction = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ std::move(compaction_inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options, output_level,
+ ioptions_.compaction_style, vstorage->base_level(),
+ ioptions_.level_compaction_dynamic_level_bytes),
+ mutable_cf_options.max_compaction_bytes,
+ compact_range_options.target_path_id,
+ GetCompressionType(vstorage, mutable_cf_options, output_level,
+ vstorage->base_level()),
+ GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+ Temperature::kUnknown, compact_range_options.max_subcompactions,
+ std::move(grandparents), /* is manual */ true, trim_ts, /* score */ -1,
+ /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
+ CompactionReason::kUnknown,
+ compact_range_options.blob_garbage_collection_policy,
+ compact_range_options.blob_garbage_collection_age_cutoff);
+
+ TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction);
+ RegisterCompaction(compaction);
+
+ // Creating a compaction influences the compaction score because the score
+ // takes running compactions into account (by skipping files that are already
+ // being compacted). Since we just changed compaction score, we recalculate it
+ // here
+ vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+
+ return compaction;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+// Test whether two files have overlapping key-ranges.
+bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
+ const SstFileMetaData& b) {
+ if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) {
+ if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+ // b.smallestkey <= a.smallestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+ // a.smallestkey < b.smallestkey <= a.largestkey
+ return true;
+ }
+ if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) {
+ if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+ // b.smallestkey <= a.largestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+ // a.smallestkey <= b.largestkey < a.largestkey
+ return true;
+ }
+ return false;
+}
+} // namespace
+
+Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+ auto& levels = cf_meta.levels;
+ auto comparator = icmp_->user_comparator();
+
+ // TODO(yhchiang): add is_adjustable to CompactionOptions
+
+ // the smallest and largest key of the current compaction input
+ std::string smallestkey;
+ std::string largestkey;
+ // a flag for initializing smallest and largest key
+ bool is_first = false;
+ const int kNotFound = -1;
+
+ // For each level, it does the following things:
+ // 1. Find the first and the last compaction input files
+ // in the current level.
+ // 2. Include all files between the first and the last
+ // compaction input files.
+ // 3. Update the compaction key-range.
+ // 4. For all remaining levels, include files that have
+ // overlapping key-range with the compaction key-range.
+ for (int l = 0; l <= output_level; ++l) {
+ auto& current_files = levels[l].files;
+ int first_included = static_cast<int>(current_files.size());
+ int last_included = kNotFound;
+
+ // identify the first and the last compaction input files
+ // in the current level.
+ for (size_t f = 0; f < current_files.size(); ++f) {
+ const uint64_t file_number = TableFileNameToNumber(current_files[f].name);
+ if (input_files->find(file_number) == input_files->end()) {
+ continue;
+ }
+ first_included = std::min(first_included, static_cast<int>(f));
+ last_included = std::max(last_included, static_cast<int>(f));
+ if (is_first == false) {
+ smallestkey = current_files[f].smallestkey;
+ largestkey = current_files[f].largestkey;
+ is_first = true;
+ }
+ }
+ if (last_included == kNotFound) {
+ continue;
+ }
+
+ if (l != 0) {
+ // expand the compaction input of the current level if it
+ // has overlapping key-range with other non-compaction input
+ // files in the same level.
+ while (first_included > 0) {
+ if (comparator->CompareWithoutTimestamp(
+ current_files[first_included - 1].largestkey,
+ current_files[first_included].smallestkey) < 0) {
+ break;
+ }
+ first_included--;
+ }
+
+ while (last_included < static_cast<int>(current_files.size()) - 1) {
+ if (comparator->CompareWithoutTimestamp(
+ current_files[last_included + 1].smallestkey,
+ current_files[last_included].largestkey) > 0) {
+ break;
+ }
+ last_included++;
+ }
+ } else if (output_level > 0) {
+ last_included = static_cast<int>(current_files.size() - 1);
+ }
+
+ // include all files between the first and the last compaction input files.
+ for (int f = first_included; f <= last_included; ++f) {
+ if (current_files[f].being_compacted) {
+ return Status::Aborted("Necessary compaction input file " +
+ current_files[f].name +
+ " is currently being compacted.");
+ }
+ input_files->insert(TableFileNameToNumber(current_files[f].name));
+ }
+
+ // update smallest and largest key
+ if (l == 0) {
+ for (int f = first_included; f <= last_included; ++f) {
+ if (comparator->CompareWithoutTimestamp(
+ smallestkey, current_files[f].smallestkey) > 0) {
+ smallestkey = current_files[f].smallestkey;
+ }
+ if (comparator->CompareWithoutTimestamp(
+ largestkey, current_files[f].largestkey) < 0) {
+ largestkey = current_files[f].largestkey;
+ }
+ }
+ } else {
+ if (comparator->CompareWithoutTimestamp(
+ smallestkey, current_files[first_included].smallestkey) > 0) {
+ smallestkey = current_files[first_included].smallestkey;
+ }
+ if (comparator->CompareWithoutTimestamp(
+ largestkey, current_files[last_included].largestkey) < 0) {
+ largestkey = current_files[last_included].largestkey;
+ }
+ }
+
+ SstFileMetaData aggregated_file_meta;
+ aggregated_file_meta.smallestkey = smallestkey;
+ aggregated_file_meta.largestkey = largestkey;
+
+ // For all lower levels, include all overlapping files.
+ // We need to add overlapping files from the current level too because even
+ // if there no input_files in level l, we would still need to add files
+ // which overlap with the range containing the input_files in levels 0 to l
+ // Level 0 doesn't need to be handled this way because files are sorted by
+ // time and not by key
+ for (int m = std::max(l, 1); m <= output_level; ++m) {
+ for (auto& next_lv_file : levels[m].files) {
+ if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta,
+ next_lv_file)) {
+ if (next_lv_file.being_compacted) {
+ return Status::Aborted(
+ "File " + next_lv_file.name +
+ " that has overlapping key range with one of the compaction "
+ " input file is currently being compacted.");
+ }
+ input_files->insert(TableFileNameToNumber(next_lv_file.name));
+ }
+ }
+ }
+ }
+ if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) {
+ return Status::Aborted(
+ "A running compaction is writing to the same output level in an "
+ "overlapping key range");
+ }
+ return Status::OK();
+}
+
+Status CompactionPicker::SanitizeCompactionInputFiles(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+ assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
+ cf_meta.levels[cf_meta.levels.size() - 1].level);
+ if (output_level >= static_cast<int>(cf_meta.levels.size())) {
+ return Status::InvalidArgument(
+ "Output level for column family " + cf_meta.name +
+ " must between [0, " +
+ std::to_string(cf_meta.levels[cf_meta.levels.size() - 1].level) + "].");
+ }
+
+ if (output_level > MaxOutputLevel()) {
+ return Status::InvalidArgument(
+ "Exceed the maximum output level defined by "
+ "the current compaction algorithm --- " +
+ std::to_string(MaxOutputLevel()));
+ }
+
+ if (output_level < 0) {
+ return Status::InvalidArgument("Output level cannot be negative.");
+ }
+
+ if (input_files->size() == 0) {
+ return Status::InvalidArgument(
+ "A compaction must contain at least one file.");
+ }
+
+ Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta,
+ output_level);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ // for all input files, check whether the file number matches
+ // any currently-existing files.
+ for (auto file_num : *input_files) {
+ bool found = false;
+ int input_file_level = -1;
+ for (const auto& level_meta : cf_meta.levels) {
+ for (const auto& file_meta : level_meta.files) {
+ if (file_num == TableFileNameToNumber(file_meta.name)) {
+ if (file_meta.being_compacted) {
+ return Status::Aborted("Specified compaction input file " +
+ MakeTableFileName("", file_num) +
+ " is already being compacted.");
+ }
+ found = true;
+ input_file_level = level_meta.level;
+ break;
+ }
+ }
+ if (found) {
+ break;
+ }
+ }
+ if (!found) {
+ return Status::InvalidArgument(
+ "Specified compaction input file " + MakeTableFileName("", file_num) +
+ " does not exist in column family " + cf_meta.name + ".");
+ }
+ if (input_file_level > output_level) {
+ return Status::InvalidArgument(
+ "Cannot compact file to up level, input file: " +
+ MakeTableFileName("", file_num) + " level " +
+ std::to_string(input_file_level) + " > output level " +
+ std::to_string(output_level));
+ }
+ }
+
+ return Status::OK();
+}
+#endif // !ROCKSDB_LITE
+
+void CompactionPicker::RegisterCompaction(Compaction* c) {
+ if (c == nullptr) {
+ return;
+ }
+ assert(ioptions_.compaction_style != kCompactionStyleLevel ||
+ c->output_level() == 0 ||
+ !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level(),
+ c->GetPenultimateLevel()));
+ if (c->start_level() == 0 ||
+ ioptions_.compaction_style == kCompactionStyleUniversal) {
+ level0_compactions_in_progress_.insert(c);
+ }
+ compactions_in_progress_.insert(c);
+ TEST_SYNC_POINT_CALLBACK("CompactionPicker::RegisterCompaction:Registered",
+ c);
+}
+
+void CompactionPicker::UnregisterCompaction(Compaction* c) {
+ if (c == nullptr) {
+ return;
+ }
+ if (c->start_level() == 0 ||
+ ioptions_.compaction_style == kCompactionStyleUniversal) {
+ level0_compactions_in_progress_.erase(c);
+ }
+ compactions_in_progress_.erase(c);
+}
+
+void CompactionPicker::PickFilesMarkedForCompaction(
+ const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level,
+ int* output_level, CompactionInputFiles* start_level_inputs) {
+ if (vstorage->FilesMarkedForCompaction().empty()) {
+ return;
+ }
+
+ auto continuation = [&, cf_name](std::pair<int, FileMetaData*> level_file) {
+ // If it's being compacted it has nothing to do here.
+ // If this assert() fails that means that some function marked some
+ // files as being_compacted, but didn't call ComputeCompactionScore()
+ assert(!level_file.second->being_compacted);
+ *start_level = level_file.first;
+ *output_level =
+ (*start_level == 0) ? vstorage->base_level() : *start_level + 1;
+
+ if (*start_level == 0 && !level0_compactions_in_progress()->empty()) {
+ return false;
+ }
+
+ start_level_inputs->files = {level_file.second};
+ start_level_inputs->level = *start_level;
+ return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs);
+ };
+
+ // take a chance on a random file first
+ Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage));
+ size_t random_file_index = static_cast<size_t>(rnd.Uniform(
+ static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size())));
+ TEST_SYNC_POINT_CALLBACK("CompactionPicker::PickFilesMarkedForCompaction",
+ &random_file_index);
+
+ if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) {
+ // found the compaction!
+ return;
+ }
+
+ for (auto& level_file : vstorage->FilesMarkedForCompaction()) {
+ if (continuation(level_file)) {
+ // found the compaction!
+ return;
+ }
+ }
+ start_level_inputs->files.clear();
+}
+
+bool CompactionPicker::GetOverlappingL0Files(
+ VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs,
+ int output_level, int* parent_index) {
+ // Two level 0 compaction won't run at the same time, so don't need to worry
+ // about files on level 0 being compacted.
+ assert(level0_compactions_in_progress()->empty());
+ InternalKey smallest, largest;
+ GetRange(*start_level_inputs, &smallest, &largest);
+ // Note that the next call will discard the file we placed in
+ // c->inputs_[0] earlier and replace it with an overlapping set
+ // which will include the picked file.
+ start_level_inputs->files.clear();
+ vstorage->GetOverlappingInputs(0, &smallest, &largest,
+ &(start_level_inputs->files));
+
+ // If we include more L0 files in the same compaction run it can
+ // cause the 'smallest' and 'largest' key to get extended to a
+ // larger range. So, re-invoke GetRange to get the new key range
+ GetRange(*start_level_inputs, &smallest, &largest);
+ if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level,
+ parent_index)) {
+ return false;
+ }
+ assert(!start_level_inputs->files.empty());
+
+ return true;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker.h b/src/rocksdb/db/compaction/compaction_picker.h
new file mode 100644
index 000000000..7739dd96b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.h
@@ -0,0 +1,323 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/version_set.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains an abstract class CompactionPicker, and its two
+// sub-classes LevelCompactionPicker and NullCompactionPicker, as
+// well as some helper functions used by them.
+
+class LogBuffer;
+class Compaction;
+class VersionStorageInfo;
+struct CompactionInputFiles;
+
+// An abstract class to pick compactions from an existing LSM-tree.
+//
+// Each compaction style inherits the class and implement the
+// interface to form automatic compactions. If NeedCompaction() is true,
+// then call PickCompaction() to find what files need to be compacted
+// and where to put the output files.
+//
+// Non-virtual functions CompactRange() and CompactFiles() are used to
+// pick files to compact based on users' DB::CompactRange() and
+// DB::CompactFiles() requests, respectively. There is little
+// compaction style specific logic for them.
+class CompactionPicker {
+ public:
+ CompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp);
+ virtual ~CompactionPicker();
+
+ // Pick level and inputs for a new compaction.
+ // Returns nullptr if there is no compaction to be done.
+ // Otherwise returns a pointer to a heap-allocated object that
+ // describes the compaction. Caller should delete the result.
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0;
+
+ // Return a compaction object for compacting the range [begin,end] in
+ // the specified level. Returns nullptr if there is nothing in that
+ // level that overlaps the specified range. Caller should delete
+ // the result.
+ //
+ // The returned Compaction might not include the whole requested range.
+ // In that case, compaction_end will be set to the next key that needs
+ // compacting. In case the compaction will compact the whole range,
+ // compaction_end will be set to nullptr.
+ // Client is responsible for compaction_end storage -- when called,
+ // *compaction_end should point to valid InternalKey!
+ virtual Compaction* CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore, const std::string& trim_ts);
+
+ // The maximum allowed output level. Default value is NumberLevels() - 1.
+ virtual int MaxOutputLevel() const { return NumberLevels() - 1; }
+
+ virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0;
+
+// Sanitize the input set of compaction input files.
+// When the input parameters do not describe a valid compaction, the
+// function will try to fix the input_files by adding necessary
+// files. If it's not possible to conver an invalid input_files
+// into a valid one by adding more files, the function will return a
+// non-ok status with specific reason.
+#ifndef ROCKSDB_LITE
+ Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta,
+ const int output_level) const;
+#endif // ROCKSDB_LITE
+
+ // Free up the files that participated in a compaction
+ //
+ // Requirement: DB mutex held
+ void ReleaseCompactionFiles(Compaction* c, Status status);
+
+ // Returns true if any one of the specified files are being compacted
+ bool AreFilesInCompaction(const std::vector<FileMetaData*>& files);
+
+ // Takes a list of CompactionInputFiles and returns a (manual) Compaction
+ // object.
+ //
+ // Caller must provide a set of input files that has been passed through
+ // `SanitizeCompactionInputFiles` earlier. The lock should not be released
+ // between that call and this one.
+ Compaction* CompactFiles(const CompactionOptions& compact_options,
+ const std::vector<CompactionInputFiles>& input_files,
+ int output_level, VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ uint32_t output_path_id);
+
+ // Converts a set of compaction input file numbers into
+ // a list of CompactionInputFiles.
+ Status GetCompactionInputsFromFileNumbers(
+ std::vector<CompactionInputFiles>* input_files,
+ std::unordered_set<uint64_t>* input_set,
+ const VersionStorageInfo* vstorage,
+ const CompactionOptions& compact_options) const;
+
+ // Is there currently a compaction involving level 0 taking place
+ bool IsLevel0CompactionInProgress() const {
+ return !level0_compactions_in_progress_.empty();
+ }
+
+ // Return true if the passed key range overlap with a compaction output
+ // that is currently running.
+ bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ int level) const;
+
+ // Stores the minimal range that covers all entries in inputs in
+ // *smallest, *largest.
+ // REQUIRES: inputs is not empty
+ void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest,
+ InternalKey* largest) const;
+
+ // Stores the minimal range that covers all entries in inputs1 and inputs2
+ // in *smallest, *largest.
+ // REQUIRES: inputs is not empty
+ void GetRange(const CompactionInputFiles& inputs1,
+ const CompactionInputFiles& inputs2, InternalKey* smallest,
+ InternalKey* largest) const;
+
+ // Stores the minimal range that covers all entries in inputs
+ // in *smallest, *largest.
+ // REQUIRES: inputs is not empty (at least on entry have one file)
+ void GetRange(const std::vector<CompactionInputFiles>& inputs,
+ InternalKey* smallest, InternalKey* largest,
+ int exclude_level) const;
+
+ int NumberLevels() const { return ioptions_.num_levels; }
+
+ // Add more files to the inputs on "level" to make sure that
+ // no newer version of a key is compacted to "level+1" while leaving an older
+ // version in a "level". Otherwise, any Get() will search "level" first,
+ // and will likely return an old/stale value for the key, since it always
+ // searches in increasing order of level to find the value. This could
+ // also scramble the order of merge operands. This function should be
+ // called any time a new Compaction is created, and its inputs_[0] are
+ // populated.
+ //
+ // Will return false if it is impossible to apply this compaction.
+ bool ExpandInputsToCleanCut(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ InternalKey** next_smallest = nullptr);
+
+ // Returns true if any one of the parent files are being compacted
+ bool IsRangeInCompaction(VersionStorageInfo* vstorage,
+ const InternalKey* smallest,
+ const InternalKey* largest, int level, int* index);
+
+ // Returns true if the key range that `inputs` files cover overlap with the
+ // key range of a currently running compaction.
+ bool FilesRangeOverlapWithCompaction(
+ const std::vector<CompactionInputFiles>& inputs, int level,
+ int penultimate_level) const;
+
+ bool SetupOtherInputs(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ CompactionInputFiles* output_level_inputs,
+ int* parent_index, int base_index,
+ bool only_expand_towards_right = false);
+
+ void GetGrandparents(VersionStorageInfo* vstorage,
+ const CompactionInputFiles& inputs,
+ const CompactionInputFiles& output_level_inputs,
+ std::vector<FileMetaData*>* grandparents);
+
+ void PickFilesMarkedForCompaction(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ int* start_level, int* output_level,
+ CompactionInputFiles* start_level_inputs);
+
+ bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
+ CompactionInputFiles* start_level_inputs,
+ int output_level, int* parent_index);
+
+ // Register this compaction in the set of running compactions
+ void RegisterCompaction(Compaction* c);
+
+ // Remove this compaction from the set of running compactions
+ void UnregisterCompaction(Compaction* c);
+
+ std::set<Compaction*>* level0_compactions_in_progress() {
+ return &level0_compactions_in_progress_;
+ }
+ std::unordered_set<Compaction*>* compactions_in_progress() {
+ return &compactions_in_progress_;
+ }
+
+ const InternalKeyComparator* icmp() const { return icmp_; }
+
+ protected:
+ const ImmutableOptions& ioptions_;
+
+// A helper function to SanitizeCompactionInputFiles() that
+// sanitizes "input_files" by adding necessary files.
+#ifndef ROCKSDB_LITE
+ virtual Status SanitizeCompactionInputFilesForAllLevels(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const;
+#endif // ROCKSDB_LITE
+
+ // Keeps track of all compactions that are running on Level0.
+ // Protected by DB mutex
+ std::set<Compaction*> level0_compactions_in_progress_;
+
+ // Keeps track of all compactions that are running.
+ // Protected by DB mutex
+ std::unordered_set<Compaction*> compactions_in_progress_;
+
+ const InternalKeyComparator* const icmp_;
+};
+
+#ifndef ROCKSDB_LITE
+// A dummy compaction that never triggers any automatic
+// compaction.
+class NullCompactionPicker : public CompactionPicker {
+ public:
+ NullCompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual ~NullCompactionPicker() {}
+
+ // Always return "nullptr"
+ Compaction* PickCompaction(
+ const std::string& /*cf_name*/,
+ const MutableCFOptions& /*mutable_cf_options*/,
+ const MutableDBOptions& /*mutable_db_options*/,
+ VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
+ SequenceNumber /* earliest_memtable_seqno */) override {
+ return nullptr;
+ }
+
+ // Always return "nullptr"
+ Compaction* CompactRange(const std::string& /*cf_name*/,
+ const MutableCFOptions& /*mutable_cf_options*/,
+ const MutableDBOptions& /*mutable_db_options*/,
+ VersionStorageInfo* /*vstorage*/,
+ int /*input_level*/, int /*output_level*/,
+ const CompactRangeOptions& /*compact_range_options*/,
+ const InternalKey* /*begin*/,
+ const InternalKey* /*end*/,
+ InternalKey** /*compaction_end*/,
+ bool* /*manual_conflict*/,
+ uint64_t /*max_file_num_to_ignore*/,
+ const std::string& /*trim_ts*/) override {
+ return nullptr;
+ }
+
+ // Always returns false.
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* /*vstorage*/) const override {
+ return false;
+ }
+};
+#endif // !ROCKSDB_LITE
+
+// Attempts to find an intra L0 compaction conforming to the given parameters.
+//
+// @param level_files Metadata for L0 files.
+// @param min_files_to_compact Minimum number of files required to
+// do the compaction.
+// @param max_compact_bytes_per_del_file Maximum average size in bytes per
+// file that is going to get deleted by
+// the compaction.
+// @param max_compaction_bytes Maximum total size in bytes (in terms
+// of compensated file size) for files
+// to be compacted.
+// @param [out] comp_inputs If a compaction was found, will be
+// initialized with corresponding input
+// files. Cannot be nullptr.
+//
+// @return true iff compaction was found.
+bool FindIntraL0Compaction(
+ const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+ uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+ CompactionInputFiles* comp_inputs,
+ SequenceNumber earliest_mem_seqno = kMaxSequenceNumber);
+
+CompressionType GetCompressionType(const VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ int level, int base_level,
+ const bool enable_compression = true);
+
+CompressionOptions GetCompressionOptions(
+ const MutableCFOptions& mutable_cf_options,
+ const VersionStorageInfo* vstorage, int level,
+ const bool enable_compression = true);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.cc b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
new file mode 100644
index 000000000..1f875e3e1
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
@@ -0,0 +1,433 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_fifo.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
+ uint64_t total_size = 0;
+ for (const auto& f : files) {
+ total_size += f->fd.file_size;
+ }
+ return total_size;
+}
+} // anonymous namespace
+
+bool FIFOCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ const int kLevel0 = 0;
+ return vstorage->CompactionScore(kLevel0) >= 1;
+}
+
+Compaction* FIFOCompactionPicker::PickTTLCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer) {
+ assert(mutable_cf_options.ttl > 0);
+
+ const int kLevel0 = 0;
+ const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+ uint64_t total_size = GetTotalFilesSize(level_files);
+
+ int64_t _current_time;
+ auto status = ioptions_.clock->GetCurrentTime(&_current_time);
+ if (!status.ok()) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: Couldn't get current time: %s. "
+ "Not doing compactions based on TTL. ",
+ cf_name.c_str(), status.ToString().c_str());
+ return nullptr;
+ }
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+ if (!level0_compactions_in_progress_.empty()) {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: Already executing compaction. No need "
+ "to run parallel compactions since compactions are very fast",
+ cf_name.c_str());
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ inputs.emplace_back();
+ inputs[0].level = 0;
+
+ // avoid underflow
+ if (current_time > mutable_cf_options.ttl) {
+ for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+ FileMetaData* f = *ritr;
+ assert(f);
+ if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+ uint64_t creation_time =
+ f->fd.table_reader->GetTableProperties()->creation_time;
+ if (creation_time == 0 ||
+ creation_time >= (current_time - mutable_cf_options.ttl)) {
+ break;
+ }
+ }
+ total_size -= f->fd.file_size;
+ inputs[0].files.push_back(f);
+ }
+ }
+
+ // Return a nullptr and proceed to size-based FIFO compaction if:
+ // 1. there are no files older than ttl OR
+ // 2. there are a few files older than ttl, but deleting them will not bring
+ // the total size to be less than max_table_files_size threshold.
+ if (inputs[0].files.empty() ||
+ total_size >
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ return nullptr;
+ }
+
+ for (const auto& f : inputs[0].files) {
+ uint64_t creation_time = 0;
+ assert(f);
+ if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+ creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
+ }
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with creation time %" PRIu64 " for deletion",
+ cf_name.c_str(), f->fd.GetNumber(), creation_time);
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ std::move(inputs), 0, 0, 0, 0, kNoCompression,
+ mutable_cf_options.compression_opts, Temperature::kUnknown,
+ /* max_subcompactions */ 0, {}, /* is manual */ false,
+ /* trim_ts */ "", vstorage->CompactionScore(0),
+ /* is deletion compaction */ true, /* l0_files_might_overlap */ true,
+ CompactionReason::kFIFOTtl);
+ return c;
+}
+
+// The size-based compaction picker for FIFO.
+//
+// When the entire column family size exceeds max_table_files_size, FIFO will
+// try to delete the oldest sst file(s) until the resulting column family size
+// is smaller than max_table_files_size.
+//
+// This function also takes care the case where a DB is migrating from level /
+// universal compaction to FIFO compaction. During the migration, the column
+// family will also have non-L0 files while FIFO can only create L0 files.
+// In this case, this function will first purge the sst files in the bottom-
+// most non-empty level first, and the DB will eventually converge to the
+// regular FIFO case where there're only L0 files. Note that during the
+// migration case, the purge order will only be an approximation of "FIFO"
+// as entries inside lower-level files might sometimes be newer than some
+// entries inside upper-level files.
+Compaction* FIFOCompactionPicker::PickSizeCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer) {
+ // compute the total size and identify the last non-empty level
+ int last_level = 0;
+ uint64_t total_size = 0;
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ auto level_size = GetTotalFilesSize(vstorage->LevelFiles(level));
+ total_size += level_size;
+ if (level_size > 0) {
+ last_level = level;
+ }
+ }
+ const std::vector<FileMetaData*>& last_level_files =
+ vstorage->LevelFiles(last_level);
+
+ if (last_level == 0 &&
+ total_size <=
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ // total size not exceeded, try to find intra level 0 compaction if enabled
+ const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0);
+ if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
+ level0_files.size() > 0) {
+ CompactionInputFiles comp_inputs;
+ // try to prevent same files from being compacted multiple times, which
+ // could produce large files that may never TTL-expire. Achieve this by
+ // disallowing compactions with files larger than memtable (inflate its
+ // size by 10% to account for uncompressed L0 files that may have size
+ // slightly greater than memtable size limit).
+ size_t max_compact_bytes_per_del_file =
+ static_cast<size_t>(MultiplyCheckOverflow(
+ static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
+ 1.1));
+ if (FindIntraL0Compaction(
+ level0_files,
+ mutable_cf_options
+ .level0_file_num_compaction_trigger /* min_files_to_compact */
+ ,
+ max_compact_bytes_per_del_file,
+ mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */,
+ 0 /* max compaction bytes, not applicable */,
+ 0 /* output path ID */, mutable_cf_options.compression,
+ mutable_cf_options.compression_opts, Temperature::kUnknown,
+ 0 /* max_subcompactions */, {}, /* is manual */ false,
+ /* trim_ts */ "", vstorage->CompactionScore(0),
+ /* is deletion compaction */ false,
+ /* l0_files_might_overlap */ true,
+ CompactionReason::kFIFOReduceNumFiles);
+ return c;
+ }
+ }
+
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
+ ", max size %" PRIu64 "\n",
+ cf_name.c_str(), total_size,
+ mutable_cf_options.compaction_options_fifo.max_table_files_size);
+ return nullptr;
+ }
+
+ if (!level0_compactions_in_progress_.empty()) {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: Already executing compaction. No need "
+ "to run parallel compactions since compactions are very fast",
+ cf_name.c_str());
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ inputs.emplace_back();
+ inputs[0].level = last_level;
+
+ if (last_level == 0) {
+ // In L0, right-most files are the oldest files.
+ for (auto ritr = last_level_files.rbegin(); ritr != last_level_files.rend();
+ ++ritr) {
+ auto f = *ritr;
+ total_size -= f->fd.file_size;
+ inputs[0].files.push_back(f);
+ char tmp_fsize[16];
+ AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with size %s for deletion",
+ cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+ if (total_size <=
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ break;
+ }
+ }
+ } else {
+ // If the last level is non-L0, we actually don't know which file is
+ // logically the oldest since the file creation time only represents
+ // when this file was compacted to this level, which is independent
+ // to when the entries in this file were first inserted.
+ //
+ // As a result, we delete files from the left instead. This means the sst
+ // file with the smallest key will be deleted first. This design decision
+ // better serves a major type of FIFO use cases where smaller keys are
+ // associated with older data.
+ for (const auto& f : last_level_files) {
+ total_size -= f->fd.file_size;
+ inputs[0].files.push_back(f);
+ char tmp_fsize[16];
+ AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with size %s for deletion",
+ cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+ if (total_size <=
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ break;
+ }
+ }
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ std::move(inputs), last_level,
+ /* target_file_size */ 0,
+ /* max_compaction_bytes */ 0,
+ /* output_path_id */ 0, kNoCompression,
+ mutable_cf_options.compression_opts, Temperature::kUnknown,
+ /* max_subcompactions */ 0, {}, /* is manual */ false,
+ /* trim_ts */ "", vstorage->CompactionScore(0),
+ /* is deletion compaction */ true,
+ /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompactionToWarm(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer) {
+ if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) {
+ return nullptr;
+ }
+
+ // PickCompactionToWarm is only triggered if there is no non-L0 files.
+ for (int level = 1; level < vstorage->num_levels(); ++level) {
+ if (GetTotalFilesSize(vstorage->LevelFiles(level)) > 0) {
+ return nullptr;
+ }
+ }
+
+ const int kLevel0 = 0;
+ const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+
+ int64_t _current_time;
+ auto status = ioptions_.clock->GetCurrentTime(&_current_time);
+ if (!status.ok()) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: Couldn't get current time: %s. "
+ "Not doing compactions based on warm threshold. ",
+ cf_name.c_str(), status.ToString().c_str());
+ return nullptr;
+ }
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+ if (!level0_compactions_in_progress_.empty()) {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: Already executing compaction. Parallel "
+ "compactions are not supported",
+ cf_name.c_str());
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ inputs.emplace_back();
+ inputs[0].level = 0;
+
+ // avoid underflow
+ if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) {
+ uint64_t create_time_threshold =
+ current_time - mutable_cf_options.compaction_options_fifo.age_for_warm;
+ uint64_t compaction_size = 0;
+ // We will ideally identify a file qualifying for warm tier by knowing
+ // the timestamp for the youngest entry in the file. However, right now
+ // we don't have the information. We infer it by looking at timestamp
+ // of the next file's (which is just younger) oldest entry's timestamp.
+ FileMetaData* prev_file = nullptr;
+ for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+ FileMetaData* f = *ritr;
+ assert(f);
+ if (f->being_compacted) {
+ // Right now this probably won't happen as we never try to schedule
+ // two compactions in parallel, so here we just simply don't schedule
+ // anything.
+ return nullptr;
+ }
+ uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+ if (oldest_ancester_time == kUnknownOldestAncesterTime) {
+ // Older files might not have enough information. It is possible to
+ // handle these files by looking at newer files, but maintaining the
+ // logic isn't worth it.
+ break;
+ }
+ if (oldest_ancester_time > create_time_threshold) {
+ // The previous file (which has slightly older data) doesn't qualify
+ // for warm tier.
+ break;
+ }
+ if (prev_file != nullptr) {
+ compaction_size += prev_file->fd.GetFileSize();
+ if (compaction_size > mutable_cf_options.max_compaction_bytes) {
+ break;
+ }
+ inputs[0].files.push_back(prev_file);
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with next file's oldest time %" PRIu64 " for warm",
+ cf_name.c_str(), prev_file->fd.GetNumber(),
+ oldest_ancester_time);
+ }
+ if (f->temperature == Temperature::kUnknown ||
+ f->temperature == Temperature::kHot) {
+ prev_file = f;
+ } else if (!inputs[0].files.empty()) {
+ // A warm file newer than files picked.
+ break;
+ } else {
+ assert(prev_file == nullptr);
+ }
+ }
+ }
+
+ if (inputs[0].files.empty()) {
+ return nullptr;
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ std::move(inputs), 0, 0 /* output file size limit */,
+ 0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
+ mutable_cf_options.compression, mutable_cf_options.compression_opts,
+ Temperature::kWarm,
+ /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "",
+ vstorage->CompactionScore(0),
+ /* is deletion compaction */ false, /* l0_files_might_overlap */ true,
+ CompactionReason::kChangeTemperature);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer, SequenceNumber /*earliest_memtable_seqno*/) {
+ Compaction* c = nullptr;
+ if (mutable_cf_options.ttl > 0) {
+ c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options,
+ vstorage, log_buffer);
+ }
+ if (c == nullptr) {
+ c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options,
+ vstorage, log_buffer);
+ }
+ if (c == nullptr) {
+ c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options,
+ vstorage, log_buffer);
+ }
+ RegisterCompaction(c);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ int input_level, int output_level,
+ const CompactRangeOptions& /*compact_range_options*/,
+ const InternalKey* /*begin*/, const InternalKey* /*end*/,
+ InternalKey** compaction_end, bool* /*manual_conflict*/,
+ uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/) {
+#ifdef NDEBUG
+ (void)input_level;
+ (void)output_level;
+#endif
+ assert(input_level == 0);
+ assert(output_level == 0);
+ *compaction_end = nullptr;
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger);
+ Compaction* c = PickCompaction(cf_name, mutable_cf_options,
+ mutable_db_options, vstorage, &log_buffer);
+ log_buffer.FlushBufferToLog();
+ return c;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.h b/src/rocksdb/db/compaction/compaction_picker_fifo.h
new file mode 100644
index 000000000..544259f38
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class FIFOCompactionPicker : public CompactionPicker {
+ public:
+ FIFOCompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* version,
+ LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+ virtual Compaction* CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore, const std::string& trim_ts) override;
+
+ // The maximum allowed output level. Always returns 0.
+ virtual int MaxOutputLevel() const override { return 0; }
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+
+ private:
+ Compaction* PickTTLCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ VersionStorageInfo* version,
+ LogBuffer* log_buffer);
+
+ Compaction* PickSizeCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ VersionStorageInfo* version,
+ LogBuffer* log_buffer);
+
+ Compaction* PickCompactionToWarm(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ VersionStorageInfo* version,
+ LogBuffer* log_buffer);
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.cc b/src/rocksdb/db/compaction/compaction_picker_level.cc
new file mode 100644
index 000000000..b689b6add
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.cc
@@ -0,0 +1,841 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_level.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "logging/log_buffer.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool LevelCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ if (!vstorage->ExpiredTtlFiles().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForForcedBlobGC().empty()) {
+ return true;
+ }
+ for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
+ if (vstorage->CompactionScore(i) >= 1) {
+ return true;
+ }
+ }
+ return false;
+}
+
+namespace {
+// A class to build a leveled compaction step-by-step.
+class LevelCompactionBuilder {
+ public:
+ LevelCompactionBuilder(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ SequenceNumber earliest_mem_seqno,
+ CompactionPicker* compaction_picker,
+ LogBuffer* log_buffer,
+ const MutableCFOptions& mutable_cf_options,
+ const ImmutableOptions& ioptions,
+ const MutableDBOptions& mutable_db_options)
+ : cf_name_(cf_name),
+ vstorage_(vstorage),
+ earliest_mem_seqno_(earliest_mem_seqno),
+ compaction_picker_(compaction_picker),
+ log_buffer_(log_buffer),
+ mutable_cf_options_(mutable_cf_options),
+ ioptions_(ioptions),
+ mutable_db_options_(mutable_db_options) {}
+
+ // Pick and return a compaction.
+ Compaction* PickCompaction();
+
+ // Pick the initial files to compact to the next level. (or together
+ // in Intra-L0 compactions)
+ void SetupInitialFiles();
+
+ // If the initial files are from L0 level, pick other L0
+ // files if needed.
+ bool SetupOtherL0FilesIfNeeded();
+
+ // Compaction with round-robin compaction priority allows more files to be
+ // picked to form a large compaction
+ void SetupOtherFilesWithRoundRobinExpansion();
+ // Based on initial files, setup other files need to be compacted
+ // in this compaction, accordingly.
+ bool SetupOtherInputsIfNeeded();
+
+ Compaction* GetCompaction();
+
+ // For the specfied level, pick a file that we want to compact.
+ // Returns false if there is no file to compact.
+ // If it returns true, inputs->files.size() will be exactly one for
+ // all compaction priorities except round-robin. For round-robin,
+ // multiple consecutive files may be put into inputs->files.
+ // If level is 0 and there is already a compaction on that level, this
+ // function will return false.
+ bool PickFileToCompact();
+
+ // Return true if a L0 trivial move is picked up.
+ bool TryPickL0TrivialMove();
+
+ // For L0->L0, picks the longest span of files that aren't currently
+ // undergoing compaction for which work-per-deleted-file decreases. The span
+ // always starts from the newest L0 file.
+ //
+ // Intra-L0 compaction is independent of all other files, so it can be
+ // performed even when L0->base_level compactions are blocked.
+ //
+ // Returns true if `inputs` is populated with a span of files to be compacted;
+ // otherwise, returns false.
+ bool PickIntraL0Compaction();
+
+ // Return true if TrivialMove is extended. `start_index` is the index of
+ // the intiial file picked, which should already be in `start_level_inputs_`.
+ bool TryExtendNonL0TrivialMove(int start_index);
+
+ // Picks a file from level_files to compact.
+ // level_files is a vector of (level, file metadata) in ascending order of
+ // level. If compact_to_next_level is true, compact the file to the next
+ // level, otherwise, compact to the same level as the input file.
+ void PickFileToCompact(
+ const autovector<std::pair<int, FileMetaData*>>& level_files,
+ bool compact_to_next_level);
+
+ const std::string& cf_name_;
+ VersionStorageInfo* vstorage_;
+ SequenceNumber earliest_mem_seqno_;
+ CompactionPicker* compaction_picker_;
+ LogBuffer* log_buffer_;
+ int start_level_ = -1;
+ int output_level_ = -1;
+ int parent_index_ = -1;
+ int base_index_ = -1;
+ double start_level_score_ = 0;
+ bool is_manual_ = false;
+ bool is_l0_trivial_move_ = false;
+ CompactionInputFiles start_level_inputs_;
+ std::vector<CompactionInputFiles> compaction_inputs_;
+ CompactionInputFiles output_level_inputs_;
+ std::vector<FileMetaData*> grandparents_;
+ CompactionReason compaction_reason_ = CompactionReason::kUnknown;
+
+ const MutableCFOptions& mutable_cf_options_;
+ const ImmutableOptions& ioptions_;
+ const MutableDBOptions& mutable_db_options_;
+ // Pick a path ID to place a newly generated file, with its level
+ static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ int level);
+
+ static const int kMinFilesForIntraL0Compaction = 4;
+};
+
+void LevelCompactionBuilder::PickFileToCompact(
+ const autovector<std::pair<int, FileMetaData*>>& level_files,
+ bool compact_to_next_level) {
+ for (auto& level_file : level_files) {
+ // If it's being compacted it has nothing to do here.
+ // If this assert() fails that means that some function marked some
+ // files as being_compacted, but didn't call ComputeCompactionScore()
+ assert(!level_file.second->being_compacted);
+ start_level_ = level_file.first;
+ if ((compact_to_next_level &&
+ start_level_ == vstorage_->num_non_empty_levels() - 1) ||
+ (start_level_ == 0 &&
+ !compaction_picker_->level0_compactions_in_progress()->empty())) {
+ continue;
+ }
+ if (compact_to_next_level) {
+ output_level_ =
+ (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+ } else {
+ output_level_ = start_level_;
+ }
+ start_level_inputs_.files = {level_file.second};
+ start_level_inputs_.level = start_level_;
+ if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_)) {
+ return;
+ }
+ }
+ start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::SetupInitialFiles() {
+ // Find the compactions by size on all levels.
+ bool skipped_l0_to_base = false;
+ for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
+ start_level_score_ = vstorage_->CompactionScore(i);
+ start_level_ = vstorage_->CompactionScoreLevel(i);
+ assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
+ if (start_level_score_ >= 1) {
+ if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
+ // If L0->base_level compaction is pending, don't schedule further
+ // compaction from base level. Otherwise L0->base_level compaction
+ // may starve.
+ continue;
+ }
+ output_level_ =
+ (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+ if (PickFileToCompact()) {
+ // found the compaction!
+ if (start_level_ == 0) {
+ // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
+ compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+ } else {
+ // L1+ score = `Level files size` / `MaxBytesForLevel`
+ compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
+ }
+ break;
+ } else {
+ // didn't find the compaction, clear the inputs
+ start_level_inputs_.clear();
+ if (start_level_ == 0) {
+ skipped_l0_to_base = true;
+ // L0->base_level may be blocked due to ongoing L0->base_level
+ // compactions. It may also be blocked by an ongoing compaction from
+ // base_level downwards.
+ //
+ // In these cases, to reduce L0 file count and thus reduce likelihood
+ // of write stalls, we can attempt compacting a span of files within
+ // L0.
+ if (PickIntraL0Compaction()) {
+ output_level_ = 0;
+ compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+ break;
+ }
+ }
+ }
+ } else {
+ // Compaction scores are sorted in descending order, no further scores
+ // will be >= 1.
+ break;
+ }
+ }
+ if (!start_level_inputs_.empty()) {
+ return;
+ }
+
+ // if we didn't find a compaction, check if there are any files marked for
+ // compaction
+ parent_index_ = base_index_ = -1;
+
+ compaction_picker_->PickFilesMarkedForCompaction(
+ cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_);
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
+ return;
+ }
+
+ // Bottommost Files Compaction on deleting tombstones
+ PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), false);
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kBottommostFiles;
+ return;
+ }
+
+ // TTL Compaction
+ if (ioptions_.compaction_pri == kRoundRobin &&
+ !vstorage_->ExpiredTtlFiles().empty()) {
+ auto expired_files = vstorage_->ExpiredTtlFiles();
+ // the expired files list should already be sorted by level
+ start_level_ = expired_files.front().first;
+#ifndef NDEBUG
+ for (const auto& file : expired_files) {
+ assert(start_level_ <= file.first);
+ }
+#endif
+ if (start_level_ > 0) {
+ output_level_ = start_level_ + 1;
+ if (PickFileToCompact()) {
+ compaction_reason_ = CompactionReason::kRoundRobinTtl;
+ return;
+ }
+ }
+ }
+
+ PickFileToCompact(vstorage_->ExpiredTtlFiles(), true);
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kTtl;
+ return;
+ }
+
+ // Periodic Compaction
+ PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), false);
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kPeriodicCompaction;
+ return;
+ }
+
+ // Forced blob garbage collection
+ PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false);
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kForcedBlobGC;
+ return;
+ }
+}
+
+bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
+ if (start_level_ == 0 && output_level_ != 0 && !is_l0_trivial_move_) {
+ return compaction_picker_->GetOverlappingL0Files(
+ vstorage_, &start_level_inputs_, output_level_, &parent_index_);
+ }
+ return true;
+}
+
+void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
+ // We only expand when the start level is not L0 under round robin
+ assert(start_level_ >= 1);
+
+ // For round-robin compaction priority, we have 3 constraints when picking
+ // multiple files.
+ // Constraint 1: We can only pick consecutive files
+ // -> Constraint 1a: When a file is being compacted (or some input files
+ // are being compacted after expanding, we cannot
+ // choose it and have to stop choosing more files
+ // -> Constraint 1b: When we reach the last file (with largest keys), we
+ // cannot choose more files (the next file will be the
+ // first one)
+ // Constraint 2: We should ensure the total compaction bytes (including the
+ // overlapped files from the next level) is no more than
+ // mutable_cf_options_.max_compaction_bytes
+ // Constraint 3: We try our best to pick as many files as possible so that
+ // the post-compaction level size is less than
+ // MaxBytesForLevel(start_level_)
+ // Constraint 4: We do not expand if it is possible to apply a trivial move
+ // Constraint 5 (TODO): Try to pick minimal files to split into the target
+ // number of subcompactions
+ TEST_SYNC_POINT("LevelCompactionPicker::RoundRobin");
+
+ // Only expand the inputs when we have selected a file in start_level_inputs_
+ if (start_level_inputs_.size() == 0) return;
+
+ uint64_t start_lvl_bytes_no_compacting = 0;
+ uint64_t curr_bytes_to_compact = 0;
+ uint64_t start_lvl_max_bytes_to_compact = 0;
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(start_level_);
+ // Constraint 3 (pre-calculate the ideal max bytes to compact)
+ for (auto f : level_files) {
+ if (!f->being_compacted) {
+ start_lvl_bytes_no_compacting += f->fd.GetFileSize();
+ }
+ }
+ if (start_lvl_bytes_no_compacting >
+ vstorage_->MaxBytesForLevel(start_level_)) {
+ start_lvl_max_bytes_to_compact = start_lvl_bytes_no_compacting -
+ vstorage_->MaxBytesForLevel(start_level_);
+ }
+
+ size_t start_index = vstorage_->FilesByCompactionPri(start_level_)[0];
+ InternalKey smallest, largest;
+ // Constraint 4 (No need to check again later)
+ compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+ CompactionInputFiles output_level_inputs;
+ output_level_inputs.level = output_level_;
+ vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+ &output_level_inputs.files);
+ if (output_level_inputs.empty()) {
+ if (TryExtendNonL0TrivialMove((int)start_index)) {
+ return;
+ }
+ }
+ // Constraint 3
+ if (start_level_inputs_[0]->fd.GetFileSize() >=
+ start_lvl_max_bytes_to_compact) {
+ return;
+ }
+ CompactionInputFiles tmp_start_level_inputs;
+ tmp_start_level_inputs = start_level_inputs_;
+ // TODO (zichen): Future parallel round-robin may also need to update this
+ // Constraint 1b (only expand till the end)
+ for (size_t i = start_index + 1; i < level_files.size(); i++) {
+ auto* f = level_files[i];
+ if (f->being_compacted) {
+ // Constraint 1a
+ return;
+ }
+
+ tmp_start_level_inputs.files.push_back(f);
+ if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &tmp_start_level_inputs) ||
+ compaction_picker_->FilesRangeOverlapWithCompaction(
+ {tmp_start_level_inputs}, output_level_,
+ Compaction::EvaluatePenultimateLevel(
+ vstorage_, ioptions_, start_level_, output_level_))) {
+ // Constraint 1a
+ tmp_start_level_inputs.clear();
+ return;
+ }
+
+ curr_bytes_to_compact = 0;
+ for (auto start_lvl_f : tmp_start_level_inputs.files) {
+ curr_bytes_to_compact += start_lvl_f->fd.GetFileSize();
+ }
+
+ // Check whether any output level files are locked
+ compaction_picker_->GetRange(tmp_start_level_inputs, &smallest, &largest);
+ vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+ &output_level_inputs.files);
+ if (!output_level_inputs.empty() &&
+ !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &output_level_inputs)) {
+ // Constraint 1a
+ tmp_start_level_inputs.clear();
+ return;
+ }
+
+ uint64_t start_lvl_curr_bytes_to_compact = curr_bytes_to_compact;
+ for (auto output_lvl_f : output_level_inputs.files) {
+ curr_bytes_to_compact += output_lvl_f->fd.GetFileSize();
+ }
+ if (curr_bytes_to_compact > mutable_cf_options_.max_compaction_bytes) {
+ // Constraint 2
+ tmp_start_level_inputs.clear();
+ return;
+ }
+
+ start_level_inputs_.files = tmp_start_level_inputs.files;
+ // Constraint 3
+ if (start_lvl_curr_bytes_to_compact > start_lvl_max_bytes_to_compact) {
+ return;
+ }
+ }
+}
+
+bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
+ // Setup input files from output level. For output to L0, we only compact
+ // spans of files that do not interact with any pending compactions, so don't
+ // need to consider other levels.
+ if (output_level_ != 0) {
+ output_level_inputs_.level = output_level_;
+ bool round_robin_expanding =
+ ioptions_.compaction_pri == kRoundRobin &&
+ compaction_reason_ == CompactionReason::kLevelMaxLevelSize;
+ if (round_robin_expanding) {
+ SetupOtherFilesWithRoundRobinExpansion();
+ }
+ if (!is_l0_trivial_move_ &&
+ !compaction_picker_->SetupOtherInputs(
+ cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
+ &output_level_inputs_, &parent_index_, base_index_,
+ round_robin_expanding)) {
+ return false;
+ }
+
+ compaction_inputs_.push_back(start_level_inputs_);
+ if (!output_level_inputs_.empty()) {
+ compaction_inputs_.push_back(output_level_inputs_);
+ }
+
+ if (!is_l0_trivial_move_) {
+ // In some edge cases we could pick a compaction that will be compacting
+ // a key range that overlap with another running compaction, and both
+ // of them have the same output level. This could happen if
+ // (1) we are running a non-exclusive manual compaction
+ // (2) AddFile ingest a new file into the LSM tree
+ // We need to disallow this from happening.
+ if (compaction_picker_->FilesRangeOverlapWithCompaction(
+ compaction_inputs_, output_level_,
+ Compaction::EvaluatePenultimateLevel(
+ vstorage_, ioptions_, start_level_, output_level_))) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ return false;
+ }
+ compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
+ output_level_inputs_, &grandparents_);
+ }
+ } else {
+ compaction_inputs_.push_back(start_level_inputs_);
+ }
+ return true;
+}
+
+Compaction* LevelCompactionBuilder::PickCompaction() {
+ // Pick up the first file to start compaction. It may have been extended
+ // to a clean cut.
+ SetupInitialFiles();
+ if (start_level_inputs_.empty()) {
+ return nullptr;
+ }
+ assert(start_level_ >= 0 && output_level_ >= 0);
+
+ // If it is a L0 -> base level compaction, we need to set up other L0
+ // files if needed.
+ if (!SetupOtherL0FilesIfNeeded()) {
+ return nullptr;
+ }
+
+ // Pick files in the output level and expand more files in the start level
+ // if needed.
+ if (!SetupOtherInputsIfNeeded()) {
+ return nullptr;
+ }
+
+ // Form a compaction object containing the files we picked.
+ Compaction* c = GetCompaction();
+
+ TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
+
+ return c;
+}
+
+Compaction* LevelCompactionBuilder::GetCompaction() {
+ auto c = new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+ std::move(compaction_inputs_), output_level_,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level_,
+ ioptions_.compaction_style, vstorage_->base_level(),
+ ioptions_.level_compaction_dynamic_level_bytes),
+ mutable_cf_options_.max_compaction_bytes,
+ GetPathId(ioptions_, mutable_cf_options_, output_level_),
+ GetCompressionType(vstorage_, mutable_cf_options_, output_level_,
+ vstorage_->base_level()),
+ GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
+ Temperature::kUnknown,
+ /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
+ /* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
+ /* l0_files_might_overlap */ start_level_ == 0 && !is_l0_trivial_move_,
+ compaction_reason_);
+
+ // If it's level 0 compaction, make sure we don't execute any other level 0
+ // compactions in parallel
+ compaction_picker_->RegisterCompaction(c);
+
+ // Creating a compaction influences the compaction score because the score
+ // takes running compactions into account (by skipping files that are already
+ // being compacted). Since we just changed compaction score, we recalculate it
+ // here
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ return c;
+}
+
+/*
+ * Find the optimal path to place a file
+ * Given a level, finds the path where levels up to it will fit in levels
+ * up to and including this path
+ */
+uint32_t LevelCompactionBuilder::GetPathId(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, int level) {
+ uint32_t p = 0;
+ assert(!ioptions.cf_paths.empty());
+
+ // size remaining in the most recent path
+ uint64_t current_path_size = ioptions.cf_paths[0].target_size;
+
+ uint64_t level_size;
+ int cur_level = 0;
+
+ // max_bytes_for_level_base denotes L1 size.
+ // We estimate L0 size to be the same as L1.
+ level_size = mutable_cf_options.max_bytes_for_level_base;
+
+ // Last path is the fallback
+ while (p < ioptions.cf_paths.size() - 1) {
+ if (level_size <= current_path_size) {
+ if (cur_level == level) {
+ // Does desired level fit in this path?
+ return p;
+ } else {
+ current_path_size -= level_size;
+ if (cur_level > 0) {
+ if (ioptions.level_compaction_dynamic_level_bytes) {
+ // Currently, level_compaction_dynamic_level_bytes is ignored when
+ // multiple db paths are specified. https://github.com/facebook/
+ // rocksdb/blob/main/db/column_family.cc.
+ // Still, adding this check to avoid accidentally using
+ // max_bytes_for_level_multiplier_additional
+ level_size = static_cast<uint64_t>(
+ level_size * mutable_cf_options.max_bytes_for_level_multiplier);
+ } else {
+ level_size = static_cast<uint64_t>(
+ level_size * mutable_cf_options.max_bytes_for_level_multiplier *
+ mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
+ }
+ }
+ cur_level++;
+ continue;
+ }
+ }
+ p++;
+ current_path_size = ioptions.cf_paths[p].target_size;
+ }
+ return p;
+}
+
+bool LevelCompactionBuilder::TryPickL0TrivialMove() {
+ if (vstorage_->base_level() <= 0) {
+ return false;
+ }
+ if (start_level_ == 0 && mutable_cf_options_.compression_per_level.empty() &&
+ !vstorage_->LevelFiles(output_level_).empty() &&
+ ioptions_.db_paths.size() <= 1) {
+ // Try to pick trivial move from L0 to L1. We start from the oldest
+ // file. We keep expanding to newer files if it would form a
+ // trivial move.
+ // For now we don't support it with
+ // mutable_cf_options_.compression_per_level to prevent the logic
+ // of determining whether L0 can be trivial moved to the next level.
+ // We skip the case where output level is empty, since in this case, at
+ // least the oldest file would qualify for trivial move, and this would
+ // be a surprising behavior with few benefits.
+
+ // We search from the oldest file from the newest. In theory, there are
+ // files in the middle can form trivial move too, but it is probably
+ // uncommon and we ignore these cases for simplicity.
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(start_level_);
+
+ InternalKey my_smallest, my_largest;
+ for (auto it = level_files.rbegin(); it != level_files.rend(); ++it) {
+ CompactionInputFiles output_level_inputs;
+ output_level_inputs.level = output_level_;
+ FileMetaData* file = *it;
+ if (it == level_files.rbegin()) {
+ my_smallest = file->smallest;
+ my_largest = file->largest;
+ } else {
+ if (compaction_picker_->icmp()->Compare(file->largest, my_smallest) <
+ 0) {
+ my_smallest = file->smallest;
+ } else if (compaction_picker_->icmp()->Compare(file->smallest,
+ my_largest) > 0) {
+ my_largest = file->largest;
+ } else {
+ break;
+ }
+ }
+ vstorage_->GetOverlappingInputs(output_level_, &my_smallest, &my_largest,
+ &output_level_inputs.files);
+ if (output_level_inputs.empty()) {
+ assert(!file->being_compacted);
+ start_level_inputs_.files.push_back(file);
+ } else {
+ break;
+ }
+ }
+ }
+
+ if (!start_level_inputs_.empty()) {
+ // Sort files by key range. Not sure it's 100% necessary but it's cleaner
+ // to always keep files sorted by key the key ranges don't overlap.
+ std::sort(start_level_inputs_.files.begin(),
+ start_level_inputs_.files.end(),
+ [icmp = compaction_picker_->icmp()](FileMetaData* f1,
+ FileMetaData* f2) -> bool {
+ return (icmp->Compare(f1->smallest, f2->smallest) < 0);
+ });
+
+ is_l0_trivial_move_ = true;
+ return true;
+ }
+ return false;
+}
+
+bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
+ if (start_level_inputs_.size() == 1 &&
+ (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) &&
+ (mutable_cf_options_.compression_per_level.empty())) {
+ // Only file of `index`, and it is likely a trivial move. Try to
+ // expand if it is still a trivial move, but not beyond
+ // max_compaction_bytes or 4 files, so that we don't create too
+ // much compaction pressure for the next level.
+ // Ignore if there are more than one DB path, as it would be hard
+ // to predict whether it is a trivial move.
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(start_level_);
+ const size_t kMaxMultiTrivialMove = 4;
+ FileMetaData* initial_file = start_level_inputs_.files[0];
+ size_t total_size = initial_file->fd.GetFileSize();
+ CompactionInputFiles output_level_inputs;
+ output_level_inputs.level = output_level_;
+ for (int i = start_index + 1;
+ i < static_cast<int>(level_files.size()) &&
+ start_level_inputs_.size() < kMaxMultiTrivialMove;
+ i++) {
+ FileMetaData* next_file = level_files[i];
+ if (next_file->being_compacted) {
+ break;
+ }
+ vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest),
+ &(next_file->largest),
+ &output_level_inputs.files);
+ if (!output_level_inputs.empty()) {
+ break;
+ }
+ if (i < static_cast<int>(level_files.size()) - 1 &&
+ compaction_picker_->icmp()
+ ->user_comparator()
+ ->CompareWithoutTimestamp(
+ next_file->largest.user_key(),
+ level_files[i + 1]->smallest.user_key()) == 0) {
+ TEST_SYNC_POINT_CALLBACK(
+ "LevelCompactionBuilder::TryExtendNonL0TrivialMove:NoCleanCut",
+ nullptr);
+ // Not a clean up after adding the next file. Skip.
+ break;
+ }
+ total_size += next_file->fd.GetFileSize();
+ if (total_size > mutable_cf_options_.max_compaction_bytes) {
+ break;
+ }
+ start_level_inputs_.files.push_back(next_file);
+ }
+ return start_level_inputs_.size() > 1;
+ }
+ return false;
+}
+
+bool LevelCompactionBuilder::PickFileToCompact() {
+ // level 0 files are overlapping. So we cannot pick more
+ // than one concurrent compactions at this level. This
+ // could be made better by looking at key-ranges that are
+ // being compacted at level 0.
+ if (start_level_ == 0 &&
+ !compaction_picker_->level0_compactions_in_progress()->empty()) {
+ TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
+ return false;
+ }
+
+ start_level_inputs_.clear();
+ start_level_inputs_.level = start_level_;
+
+ assert(start_level_ >= 0);
+
+ if (TryPickL0TrivialMove()) {
+ return true;
+ }
+
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(start_level_);
+
+ // Pick the file with the highest score in this level that is not already
+ // being compacted.
+ const std::vector<int>& file_scores =
+ vstorage_->FilesByCompactionPri(start_level_);
+
+ unsigned int cmp_idx;
+ for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
+ cmp_idx < file_scores.size(); cmp_idx++) {
+ int index = file_scores[cmp_idx];
+ auto* f = level_files[index];
+
+ // do not pick a file to compact if it is being compacted
+ // from n-1 level.
+ if (f->being_compacted) {
+ if (ioptions_.compaction_pri == kRoundRobin) {
+ // TODO(zichen): this file may be involved in one compaction from
+ // an upper level, cannot advance the cursor for round-robin policy.
+ // Currently, we do not pick any file to compact in this case. We
+ // should fix this later to ensure a compaction is picked but the
+ // cursor shall not be advanced.
+ return false;
+ }
+ continue;
+ }
+
+ start_level_inputs_.files.push_back(f);
+ if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_) ||
+ compaction_picker_->FilesRangeOverlapWithCompaction(
+ {start_level_inputs_}, output_level_,
+ Compaction::EvaluatePenultimateLevel(
+ vstorage_, ioptions_, start_level_, output_level_))) {
+ // A locked (pending compaction) input-level file was pulled in due to
+ // user-key overlap.
+ start_level_inputs_.clear();
+
+ if (ioptions_.compaction_pri == kRoundRobin) {
+ return false;
+ }
+ continue;
+ }
+
+ // Now that input level is fully expanded, we check whether any output
+ // files are locked due to pending compaction.
+ //
+ // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
+ // level files are locked, not just the extra ones pulled in for user-key
+ // overlap.
+ InternalKey smallest, largest;
+ compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+ CompactionInputFiles output_level_inputs;
+ output_level_inputs.level = output_level_;
+ vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+ &output_level_inputs.files);
+ if (output_level_inputs.empty()) {
+ if (TryExtendNonL0TrivialMove(index)) {
+ break;
+ }
+ } else {
+ if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &output_level_inputs)) {
+ start_level_inputs_.clear();
+ if (ioptions_.compaction_pri == kRoundRobin) {
+ return false;
+ }
+ continue;
+ }
+ }
+
+ base_index_ = index;
+ break;
+ }
+
+ // store where to start the iteration in the next call to PickCompaction
+ if (ioptions_.compaction_pri != kRoundRobin) {
+ vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
+ }
+ return start_level_inputs_.size() > 0;
+}
+
+bool LevelCompactionBuilder::PickIntraL0Compaction() {
+ start_level_inputs_.clear();
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(0 /* level */);
+ if (level_files.size() <
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
+ level_files[0]->being_compacted) {
+ // If L0 isn't accumulating much files beyond the regular trigger, don't
+ // resort to L0->L0 compaction yet.
+ return false;
+ }
+ return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
+ std::numeric_limits<uint64_t>::max(),
+ mutable_cf_options_.max_compaction_bytes,
+ &start_level_inputs_, earliest_mem_seqno_);
+}
+} // namespace
+
+Compaction* LevelCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) {
+ LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
+ log_buffer, mutable_cf_options, ioptions_,
+ mutable_db_options);
+ return builder.PickCompaction();
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.h b/src/rocksdb/db/compaction/compaction_picker_level.h
new file mode 100644
index 000000000..42a9b60a6
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Picking compactions for leveled compaction. See wiki page
+// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
+// for description of Leveled compaction.
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+ LevelCompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_test.cc b/src/rocksdb/db/compaction/compaction_picker_test.cc
new file mode 100644
index 000000000..2e2e566c0
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_test.cc
@@ -0,0 +1,3964 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+#include "db/compaction/file_pri.h"
+#include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CountingLogger : public Logger {
+ public:
+ using Logger::Logv;
+ void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+ size_t log_count;
+};
+
+class CompactionPickerTestBase : public testing::Test {
+ public:
+ const Comparator* ucmp_;
+ InternalKeyComparator icmp_;
+ Options options_;
+ ImmutableOptions ioptions_;
+ MutableCFOptions mutable_cf_options_;
+ MutableDBOptions mutable_db_options_;
+ LevelCompactionPicker level_compaction_picker;
+ std::string cf_name_;
+ CountingLogger logger_;
+ LogBuffer log_buffer_;
+ uint32_t file_num_;
+ CompactionOptionsFIFO fifo_options_;
+ std::unique_ptr<VersionStorageInfo> vstorage_;
+ std::vector<std::unique_ptr<FileMetaData>> files_;
+ // does not own FileMetaData
+ std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_;
+ // input files to compaction process.
+ std::vector<CompactionInputFiles> input_files_;
+ int compaction_level_start_;
+
+ explicit CompactionPickerTestBase(const Comparator* _ucmp)
+ : ucmp_(_ucmp),
+ icmp_(ucmp_),
+ options_(CreateOptions(ucmp_)),
+ ioptions_(options_),
+ mutable_cf_options_(options_),
+ mutable_db_options_(),
+ level_compaction_picker(ioptions_, &icmp_),
+ cf_name_("dummy"),
+ log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
+ file_num_(1),
+ vstorage_(nullptr) {
+ mutable_cf_options_.ttl = 0;
+ mutable_cf_options_.periodic_compaction_seconds = 0;
+ // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of
+ // tests to cover.
+ ioptions_.compaction_pri = kByCompensatedSize;
+ fifo_options_.max_table_files_size = 1;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ ioptions_.cf_paths.emplace_back("dummy",
+ std::numeric_limits<uint64_t>::max());
+ }
+
+ ~CompactionPickerTestBase() override {}
+
+ void NewVersionStorage(int num_levels, CompactionStyle style) {
+ DeleteVersionStorage();
+ options_.num_levels = num_levels;
+ vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels,
+ style, nullptr, false));
+ vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+ }
+
+ // Create a new VersionStorageInfo object so we can add mode files and then
+ // merge it with the existing VersionStorageInfo
+ void AddVersionStorage() {
+ temp_vstorage_.reset(new VersionStorageInfo(
+ &icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style,
+ vstorage_.get(), false));
+ }
+
+ void DeleteVersionStorage() {
+ vstorage_.reset();
+ temp_vstorage_.reset();
+ files_.clear();
+ file_map_.clear();
+ input_files_.clear();
+ }
+
+ // REQUIRES: smallest and largest are c-style strings ending with '\0'
+ void Add(int level, uint32_t file_number, const char* smallest,
+ const char* largest, uint64_t file_size = 1, uint32_t path_id = 0,
+ SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+ size_t compensated_file_size = 0, bool marked_for_compact = false,
+ Temperature temperature = Temperature::kUnknown,
+ uint64_t oldest_ancestor_time = kUnknownOldestAncesterTime,
+ Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice()) {
+ assert(ts_of_smallest.size() == ucmp_->timestamp_size());
+ assert(ts_of_largest.size() == ucmp_->timestamp_size());
+
+ VersionStorageInfo* vstorage;
+ if (temp_vstorage_) {
+ vstorage = temp_vstorage_.get();
+ } else {
+ vstorage = vstorage_.get();
+ }
+ assert(level < vstorage->num_levels());
+ char* smallest_key_buf = nullptr;
+ char* largest_key_buf = nullptr;
+
+ if (!ts_of_smallest.empty()) {
+ smallest_key_buf = new char[strlen(smallest) + ucmp_->timestamp_size()];
+ memcpy(smallest_key_buf, smallest, strlen(smallest));
+ memcpy(smallest_key_buf + strlen(smallest), ts_of_smallest.data(),
+ ucmp_->timestamp_size());
+ largest_key_buf = new char[strlen(largest) + ucmp_->timestamp_size()];
+ memcpy(largest_key_buf, largest, strlen(largest));
+ memcpy(largest_key_buf + strlen(largest), ts_of_largest.data(),
+ ucmp_->timestamp_size());
+ }
+
+ InternalKey smallest_ikey = InternalKey(
+ smallest_key_buf ? Slice(smallest_key_buf,
+ ucmp_->timestamp_size() + strlen(smallest))
+ : smallest,
+ smallest_seq, kTypeValue);
+ InternalKey largest_ikey = InternalKey(
+ largest_key_buf
+ ? Slice(largest_key_buf, ucmp_->timestamp_size() + strlen(largest))
+ : largest,
+ largest_seq, kTypeValue);
+
+ FileMetaData* f = new FileMetaData(
+ file_number, path_id, file_size, smallest_ikey, largest_ikey,
+ smallest_seq, largest_seq, marked_for_compact, temperature,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ f->compensated_file_size =
+ (compensated_file_size != 0) ? compensated_file_size : file_size;
+ f->oldest_ancester_time = oldest_ancestor_time;
+ vstorage->AddFile(level, f);
+ files_.emplace_back(f);
+ file_map_.insert({file_number, {f, level}});
+
+ delete[] smallest_key_buf;
+ delete[] largest_key_buf;
+ }
+
+ void SetCompactionInputFilesLevels(int level_count, int start_level) {
+ input_files_.resize(level_count);
+ for (int i = 0; i < level_count; ++i) {
+ input_files_[i].level = start_level + i;
+ }
+ compaction_level_start_ = start_level;
+ }
+
+ void AddToCompactionFiles(uint32_t file_number) {
+ auto iter = file_map_.find(file_number);
+ assert(iter != file_map_.end());
+ int level = iter->second.second;
+ assert(level < vstorage_->num_levels());
+ input_files_[level - compaction_level_start_].files.emplace_back(
+ iter->second.first);
+ }
+
+ void UpdateVersionStorageInfo() {
+ if (temp_vstorage_) {
+ VersionBuilder builder(FileOptions(), &ioptions_, nullptr,
+ vstorage_.get(), nullptr);
+ ASSERT_OK(builder.SaveTo(temp_vstorage_.get()));
+ vstorage_ = std::move(temp_vstorage_);
+ }
+ vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ vstorage_->SetFinalized();
+ }
+
+ private:
+ Options CreateOptions(const Comparator* ucmp) const {
+ Options opts;
+ opts.comparator = ucmp;
+ return opts;
+ }
+
+ std::unique_ptr<VersionStorageInfo> temp_vstorage_;
+};
+
+class CompactionPickerTest : public CompactionPickerTestBase {
+ public:
+ explicit CompactionPickerTest()
+ : CompactionPickerTestBase(BytewiseComparator()) {}
+
+ ~CompactionPickerTest() override {}
+};
+
+class CompactionPickerU64TsTest : public CompactionPickerTestBase {
+ public:
+ explicit CompactionPickerU64TsTest()
+ : CompactionPickerTestBase(test::BytewiseComparatorWithU64TsWrapper()) {}
+
+ ~CompactionPickerU64TsTest() override {}
+};
+
+TEST_F(CompactionPickerTest, Empty) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ UpdateVersionStorageInfo();
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Single) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ Add(0, 1U, "p", "q");
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Level0Trigger) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(1, 66U, "150", "200", 1000000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger2) {
+ mutable_cf_options_.target_file_size_base = 10000000000;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(1, 66U, "150", "200", 1000000001U);
+ Add(1, 88U, "201", "300", 1000000000U);
+ Add(2, 6U, "150", "179", 1000000000U);
+ Add(2, 7U, "180", "220", 1000000000U);
+ Add(2, 8U, "221", "300", 1000000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, LevelMaxScore) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.target_file_size_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ Add(0, 1U, "150", "200", 1000000U);
+ // Level 1 score 1.2
+ Add(1, 66U, "150", "200", 6000000U);
+ Add(1, 88U, "201", "300", 6000000U);
+ // Level 2 score 1.8. File 7 is the largest. Should be picked
+ Add(2, 6U, "150", "179", 60000000U);
+ Add(2, 7U, "180", "220", 60000001U);
+ Add(2, 8U, "221", "300", 60000000U);
+ // Level 3 score slightly larger than 1
+ Add(3, 26U, "150", "170", 260000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(mutable_cf_options_.target_file_size_base +
+ mutable_cf_options_.target_file_size_base / 10,
+ compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionLevel) {
+ const int kLevels = 6;
+ const int kFileCount = 20;
+
+ for (int level = 0; level < kLevels - 1; ++level) {
+ NewVersionStorage(kLevels, kCompactionStyleLevel);
+ uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount;
+ for (int file_count = 1; file_count <= kFileCount; ++file_count) {
+ // start a brand new version in each test.
+ NewVersionStorage(kLevels, kCompactionStyleLevel);
+ for (int i = 0; i < file_count; ++i) {
+ Add(level, i, std::to_string((i + 100) * 1000).c_str(),
+ std::to_string((i + 100) * 1000 + 999).c_str(), file_size, 0,
+ i * 100, i * 100 + 99);
+ }
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
+ ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ // release the version storage
+ DeleteVersionStorage();
+ }
+ }
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic2) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 2);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 2, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic3) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+ Add(num_levels - 1, 4U, "300", "350", 3000U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic4) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+ Add(num_levels - 1, 4U, "300", "350", 3000U);
+ Add(num_levels - 3, 5U, "150", "180", 3U);
+ Add(num_levels - 3, 6U, "181", "300", 3U);
+ Add(num_levels - 3, 7U, "400", "450", 3U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(num_levels - 3, compaction->level(1));
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(2, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(num_levels - 1, 2U, "200", "250", 300U);
+ Add(num_levels - 1, 3U, "300", "350", 3000U);
+ Add(num_levels - 1, 4U, "400", "450", 3U);
+ Add(num_levels - 2, 5U, "150", "180", 300U);
+ Add(num_levels - 2, 6U, "181", "350", 500U);
+ Add(num_levels - 2, 7U, "400", "450", 200U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(0, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+// Universal and FIFO Compactions are not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ // verify the trigger given different number of L0 files.
+ for (int i = 1;
+ i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ Add(0, i, std::to_string((i + 100) * 1000).c_str(),
+ std::to_string((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
+ i * 100 + 99);
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ }
+}
+
+TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
+ const uint64_t kFileSize = 100000;
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ ioptions_.allow_ingest_behind = true;
+ ioptions_.num_levels = 3;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+ Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ // output level should be the one above the bottom-most
+ ASSERT_EQ(1, compaction->output_level());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files overlaps, they cannot
+// be trivially moved.
+
+TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+ Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(!compaction->is_trivial_move());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files doesn't overlaps, they should
+// be trivially moved.
+TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(2, 3U, "301", "350", kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction->is_trivial_move());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) {
+ // The case where universal periodic compaction can be picked
+ // with some newer files being compacted.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[2].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) {
+ // The case where universal periodic compaction does not
+ // pick up only level to compact if it doesn't cover
+ // any file marked as periodic compaction.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[5].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
+ // The case where universal periodic compaction does not
+ // pick up only the last sorted run which is an L0 file if it isn't
+ // marked as periodic compaction.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(0, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[5].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
+ // The case where universal periodic compaction couldn't form
+ // a compaction that includes any file marked for periodic compaction.
+ // Right now we form the compaction anyway if it is more than one
+ // sorted run. Just put the case here to validate that it doesn't
+ // crash.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[2].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(!compaction ||
+ compaction->start_level() != compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) {
+ // Test single L0 file periodic compaction triggering.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 6U, "150", "200", kFileSize, 0, 500, 550);
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) {
+ // Test single sorted run non-L0 periodic compaction
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(4, 5U, "150", "200", kFileSize, 0, 500, 550);
+ Add(4, 6U, "350", "400", kFileSize, 0, 500, 550);
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->start_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.max_compaction_bytes = 555555;
+ mutable_cf_options_.compaction_options_universal.incremental = true;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 30;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+ Add(3, 5U, "310", "380", kFileSize, 0, 200, 251);
+ Add(3, 6U, "410", "880", kFileSize, 0, 200, 251);
+ Add(3, 7U, "910", "980", 1, 0, 200, 251);
+ Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+ Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+ Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+ Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+ Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+ // Add(4, 15U, "960", "970", kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(3, compaction->start_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+ // ASSERT_EQ(4U, compaction->num_input_files(1));
+ ASSERT_EQ(11U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(12U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(13U, compaction->input(1, 2)->fd.GetNumber());
+ ASSERT_EQ(14U, compaction->input(1, 3)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.max_compaction_bytes = 400000;
+ mutable_cf_options_.compaction_options_universal.incremental = true;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 30;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(1, 2U, "010", "080", kFileSize, 0, 200, 251);
+ Add(2, 5U, "310", "380", kFileSize, 0, 200, 251);
+ Add(2, 6U, "410", "880", kFileSize, 0, 200, 251);
+ Add(2, 7U, "910", "980", kFileSize, 0, 200, 251);
+ Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+ Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+ Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+ Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+ Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(2, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(15U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) {
+ // Test bottom level files falling between gaps between two upper level
+ // files
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.max_compaction_bytes = 300000;
+ mutable_cf_options_.compaction_options_universal.incremental = true;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 30;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+ Add(3, 5U, "000", "180", kFileSize, 0, 200, 251);
+ Add(3, 6U, "181", "190", kFileSize, 0, 200, 251);
+ Add(3, 7U, "710", "810", kFileSize, 0, 200, 251);
+ Add(3, 8U, "820", "830", kFileSize, 0, 200, 251);
+ Add(3, 9U, "900", "991", kFileSize, 0, 200, 251);
+ Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+ Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+ Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+ Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+ Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(2, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(0, compaction->num_input_files(2));
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) {
+ // Test compaction candidates always cover many files.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.max_compaction_bytes = 3200000;
+ mutable_cf_options_.compaction_options_universal.incremental = true;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 30;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+
+ // Generate files like following:
+ // L3: (1101, 1180) (1201, 1280) ... (7901, 7908)
+ // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010)
+ for (int i = 11; i < 79; i++) {
+ Add(3, 100 + i * 3, std::to_string(i * 100).c_str(),
+ std::to_string(i * 100 + 80).c_str(), kFileSize, 0, 200, 251);
+ // Add a tie breaker
+ if (i == 66) {
+ Add(3, 10000U, "6690", "6699", kFileSize, 0, 200, 251);
+ }
+
+ Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(),
+ std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251);
+ Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(),
+ std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251);
+ }
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(3, compaction->start_level());
+ ASSERT_EQ(6U, compaction->num_input_files(0));
+ ASSERT_EQ(100 + 62U * 3, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(10000U, compaction->input(0, 5)->fd.GetNumber());
+ ASSERT_EQ(11, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) {
+ // Test compaction candidates always cover many files with some single
+ // files larger than size threshold.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.max_compaction_bytes = 3200000;
+ mutable_cf_options_.compaction_options_universal.incremental = true;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 30;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+
+ // Generate files like following:
+ // L3: (1101, 1180) (1201, 1280) ... (7901, 7908)
+ // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010)
+ for (int i = 11; i < 70; i++) {
+ Add(3, 100 + i * 3, std::to_string(i * 100).c_str(),
+ std::to_string(i * 100 + 80).c_str(),
+ i % 10 == 9 ? kFileSize * 100 : kFileSize, 0, 200, 251);
+
+ Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(),
+ std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251);
+ Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(),
+ std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251);
+ }
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(3, compaction->start_level());
+ ASSERT_EQ(6U, compaction->num_input_files(0));
+ ASSERT_EQ(100 + 14 * 3, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(100 + 19 * 3, compaction->input(0, 5)->fd.GetNumber());
+ ASSERT_EQ(13, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const int kFileCount =
+ mutable_cf_options_.level0_file_num_compaction_trigger * 3;
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * kFileCount / 2;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false);
+
+ // verify whether compaction is needed based on the current
+ // size of L0 files.
+ for (int i = 1; i <= kFileCount; ++i) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ Add(0, i, std::to_string((i + 100) * 1000).c_str(),
+ std::to_string((i + 100) * 1000 + 999).c_str(), kFileSize, 0, i * 100,
+ i * 100 + 99);
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ }
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarm1) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kUnknown, threshold_time - 3000);
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarm2) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kUnknown, threshold_time - 3000);
+ Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+ Temperature::kUnknown, threshold_time - 4000);
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 9;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kUnknown, threshold_time - 3000);
+ Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+ Temperature::kUnknown, threshold_time - 4000);
+ Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+ Temperature::kUnknown, threshold_time - 5000);
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kUnknown, threshold_time - 3000);
+ Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+ Temperature::kUnknown, threshold_time - 4000);
+ Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+ Temperature::kWarm, threshold_time - 5000);
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kUnknown, threshold_time - 3000);
+ Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+ Temperature::kUnknown, threshold_time - 4000);
+ Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+ Temperature::kWarm, threshold_time - 5000);
+ file_map_[2].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ // Stop if a file is being compacted
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kWarm, threshold_time - 3000);
+ Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+ Temperature::kUnknown, threshold_time - 4000);
+ Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+ Temperature::kWarm, threshold_time - 5000);
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ // Stop if a file is being compacted
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.target_file_size_base = 100000000000;
+ mutable_cf_options_.target_file_size_multiplier = 10;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+
+ Add(2, 6U, "150", "179", 50000000U);
+ Add(2, 7U, "180", "220", 50000000U);
+ Add(2, 8U, "321", "400", 50000000U); // File not overlapping
+ Add(2, 9U, "721", "800", 50000000U);
+
+ Add(3, 26U, "150", "170", 260000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 260000000U);
+ Add(3, 30U, "750", "900", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Pick file 8 because it overlaps with 0 files on level 3.
+ ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+ // Compaction input size * 1.1
+ ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.target_file_size_base = 10000000;
+ mutable_cf_options_.target_file_size_multiplier = 10;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+
+ Add(2, 6U, "150", "175",
+ 60000000U); // Overlaps with file 26, 27, total size 521M
+ Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size
+ // 520M, the smallest overlapping
+ Add(2, 8U, "201", "300",
+ 60000000U); // Overlaps with file 28, 29, total size 521M
+
+ Add(3, 25U, "100", "110", 261000000U);
+ Add(3, 26U, "150", "170", 261000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 261000000U);
+ Add(3, 30U, "321", "400", 261000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 7 because overlapping ratio is the biggest.
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.max_bytes_for_level_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ // file 7 and 8 over lap with the same file, but file 8 is smaller so
+ // it will be picked.
+ Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27
+ Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27
+ Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file
+ // itself is larger. Should be picked.
+
+ Add(3, 26U, "160", "165", 260000000U);
+ Add(3, 27U, "166", "170", 260000000U);
+ Add(3, 28U, "180", "400", 260000000U);
+ Add(3, 29U, "401", "500", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 8 because overlapping ratio is the biggest.
+ ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.max_bytes_for_level_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+
+ // file 7 and 8 over lap with the same file, but file 8 is smaller so
+ // it will be picked.
+ // Overlaps with file 26, 27. And the file is compensated so will be
+ // picked up.
+ Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U);
+ Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27
+ Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28
+
+ Add(3, 26U, "160", "165", 60000000U);
+ // Boosted file size in output level is not considered.
+ Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U);
+ Add(3, 28U, "180", "400", 60000000U);
+ Add(3, 29U, "401", "500", 60000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 8 because overlapping ratio is the biggest.
+ ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriRoundRobin) {
+ std::vector<InternalKey> test_cursors = {InternalKey("249", 100, kTypeValue),
+ InternalKey("600", 100, kTypeValue),
+ InternalKey()};
+ std::vector<uint32_t> selected_files = {8U, 6U, 6U};
+
+ ioptions_.compaction_pri = kRoundRobin;
+ mutable_cf_options_.max_bytes_for_level_base = 12000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ for (size_t i = 0; i < test_cursors.size(); i++) {
+ // start a brand new version in each test.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ vstorage_->ResizeCompactCursors(6);
+ // Set the cursor
+ vstorage_->AddCursorForOneLevel(2, test_cursors[i]);
+ Add(2, 6U, "150", "199", 50000000U); // Overlap with 26U, 27U
+ Add(2, 7U, "200", "249", 50000000U); // File not overlapping
+ Add(2, 8U, "300", "600", 50000000U); // Overlap with 28U, 29U
+
+ Add(3, 26U, "130", "165", 60000000U);
+ Add(3, 27U, "166", "170", 60000000U);
+ Add(3, 28U, "270", "340", 60000000U);
+ Add(3, 29U, "401", "500", 60000000U);
+ UpdateVersionStorageInfo();
+ LevelCompactionPicker local_level_compaction_picker =
+ LevelCompactionPicker(ioptions_, &icmp_);
+ std::unique_ptr<Compaction> compaction(
+ local_level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ // Since the max bytes for level 2 is 120M, picking one file to compact
+ // makes the post-compaction level size less than 120M, there is exactly one
+ // file picked for round-robin compaction
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(selected_files[i], compaction->input(0, 0)->fd.GetNumber());
+ // release the version storage
+ DeleteVersionStorage();
+ }
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin1) {
+ ioptions_.compaction_pri = kRoundRobin;
+ mutable_cf_options_.max_compaction_bytes = 100000000u;
+ mutable_cf_options_.max_bytes_for_level_base = 120;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ // start a brand new version in each test.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ vstorage_->ResizeCompactCursors(6);
+ // Set the cursor (file picking should start with 7U)
+ vstorage_->AddCursorForOneLevel(2, InternalKey("199", 100, kTypeValue));
+ Add(2, 6U, "150", "199", 500U);
+ Add(2, 7U, "200", "249", 500U);
+ Add(2, 8U, "300", "600", 500U);
+ Add(2, 9U, "700", "800", 500U);
+ Add(2, 10U, "850", "950", 500U);
+
+ Add(3, 26U, "130", "165", 600U);
+ Add(3, 27U, "166", "170", 600U);
+ Add(3, 28U, "270", "340", 600U);
+ Add(3, 29U, "401", "500", 600U);
+ Add(3, 30U, "601", "800", 600U);
+ Add(3, 31U, "830", "890", 600U);
+ UpdateVersionStorageInfo();
+ LevelCompactionPicker local_level_compaction_picker =
+ LevelCompactionPicker(ioptions_, &icmp_);
+ std::unique_ptr<Compaction> compaction(
+ local_level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+
+ // The maximum compaction bytes is very large in this case so we can igore its
+ // constraint in this test case. The maximum bytes for level 2 is 1200
+ // bytes, and thus at least 3 files should be picked so that the bytes in
+ // level 2 is less than the maximum
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(8U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(9U, compaction->input(0, 2)->fd.GetNumber());
+ // release the version storage
+ DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin2) {
+ ioptions_.compaction_pri = kRoundRobin;
+ mutable_cf_options_.max_compaction_bytes = 2500u;
+ mutable_cf_options_.max_bytes_for_level_base = 120;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ // start a brand new version in each test.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ vstorage_->ResizeCompactCursors(6);
+ // Set the cursor (file picking should start with 6U)
+ vstorage_->AddCursorForOneLevel(2, InternalKey("1000", 100, kTypeValue));
+ Add(2, 6U, "150", "199", 500U); // Overlap with 26U, 27U
+ Add(2, 7U, "200", "249", 500U); // Overlap with 27U
+ Add(2, 8U, "300", "600", 500U); // Overlap with 28U, 29U
+ Add(2, 9U, "700", "800", 500U);
+ Add(2, 10U, "850", "950", 500U);
+
+ Add(3, 26U, "130", "165", 600U);
+ Add(3, 27U, "166", "230", 600U);
+ Add(3, 28U, "270", "340", 600U);
+ Add(3, 29U, "401", "500", 600U);
+ Add(3, 30U, "601", "800", 600U);
+ Add(3, 31U, "830", "890", 600U);
+ UpdateVersionStorageInfo();
+ LevelCompactionPicker local_level_compaction_picker =
+ LevelCompactionPicker(ioptions_, &icmp_);
+ std::unique_ptr<Compaction> compaction(
+ local_level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+
+ // The maximum compaction bytes is only 2500 bytes now. Even though we are
+ // required to choose 3 files so that the post-compaction level size is less
+ // than 1200 bytes. We cannot pick 3 files to compact since the maximum
+ // compaction size is 2500. After picking files 6U and 7U, the number of
+ // compaction bytes has reached 2200, and thus no more space to add another
+ // input file with 50M bytes.
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(0, 1)->fd.GetNumber());
+ // release the version storage
+ DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin3) {
+ ioptions_.compaction_pri = kRoundRobin;
+ mutable_cf_options_.max_compaction_bytes = 1000000u;
+ mutable_cf_options_.max_bytes_for_level_base = 120;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ // start a brand new version in each test.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ vstorage_->ResizeCompactCursors(6);
+ // Set the cursor (file picking should start with 9U)
+ vstorage_->AddCursorForOneLevel(2, InternalKey("700", 100, kTypeValue));
+ Add(2, 6U, "150", "199", 500U);
+ Add(2, 7U, "200", "249", 500U);
+ Add(2, 8U, "300", "600", 500U);
+ Add(2, 9U, "700", "800", 500U);
+ Add(2, 10U, "850", "950", 500U);
+
+ Add(3, 26U, "130", "165", 600U);
+ Add(3, 27U, "166", "170", 600U);
+ Add(3, 28U, "270", "340", 600U);
+ Add(3, 29U, "401", "500", 600U);
+ Add(3, 30U, "601", "800", 600U);
+ Add(3, 31U, "830", "890", 600U);
+ UpdateVersionStorageInfo();
+ LevelCompactionPicker local_level_compaction_picker =
+ LevelCompactionPicker(ioptions_, &icmp_);
+ std::unique_ptr<Compaction> compaction(
+ local_level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+
+ // Cannot pick more files since we reach the last file in level 2
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(9U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(10U, compaction->input(0, 1)->fd.GetNumber());
+ // release the version storage
+ DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlappingManyFiles) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.max_bytes_for_level_base = 15000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ // file 7 and 8 over lap with the same file, but file 8 is smaller so
+ // it will be picked.
+ Add(2, 13U, "010", "011",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 14U, "020", "021",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 15U, "030", "031",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 16U, "040", "041",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 17U, "050", "051",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 18U, "060", "061",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 19U, "070", "071",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 20U, "080", "081",
+ 6100U); // Overlaps with a large file. Not picked
+
+ Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27
+ Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27
+ Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file
+ // itself is larger. Should be picked.
+ Add(2, 9U, "610", "611",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 10U, "620", "621",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 11U, "630", "631",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 12U, "640", "641",
+ 6100U); // Overlaps with a large file. Not picked
+
+ Add(3, 31U, "001", "100", 260000000U);
+ Add(3, 26U, "160", "165", 260000000U);
+ Add(3, 27U, "166", "170", 260000000U);
+ Add(3, 28U, "180", "400", 260000000U);
+ Add(3, 29U, "401", "500", 260000000U);
+ Add(3, 30U, "601", "700", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 8 because overlapping ratio is the biggest.
+ ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+// This test exhibits the bug where we don't properly reset parent_index in
+// PickCompaction()
+TEST_F(CompactionPickerTest, ParentIndexResetBug) {
+ int num_levels = ioptions_.num_levels;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200"); // <- marked for compaction
+ Add(1, 3U, "400", "500", 600); // <- this one needs compacting
+ Add(2, 4U, "150", "200");
+ Add(2, 5U, "201", "210");
+ Add(2, 6U, "300", "310");
+ Add(2, 7U, "400", "500"); // <- being compacted
+
+ vstorage_->LevelFiles(2)[3]->being_compacted = true;
+ vstorage_->LevelFiles(0)[0]->marked_for_compaction = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+}
+
+// This test checks ExpandWhileOverlapping() by having overlapping user keys
+// ranges (with different sequence numbers) in the input files.
+TEST_F(CompactionPickerTest, OverlappingUserKeys) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kByCompensatedSize;
+
+ Add(1, 1U, "100", "150", 1U);
+ // Overlapping user keys
+ Add(1, 2U, "200", "400", 1U);
+ Add(1, 3U, "400", "500", 1000000000U, 0, 0);
+ Add(2, 4U, "600", "700", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1000000000U);
+ Add(1, 2U, "400", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "700", 1U, 0, 0);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to
+ // expand multiple times)
+ Add(1, 1U, "100", "150", 1U);
+ Add(1, 2U, "150", "200", 1U, 0, 0);
+ Add(1, 3U, "200", "250", 1000000000U, 0, 0);
+ Add(1, 4U, "250", "300", 1U, 0, 0);
+ Add(1, 5U, "300", "350", 1U, 0, 0);
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "350", "400", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys4) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_bytes_for_level_base = 1000000;
+
+ Add(1, 1U, "100", "150", 1U);
+ Add(1, 2U, "150", "199", 1U, 0, 0);
+ Add(1, 3U, "200", "250", 1100000U, 0, 0);
+ Add(1, 4U, "251", "300", 1U, 0, 0);
+ Add(1, 5U, "300", "350", 1U, 0, 0);
+
+ Add(2, 6U, "100", "115", 1U);
+ Add(2, 7U, "125", "325", 1U);
+ Add(2, 8U, "350", "400", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys5) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1000000000U);
+ Add(1, 2U, "400", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "700", 1U, 0, 0);
+
+ vstorage_->LevelFiles(2)[2]->being_compacted = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys6) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1U, 0, 0);
+ Add(1, 2U, "401", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "300", 1U, 0, 0);
+ Add(2, 5U, "305", "450", 1U, 0, 0);
+ Add(2, 6U, "460", "600", 1U, 0, 0);
+ Add(2, 7U, "600", "700", 1U, 0, 0);
+
+ vstorage_->LevelFiles(1)[0]->marked_for_compaction = true;
+ vstorage_->LevelFiles(1)[1]->marked_for_compaction = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys7) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1U, 0, 0);
+ Add(1, 2U, "401", "500", 1000000000U, 0, 0);
+ Add(2, 3U, "100", "250", 1U);
+ Add(2, 4U, "300", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "800", 1U, 0, 0);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_GE(1U, compaction->num_input_files(0));
+ ASSERT_GE(2U, compaction->num_input_files(1));
+ // File 5 has to be included in the compaction
+ ASSERT_EQ(5U, compaction->inputs(1)->back()->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys8) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up
+ // Expand input level as much as possible
+ // no overlapping case
+ Add(1, 1U, "101", "150", 1U);
+ Add(1, 2U, "151", "200", 1U);
+ Add(1, 3U, "201", "300", 1000000000U);
+ Add(1, 4U, "301", "400", 1U);
+ Add(1, 5U, "401", "500", 1U);
+ Add(2, 6U, "150", "200", 1U);
+ Add(2, 7U, "200", "450", 1U, 0, 0);
+ Add(2, 8U, "500", "600", 1U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys9) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up
+ // Expand input level as much as possible
+ // overlapping case
+ Add(1, 1U, "121", "150", 1U);
+ Add(1, 2U, "151", "200", 1U);
+ Add(1, 3U, "201", "300", 1000000000U);
+ Add(1, 4U, "301", "400", 1U);
+ Add(1, 5U, "401", "500", 1U);
+ Add(2, 6U, "100", "120", 1U);
+ Add(2, 7U, "150", "200", 1U);
+ Add(2, 8U, "200", "450", 1U, 0, 0);
+ Add(2, 9U, "501", "600", 1U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys10) {
+ // Locked file encountered when pulling in extra input-level files with same
+ // user keys. Verify we pick the next-best file from the same input level.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ // file_number 2U is largest and thus first choice. But it overlaps with
+ // file_number 1U which is being compacted. So instead we pick the next-
+ // biggest file, 3U, which is eligible for compaction.
+ Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+ "150" /* largest */, 1U /* file_size */);
+ file_map_[1U].first->being_compacted = true;
+ Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+ "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */,
+ 0 /* largest_seq */);
+ Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 900000000U /* file_size */);
+ Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+ "150" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "151" /* smallest */,
+ "200" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 1U /* file_size */);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys11) {
+ // Locked file encountered when pulling in extra output-level files with same
+ // user keys. Expected to skip that compaction and pick the next-best choice.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ // score(L1) = 3.7
+ // score(L2) = 1.85
+ // There is no eligible file in L1 to compact since both candidates pull in
+ // file_number 5U, which overlaps with a file pending compaction (6U). The
+ // first eligible compaction is from L2->L3.
+ Add(1 /* level */, 2U /* file_number */, "151" /* smallest */,
+ "200" /* largest */, 1000000000U /* file_size */);
+ Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 5000000000U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+ "201" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+ "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */,
+ 0 /* largest_seq */);
+ file_map_[6U].first->being_compacted = true;
+ Add(3 /* level */, 7U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 1U /* file_size */);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FileTtlBooster) {
+ // Set TTL to 2048
+ // TTL boosting for all levels starts at 1024,
+ // Whole TTL range is 2048 * 31 / 32 - 1024 = 1984 - 1024 = 960.
+ // From second last level (L5), range starts at
+ // 1024 + 480, 1024 + 240, 1024 + 120 (which is L3).
+ // Boosting step 124 / 16 = 7.75 -> 7
+ //
+ const uint64_t kCurrentTime = 1000000;
+ FileMetaData meta;
+
+ {
+ FileTtlBooster booster(kCurrentTime, 2048, 7, 3);
+
+ // Not triggering if the file is younger than ttl/2
+ meta.oldest_ancester_time = kCurrentTime - 1023;
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime - 1024;
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime + 10;
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+
+ // Within one boosting step
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 6);
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+
+ // One boosting step
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 7);
+ ASSERT_EQ(2, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 8);
+ ASSERT_EQ(2, booster.GetBoostScore(&meta));
+
+ // Multiple boosting steps
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 30);
+ ASSERT_EQ(5, booster.GetBoostScore(&meta));
+
+ // Very high boosting steps
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 700);
+ ASSERT_EQ(101, booster.GetBoostScore(&meta));
+ }
+ {
+ // Test second last level
+ FileTtlBooster booster(kCurrentTime, 2048, 7, 5);
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 480);
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60);
+ ASSERT_EQ(3, booster.GetBoostScore(&meta));
+ }
+ {
+ // Test last level
+ FileTtlBooster booster(kCurrentTime, 2048, 7, 6);
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 480);
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60);
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime - 3000;
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ }
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ file_map_[4u].first->being_compacted = true;
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // No compaction should be scheduled, if L0 has higher priority than L1
+ // but L0->L1 compaction is blocked by a file in L1 being compacted.
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // If no file in L1 being compacted, L0->L1 compaction will be scheduled.
+ UpdateVersionStorageInfo(); // being_compacted flag is cleared here.
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 score more than 6.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ file_map_[4u].first->being_compacted = true;
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+ Add(1, 51U, "351", "400", 6000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // If score in L1 is larger than L0, L1 compaction goes through despite
+ // there is pending L0 compaction.
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 3U, "150", "200", 200);
+ // Level 1 is over target by 200
+ Add(1, 4U, "400", "500", 600);
+ Add(1, 5U, "600", "700", 600);
+ // Level 2 is less than target 10000 even added size of level 1
+ // Size ratio of L2/L1 is 9600 / 1200 = 8
+ Add(2, 6U, "150", "200", 2500);
+ Add(2, 7U, "201", "210", 2000);
+ Add(2, 8U, "300", "310", 2600);
+ Add(2, 9U, "400", "500", 2500);
+ // Level 3 exceeds target 100,000 of 1000
+ Add(3, 10U, "400", "500", 101000);
+ // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3
+ // Size ratio L4/L3 is 9.9
+ // After merge from L3, L4 size is 1000900
+ Add(4, 11U, "400", "500", 999900);
+ Add(5, 12U, "400", "500", 8007200);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(200u * 9u + 10900u + 900u * 9,
+ vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 4U, "150", "200", 200);
+ Add(0, 5U, "150", "200", 200);
+ Add(0, 6U, "150", "200", 200);
+ // Level 1 size will be 1400 after merging with L0
+ Add(1, 7U, "400", "500", 200);
+ Add(1, 8U, "600", "700", 200);
+ // Level 2 is less than target 10000 even added size of level 1
+ Add(2, 9U, "150", "200", 9100);
+ // Level 3 over the target, but since level 4 is empty, we assume it will be
+ // a trivial move.
+ Add(3, 10U, "400", "500", 101000);
+
+ UpdateVersionStorageInfo();
+
+ // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0)
+ ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 2000);
+ Add(0, 2U, "150", "200", 2000);
+ Add(0, 4U, "150", "200", 2000);
+ Add(0, 5U, "150", "200", 2000);
+ Add(0, 6U, "150", "200", 1000);
+ // Level 1 size will be 10000 after merging with L0
+ Add(1, 7U, "400", "500", 500);
+ Add(1, 8U, "600", "700", 500);
+
+ Add(2, 9U, "150", "200", 10000);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+ // Set Last level size 50000
+ // num_levels - 1 target 5000
+ // num_levels - 2 is base level with target 1000 (rounded up to
+ // max_bytes_for_level_base).
+ Add(num_levels - 1, 10U, "400", "500", 50000);
+
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 4U, "150", "200", 200);
+ Add(0, 5U, "150", "200", 200);
+ Add(0, 6U, "150", "200", 200);
+ // num_levels - 3 is over target by 100 + 1000
+ Add(num_levels - 3, 7U, "400", "500", 550);
+ Add(num_levels - 3, 8U, "600", "700", 550);
+ // num_levels - 2 is over target by 1100 + 200
+ Add(num_levels - 2, 9U, "150", "200", 5200);
+
+ UpdateVersionStorageInfo();
+
+ // Merging to the second last level: (5200 / 2100 + 1) * 1100
+ // Merging to the last level: (50000 / 6300 + 1) * 1300
+ ASSERT_EQ(2100u + 3823u + 11617u,
+ vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
+ // case 1: Higher levels are empty
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ bool result =
+ Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // case 2: Higher levels have no overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "k", "p");
+ Add(3, 8U, "t", "w");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // case 3.1: Higher levels (level 3) have overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "e", "g");
+ Add(3, 8U, "h", "k");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // case 3.2: Higher levels (level 5) have overlap
+ DeleteVersionStorage();
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "j", "k");
+ Add(3, 8U, "l", "m");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ Add(5, 11U, "h", "k");
+ Add(5, 12U, "y", "yy");
+ Add(5, 13U, "z", "zz");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping
+ // one key ("d")
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "j", "k");
+ Add(3, 8U, "l", "m");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ Add(5, 11U, "ccc", "d");
+ Add(5, 12U, "y", "yy");
+ Add(5, 13U, "z", "zz");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // Level 0 files overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "z");
+ Add(0, 4U, "e", "f");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(1, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // Level 0 files don't overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "k");
+ Add(0, 4U, "e", "f");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(1, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // Level 1 files overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "k");
+ Add(0, 4U, "e", "f");
+ Add(1, 5U, "a", "m");
+ Add(1, 6U, "n", "o");
+ Add(1, 7U, "w", "y");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ AddToCompactionFiles(5U);
+ AddToCompactionFiles(6U);
+ AddToCompactionFiles(7U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesHit) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+ mutable_cf_options_.max_compaction_bytes = 800000u;
+ mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2 and 5.
+ // It can expand because adding file 1 and 3, the compaction size will
+ // exceed mutable_cf_options_.max_bytes_for_level_base.
+ Add(1, 1U, "100", "150", 300000U);
+ Add(1, 2U, "151", "200", 300001U, 0, 0);
+ Add(1, 3U, "201", "250", 300000U, 0, 0);
+ Add(1, 4U, "251", "300", 300000U, 0, 0);
+ Add(2, 5U, "100", "256", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) {
+ mutable_cf_options_.max_bytes_for_level_base = 800000u;
+ mutable_cf_options_.max_compaction_bytes = 1000000u;
+ mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2 and 5.
+ // and it expands to file 1 and 3 too.
+ Add(1, 1U, "100", "150", 300000U);
+ Add(1, 2U, "151", "200", 300001U, 0, 0);
+ Add(1, 3U, "201", "250", 300000U, 0, 0);
+ Add(1, 4U, "251", "300", 300000U, 0, 0);
+ Add(2, 5U, "000", "251", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOn) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000u;
+ mutable_cf_options_.max_compaction_bytes = 10001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2
+ Add(1, 1U, "100", "150", 3000U);
+ Add(1, 2U, "151", "200", 3001U);
+ Add(1, 3U, "201", "250", 3000U);
+ Add(1, 4U, "251", "300", 3000U);
+
+ Add(3, 5U, "120", "130", 7000U);
+ Add(3, 6U, "170", "180", 7000U);
+ Add(3, 7U, "220", "230", 7000U);
+ Add(3, 8U, "270", "280", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMove1) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options_.max_compaction_bytes = 10000000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(0, 1U, "100", "150", 3000U, 0, 710, 800);
+ Add(0, 2U, "151", "200", 3001U, 0, 610, 700);
+ Add(0, 3U, "301", "350", 3000U, 0, 510, 600);
+ Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+ Add(1, 5U, "120", "130", 7000U);
+ Add(1, 6U, "170", "180", 7000U);
+ Add(1, 7U, "220", "230", 7000U);
+ Add(1, 8U, "270", "280", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(2, compaction->num_input_files(0));
+ ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMoveOneFile) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options_.max_compaction_bytes = 10000000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(0, 1U, "100", "150", 3000U, 0, 710, 800);
+ Add(0, 2U, "551", "600", 3001U, 0, 610, 700);
+ Add(0, 3U, "101", "150", 3000U, 0, 510, 600);
+ Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+ Add(1, 5U, "120", "130", 7000U);
+ Add(1, 6U, "170", "180", 7000U);
+ Add(1, 7U, "220", "230", 7000U);
+ Add(1, 8U, "270", "280", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(1, compaction->num_input_files(0));
+ ASSERT_EQ(4, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options_.max_compaction_bytes = 10000000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(0, 1U, "300", "350", 3000U, 0, 710, 800);
+ Add(0, 2U, "651", "600", 3001U, 0, 610, 700);
+ Add(0, 3U, "501", "550", 3000U, 0, 510, 600);
+ Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+ Add(1, 5U, "120", "130", 7000U);
+ Add(1, 6U, "970", "980", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(4, compaction->num_input_files(0));
+ ASSERT_EQ(1, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(2, compaction->input(0, 3)->fd.GetNumber());
+ ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000u;
+ mutable_cf_options_.max_compaction_bytes = 10001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(1);
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2
+ Add(1, 1U, "100", "150", 3000U);
+ Add(1, 2U, "151", "200", 3001U);
+ Add(1, 3U, "201", "250", 3000U);
+ Add(1, 4U, "251", "300", 3000U);
+
+ Add(3, 5U, "120", "130", 7000U);
+ Add(3, 6U, "170", "180", 7000U);
+ Add(3, 7U, "220", "230", 7000U);
+ Add(3, 8U, "270", "280", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ // No trivial move, because partitioning is applied
+ ASSERT_TRUE(!compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOff) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+ mutable_cf_options_.max_compaction_bytes = 10000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick all files from level 1
+ Add(1, 1U, "100", "150", 300000U, 0, 0);
+ Add(1, 2U, "150", "200", 300000U, 0, 0);
+ Add(1, 3U, "200", "250", 300000U, 0, 0);
+ Add(1, 4U, "250", "300", 300000U, 0, 0);
+
+ Add(3, 5U, "120", "130", 6000U);
+ Add(3, 6U, "140", "150", 6000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_FALSE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles1) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(2, 1U, "100", "150", 3000U);
+ Add(2, 2U, "151", "200", 3001U);
+ Add(2, 3U, "301", "350", 3000U);
+ Add(2, 4U, "451", "400", 3000U);
+ Add(2, 5U, "551", "500", 3000U);
+ Add(2, 6U, "651", "600", 3000U);
+ Add(2, 7U, "751", "700", 3000U);
+ Add(2, 8U, "851", "900", 3000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 16U, "170", "180", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(4, compaction->num_input_files(0));
+ ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(5, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(6, compaction->input(0, 3)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles2) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(2, 1U, "100", "150", 3000U);
+ Add(2, 2U, "151", "160", 3001U);
+ Add(2, 3U, "161", "179", 3000U);
+ Add(2, 4U, "220", "400", 3000U);
+ Add(2, 5U, "551", "500", 3000U);
+ Add(2, 6U, "651", "600", 3000U);
+ Add(2, 7U, "751", "700", 3000U);
+ Add(2, 8U, "851", "900", 3000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(2, compaction->num_input_files(0));
+ ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles3) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // Even if consecutive files can be trivial moved, we don't pick them
+ // since in case trivial move can't be issued for a reason, we cannot
+ // fall back to normal compactions.
+ Add(2, 1U, "100", "150", 3000U);
+ Add(2, 2U, "151", "160", 3001U);
+ Add(2, 5U, "551", "500", 3000U);
+ Add(2, 6U, "651", "600", 3000U);
+ Add(2, 7U, "751", "700", 3000U);
+ Add(2, 8U, "851", "900", 3000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(1, compaction->num_input_files(0));
+ ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles4) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(2, 1U, "100", "150", 4000U);
+ Add(2, 2U, "151", "160", 4001U);
+ Add(2, 3U, "161", "179", 4000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(2, compaction->num_input_files(0));
+ ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles5) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // File 4 and 5 aren't clean cut, so only 2 and 3 are picked.
+ Add(2, 1U, "100", "150", 4000U);
+ Add(2, 2U, "151", "160", 4001U);
+ Add(2, 3U, "161", "179", 4000U);
+ Add(2, 4U, "180", "185", 4000U);
+ Add(2, 5U, "185", "190", 4000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(2, compaction->num_input_files(0));
+ ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles6) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(2, 1U, "100", "150", 3000U);
+ Add(2, 2U, "151", "200", 3001U);
+ Add(2, 3U, "301", "350", 3000U);
+ Add(2, 4U, "451", "400", 3000U);
+ Add(2, 5U, "551", "500", 3000U);
+ file_map_[5U].first->being_compacted = true;
+ Add(2, 6U, "651", "600", 3000U);
+ Add(2, 7U, "751", "700", 3000U);
+ Add(2, 8U, "851", "900", 3000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 16U, "170", "180", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ // Since the next file is being compacted. Stopping at 3 and 4.
+ ASSERT_EQ(2, compaction->num_input_files(0));
+ ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 1000000000U /* file_size */);
+ file_map_[1U].first->being_compacted = true;
+ Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+ "199" /* largest */, 900000000U /* file_size */);
+ Add(1 /* level */, 3U /* file_number */, "200" /* smallest */,
+ "249" /* largest */, 800000000U /* file_size */);
+ Add(1 /* level */, 4U /* file_number */, "250" /* smallest */,
+ "299" /* largest */, 700000000U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+ "199" /* largest */, 100U /* file_size */);
+ Add(2 /* level */, 6U /* file_number */, "200" /* smallest */,
+ "240" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 7U /* file_number */, "260" /* smallest */,
+ "270" /* largest */, 1U /* file_size */);
+ file_map_[5U].first->being_compacted = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */));
+
+ compaction.reset(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */));
+
+ compaction.reset(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+ ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 1000000u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // All 5 L0 files will be picked for intra L0 compaction. The one L1 file
+ // spans entire L0 key range and is marked as being compacted to avoid
+ // L0->L1 compaction.
+ Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+ Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+ Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+ Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+ Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+ Add(1, 6U, "100", "350", 200000U, 0, 110, 111);
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 999999u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // 4 out of 5 L0 files will be picked for intra L0 compaction due to
+ // max_compaction_bytes limit (the minimum number of files for triggering
+ // intra L0 compaction is 4). The one L1 file spans entire L0 key range and
+ // is marked as being compacted to avoid L0->L1 compaction.
+ Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+ Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+ Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+ Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+ Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+ Add(1, 6U, "100", "350", 200000U, 0, 109, 110);
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(4U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 999999u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // 4 out of 6 L0 files will be picked for intra L0 compaction due to
+ // being_compact limit. And the latest one L0 will be skipped due to earliest
+ // seqno. The one L1 file spans entire L0 key range and is marked as being
+ // compacted to avoid L0->L1 compaction.
+ Add(1, 1U, "100", "350", 200000U, 0, 110, 111);
+ Add(0, 2U, "301", "350", 1U, 0, 108, 109);
+ Add(0, 3U, "251", "300", 1U, 0, 106, 107);
+ Add(0, 4U, "201", "250", 1U, 0, 104, 105);
+ Add(0, 5U, "151", "200", 1U, 0, 102, 103);
+ Add(0, 6U, "100", "150", 1U, 0, 100, 101);
+ Add(0, 7U, "100", "100", 1U, 0, 99, 100);
+ vstorage_->LevelFiles(0)[5]->being_compacted = true;
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_, 107));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(4U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ // This test covers the case where a "regular" universal compaction is
+ // scheduled first, followed by a delete triggered compaction. The latter
+ // should fail
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300);
+ Add(3, 5U, "010", "080", 8 * kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a compaction to reduce sorted runs
+ ASSERT_EQ(CompactionReason::kUniversalSortedRunNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+
+ AddVersionStorage();
+ // Simulate a flush and mark the file for compaction
+ Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction2(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_FALSE(compaction2);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) {
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ // This test covers the case where a delete triggered compaction is
+ // scheduled first, followed by a "regular" compaction. The latter
+ // should fail
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ // Mark file number 4 for compaction
+ Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true);
+ Add(3, 5U, "240", "290", 8 * kFileSize, 0, 201, 250);
+ Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a delete triggered compaction
+ ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+ compaction->compaction_reason());
+ ASSERT_EQ(3, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+
+ AddVersionStorage();
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction2(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_FALSE(compaction2);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) {
+ // The case where universal periodic compaction can be picked
+ // with some newer files being compacted.
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+
+ bool input_level_overlap = false;
+ bool output_level_overlap = false;
+ // Let's mark 2 files in 2 different levels for compaction. The
+ // compaction picker will randomly pick one, so use the sync point to
+ // ensure a deterministic order. Loop until both cases are covered
+ size_t random_index = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionPicker::PickFilesMarkedForCompaction", [&](void* arg) {
+ size_t* index = static_cast<size_t*>(arg);
+ *index = random_index;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ while (!input_level_overlap || !output_level_overlap) {
+ // Ensure that the L0 file gets picked first
+ random_index = !input_level_overlap ? 0 : 1;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true);
+ Add(3, 2U, "010", "020", 2 * kFileSize, 0, 201, 248);
+ Add(3, 3U, "250", "270", 2 * kFileSize, 0, 202, 249);
+ Add(3, 4U, "290", "310", 2 * kFileSize, 0, 203, 250);
+ Add(3, 5U, "310", "320", 2 * kFileSize, 0, 204, 251, 0, true);
+ Add(4, 6U, "301", "350", 8 * kFileSize, 0, 101, 150);
+ Add(4, 7U, "501", "750", 8 * kFileSize, 0, 101, 150);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a delete triggered compaction
+ ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+ compaction->compaction_reason());
+ ASSERT_TRUE(compaction->start_level() == 0 ||
+ compaction->start_level() == 3);
+ if (compaction->start_level() == 0) {
+ // The L0 file was picked. The next compaction will detect an
+ // overlap on its input level
+ input_level_overlap = true;
+ ASSERT_EQ(3, compaction->output_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->num_input_files(1));
+ } else {
+ // The level 3 file was picked. The next compaction will pick
+ // the L0 file and will detect overlap when adding output
+ // level inputs
+ output_level_overlap = true;
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ }
+
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ // After recomputing the compaction score, only one marked file will remain
+ random_index = 0;
+ std::unique_ptr<Compaction> compaction2(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_FALSE(compaction2);
+ DeleteVersionStorage();
+ }
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) {
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ // This test covers the case where a delete triggered compaction is
+ // scheduled and should result in a full compaction
+ NewVersionStorage(1, kCompactionStyleUniversal);
+
+ // Mark file number 4 for compaction
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250);
+ Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+ Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a delete triggered compaction
+ ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(4U, compaction->num_input_files(0));
+ ASSERT_TRUE(file_map_[4].first->being_compacted);
+ ASSERT_TRUE(file_map_[5].first->being_compacted);
+ ASSERT_TRUE(file_map_[3].first->being_compacted);
+ ASSERT_TRUE(file_map_[6].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) {
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ // This test covers the case where a file is being compacted, and a
+ // delete triggered compaction is then scheduled. The latter should stop
+ // at the first file being compacted
+ NewVersionStorage(1, kCompactionStyleUniversal);
+
+ // Mark file number 4 for compaction
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250);
+ Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+ Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+ UpdateVersionStorageInfo();
+ file_map_[3].first->being_compacted = true;
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a delete triggered compaction
+ ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_TRUE(file_map_[4].first->being_compacted);
+ ASSERT_TRUE(file_map_[5].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) {
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ // This test covers the case where a delete triggered compaction is
+ // scheduled first, followed by a "regular" compaction. The latter
+ // should fail
+ NewVersionStorage(1, kCompactionStyleUniversal);
+
+ // Mark file number 5 for compaction
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true);
+ Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+ Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a delete triggered compaction
+ ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_TRUE(file_map_[5].first->being_compacted);
+ ASSERT_TRUE(file_map_[3].first->being_compacted);
+ ASSERT_TRUE(file_map_[6].first->being_compacted);
+
+ AddVersionStorage();
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction2(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction2);
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_TRUE(file_map_[1].first->being_compacted);
+ ASSERT_TRUE(file_map_[2].first->being_compacted);
+ ASSERT_TRUE(file_map_[4].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) {
+ const uint64_t kFileSize = 100000;
+ const int kNumLevels = 7;
+
+ // This test makes sure the `files_marked_for_compaction_` is updated after
+ // creating manual compaction.
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+
+ // Add 3 files marked for compaction
+ Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150, 0, true);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true);
+ UpdateVersionStorageInfo();
+
+ // All 3 files are marked for compaction
+ ASSERT_EQ(3U, vstorage_->FilesMarkedForCompaction().size());
+
+ bool manual_conflict = false;
+ InternalKey* manual_end = nullptr;
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.CompactRange(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(),
+ nullptr, nullptr, &manual_end, &manual_conflict,
+ std::numeric_limits<uint64_t>::max(), ""));
+
+ ASSERT_TRUE(compaction);
+
+ ASSERT_EQ(CompactionReason::kManualCompaction,
+ compaction->compaction_reason());
+ ASSERT_EQ(kNumLevels - 1, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_TRUE(file_map_[3].first->being_compacted);
+ ASSERT_TRUE(file_map_[4].first->being_compacted);
+ ASSERT_TRUE(file_map_[5].first->being_compacted);
+
+ // After creating the manual compaction, all files should be cleared from
+ // `FilesMarkedForCompaction`. So they won't be picked by others.
+ ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size());
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) {
+ // This test make sure size amplification compaction could still be triggered
+ // if the last sorted run is not the last level.
+ const uint64_t kFileSize = 100000;
+ const int kNumLevels = 7;
+ const int kLastLevel = kNumLevels - 1;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ ioptions_.preclude_last_level_data_seconds = 1000;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 200;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+ Add(0, 100U, "100", "300", 1 * kFileSize);
+ Add(0, 101U, "200", "400", 1 * kFileSize);
+ Add(4, 90U, "100", "600", 4 * kFileSize);
+ Add(5, 80U, "200", "300", 2 * kFileSize);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ // Make sure it's a size amp compaction and includes all files
+ ASSERT_EQ(compaction->compaction_reason(),
+ CompactionReason::kUniversalSizeAmplification);
+ ASSERT_EQ(compaction->output_level(), kLastLevel);
+ ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+ ASSERT_EQ(compaction->input_levels(4)->num_files, 1);
+ ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) {
+ // This test makes sure the size amp calculation skips the last level (L6), so
+ // size amp compaction is not triggered, instead a size ratio compaction is
+ // triggered.
+ const uint64_t kFileSize = 100000;
+ const int kNumLevels = 7;
+ const int kLastLevel = kNumLevels - 1;
+ const int kPenultimateLevel = kLastLevel - 1;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ ioptions_.preclude_last_level_data_seconds = 1000;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 200;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+ Add(0, 100U, "100", "300", 1 * kFileSize);
+ Add(0, 101U, "200", "400", 1 * kFileSize);
+ Add(5, 90U, "100", "600", 4 * kFileSize);
+ Add(6, 80U, "200", "300", 2 * kFileSize);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ // Internally, size amp compaction is evaluated before size ratio compaction.
+ // Here to make sure it's size ratio compaction instead of size amp
+ ASSERT_EQ(compaction->compaction_reason(),
+ CompactionReason::kUniversalSizeRatio);
+ ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1);
+ ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+ ASSERT_EQ(compaction->input_levels(5)->num_files, 0);
+ ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) {
+ // Tiered compaction only support level_num > 2 (otherwise the penultimate
+ // level is going to be level 0, which may make thing more complicated), so
+ // when there's only 2 level, still treating level 1 as the last level for
+ // size amp compaction
+ const uint64_t kFileSize = 100000;
+ const int kNumLevels = 2;
+ const int kLastLevel = kNumLevels - 1;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ ioptions_.preclude_last_level_data_seconds = 1000;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 200;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+ Add(0, 100U, "100", "300", 1 * kFileSize);
+ Add(0, 101U, "200", "400", 1 * kFileSize);
+ Add(0, 90U, "100", "600", 4 * kFileSize);
+ Add(1, 80U, "200", "300", 2 * kFileSize);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ // size amp compaction is still triggered even preclude_last_level is set
+ ASSERT_EQ(compaction->compaction_reason(),
+ CompactionReason::kUniversalSizeAmplification);
+ ASSERT_EQ(compaction->output_level(), kLastLevel);
+ ASSERT_EQ(compaction->input_levels(0)->num_files, 3);
+ ASSERT_EQ(compaction->input_levels(1)->num_files, 1);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) {
+ // This test makes sure the size amp compaction for tiered storage could still
+ // be triggered, but only for non-last-level files
+ const uint64_t kFileSize = 100000;
+ const int kNumLevels = 7;
+ const int kLastLevel = kNumLevels - 1;
+ const int kPenultimateLevel = kLastLevel - 1;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ ioptions_.preclude_last_level_data_seconds = 1000;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 200;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+ Add(0, 100U, "100", "300", 3 * kFileSize);
+ Add(0, 101U, "200", "400", 2 * kFileSize);
+ Add(5, 90U, "100", "600", 2 * kFileSize);
+ Add(6, 80U, "200", "300", 2 * kFileSize);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ // It's a Size Amp compaction, but doesn't include the last level file and
+ // output to the penultimate level.
+ ASSERT_EQ(compaction->compaction_reason(),
+ CompactionReason::kUniversalSizeAmplification);
+ ASSERT_EQ(compaction->output_level(), kPenultimateLevel);
+ ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+ ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
+ ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
+}
+
+TEST_F(CompactionPickerU64TsTest, Overlap) {
+ int num_levels = ioptions_.num_levels;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+ constexpr int level = 0;
+ constexpr uint64_t file_number = 20ULL;
+ constexpr char smallest[4] = "500";
+ constexpr char largest[4] = "600";
+ constexpr uint64_t ts_of_smallest = 12345ULL;
+ constexpr uint64_t ts_of_largest = 56789ULL;
+
+ {
+ std::string ts1;
+ PutFixed64(&ts1, ts_of_smallest);
+ std::string ts2;
+ PutFixed64(&ts2, ts_of_largest);
+ Add(level, file_number, smallest, largest,
+ /*file_size=*/1U, /*path_id=*/0,
+ /*smallest_seq=*/100, /*largest_seq=*/100, /*compensated_file_size=*/0,
+ /*marked_for_compact=*/false, /*temperature=*/Temperature::kUnknown,
+ /*oldest_ancestor_time=*/kUnknownOldestAncesterTime, ts1, ts2);
+ UpdateVersionStorageInfo();
+ }
+
+ std::unordered_set<uint64_t> input{file_number};
+
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input, vstorage_.get(), CompactionOptions()));
+ std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+ CompactionOptions(), input_files, level, vstorage_.get(),
+ mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0));
+
+ {
+ // [600, ts=50000] to [600, ts=50000] is the range to check.
+ // ucmp->Compare(smallest_user_key, c->GetLargestUserKey()) > 0, but
+ // ucmp->CompareWithoutTimestamp(smallest_user_key,
+ // c->GetLargestUserKey()) == 0.
+ // Should still be considered overlapping.
+ std::string user_key_with_ts1(largest);
+ PutFixed64(&user_key_with_ts1, ts_of_largest - 1);
+ std::string user_key_with_ts2(largest);
+ PutFixed64(&user_key_with_ts2, ts_of_largest - 1);
+ ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction(
+ user_key_with_ts1, user_key_with_ts2, level));
+ }
+ {
+ // [500, ts=60000] to [500, ts=60000] is the range to check.
+ // ucmp->Compare(largest_user_key, c->GetSmallestUserKey()) < 0, but
+ // ucmp->CompareWithoutTimestamp(largest_user_key,
+ // c->GetSmallestUserKey()) == 0.
+ // Should still be considered overlapping.
+ std::string user_key_with_ts1(smallest);
+ PutFixed64(&user_key_with_ts1, ts_of_smallest + 1);
+ std::string user_key_with_ts2(smallest);
+ PutFixed64(&user_key_with_ts2, ts_of_smallest + 1);
+ ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction(
+ user_key_with_ts1, user_key_with_ts2, level));
+ }
+}
+
+TEST_F(CompactionPickerU64TsTest, CannotTrivialMoveUniversal) {
+ constexpr uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_FALSE(universal_compaction_picker.NeedsCompaction(vstorage_.get()));
+
+ std::string ts1;
+ PutFixed64(&ts1, 9000);
+ std::string ts2;
+ PutFixed64(&ts2, 8000);
+ std::string ts3;
+ PutFixed64(&ts3, 7000);
+ std::string ts4;
+ PutFixed64(&ts4, 6000);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+ // A compaction should be triggered and pick file 2
+ Add(1, 1U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100,
+ /*largest_seq=*/100, /*compensated_file_size=*/kFileSize,
+ /*marked_for_compact=*/false, Temperature::kUnknown,
+ kUnknownOldestAncesterTime, ts1, ts2);
+ Add(2, 2U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100,
+ /*largest_seq=*/100, /*compensated_file_size=*/kFileSize,
+ /*marked_for_compact=*/false, Temperature::kUnknown,
+ kUnknownOldestAncesterTime, ts3, ts4);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ assert(compaction);
+ ASSERT_TRUE(!compaction->is_trivial_move());
+}
+
+class PerKeyPlacementCompactionPickerTest
+ : public CompactionPickerTest,
+ public testing::WithParamInterface<bool> {
+ public:
+ PerKeyPlacementCompactionPickerTest() : CompactionPickerTest() {}
+
+ void SetUp() override { enable_per_key_placement_ = GetParam(); }
+
+ protected:
+ bool enable_per_key_placement_ = false;
+};
+
+TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = enable_per_key_placement_;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ int num_levels = ioptions_.num_levels;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+ Add(0, 21U, "100", "150", 60000000U);
+ Add(0, 22U, "300", "350", 60000000U);
+ Add(5, 40U, "200", "250", 60000000U);
+ Add(6, 50U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(40);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+ comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(21);
+ input_set.insert(22);
+ input_set.insert(50);
+ ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ level_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 6,
+ Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+ 0, 6)));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlap) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = enable_per_key_placement_;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ int num_levels = ioptions_.num_levels;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+ Add(0, 21U, "100", "150", 60000000U);
+ Add(0, 22U, "300", "350", 60000000U);
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(6, 50U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(21);
+ input_set.insert(22);
+ input_set.insert(50);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(40);
+ input_set.insert(41);
+ ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ level_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+ OverlapWithNormalCompactionUniveral) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = enable_per_key_placement_;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ int num_levels = ioptions_.num_levels;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ Add(0, 21U, "100", "150", 60000000U);
+ Add(0, 22U, "300", "350", 60000000U);
+ Add(5, 40U, "200", "250", 60000000U);
+ Add(6, 50U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(40);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(21);
+ input_set.insert(22);
+ input_set.insert(50);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 6,
+ Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+ 0, 6)));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = enable_per_key_placement_;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ int num_levels = ioptions_.num_levels;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ Add(0, 21U, "100", "150", 60000000U);
+ Add(0, 22U, "300", "350", 60000000U);
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(6, 50U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(21);
+ input_set.insert(22);
+ input_set.insert(50);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(40);
+ input_set.insert(41);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) {
+ // This test is make sure the Tiered compaction would lock whole range of
+ // both output level and penultimate level
+ if (enable_per_key_placement_) {
+ ioptions_.preclude_last_level_data_seconds = 10000;
+ }
+
+ int num_levels = ioptions_.num_levels;
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ // L4: [200, 220] [230, 250] [360, 380]
+ // L5:
+ // L6: [101, 351]
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(4, 42U, "360", "380", 60000000U);
+ Add(6, 60U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ // the existing compaction is the 1st L4 file + L6 file
+ // then compaction of the 2nd L4 file to L5 (penultimate level) is overlapped
+ // when the tiered compaction feature is on.
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(40);
+ input_set.insert(60);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(41);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+
+ // compacting the 3rd L4 file is always safe:
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(42);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) {
+ if (enable_per_key_placement_) {
+ ioptions_.preclude_last_level_data_seconds = 10000;
+ }
+
+ int num_levels = ioptions_.num_levels;
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ // L4: [200, 220] [230, 250] [360, 380]
+ // L5:
+ // L6: [101, 351]
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(4, 42U, "360", "380", 60000000U);
+ Add(6, 60U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(60);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ // cannot compact file 41 if the preclude_last_level feature is on, otherwise
+ // compact file 41 is okay.
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(41);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+
+ // compacting the 3rd L4 file is always safe:
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(42);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+ LastLevelOnlyFailPenultimateUniversal) {
+ // This is to test last_level only compaction still unable to do the
+ // penultimate level compaction if there's already a file in the penultimate
+ // level.
+ // This should rarely happen in universal compaction, as the non-empty L5
+ // should be included in the compaction.
+ if (enable_per_key_placement_) {
+ ioptions_.preclude_last_level_data_seconds = 10000;
+ }
+
+ int num_levels = ioptions_.num_levels;
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ // L4: [200, 220]
+ // L5: [230, 250]
+ // L6: [101, 351]
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(5, 50U, "230", "250", 60000000U);
+ Add(6, 60U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(60);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ ASSERT_TRUE(comp1);
+ ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+ // As comp1 cannot be output to the penultimate level, compacting file 40 to
+ // L5 is always safe.
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(40);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+
+ std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+ ASSERT_TRUE(comp2);
+ ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+ LastLevelOnlyConflictWithOngoingUniversal) {
+ // This is to test last_level only compaction still unable to do the
+ // penultimate level compaction if there's already an ongoing compaction to
+ // the penultimate level
+ if (enable_per_key_placement_) {
+ ioptions_.preclude_last_level_data_seconds = 10000;
+ }
+
+ int num_levels = ioptions_.num_levels;
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ // L4: [200, 220] [230, 250] [360, 380]
+ // L5:
+ // L6: [101, 351]
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(4, 42U, "360", "380", 60000000U);
+ Add(6, 60U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ // create an ongoing compaction to L5 (penultimate level)
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(40);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ ASSERT_TRUE(comp1);
+ ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(60);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 6,
+ Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+ 6, 6)));
+
+ if (!enable_per_key_placement_) {
+ std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+ ASSERT_TRUE(comp2);
+ ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+ }
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+ LastLevelOnlyNoConflictWithOngoingUniversal) {
+ // This is similar to `LastLevelOnlyConflictWithOngoingUniversal`, the only
+ // change is the ongoing compaction to L5 has no overlap with the last level
+ // compaction, so it's safe to move data from the last level to the
+ // penultimate level.
+ if (enable_per_key_placement_) {
+ ioptions_.preclude_last_level_data_seconds = 10000;
+ }
+
+ int num_levels = ioptions_.num_levels;
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ // L4: [200, 220] [230, 250] [360, 380]
+ // L5:
+ // L6: [101, 351]
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(4, 42U, "360", "380", 60000000U);
+ Add(6, 60U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ // create an ongoing compaction to L5 (penultimate level)
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(42);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ ASSERT_TRUE(comp1);
+ ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(60);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ // always safe to move data up
+ ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 6,
+ Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, 6, 6)));
+
+ // 2 compactions can be run in parallel
+ std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+ ASSERT_TRUE(comp2);
+ if (enable_per_key_placement_) {
+ ASSERT_NE(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+ } else {
+ ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest,
+ PerKeyPlacementCompactionPickerTest, ::testing::Bool());
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.cc b/src/rocksdb/db/compaction/compaction_picker_universal.cc
new file mode 100644
index 000000000..376e4f60f
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.cc
@@ -0,0 +1,1450 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_universal.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// A helper class that form universal compactions. The class is used by
+// UniversalCompactionPicker::PickCompaction().
+// The usage is to create the class, and get the compaction object by calling
+// PickCompaction().
+class UniversalCompactionBuilder {
+ public:
+ UniversalCompactionBuilder(
+ const ImmutableOptions& ioptions, const InternalKeyComparator* icmp,
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ UniversalCompactionPicker* picker, LogBuffer* log_buffer)
+ : ioptions_(ioptions),
+ icmp_(icmp),
+ cf_name_(cf_name),
+ mutable_cf_options_(mutable_cf_options),
+ mutable_db_options_(mutable_db_options),
+ vstorage_(vstorage),
+ picker_(picker),
+ log_buffer_(log_buffer) {}
+
+ // Form and return the compaction object. The caller owns return object.
+ Compaction* PickCompaction();
+
+ private:
+ struct SortedRun {
+ SortedRun(int _level, FileMetaData* _file, uint64_t _size,
+ uint64_t _compensated_file_size, bool _being_compacted)
+ : level(_level),
+ file(_file),
+ size(_size),
+ compensated_file_size(_compensated_file_size),
+ being_compacted(_being_compacted) {
+ assert(compensated_file_size > 0);
+ assert(level != 0 || file != nullptr);
+ }
+
+ void Dump(char* out_buf, size_t out_buf_size,
+ bool print_path = false) const;
+
+ // sorted_run_count is added into the string to print
+ void DumpSizeInfo(char* out_buf, size_t out_buf_size,
+ size_t sorted_run_count) const;
+
+ int level;
+ // `file` Will be null for level > 0. For level = 0, the sorted run is
+ // for this file.
+ FileMetaData* file;
+ // For level > 0, `size` and `compensated_file_size` are sum of sizes all
+ // files in the level. `being_compacted` should be the same for all files
+ // in a non-zero level. Use the value here.
+ uint64_t size;
+ uint64_t compensated_file_size;
+ bool being_compacted;
+ };
+
+ // Pick Universal compaction to limit read amplification
+ Compaction* PickCompactionToReduceSortedRuns(
+ unsigned int ratio, unsigned int max_number_of_files_to_compact);
+
+ // Pick Universal compaction to limit space amplification.
+ Compaction* PickCompactionToReduceSizeAmp();
+
+ // Try to pick incremental compaction to reduce space amplification.
+ // It will return null if it cannot find a fanout within the threshold.
+ // Fanout is defined as
+ // total size of files to compact at output level
+ // --------------------------------------------------
+ // total size of files to compact at other levels
+ Compaction* PickIncrementalForReduceSizeAmp(double fanout_threshold);
+
+ Compaction* PickDeleteTriggeredCompaction();
+
+ // Form a compaction from the sorted run indicated by start_index to the
+ // oldest sorted run.
+ // The caller is responsible for making sure that those files are not in
+ // compaction.
+ Compaction* PickCompactionToOldest(size_t start_index,
+ CompactionReason compaction_reason);
+
+ Compaction* PickCompactionWithSortedRunRange(
+ size_t start_index, size_t end_index, CompactionReason compaction_reason);
+
+ // Try to pick periodic compaction. The caller should only call it
+ // if there is at least one file marked for periodic compaction.
+ // null will be returned if no such a compaction can be formed
+ // because some files are being compacted.
+ Compaction* PickPeriodicCompaction();
+
+ // Used in universal compaction when the allow_trivial_move
+ // option is set. Checks whether there are any overlapping files
+ // in the input. Returns true if the input files are non
+ // overlapping.
+ bool IsInputFilesNonOverlapping(Compaction* c);
+
+ uint64_t GetMaxOverlappingBytes() const;
+
+ const ImmutableOptions& ioptions_;
+ const InternalKeyComparator* icmp_;
+ double score_;
+ std::vector<SortedRun> sorted_runs_;
+ const std::string& cf_name_;
+ const MutableCFOptions& mutable_cf_options_;
+ const MutableDBOptions& mutable_db_options_;
+ VersionStorageInfo* vstorage_;
+ UniversalCompactionPicker* picker_;
+ LogBuffer* log_buffer_;
+
+ static std::vector<SortedRun> CalculateSortedRuns(
+ const VersionStorageInfo& vstorage);
+
+ // Pick a path ID to place a newly generated file, with its estimated file
+ // size.
+ static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ uint64_t file_size);
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This structure is used for the construction of min heap
+// that contains the file meta data, the level of the file
+// and the index of the file in that level
+
+struct InputFileInfo {
+ InputFileInfo() : f(nullptr), level(0), index(0) {}
+
+ FileMetaData* f;
+ size_t level;
+ size_t index;
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This comparator is used for the construction of min heap
+// based on the smallest key of the file.
+struct SmallestKeyHeapComparator {
+ explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; }
+
+ bool operator()(InputFileInfo i1, InputFileInfo i2) const {
+ return (ucmp_->CompareWithoutTimestamp(i1.f->smallest.user_key(),
+ i2.f->smallest.user_key()) > 0);
+ }
+
+ private:
+ const Comparator* ucmp_;
+};
+
+using SmallestKeyHeap =
+ std::priority_queue<InputFileInfo, std::vector<InputFileInfo>,
+ SmallestKeyHeapComparator>;
+
+// This function creates the heap that is used to find if the files are
+// overlapping during universal compaction when the allow_trivial_move
+// is set.
+SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) {
+ SmallestKeyHeap smallest_key_priority_q =
+ SmallestKeyHeap(SmallestKeyHeapComparator(ucmp));
+
+ InputFileInfo input_file;
+
+ for (size_t l = 0; l < c->num_input_levels(); l++) {
+ if (c->num_input_files(l) != 0) {
+ if (l == 0 && c->start_level() == 0) {
+ for (size_t i = 0; i < c->num_input_files(0); i++) {
+ input_file.f = c->input(0, i);
+ input_file.level = 0;
+ input_file.index = i;
+ smallest_key_priority_q.push(std::move(input_file));
+ }
+ } else {
+ input_file.f = c->input(l, 0);
+ input_file.level = l;
+ input_file.index = 0;
+ smallest_key_priority_q.push(std::move(input_file));
+ }
+ }
+ }
+ return smallest_key_priority_q;
+}
+
+#ifndef NDEBUG
+// smallest_seqno and largest_seqno are set iff. `files` is not empty.
+void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
+ SequenceNumber* smallest_seqno,
+ SequenceNumber* largest_seqno) {
+ bool is_first = true;
+ for (FileMetaData* f : files) {
+ assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+ if (is_first) {
+ is_first = false;
+ *smallest_seqno = f->fd.smallest_seqno;
+ *largest_seqno = f->fd.largest_seqno;
+ } else {
+ if (f->fd.smallest_seqno < *smallest_seqno) {
+ *smallest_seqno = f->fd.smallest_seqno;
+ }
+ if (f->fd.largest_seqno > *largest_seqno) {
+ *largest_seqno = f->fd.largest_seqno;
+ }
+ }
+ }
+}
+#endif
+} // namespace
+
+// Algorithm that checks to see if there are any overlapping
+// files in the input
+bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
+ auto comparator = icmp_->user_comparator();
+ int first_iter = 1;
+
+ InputFileInfo prev, curr, next;
+
+ SmallestKeyHeap smallest_key_priority_q =
+ create_level_heap(c, icmp_->user_comparator());
+
+ while (!smallest_key_priority_q.empty()) {
+ curr = smallest_key_priority_q.top();
+ smallest_key_priority_q.pop();
+
+ if (first_iter) {
+ prev = curr;
+ first_iter = 0;
+ } else {
+ if (comparator->CompareWithoutTimestamp(
+ prev.f->largest.user_key(), curr.f->smallest.user_key()) >= 0) {
+ // found overlapping files, return false
+ return false;
+ }
+ assert(comparator->CompareWithoutTimestamp(
+ curr.f->largest.user_key(), prev.f->largest.user_key()) > 0);
+ prev = curr;
+ }
+
+ next.f = nullptr;
+
+ if (c->level(curr.level) != 0 &&
+ curr.index < c->num_input_files(curr.level) - 1) {
+ next.f = c->input(curr.level, curr.index + 1);
+ next.level = curr.level;
+ next.index = curr.index + 1;
+ }
+
+ if (next.f) {
+ smallest_key_priority_q.push(std::move(next));
+ }
+ }
+ return true;
+}
+
+bool UniversalCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ const int kLevel0 = 0;
+ if (vstorage->CompactionScore(kLevel0) >= 1) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ return false;
+}
+
+Compaction* UniversalCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer, SequenceNumber /* earliest_memtable_seqno */) {
+ UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
+ mutable_cf_options, mutable_db_options,
+ vstorage, this, log_buffer);
+ return builder.PickCompaction();
+}
+
+void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf,
+ size_t out_buf_size,
+ bool print_path) const {
+ if (level == 0) {
+ assert(file != nullptr);
+ if (file->fd.GetPathId() == 0 || !print_path) {
+ snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber());
+ } else {
+ snprintf(out_buf, out_buf_size,
+ "file %" PRIu64
+ "(path "
+ "%" PRIu32 ")",
+ file->fd.GetNumber(), file->fd.GetPathId());
+ }
+ } else {
+ snprintf(out_buf, out_buf_size, "level %d", level);
+ }
+}
+
+void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
+ char* out_buf, size_t out_buf_size, size_t sorted_run_count) const {
+ if (level == 0) {
+ assert(file != nullptr);
+ snprintf(out_buf, out_buf_size,
+ "file %" PRIu64 "[%" ROCKSDB_PRIszt
+ "] "
+ "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+ file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(),
+ file->compensated_file_size);
+ } else {
+ snprintf(out_buf, out_buf_size,
+ "level %d[%" ROCKSDB_PRIszt
+ "] "
+ "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+ level, sorted_run_count, size, compensated_file_size);
+ }
+}
+
+std::vector<UniversalCompactionBuilder::SortedRun>
+UniversalCompactionBuilder::CalculateSortedRuns(
+ const VersionStorageInfo& vstorage) {
+ std::vector<UniversalCompactionBuilder::SortedRun> ret;
+ for (FileMetaData* f : vstorage.LevelFiles(0)) {
+ ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
+ f->being_compacted);
+ }
+ for (int level = 1; level < vstorage.num_levels(); level++) {
+ uint64_t total_compensated_size = 0U;
+ uint64_t total_size = 0U;
+ bool being_compacted = false;
+ for (FileMetaData* f : vstorage.LevelFiles(level)) {
+ total_compensated_size += f->compensated_file_size;
+ total_size += f->fd.GetFileSize();
+ // Size amp, read amp and periodic compactions always include all files
+ // for a non-zero level. However, a delete triggered compaction and
+ // a trivial move might pick a subset of files in a sorted run. So
+ // always check all files in a sorted run and mark the entire run as
+ // being compacted if one or more files are being compacted
+ if (f->being_compacted) {
+ being_compacted = f->being_compacted;
+ }
+ }
+ if (total_compensated_size > 0) {
+ ret.emplace_back(level, nullptr, total_size, total_compensated_size,
+ being_compacted);
+ }
+ }
+ return ret;
+}
+
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+Compaction* UniversalCompactionBuilder::PickCompaction() {
+ const int kLevel0 = 0;
+ score_ = vstorage_->CompactionScore(kLevel0);
+ sorted_runs_ = CalculateSortedRuns(*vstorage_);
+
+ if (sorted_runs_.size() == 0 ||
+ (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
+ vstorage_->FilesMarkedForCompaction().empty() &&
+ sorted_runs_.size() < (unsigned int)mutable_cf_options_
+ .level0_file_num_compaction_trigger)) {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n",
+ cf_name_.c_str());
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+ return nullptr;
+ }
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_BUFFER_MAX_SZ(
+ log_buffer_, 3072,
+ "[%s] Universal: sorted runs: %" ROCKSDB_PRIszt " files: %s\n",
+ cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
+
+ Compaction* c = nullptr;
+ // Periodic compaction has higher priority than other type of compaction
+ // because it's a hard requirement.
+ if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+ // Always need to do a full compaction for periodic compaction.
+ c = PickPeriodicCompaction();
+ }
+
+ // Check for size amplification.
+ if (c == nullptr &&
+ sorted_runs_.size() >=
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger)) {
+ if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
+ cf_name_.c_str());
+ } else {
+ // Size amplification is within limits. Try reducing read
+ // amplification while maintaining file size ratios.
+ unsigned int ratio =
+ mutable_cf_options_.compaction_options_universal.size_ratio;
+
+ if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: compacting for size ratio\n",
+ cf_name_.c_str());
+ } else {
+ // Size amplification and file size ratios are within configured limits.
+ // If max read amplification is exceeding configured limits, then force
+ // compaction without looking at filesize ratios and try to reduce
+ // the number of files to fewer than level0_file_num_compaction_trigger.
+ // This is guaranteed by NeedsCompaction()
+ assert(sorted_runs_.size() >=
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger));
+ // Get the total number of sorted runs that are not being compacted
+ int num_sr_not_compacted = 0;
+ for (size_t i = 0; i < sorted_runs_.size(); i++) {
+ if (sorted_runs_[i].being_compacted == false) {
+ num_sr_not_compacted++;
+ }
+ }
+
+ // The number of sorted runs that are not being compacted is greater
+ // than the maximum allowed number of sorted runs
+ if (num_sr_not_compacted >
+ mutable_cf_options_.level0_file_num_compaction_trigger) {
+ unsigned int num_files =
+ num_sr_not_compacted -
+ mutable_cf_options_.level0_file_num_compaction_trigger + 1;
+ if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
+ nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: compacting for file num -- %u\n",
+ cf_name_.c_str(), num_files);
+ }
+ }
+ }
+ }
+ }
+
+ if (c == nullptr) {
+ if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: delete triggered compaction\n",
+ cf_name_.c_str());
+ }
+ }
+
+ if (c == nullptr) {
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+ return nullptr;
+ }
+
+ if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
+ true &&
+ c->compaction_reason() != CompactionReason::kPeriodicCompaction) {
+ c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
+ }
+
+// validate that all the chosen files of L0 are non overlapping in time
+#ifndef NDEBUG
+ bool is_first = true;
+
+ size_t level_index = 0U;
+ if (c->start_level() == 0) {
+ for (auto f : *c->inputs(0)) {
+ assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+ if (is_first) {
+ is_first = false;
+ }
+ }
+ level_index = 1U;
+ }
+ for (; level_index < c->num_input_levels(); level_index++) {
+ if (c->num_input_files(level_index) != 0) {
+ SequenceNumber smallest_seqno = 0U;
+ SequenceNumber largest_seqno = 0U;
+ GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno,
+ &largest_seqno);
+ if (is_first) {
+ is_first = false;
+ }
+ }
+ }
+#endif
+ // update statistics
+ size_t num_files = 0;
+ for (auto& each_level : *c->inputs()) {
+ num_files += each_level.files.size();
+ }
+ RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files);
+
+ picker_->RegisterCompaction(c);
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+
+ TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
+ c);
+ return c;
+}
+
+uint32_t UniversalCompactionBuilder::GetPathId(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, uint64_t file_size) {
+ // Two conditions need to be satisfied:
+ // (1) the target path needs to be able to hold the file's size
+ // (2) Total size left in this and previous paths need to be not
+ // smaller than expected future file size before this new file is
+ // compacted, which is estimated based on size_ratio.
+ // For example, if now we are compacting files of size (1, 1, 2, 4, 8),
+ // we will make sure the target file, probably with size of 16, will be
+ // placed in a path so that eventually when new files are generated and
+ // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or
+ // before the path we chose.
+ //
+ // TODO(sdong): now the case of multiple column families is not
+ // considered in this algorithm. So the target size can be violated in
+ // that case. We need to improve it.
+ uint64_t accumulated_size = 0;
+ uint64_t future_size =
+ file_size *
+ (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100;
+ uint32_t p = 0;
+ assert(!ioptions.cf_paths.empty());
+ for (; p < ioptions.cf_paths.size() - 1; p++) {
+ uint64_t target_size = ioptions.cf_paths[p].target_size;
+ if (target_size > file_size &&
+ accumulated_size + (target_size - file_size) > future_size) {
+ return p;
+ }
+ accumulated_size += target_size;
+ }
+ return p;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
+ unsigned int ratio, unsigned int max_number_of_files_to_compact) {
+ unsigned int min_merge_width =
+ mutable_cf_options_.compaction_options_universal.min_merge_width;
+ unsigned int max_merge_width =
+ mutable_cf_options_.compaction_options_universal.max_merge_width;
+
+ const SortedRun* sr = nullptr;
+ bool done = false;
+ size_t start_index = 0;
+ unsigned int candidate_count = 0;
+
+ unsigned int max_files_to_compact =
+ std::min(max_merge_width, max_number_of_files_to_compact);
+ min_merge_width = std::max(min_merge_width, 2U);
+
+ // Caller checks the size before executing this function. This invariant is
+ // important because otherwise we may have a possible integer underflow when
+ // dealing with unsigned types.
+ assert(sorted_runs_.size() > 0);
+
+ // Considers a candidate file only if it is smaller than the
+ // total size accumulated so far.
+ for (size_t loop = 0; loop < sorted_runs_.size(); loop++) {
+ candidate_count = 0;
+
+ // Skip files that are already being compacted
+ for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
+ sr = &sorted_runs_[loop];
+
+ if (!sr->being_compacted) {
+ candidate_count = 1;
+ break;
+ }
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf));
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: %s"
+ "[%d] being compacted, skipping",
+ cf_name_.c_str(), file_num_buf, loop);
+
+ sr = nullptr;
+ }
+
+ // This file is not being compacted. Consider it as the
+ // first candidate to be compacted.
+ uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0;
+ if (sr != nullptr) {
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: Possible candidate %s[%d].",
+ cf_name_.c_str(), file_num_buf, loop);
+ }
+
+ // Check if the succeeding files need compaction.
+ for (size_t i = loop + 1;
+ candidate_count < max_files_to_compact && i < sorted_runs_.size();
+ i++) {
+ const SortedRun* succeeding_sr = &sorted_runs_[i];
+ if (succeeding_sr->being_compacted) {
+ break;
+ }
+ // Pick files if the total/last candidate file size (increased by the
+ // specified ratio) is still larger than the next candidate file.
+ // candidate_size is the total size of files picked so far with the
+ // default kCompactionStopStyleTotalSize; with
+ // kCompactionStopStyleSimilarSize, it's simply the size of the last
+ // picked file.
+ double sz = candidate_size * (100.0 + ratio) / 100.0;
+ if (sz < static_cast<double>(succeeding_sr->size)) {
+ break;
+ }
+ if (mutable_cf_options_.compaction_options_universal.stop_style ==
+ kCompactionStopStyleSimilarSize) {
+ // Similar-size stopping rule: also check the last picked file isn't
+ // far larger than the next candidate file.
+ sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0;
+ if (sz < static_cast<double>(candidate_size)) {
+ // If the small file we've encountered begins a run of similar-size
+ // files, we'll pick them up on a future iteration of the outer
+ // loop. If it's some lonely straggler, it'll eventually get picked
+ // by the last-resort read amp strategy which disregards size ratios.
+ break;
+ }
+ candidate_size = succeeding_sr->compensated_file_size;
+ } else { // default kCompactionStopStyleTotalSize
+ candidate_size += succeeding_sr->compensated_file_size;
+ }
+ candidate_count++;
+ }
+
+ // Found a series of consecutive files that need compaction.
+ if (candidate_count >= (unsigned int)min_merge_width) {
+ start_index = loop;
+ done = true;
+ break;
+ } else {
+ for (size_t i = loop;
+ i < loop + candidate_count && i < sorted_runs_.size(); i++) {
+ const SortedRun* skipping_sr = &sorted_runs_[i];
+ char file_num_buf[256];
+ skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s",
+ cf_name_.c_str(), file_num_buf);
+ }
+ }
+ }
+ if (!done || candidate_count <= 1) {
+ return nullptr;
+ }
+ size_t first_index_after = start_index + candidate_count;
+ // Compression is enabled if files compacted earlier already reached
+ // size ratio of compression.
+ bool enable_compression = true;
+ int ratio_to_compress =
+ mutable_cf_options_.compaction_options_universal.compression_size_percent;
+ if (ratio_to_compress >= 0) {
+ uint64_t total_size = 0;
+ for (auto& sorted_run : sorted_runs_) {
+ total_size += sorted_run.compensated_file_size;
+ }
+
+ uint64_t older_file_size = 0;
+ for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) {
+ older_file_size += sorted_runs_[i].size;
+ if (older_file_size * 100L >= total_size * (long)ratio_to_compress) {
+ enable_compression = false;
+ break;
+ }
+ }
+ }
+
+ uint64_t estimated_total_size = 0;
+ for (unsigned int i = 0; i < first_index_after; i++) {
+ estimated_total_size += sorted_runs_[i].size;
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ int start_level = sorted_runs_[start_index].level;
+ int output_level;
+ if (first_index_after == sorted_runs_.size()) {
+ output_level = vstorage_->num_levels() - 1;
+ } else if (sorted_runs_[first_index_after].level == 0) {
+ output_level = 0;
+ } else {
+ output_level = sorted_runs_[first_index_after].level - 1;
+ }
+
+ // last level is reserved for the files ingested behind
+ if (ioptions_.allow_ingest_behind &&
+ (output_level == vstorage_->num_levels() - 1)) {
+ assert(output_level > 1);
+ output_level--;
+ }
+
+ std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ inputs[i].level = start_level + static_cast<int>(i);
+ }
+ for (size_t i = start_index; i < first_index_after; i++) {
+ auto& picking_sr = sorted_runs_[i];
+ if (picking_sr.level == 0) {
+ FileMetaData* picking_file = picking_sr.file;
+ inputs[0].files.push_back(picking_file);
+ } else {
+ auto& files = inputs[picking_sr.level - start_level].files;
+ for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+ files.push_back(f);
+ }
+ }
+ char file_num_buf[256];
+ picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s",
+ cf_name_.c_str(), file_num_buf);
+ }
+
+ std::vector<FileMetaData*> grandparents;
+ // Include grandparents for potential file cutting in incremental
+ // mode. It is for aligning file cutting boundaries across levels,
+ // so that subsequent compactions can pick files with aligned
+ // buffer.
+ // Single files are only picked up in incremental mode, so that
+ // there is no need for full range.
+ if (mutable_cf_options_.compaction_options_universal.incremental &&
+ first_index_after < sorted_runs_.size() &&
+ sorted_runs_[first_index_after].level > 1) {
+ grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level);
+ }
+
+ if (output_level != 0 &&
+ picker_->FilesRangeOverlapWithCompaction(
+ inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+ start_level, output_level))) {
+ return nullptr;
+ }
+ CompactionReason compaction_reason;
+ if (max_number_of_files_to_compact == UINT_MAX) {
+ compaction_reason = CompactionReason::kUniversalSizeRatio;
+ } else {
+ compaction_reason = CompactionReason::kUniversalSortedRunNum;
+ }
+ return new Compaction(vstorage_, ioptions_, mutable_cf_options_,
+ mutable_db_options_, std::move(inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ GetMaxOverlappingBytes(), path_id,
+ GetCompressionType(vstorage_, mutable_cf_options_,
+ output_level, 1, enable_compression),
+ GetCompressionOptions(mutable_cf_options_, vstorage_,
+ output_level, enable_compression),
+ Temperature::kUnknown,
+ /* max_subcompactions */ 0, grandparents,
+ /* is manual */ false, /* trim_ts */ "", score_,
+ false /* deletion_compaction */,
+ /* l0_files_might_overlap */ true, compaction_reason);
+}
+
+// Look at overall size amplification. If size amplification
+// exceeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
+ // percentage flexibility while reducing size amplification
+ uint64_t ratio = mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent;
+
+ unsigned int candidate_count = 0;
+ uint64_t candidate_size = 0;
+ size_t start_index = 0;
+ const SortedRun* sr = nullptr;
+
+ assert(!sorted_runs_.empty());
+ if (sorted_runs_.back().being_compacted) {
+ return nullptr;
+ }
+
+ // Skip files that are already being compacted
+ for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) {
+ sr = &sorted_runs_[loop];
+ if (!sr->being_compacted) {
+ start_index = loop; // Consider this as the first candidate.
+ break;
+ }
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: skipping %s[%d] compacted %s",
+ cf_name_.c_str(), file_num_buf, loop,
+ " cannot be a candidate to reduce size amp.\n");
+ sr = nullptr;
+ }
+
+ if (sr == nullptr) {
+ return nullptr; // no candidate files
+ }
+ {
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s",
+ cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
+ }
+
+ // size of the base sorted run for size amp calculation
+ uint64_t base_sr_size = sorted_runs_.back().size;
+ size_t sr_end_idx = sorted_runs_.size() - 1;
+ // If tiered compaction is enabled and the last sorted run is the last level
+ if (ioptions_.preclude_last_level_data_seconds > 0 &&
+ ioptions_.num_levels > 2 &&
+ sorted_runs_.back().level == ioptions_.num_levels - 1 &&
+ sorted_runs_.size() > 1) {
+ sr_end_idx = sorted_runs_.size() - 2;
+ base_sr_size = sorted_runs_[sr_end_idx].size;
+ }
+
+ // keep adding up all the remaining files
+ for (size_t loop = start_index; loop < sr_end_idx; loop++) {
+ sr = &sorted_runs_[loop];
+ if (sr->being_compacted) {
+ // TODO with incremental compaction is supported, we might want to
+ // schedule some incremental compactions in parallel if needed.
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(
+ log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s",
+ cf_name_.c_str(), file_num_buf, start_index,
+ " is already being compacted. No size amp reduction possible.\n");
+ return nullptr;
+ }
+ candidate_size += sr->compensated_file_size;
+ candidate_count++;
+ }
+ if (candidate_count == 0) {
+ return nullptr;
+ }
+
+ // size amplification = percentage of additional size
+ if (candidate_size * 100 < ratio * base_sr_size) {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
+ " earliest-file-size %" PRIu64,
+ cf_name_.c_str(), candidate_size, base_sr_size);
+ return nullptr;
+ } else {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
+ " earliest-file-size %" PRIu64,
+ cf_name_.c_str(), candidate_size, base_sr_size);
+ }
+ // Since incremental compaction can't include more than second last
+ // level, it can introduce penalty, compared to full compaction. We
+ // hard code the pentalty to be 80%. If we end up with a compaction
+ // fanout higher than 80% of full level compactions, we fall back
+ // to full level compaction.
+ // The 80% threshold is arbitrary and can be adjusted or made
+ // configurable in the future.
+ // This also prevent the case when compaction falls behind and we
+ // need to compact more levels for compactions to catch up.
+ if (mutable_cf_options_.compaction_options_universal.incremental) {
+ double fanout_threshold = static_cast<double>(base_sr_size) /
+ static_cast<double>(candidate_size) * 1.8;
+ Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold);
+ if (picked != nullptr) {
+ // As the feature is still incremental, picking incremental compaction
+ // might fail and we will fall bck to compacting full level.
+ return picked;
+ }
+ }
+ return PickCompactionWithSortedRunRange(
+ start_index, sr_end_idx, CompactionReason::kUniversalSizeAmplification);
+}
+
+Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
+ double fanout_threshold) {
+ // Try find all potential compactions with total size just over
+ // options.max_compaction_size / 2, and take the one with the lowest
+ // fanout (defined in declaration of the function).
+ // This is done by having a sliding window of the files at the second
+ // lowest level, and keep expanding while finding overlapping in the
+ // last level. Once total size exceeds the size threshold, calculate
+ // the fanout value. And then shrinking from the small side of the
+ // window. Keep doing it until the end.
+ // Finally, we try to include upper level files if they fall into
+ // the range.
+ //
+ // Note that it is a similar problem as leveled compaction's
+ // kMinOverlappingRatio priority, but instead of picking single files
+ // we expand to a target compaction size. The reason is that in
+ // leveled compaction, actual fanout value tends to high, e.g. 10, so
+ // even with single file in down merging level, the extra size
+ // compacted in boundary files is at a lower ratio. But here users
+ // often have size of second last level size to be 1/4, 1/3 or even
+ // 1/2 of the bottommost level, so picking single file in second most
+ // level will cause significant waste, which is not desirable.
+ //
+ // This algorithm has lots of room to improve to pick more efficient
+ // compactions.
+ assert(sorted_runs_.size() >= 2);
+ int second_last_level = sorted_runs_[sorted_runs_.size() - 2].level;
+ if (second_last_level == 0) {
+ // Can't split Level 0.
+ return nullptr;
+ }
+ int output_level = sorted_runs_.back().level;
+ const std::vector<FileMetaData*>& bottom_files =
+ vstorage_->LevelFiles(output_level);
+ const std::vector<FileMetaData*>& files =
+ vstorage_->LevelFiles(second_last_level);
+ assert(!bottom_files.empty());
+ assert(!files.empty());
+
+ // std::unordered_map<uint64_t, uint64_t> file_to_order;
+
+ int picked_start_idx = 0;
+ int picked_end_idx = 0;
+ double picked_fanout = fanout_threshold;
+
+ // Use half target compaction bytes as anchor to stop growing second most
+ // level files, and reserve growing space for more overlapping bottom level,
+ // clean cut, files from other levels, etc.
+ uint64_t comp_thres_size = mutable_cf_options_.max_compaction_bytes / 2;
+ int start_idx = 0;
+ int bottom_end_idx = 0;
+ int bottom_start_idx = 0;
+ uint64_t non_bottom_size = 0;
+ uint64_t bottom_size = 0;
+ bool end_bottom_size_counted = false;
+ for (int end_idx = 0; end_idx < static_cast<int>(files.size()); end_idx++) {
+ FileMetaData* end_file = files[end_idx];
+
+ // Include bottom most level files smaller than the current second
+ // last level file.
+ int num_skipped = 0;
+ while (bottom_end_idx < static_cast<int>(bottom_files.size()) &&
+ icmp_->Compare(bottom_files[bottom_end_idx]->largest,
+ end_file->smallest) < 0) {
+ if (!end_bottom_size_counted) {
+ bottom_size += bottom_files[bottom_end_idx]->fd.file_size;
+ }
+ bottom_end_idx++;
+ end_bottom_size_counted = false;
+ num_skipped++;
+ }
+
+ if (num_skipped > 1) {
+ // At least a file in the bottom most level falls into the file gap. No
+ // reason to include the file. We cut the range and start a new sliding
+ // window.
+ start_idx = end_idx;
+ }
+
+ if (start_idx == end_idx) {
+ // new sliding window.
+ non_bottom_size = 0;
+ bottom_size = 0;
+ bottom_start_idx = bottom_end_idx;
+ end_bottom_size_counted = false;
+ }
+
+ non_bottom_size += end_file->fd.file_size;
+
+ // Include all overlapping files in bottom level.
+ while (bottom_end_idx < static_cast<int>(bottom_files.size()) &&
+ icmp_->Compare(bottom_files[bottom_end_idx]->smallest,
+ end_file->largest) < 0) {
+ if (!end_bottom_size_counted) {
+ bottom_size += bottom_files[bottom_end_idx]->fd.file_size;
+ end_bottom_size_counted = true;
+ }
+ if (icmp_->Compare(bottom_files[bottom_end_idx]->largest,
+ end_file->largest) > 0) {
+ // next level file cross large boundary of current file.
+ break;
+ }
+ bottom_end_idx++;
+ end_bottom_size_counted = false;
+ }
+
+ if ((non_bottom_size + bottom_size > comp_thres_size ||
+ end_idx == static_cast<int>(files.size()) - 1) &&
+ non_bottom_size > 0) { // Do we alow 0 size file at all?
+ // If it is a better compaction, remember it in picked* variables.
+ double fanout = static_cast<double>(bottom_size) /
+ static_cast<double>(non_bottom_size);
+ if (fanout < picked_fanout) {
+ picked_start_idx = start_idx;
+ picked_end_idx = end_idx;
+ picked_fanout = fanout;
+ }
+ // Shrink from the start end to under comp_thres_size
+ while (non_bottom_size + bottom_size > comp_thres_size &&
+ start_idx <= end_idx) {
+ non_bottom_size -= files[start_idx]->fd.file_size;
+ start_idx++;
+ if (start_idx < static_cast<int>(files.size())) {
+ while (bottom_start_idx <= bottom_end_idx &&
+ icmp_->Compare(bottom_files[bottom_start_idx]->largest,
+ files[start_idx]->smallest) < 0) {
+ bottom_size -= bottom_files[bottom_start_idx]->fd.file_size;
+ bottom_start_idx++;
+ }
+ }
+ }
+ }
+ }
+
+ if (picked_fanout >= fanout_threshold) {
+ assert(picked_fanout == fanout_threshold);
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ CompactionInputFiles bottom_level_inputs;
+ CompactionInputFiles second_last_level_inputs;
+ second_last_level_inputs.level = second_last_level;
+ bottom_level_inputs.level = output_level;
+ for (int i = picked_start_idx; i <= picked_end_idx; i++) {
+ if (files[i]->being_compacted) {
+ return nullptr;
+ }
+ second_last_level_inputs.files.push_back(files[i]);
+ }
+ assert(!second_last_level_inputs.empty());
+ if (!picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &second_last_level_inputs,
+ /*next_smallest=*/nullptr)) {
+ return nullptr;
+ }
+ // We might be able to avoid this binary search if we save and expand
+ // from bottom_start_idx and bottom_end_idx, but for now, we use
+ // SetupOtherInputs() for simplicity.
+ int parent_index = -1; // Create and use bottom_start_idx?
+ if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+ &second_last_level_inputs,
+ &bottom_level_inputs, &parent_index,
+ /*base_index=*/-1)) {
+ return nullptr;
+ }
+
+ // Try to include files in upper levels if they fall into the range.
+ // Since we need to go from lower level up and this is in the reverse
+ // order, compared to level order, we first write to an reversed
+ // data structure and finally copy them to compaction inputs.
+ InternalKey smallest, largest;
+ picker_->GetRange(second_last_level_inputs, &smallest, &largest);
+ std::vector<CompactionInputFiles> inputs_reverse;
+ for (auto it = ++(++sorted_runs_.rbegin()); it != sorted_runs_.rend(); it++) {
+ SortedRun& sr = *it;
+ if (sr.level == 0) {
+ break;
+ }
+ std::vector<FileMetaData*> level_inputs;
+ vstorage_->GetCleanInputsWithinInterval(sr.level, &smallest, &largest,
+ &level_inputs);
+ if (!level_inputs.empty()) {
+ inputs_reverse.push_back({});
+ inputs_reverse.back().level = sr.level;
+ inputs_reverse.back().files = level_inputs;
+ picker_->GetRange(inputs_reverse.back(), &smallest, &largest);
+ }
+ }
+ for (auto it = inputs_reverse.rbegin(); it != inputs_reverse.rend(); it++) {
+ inputs.push_back(*it);
+ }
+
+ inputs.push_back(second_last_level_inputs);
+ inputs.push_back(bottom_level_inputs);
+
+ int start_level = Compaction::kInvalidLevel;
+ for (const auto& in : inputs) {
+ if (!in.empty()) {
+ // inputs should already be sorted by level
+ start_level = in.level;
+ break;
+ }
+ }
+
+ // intra L0 compactions outputs could have overlap
+ if (output_level != 0 &&
+ picker_->FilesRangeOverlapWithCompaction(
+ inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+ start_level, output_level))) {
+ return nullptr;
+ }
+
+ // TODO support multi paths?
+ uint32_t path_id = 0;
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+ std::move(inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ GetMaxOverlappingBytes(), path_id,
+ GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1,
+ true /* enable_compression */),
+ GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
+ true /* enable_compression */),
+ Temperature::kUnknown,
+ /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+ /* trim_ts */ "", score_, false /* deletion_compaction */,
+ /* l0_files_might_overlap */ true,
+ CompactionReason::kUniversalSizeAmplification);
+}
+
+// Pick files marked for compaction. Typically, files are marked by
+// CompactOnDeleteCollector due to the presence of tombstones.
+Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
+ CompactionInputFiles start_level_inputs;
+ int output_level;
+ std::vector<CompactionInputFiles> inputs;
+ std::vector<FileMetaData*> grandparents;
+
+ if (vstorage_->num_levels() == 1) {
+ // This is single level universal. Since we're basically trying to reclaim
+ // space by processing files marked for compaction due to high tombstone
+ // density, let's do the same thing as compaction to reduce size amp which
+ // has the same goals.
+ int start_index = -1;
+
+ start_level_inputs.level = 0;
+ start_level_inputs.files.clear();
+ output_level = 0;
+ // Find the first file marked for compaction. Ignore the last file
+ for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) {
+ SortedRun* sr = &sorted_runs_[loop];
+ if (sr->being_compacted) {
+ continue;
+ }
+ FileMetaData* f = vstorage_->LevelFiles(0)[loop];
+ if (f->marked_for_compaction) {
+ start_level_inputs.files.push_back(f);
+ start_index =
+ static_cast<int>(loop); // Consider this as the first candidate.
+ break;
+ }
+ }
+ if (start_index < 0) {
+ // Either no file marked, or they're already being compacted
+ return nullptr;
+ }
+
+ for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) {
+ SortedRun* sr = &sorted_runs_[loop];
+ if (sr->being_compacted) {
+ break;
+ }
+
+ FileMetaData* f = vstorage_->LevelFiles(0)[loop];
+ start_level_inputs.files.push_back(f);
+ }
+ if (start_level_inputs.size() <= 1) {
+ // If only the last file in L0 is marked for compaction, ignore it
+ return nullptr;
+ }
+ inputs.push_back(start_level_inputs);
+ } else {
+ int start_level;
+
+ // For multi-level universal, the strategy is to make this look more like
+ // leveled. We pick one of the files marked for compaction and compact with
+ // overlapping files in the adjacent level.
+ picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
+ &output_level, &start_level_inputs);
+ if (start_level_inputs.empty()) {
+ return nullptr;
+ }
+
+ // Pick the first non-empty level after the start_level
+ for (output_level = start_level + 1; output_level < vstorage_->num_levels();
+ output_level++) {
+ if (vstorage_->NumLevelFiles(output_level) != 0) {
+ break;
+ }
+ }
+
+ // If all higher levels are empty, pick the highest level as output level
+ if (output_level == vstorage_->num_levels()) {
+ if (start_level == 0) {
+ output_level = vstorage_->num_levels() - 1;
+ } else {
+ // If start level is non-zero and all higher levels are empty, this
+ // compaction will translate into a trivial move. Since the idea is
+ // to reclaim space and trivial move doesn't help with that, we
+ // skip compaction in this case and return nullptr
+ return nullptr;
+ }
+ }
+ if (ioptions_.allow_ingest_behind &&
+ output_level == vstorage_->num_levels() - 1) {
+ assert(output_level > 1);
+ output_level--;
+ }
+
+ if (output_level != 0) {
+ if (start_level == 0) {
+ if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
+ output_level, nullptr)) {
+ return nullptr;
+ }
+ }
+
+ CompactionInputFiles output_level_inputs;
+ int parent_index = -1;
+
+ output_level_inputs.level = output_level;
+ if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+ &start_level_inputs, &output_level_inputs,
+ &parent_index, -1)) {
+ return nullptr;
+ }
+ inputs.push_back(start_level_inputs);
+ if (!output_level_inputs.empty()) {
+ inputs.push_back(output_level_inputs);
+ }
+ if (picker_->FilesRangeOverlapWithCompaction(
+ inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(
+ vstorage_, ioptions_, start_level, output_level))) {
+ return nullptr;
+ }
+
+ picker_->GetGrandparents(vstorage_, start_level_inputs,
+ output_level_inputs, &grandparents);
+ } else {
+ inputs.push_back(start_level_inputs);
+ }
+ }
+
+ uint64_t estimated_total_size = 0;
+ // Use size of the output level as estimated file size
+ for (FileMetaData* f : vstorage_->LevelFiles(output_level)) {
+ estimated_total_size += f->fd.GetFileSize();
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+ std::move(inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id,
+ GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1),
+ GetCompressionOptions(mutable_cf_options_, vstorage_, output_level),
+ Temperature::kUnknown,
+ /* max_subcompactions */ 0, grandparents, /* is manual */ false,
+ /* trim_ts */ "", score_, false /* deletion_compaction */,
+ /* l0_files_might_overlap */ true,
+ CompactionReason::kFilesMarkedForCompaction);
+}
+
+Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
+ size_t start_index, CompactionReason compaction_reason) {
+ return PickCompactionWithSortedRunRange(start_index, sorted_runs_.size() - 1,
+ compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
+ size_t start_index, size_t end_index, CompactionReason compaction_reason) {
+ assert(start_index < sorted_runs_.size());
+
+ // Estimate total file size
+ uint64_t estimated_total_size = 0;
+ for (size_t loop = start_index; loop <= end_index; loop++) {
+ estimated_total_size += sorted_runs_[loop].size;
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ int start_level = sorted_runs_[start_index].level;
+
+ std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ inputs[i].level = start_level + static_cast<int>(i);
+ }
+ for (size_t loop = start_index; loop <= end_index; loop++) {
+ auto& picking_sr = sorted_runs_[loop];
+ if (picking_sr.level == 0) {
+ FileMetaData* f = picking_sr.file;
+ inputs[0].files.push_back(f);
+ } else {
+ auto& files = inputs[picking_sr.level - start_level].files;
+ for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+ files.push_back(f);
+ }
+ }
+ std::string comp_reason_print_string;
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ comp_reason_print_string = "periodic compaction";
+ } else if (compaction_reason ==
+ CompactionReason::kUniversalSizeAmplification) {
+ comp_reason_print_string = "size amp";
+ } else {
+ assert(false);
+ comp_reason_print_string = "unknown: ";
+ comp_reason_print_string.append(
+ std::to_string(static_cast<int>(compaction_reason)));
+ }
+
+ char file_num_buf[256];
+ picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s",
+ cf_name_.c_str(), comp_reason_print_string.c_str(),
+ file_num_buf);
+ }
+
+ int output_level;
+ if (end_index == sorted_runs_.size() - 1) {
+ // output files at the last level, unless it's reserved
+ output_level = vstorage_->num_levels() - 1;
+ // last level is reserved for the files ingested behind
+ if (ioptions_.allow_ingest_behind) {
+ assert(output_level > 1);
+ output_level--;
+ }
+ } else {
+ // if it's not including all sorted_runs, it can only output to the level
+ // above the `end_index + 1` sorted_run.
+ output_level = sorted_runs_[end_index + 1].level - 1;
+ }
+
+ // intra L0 compactions outputs could have overlap
+ if (output_level != 0 &&
+ picker_->FilesRangeOverlapWithCompaction(
+ inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+ start_level, output_level))) {
+ return nullptr;
+ }
+
+ // We never check size for
+ // compaction_options_universal.compression_size_percent,
+ // because we always compact all the files, so always compress.
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+ std::move(inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ GetMaxOverlappingBytes(), path_id,
+ GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1,
+ true /* enable_compression */),
+ GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
+ true /* enable_compression */),
+ Temperature::kUnknown,
+ /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+ /* trim_ts */ "", score_, false /* deletion_compaction */,
+ /* l0_files_might_overlap */ true, compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction",
+ cf_name_.c_str());
+
+ // In universal compaction, sorted runs contain older data are almost always
+ // generated earlier too. To simplify the problem, we just try to trigger
+ // a full compaction. We start from the oldest sorted run and include
+ // all sorted runs, until we hit a sorted already being compacted.
+ // Since usually the largest (which is usually the oldest) sorted run is
+ // included anyway, doing a full compaction won't increase write
+ // amplification much.
+
+ // Get some information from marked files to check whether a file is
+ // included in the compaction.
+
+ size_t start_index = sorted_runs_.size();
+ while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) {
+ start_index--;
+ }
+ if (start_index == sorted_runs_.size()) {
+ return nullptr;
+ }
+
+ // There is a rare corner case where we can't pick up all the files
+ // because some files are being compacted and we end up with picking files
+ // but none of them need periodic compaction. Unless we simply recompact
+ // the last sorted run (either the last level or last L0 file), we would just
+ // execute the compaction, in order to simplify the logic.
+ if (start_index == sorted_runs_.size() - 1) {
+ bool included_file_marked = false;
+ int start_level = sorted_runs_[start_index].level;
+ FileMetaData* start_file = sorted_runs_[start_index].file;
+ for (const std::pair<int, FileMetaData*>& level_file_pair :
+ vstorage_->FilesMarkedForPeriodicCompaction()) {
+ if (start_level != 0) {
+ // Last sorted run is a level
+ if (start_level == level_file_pair.first) {
+ included_file_marked = true;
+ break;
+ }
+ } else {
+ // Last sorted run is a L0 file.
+ if (start_file == level_file_pair.second) {
+ included_file_marked = true;
+ break;
+ }
+ }
+ }
+ if (!included_file_marked) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: Cannot form a compaction covering file "
+ "marked for periodic compaction",
+ cf_name_.c_str());
+ return nullptr;
+ }
+ }
+
+ Compaction* c = PickCompactionToOldest(start_index,
+ CompactionReason::kPeriodicCompaction);
+
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionPicker::PickPeriodicCompaction:Return", c);
+
+ return c;
+}
+
+uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const {
+ if (!mutable_cf_options_.compaction_options_universal.incremental) {
+ return std::numeric_limits<uint64_t>::max();
+ } else {
+ // Try to align cutting boundary with files at the next level if the
+ // file isn't end up with 1/2 of target size, or it would overlap
+ // with two full size files at the next level.
+ return mutable_cf_options_.target_file_size_base / 2 * 3;
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.h b/src/rocksdb/db/compaction/compaction_picker_universal.h
new file mode 100644
index 000000000..5f897cc9b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+ UniversalCompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+ virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_service_job.cc b/src/rocksdb/db/compaction/compaction_service_job.cc
new file mode 100644
index 000000000..1d2e99d99
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_service_job.cc
@@ -0,0 +1,829 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_job.h"
+#include "db/compaction/compaction_state.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "rocksdb/utilities/options_type.h"
+
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+class SubcompactionState;
+
+CompactionServiceJobStatus
+CompactionJob::ProcessKeyValueCompactionWithCompactionService(
+ SubcompactionState* sub_compact) {
+ assert(sub_compact);
+ assert(sub_compact->compaction);
+ assert(db_options_.compaction_service);
+
+ const Compaction* compaction = sub_compact->compaction;
+ CompactionServiceInput compaction_input;
+ compaction_input.output_level = compaction->output_level();
+ compaction_input.db_id = db_id_;
+
+ const std::vector<CompactionInputFiles>& inputs =
+ *(compact_->compaction->inputs());
+ for (const auto& files_per_level : inputs) {
+ for (const auto& file : files_per_level.files) {
+ compaction_input.input_files.emplace_back(
+ MakeTableFileName(file->fd.GetNumber()));
+ }
+ }
+ compaction_input.column_family.name =
+ compaction->column_family_data()->GetName();
+ compaction_input.column_family.options =
+ compaction->column_family_data()->GetLatestCFOptions();
+ compaction_input.db_options =
+ BuildDBOptions(db_options_, mutable_db_options_copy_);
+ compaction_input.snapshots = existing_snapshots_;
+ compaction_input.has_begin = sub_compact->start.has_value();
+ compaction_input.begin =
+ compaction_input.has_begin ? sub_compact->start->ToString() : "";
+ compaction_input.has_end = sub_compact->end.has_value();
+ compaction_input.end =
+ compaction_input.has_end ? sub_compact->end->ToString() : "";
+
+ std::string compaction_input_binary;
+ Status s = compaction_input.Write(&compaction_input_binary);
+ if (!s.ok()) {
+ sub_compact->status = s;
+ return CompactionServiceJobStatus::kFailure;
+ }
+
+ std::ostringstream input_files_oss;
+ bool is_first_one = true;
+ for (const auto& file : compaction_input.input_files) {
+ input_files_oss << (is_first_one ? "" : ", ") << file;
+ is_first_one = false;
+ }
+
+ ROCKS_LOG_INFO(
+ db_options_.info_log,
+ "[%s] [JOB %d] Starting remote compaction (output level: %d): %s",
+ compaction_input.column_family.name.c_str(), job_id_,
+ compaction_input.output_level, input_files_oss.str().c_str());
+ CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_,
+ GetCompactionId(sub_compact), thread_pri_);
+ CompactionServiceJobStatus compaction_status =
+ db_options_.compaction_service->StartV2(info, compaction_input_binary);
+ switch (compaction_status) {
+ case CompactionServiceJobStatus::kSuccess:
+ break;
+ case CompactionServiceJobStatus::kFailure:
+ sub_compact->status = Status::Incomplete(
+ "CompactionService failed to start compaction job.");
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "[%s] [JOB %d] Remote compaction failed to start.",
+ compaction_input.column_family.name.c_str(), job_id_);
+ return compaction_status;
+ case CompactionServiceJobStatus::kUseLocal:
+ ROCKS_LOG_INFO(
+ db_options_.info_log,
+ "[%s] [JOB %d] Remote compaction fallback to local by API Start.",
+ compaction_input.column_family.name.c_str(), job_id_);
+ return compaction_status;
+ default:
+ assert(false); // unknown status
+ break;
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Waiting for remote compaction...",
+ compaction_input.column_family.name.c_str(), job_id_);
+ std::string compaction_result_binary;
+ compaction_status = db_options_.compaction_service->WaitForCompleteV2(
+ info, &compaction_result_binary);
+
+ if (compaction_status == CompactionServiceJobStatus::kUseLocal) {
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Remote compaction fallback to local by API "
+ "WaitForComplete.",
+ compaction_input.column_family.name.c_str(), job_id_);
+ return compaction_status;
+ }
+
+ CompactionServiceResult compaction_result;
+ s = CompactionServiceResult::Read(compaction_result_binary,
+ &compaction_result);
+
+ if (compaction_status == CompactionServiceJobStatus::kFailure) {
+ if (s.ok()) {
+ if (compaction_result.status.ok()) {
+ sub_compact->status = Status::Incomplete(
+ "CompactionService failed to run the compaction job (even though "
+ "the internal status is okay).");
+ } else {
+ // set the current sub compaction status with the status returned from
+ // remote
+ sub_compact->status = compaction_result.status;
+ }
+ } else {
+ sub_compact->status = Status::Incomplete(
+ "CompactionService failed to run the compaction job (and no valid "
+ "result is returned).");
+ compaction_result.status.PermitUncheckedError();
+ }
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "[%s] [JOB %d] Remote compaction failed.",
+ compaction_input.column_family.name.c_str(), job_id_);
+ return compaction_status;
+ }
+
+ if (!s.ok()) {
+ sub_compact->status = s;
+ compaction_result.status.PermitUncheckedError();
+ return CompactionServiceJobStatus::kFailure;
+ }
+ sub_compact->status = compaction_result.status;
+
+ std::ostringstream output_files_oss;
+ is_first_one = true;
+ for (const auto& file : compaction_result.output_files) {
+ output_files_oss << (is_first_one ? "" : ", ") << file.file_name;
+ is_first_one = false;
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Receive remote compaction result, output path: "
+ "%s, files: %s",
+ compaction_input.column_family.name.c_str(), job_id_,
+ compaction_result.output_path.c_str(),
+ output_files_oss.str().c_str());
+
+ if (!s.ok()) {
+ sub_compact->status = s;
+ return CompactionServiceJobStatus::kFailure;
+ }
+
+ for (const auto& file : compaction_result.output_files) {
+ uint64_t file_num = versions_->NewFileNumber();
+ auto src_file = compaction_result.output_path + "/" + file.file_name;
+ auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths,
+ file_num, compaction->output_path_id());
+ s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
+ if (!s.ok()) {
+ sub_compact->status = s;
+ return CompactionServiceJobStatus::kFailure;
+ }
+
+ FileMetaData meta;
+ uint64_t file_size;
+ s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
+ if (!s.ok()) {
+ sub_compact->status = s;
+ return CompactionServiceJobStatus::kFailure;
+ }
+ meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
+ file.smallest_seqno, file.largest_seqno);
+ meta.smallest.DecodeFrom(file.smallest_internal_key);
+ meta.largest.DecodeFrom(file.largest_internal_key);
+ meta.oldest_ancester_time = file.oldest_ancester_time;
+ meta.file_creation_time = file.file_creation_time;
+ meta.marked_for_compaction = file.marked_for_compaction;
+ meta.unique_id = file.unique_id;
+
+ auto cfd = compaction->column_family_data();
+ sub_compact->Current().AddOutput(std::move(meta),
+ cfd->internal_comparator(), false, false,
+ true, file.paranoid_hash);
+ }
+ sub_compact->compaction_job_stats = compaction_result.stats;
+ sub_compact->Current().SetNumOutputRecords(
+ compaction_result.num_output_records);
+ sub_compact->Current().SetTotalBytes(compaction_result.total_bytes);
+ RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
+ RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
+ compaction_result.bytes_written);
+ return CompactionServiceJobStatus::kSuccess;
+}
+
+std::string CompactionServiceCompactionJob::GetTableFileName(
+ uint64_t file_number) {
+ return MakeTableFileName(output_path_, file_number);
+}
+
+void CompactionServiceCompactionJob::RecordCompactionIOStats() {
+ compaction_result_->bytes_read += IOSTATS(bytes_read);
+ compaction_result_->bytes_written += IOSTATS(bytes_written);
+ CompactionJob::RecordCompactionIOStats();
+}
+
+CompactionServiceCompactionJob::CompactionServiceCompactionJob(
+ int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+ const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+ VersionSet* versions, const std::atomic<bool>* shutting_down,
+ LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats,
+ InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+ const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const std::string& db_id, const std::string& db_session_id,
+ std::string output_path,
+ const CompactionServiceInput& compaction_service_input,
+ CompactionServiceResult* compaction_service_result)
+ : CompactionJob(
+ job_id, compaction, db_options, mutable_db_options, file_options,
+ versions, shutting_down, log_buffer, nullptr, output_directory,
+ nullptr, stats, db_mutex, db_error_handler,
+ std::move(existing_snapshots), kMaxSequenceNumber, nullptr, nullptr,
+ std::move(table_cache), event_logger,
+ compaction->mutable_cf_options()->paranoid_file_checks,
+ compaction->mutable_cf_options()->report_bg_io_stats, dbname,
+ &(compaction_service_result->stats), Env::Priority::USER, io_tracer,
+ manual_compaction_canceled, db_id, db_session_id,
+ compaction->column_family_data()->GetFullHistoryTsLow()),
+ output_path_(std::move(output_path)),
+ compaction_input_(compaction_service_input),
+ compaction_result_(compaction_service_result) {}
+
+Status CompactionServiceCompactionJob::Run() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_RUN);
+
+ auto* c = compact_->compaction;
+ assert(c->column_family_data() != nullptr);
+ assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
+ compact_->compaction->level()) > 0);
+
+ write_hint_ =
+ c->column_family_data()->CalculateSSTWriteHint(c->output_level());
+ bottommost_level_ = c->bottommost_level();
+
+ Slice begin = compaction_input_.begin;
+ Slice end = compaction_input_.end;
+ compact_->sub_compact_states.emplace_back(
+ c,
+ compaction_input_.has_begin ? std::optional<Slice>(begin)
+ : std::optional<Slice>(),
+ compaction_input_.has_end ? std::optional<Slice>(end)
+ : std::optional<Slice>(),
+ /*sub_job_id*/ 0);
+
+ log_buffer_->FlushBufferToLog();
+ LogCompaction();
+ const uint64_t start_micros = db_options_.clock->NowMicros();
+ // Pick the only sub-compaction we should have
+ assert(compact_->sub_compact_states.size() == 1);
+ SubcompactionState* sub_compact = compact_->sub_compact_states.data();
+
+ ProcessKeyValueCompaction(sub_compact);
+
+ compaction_stats_.stats.micros =
+ db_options_.clock->NowMicros() - start_micros;
+ compaction_stats_.stats.cpu_micros =
+ sub_compact->compaction_job_stats.cpu_micros;
+
+ RecordTimeToHistogram(stats_, COMPACTION_TIME,
+ compaction_stats_.stats.micros);
+ RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+ compaction_stats_.stats.cpu_micros);
+
+ Status status = sub_compact->status;
+ IOStatus io_s = sub_compact->io_status;
+
+ if (io_status_.ok()) {
+ io_status_ = io_s;
+ }
+
+ if (status.ok()) {
+ constexpr IODebugContext* dbg = nullptr;
+
+ if (output_directory_) {
+ io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg,
+ DirFsyncOptions());
+ }
+ }
+ if (io_status_.ok()) {
+ io_status_ = io_s;
+ }
+ if (status.ok()) {
+ status = io_s;
+ }
+ if (status.ok()) {
+ // TODO: Add verify_table()
+ }
+
+ // Finish up all book-keeping to unify the subcompaction results
+ compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
+ UpdateCompactionStats();
+ RecordCompactionIOStats();
+
+ LogFlush(db_options_.info_log);
+ compact_->status = status;
+ compact_->status.PermitUncheckedError();
+
+ // Build compaction result
+ compaction_result_->output_level = compact_->compaction->output_level();
+ compaction_result_->output_path = output_path_;
+ for (const auto& output_file : sub_compact->GetOutputs()) {
+ auto& meta = output_file.meta;
+ compaction_result_->output_files.emplace_back(
+ MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
+ meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
+ meta.largest.Encode().ToString(), meta.oldest_ancester_time,
+ meta.file_creation_time, output_file.validator.GetHash(),
+ meta.marked_for_compaction, meta.unique_id);
+ }
+ InternalStats::CompactionStatsFull compaction_stats;
+ sub_compact->AggregateCompactionStats(compaction_stats);
+ compaction_result_->num_output_records =
+ compaction_stats.stats.num_output_records;
+ compaction_result_->total_bytes = compaction_stats.TotalBytesWritten();
+
+ return status;
+}
+
+void CompactionServiceCompactionJob::CleanupCompaction() {
+ CompactionJob::CleanupCompaction();
+}
+
+// Internal binary format for the input and result data
+enum BinaryFormatVersion : uint32_t {
+ kOptionsString = 1, // Use string format similar to Option string format
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cfd_type_info = {
+ {"name",
+ {offsetof(struct ColumnFamilyDescriptor, name), OptionType::kEncodedString,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"options",
+ {offsetof(struct ColumnFamilyDescriptor, options),
+ OptionType::kConfigurable, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone,
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const std::string& value, void* addr) {
+ auto cf_options = static_cast<ColumnFamilyOptions*>(addr);
+ return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(),
+ value, cf_options);
+ },
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const void* addr, std::string* value) {
+ const auto cf_options = static_cast<const ColumnFamilyOptions*>(addr);
+ std::string result;
+ auto status =
+ GetStringFromColumnFamilyOptions(opts, *cf_options, &result);
+ *value = "{" + result + "}";
+ return status;
+ },
+ [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+ const void* addr2, std::string* mismatch) {
+ const auto this_one = static_cast<const ColumnFamilyOptions*>(addr1);
+ const auto that_one = static_cast<const ColumnFamilyOptions*>(addr2);
+ auto this_conf = CFOptionsAsConfigurable(*this_one);
+ auto that_conf = CFOptionsAsConfigurable(*that_one);
+ std::string mismatch_opt;
+ bool result =
+ this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+ if (!result) {
+ *mismatch = name + "." + mismatch_opt;
+ }
+ return result;
+ }}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
+ {"column_family",
+ OptionTypeInfo::Struct(
+ "column_family", &cfd_type_info,
+ offsetof(struct CompactionServiceInput, column_family),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+ {"db_options",
+ {offsetof(struct CompactionServiceInput, db_options),
+ OptionType::kConfigurable, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone,
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const std::string& value, void* addr) {
+ auto options = static_cast<DBOptions*>(addr);
+ return GetDBOptionsFromString(opts, DBOptions(), value, options);
+ },
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const void* addr, std::string* value) {
+ const auto options = static_cast<const DBOptions*>(addr);
+ std::string result;
+ auto status = GetStringFromDBOptions(opts, *options, &result);
+ *value = "{" + result + "}";
+ return status;
+ },
+ [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+ const void* addr2, std::string* mismatch) {
+ const auto this_one = static_cast<const DBOptions*>(addr1);
+ const auto that_one = static_cast<const DBOptions*>(addr2);
+ auto this_conf = DBOptionsAsConfigurable(*this_one);
+ auto that_conf = DBOptionsAsConfigurable(*that_one);
+ std::string mismatch_opt;
+ bool result =
+ this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+ if (!result) {
+ *mismatch = name + "." + mismatch_opt;
+ }
+ return result;
+ }}},
+ {"snapshots", OptionTypeInfo::Vector<uint64_t>(
+ offsetof(struct CompactionServiceInput, snapshots),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+ {0, OptionType::kUInt64T})},
+ {"input_files", OptionTypeInfo::Vector<std::string>(
+ offsetof(struct CompactionServiceInput, input_files),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+ {0, OptionType::kEncodedString})},
+ {"output_level",
+ {offsetof(struct CompactionServiceInput, output_level), OptionType::kInt,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"db_id",
+ {offsetof(struct CompactionServiceInput, db_id),
+ OptionType::kEncodedString}},
+ {"has_begin",
+ {offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"begin",
+ {offsetof(struct CompactionServiceInput, begin),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"has_end",
+ {offsetof(struct CompactionServiceInput, has_end), OptionType::kBoolean,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"end",
+ {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+ cs_output_file_type_info = {
+ {"file_name",
+ {offsetof(struct CompactionServiceOutputFile, file_name),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"smallest_seqno",
+ {offsetof(struct CompactionServiceOutputFile, smallest_seqno),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"largest_seqno",
+ {offsetof(struct CompactionServiceOutputFile, largest_seqno),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"smallest_internal_key",
+ {offsetof(struct CompactionServiceOutputFile, smallest_internal_key),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"largest_internal_key",
+ {offsetof(struct CompactionServiceOutputFile, largest_internal_key),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"oldest_ancester_time",
+ {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"file_creation_time",
+ {offsetof(struct CompactionServiceOutputFile, file_creation_time),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"paranoid_hash",
+ {offsetof(struct CompactionServiceOutputFile, paranoid_hash),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"marked_for_compaction",
+ {offsetof(struct CompactionServiceOutputFile, marked_for_compaction),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"unique_id",
+ OptionTypeInfo::Array<uint64_t, 2>(
+ offsetof(struct CompactionServiceOutputFile, unique_id),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+ {0, OptionType::kUInt64T})},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+ compaction_job_stats_type_info = {
+ {"elapsed_micros",
+ {offsetof(struct CompactionJobStats, elapsed_micros),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"cpu_micros",
+ {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"num_input_records",
+ {offsetof(struct CompactionJobStats, num_input_records),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_blobs_read",
+ {offsetof(struct CompactionJobStats, num_blobs_read),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_input_files",
+ {offsetof(struct CompactionJobStats, num_input_files),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_input_files_at_output_level",
+ {offsetof(struct CompactionJobStats, num_input_files_at_output_level),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_output_records",
+ {offsetof(struct CompactionJobStats, num_output_records),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_output_files",
+ {offsetof(struct CompactionJobStats, num_output_files),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_output_files_blob",
+ {offsetof(struct CompactionJobStats, num_output_files_blob),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"is_full_compaction",
+ {offsetof(struct CompactionJobStats, is_full_compaction),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"is_manual_compaction",
+ {offsetof(struct CompactionJobStats, is_manual_compaction),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_input_bytes",
+ {offsetof(struct CompactionJobStats, total_input_bytes),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_blob_bytes_read",
+ {offsetof(struct CompactionJobStats, total_blob_bytes_read),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_output_bytes",
+ {offsetof(struct CompactionJobStats, total_output_bytes),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_output_bytes_blob",
+ {offsetof(struct CompactionJobStats, total_output_bytes_blob),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_records_replaced",
+ {offsetof(struct CompactionJobStats, num_records_replaced),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_input_raw_key_bytes",
+ {offsetof(struct CompactionJobStats, total_input_raw_key_bytes),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_input_raw_value_bytes",
+ {offsetof(struct CompactionJobStats, total_input_raw_value_bytes),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_input_deletion_records",
+ {offsetof(struct CompactionJobStats, num_input_deletion_records),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_expired_deletion_records",
+ {offsetof(struct CompactionJobStats, num_expired_deletion_records),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_corrupt_keys",
+ {offsetof(struct CompactionJobStats, num_corrupt_keys),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"file_write_nanos",
+ {offsetof(struct CompactionJobStats, file_write_nanos),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"file_range_sync_nanos",
+ {offsetof(struct CompactionJobStats, file_range_sync_nanos),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"file_fsync_nanos",
+ {offsetof(struct CompactionJobStats, file_fsync_nanos),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"file_prepare_write_nanos",
+ {offsetof(struct CompactionJobStats, file_prepare_write_nanos),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"smallest_output_key_prefix",
+ {offsetof(struct CompactionJobStats, smallest_output_key_prefix),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"largest_output_key_prefix",
+ {offsetof(struct CompactionJobStats, largest_output_key_prefix),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_single_del_fallthru",
+ {offsetof(struct CompactionJobStats, num_single_del_fallthru),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_single_del_mismatch",
+ {offsetof(struct CompactionJobStats, num_single_del_mismatch),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+};
+
+namespace {
+// this is a helper struct to serialize and deserialize class Status, because
+// Status's members are not public.
+struct StatusSerializationAdapter {
+ uint8_t code;
+ uint8_t subcode;
+ uint8_t severity;
+ std::string message;
+
+ StatusSerializationAdapter() = default;
+ explicit StatusSerializationAdapter(const Status& s) {
+ code = s.code();
+ subcode = s.subcode();
+ severity = s.severity();
+ auto msg = s.getState();
+ message = msg ? msg : "";
+ }
+
+ Status GetStatus() const {
+ return Status{static_cast<Status::Code>(code),
+ static_cast<Status::SubCode>(subcode),
+ static_cast<Status::Severity>(severity), message};
+ }
+};
+} // namespace
+
+static std::unordered_map<std::string, OptionTypeInfo>
+ status_adapter_type_info = {
+ {"code",
+ {offsetof(struct StatusSerializationAdapter, code),
+ OptionType::kUInt8T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"subcode",
+ {offsetof(struct StatusSerializationAdapter, subcode),
+ OptionType::kUInt8T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"severity",
+ {offsetof(struct StatusSerializationAdapter, severity),
+ OptionType::kUInt8T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"message",
+ {offsetof(struct StatusSerializationAdapter, message),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = {
+ {"status",
+ {offsetof(struct CompactionServiceResult, status),
+ OptionType::kCustomizable, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone,
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const std::string& value, void* addr) {
+ auto status_obj = static_cast<Status*>(addr);
+ StatusSerializationAdapter adapter;
+ Status s = OptionTypeInfo::ParseType(
+ opts, value, status_adapter_type_info, &adapter);
+ *status_obj = adapter.GetStatus();
+ return s;
+ },
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const void* addr, std::string* value) {
+ const auto status_obj = static_cast<const Status*>(addr);
+ StatusSerializationAdapter adapter(*status_obj);
+ std::string result;
+ Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info,
+ &adapter, &result);
+ *value = "{" + result + "}";
+ return s;
+ },
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const void* addr1, const void* addr2, std::string* mismatch) {
+ const auto status1 = static_cast<const Status*>(addr1);
+ const auto status2 = static_cast<const Status*>(addr2);
+
+ StatusSerializationAdapter adatper1(*status1);
+ StatusSerializationAdapter adapter2(*status2);
+ return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info,
+ &adatper1, &adapter2, mismatch);
+ }}},
+ {"output_files",
+ OptionTypeInfo::Vector<CompactionServiceOutputFile>(
+ offsetof(struct CompactionServiceResult, output_files),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+ OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0,
+ OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone))},
+ {"output_level",
+ {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"output_path",
+ {offsetof(struct CompactionServiceResult, output_path),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_output_records",
+ {offsetof(struct CompactionServiceResult, num_output_records),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_bytes",
+ {offsetof(struct CompactionServiceResult, total_bytes),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"bytes_read",
+ {offsetof(struct CompactionServiceResult, bytes_read),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"bytes_written",
+ {offsetof(struct CompactionServiceResult, bytes_written),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"stats", OptionTypeInfo::Struct(
+ "stats", &compaction_job_stats_type_info,
+ offsetof(struct CompactionServiceResult, stats),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+};
+
+Status CompactionServiceInput::Read(const std::string& data_str,
+ CompactionServiceInput* obj) {
+ if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+ return Status::InvalidArgument("Invalid CompactionServiceInput string");
+ }
+ auto format_version = DecodeFixed32(data_str.data());
+ if (format_version == kOptionsString) {
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ cf.ignore_unknown_options = true;
+ return OptionTypeInfo::ParseType(
+ cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info,
+ obj);
+ } else {
+ return Status::NotSupported(
+ "Compaction Service Input data version not supported: " +
+ std::to_string(format_version));
+ }
+}
+
+Status CompactionServiceInput::Write(std::string* output) {
+ char buf[sizeof(BinaryFormatVersion)];
+ EncodeFixed32(buf, kOptionsString);
+ output->append(buf, sizeof(BinaryFormatVersion));
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output);
+}
+
+Status CompactionServiceResult::Read(const std::string& data_str,
+ CompactionServiceResult* obj) {
+ if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+ return Status::InvalidArgument("Invalid CompactionServiceResult string");
+ }
+ auto format_version = DecodeFixed32(data_str.data());
+ if (format_version == kOptionsString) {
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ cf.ignore_unknown_options = true;
+ return OptionTypeInfo::ParseType(
+ cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info,
+ obj);
+ } else {
+ return Status::NotSupported(
+ "Compaction Service Result data version not supported: " +
+ std::to_string(format_version));
+ }
+}
+
+Status CompactionServiceResult::Write(std::string* output) {
+ char buf[sizeof(BinaryFormatVersion)];
+ EncodeFixed32(buf, kOptionsString);
+ output->append(buf, sizeof(BinaryFormatVersion));
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output);
+}
+
+#ifndef NDEBUG
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) {
+ std::string mismatch;
+ return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other,
+ std::string* mismatch) {
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other,
+ mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) {
+ std::string mismatch;
+ return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other,
+ std::string* mismatch) {
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other,
+ mismatch);
+}
+#endif // NDEBUG
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_service_test.cc b/src/rocksdb/db/compaction/compaction_service_test.cc
new file mode 100644
index 000000000..c475c4e3b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_service_test.cc
@@ -0,0 +1,966 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "table/unique_id_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MyTestCompactionService : public CompactionService {
+ public:
+ MyTestCompactionService(
+ std::string db_path, Options& options,
+ std::shared_ptr<Statistics>& statistics,
+ std::vector<std::shared_ptr<EventListener>>& listeners,
+ std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+ table_properties_collector_factories)
+ : db_path_(std::move(db_path)),
+ options_(options),
+ statistics_(statistics),
+ start_info_("na", "na", "na", 0, Env::TOTAL),
+ wait_info_("na", "na", "na", 0, Env::TOTAL),
+ listeners_(listeners),
+ table_properties_collector_factories_(
+ std::move(table_properties_collector_factories)) {}
+
+ static const char* kClassName() { return "MyTestCompactionService"; }
+
+ const char* Name() const override { return kClassName(); }
+
+ CompactionServiceJobStatus StartV2(
+ const CompactionServiceJobInfo& info,
+ const std::string& compaction_service_input) override {
+ InstrumentedMutexLock l(&mutex_);
+ start_info_ = info;
+ assert(info.db_name == db_path_);
+ jobs_.emplace(info.job_id, compaction_service_input);
+ CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess;
+ if (is_override_start_status_) {
+ return override_start_status_;
+ }
+ return s;
+ }
+
+ CompactionServiceJobStatus WaitForCompleteV2(
+ const CompactionServiceJobInfo& info,
+ std::string* compaction_service_result) override {
+ std::string compaction_input;
+ assert(info.db_name == db_path_);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ wait_info_ = info;
+ auto i = jobs_.find(info.job_id);
+ if (i == jobs_.end()) {
+ return CompactionServiceJobStatus::kFailure;
+ }
+ compaction_input = std::move(i->second);
+ jobs_.erase(i);
+ }
+
+ if (is_override_wait_status_) {
+ return override_wait_status_;
+ }
+
+ CompactionServiceOptionsOverride options_override;
+ options_override.env = options_.env;
+ options_override.file_checksum_gen_factory =
+ options_.file_checksum_gen_factory;
+ options_override.comparator = options_.comparator;
+ options_override.merge_operator = options_.merge_operator;
+ options_override.compaction_filter = options_.compaction_filter;
+ options_override.compaction_filter_factory =
+ options_.compaction_filter_factory;
+ options_override.prefix_extractor = options_.prefix_extractor;
+ options_override.table_factory = options_.table_factory;
+ options_override.sst_partitioner_factory = options_.sst_partitioner_factory;
+ options_override.statistics = statistics_;
+ if (!listeners_.empty()) {
+ options_override.listeners = listeners_;
+ }
+
+ if (!table_properties_collector_factories_.empty()) {
+ options_override.table_properties_collector_factories =
+ table_properties_collector_factories_;
+ }
+
+ OpenAndCompactOptions options;
+ options.canceled = &canceled_;
+
+ Status s = DB::OpenAndCompact(
+ options, db_path_, db_path_ + "/" + std::to_string(info.job_id),
+ compaction_input, compaction_service_result, options_override);
+ if (is_override_wait_result_) {
+ *compaction_service_result = override_wait_result_;
+ }
+ compaction_num_.fetch_add(1);
+ if (s.ok()) {
+ return CompactionServiceJobStatus::kSuccess;
+ } else {
+ return CompactionServiceJobStatus::kFailure;
+ }
+ }
+
+ int GetCompactionNum() { return compaction_num_.load(); }
+
+ CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; }
+ CompactionServiceJobInfo GetCompactionInfoForWait() { return wait_info_; }
+
+ void OverrideStartStatus(CompactionServiceJobStatus s) {
+ is_override_start_status_ = true;
+ override_start_status_ = s;
+ }
+
+ void OverrideWaitStatus(CompactionServiceJobStatus s) {
+ is_override_wait_status_ = true;
+ override_wait_status_ = s;
+ }
+
+ void OverrideWaitResult(std::string str) {
+ is_override_wait_result_ = true;
+ override_wait_result_ = std::move(str);
+ }
+
+ void ResetOverride() {
+ is_override_wait_result_ = false;
+ is_override_start_status_ = false;
+ is_override_wait_status_ = false;
+ }
+
+ void SetCanceled(bool canceled) { canceled_ = canceled; }
+
+ private:
+ InstrumentedMutex mutex_;
+ std::atomic_int compaction_num_{0};
+ std::map<uint64_t, std::string> jobs_;
+ const std::string db_path_;
+ Options options_;
+ std::shared_ptr<Statistics> statistics_;
+ CompactionServiceJobInfo start_info_;
+ CompactionServiceJobInfo wait_info_;
+ bool is_override_start_status_ = false;
+ CompactionServiceJobStatus override_start_status_ =
+ CompactionServiceJobStatus::kFailure;
+ bool is_override_wait_status_ = false;
+ CompactionServiceJobStatus override_wait_status_ =
+ CompactionServiceJobStatus::kFailure;
+ bool is_override_wait_result_ = false;
+ std::string override_wait_result_;
+ std::vector<std::shared_ptr<EventListener>> listeners_;
+ std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+ table_properties_collector_factories_;
+ std::atomic_bool canceled_{false};
+};
+
+class CompactionServiceTest : public DBTestBase {
+ public:
+ explicit CompactionServiceTest()
+ : DBTestBase("compaction_service_test", true) {}
+
+ protected:
+ void ReopenWithCompactionService(Options* options) {
+ options->env = env_;
+ primary_statistics_ = CreateDBStatistics();
+ options->statistics = primary_statistics_;
+ compactor_statistics_ = CreateDBStatistics();
+
+ compaction_service_ = std::make_shared<MyTestCompactionService>(
+ dbname_, *options, compactor_statistics_, remote_listeners,
+ remote_table_properties_collector_factories);
+ options->compaction_service = compaction_service_;
+ DestroyAndReopen(*options);
+ }
+
+ Statistics* GetCompactorStatistics() { return compactor_statistics_.get(); }
+
+ Statistics* GetPrimaryStatistics() { return primary_statistics_.get(); }
+
+ MyTestCompactionService* GetCompactionService() {
+ CompactionService* cs = compaction_service_.get();
+ return static_cast_with_check<MyTestCompactionService>(cs);
+ }
+
+ void GenerateTestData() {
+ // Generate 20 files @ L2
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(2);
+
+ // Generate 10 files @ L1 overlap with all 20 files @ L2
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(1);
+ ASSERT_EQ(FilesPerLevel(), "0,10,20");
+ }
+
+ void VerifyTestData() {
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+ }
+
+ std::vector<std::shared_ptr<EventListener>> remote_listeners;
+ std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+ remote_table_properties_collector_factories;
+
+ private:
+ std::shared_ptr<Statistics> compactor_statistics_;
+ std::shared_ptr<Statistics> primary_statistics_;
+ std::shared_ptr<CompactionService> compaction_service_;
+};
+
+TEST_F(CompactionServiceTest, BasicCompactions) {
+ Options options = CurrentOptions();
+ ReopenWithCompactionService(&options);
+
+ Statistics* primary_statistics = GetPrimaryStatistics();
+ Statistics* compactor_statistics = GetCompactorStatistics();
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // verify result
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+ auto my_cs = GetCompactionService();
+ ASSERT_GE(my_cs->GetCompactionNum(), 1);
+
+ // make sure the compaction statistics is only recorded on the remote side
+ ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1);
+ ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_READ_BYTES), 1);
+ ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0);
+ // even with remote compaction, primary host still needs to read SST files to
+ // `verify_table()`.
+ ASSERT_GE(primary_statistics->getTickerCount(COMPACT_READ_BYTES), 1);
+ // all the compaction write happens on the remote side
+ ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+ compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES));
+ ASSERT_GE(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 1);
+ ASSERT_GT(primary_statistics->getTickerCount(COMPACT_READ_BYTES),
+ primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES));
+ // compactor is already the remote side, which doesn't have remote
+ ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0);
+ ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+ 0);
+
+ // Test failed compaction
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) {
+ // override job status
+ auto s = static_cast<Status*>(status);
+ *s = Status::Aborted("MyTestCompactionService failed to compact!");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Status s;
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ s = Put(Key(key_id), "value_new" + std::to_string(key_id));
+ if (s.IsAborted()) {
+ break;
+ }
+ }
+ if (s.IsAborted()) {
+ break;
+ }
+ s = Flush();
+ if (s.IsAborted()) {
+ break;
+ }
+ s = dbfull()->TEST_WaitForCompact();
+ if (s.IsAborted()) {
+ break;
+ }
+ }
+ ASSERT_TRUE(s.IsAborted());
+
+ // Test re-open and successful unique id verification
+ std::atomic_int verify_passed{0};
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::PassedVerifyUniqueId", [&](void* arg) {
+ // override job status
+ auto id = static_cast<UniqueId64x2*>(arg);
+ assert(*id != kNullUniqueId64x2);
+ verify_passed++;
+ });
+ Reopen(options);
+ ASSERT_GT(verify_passed, 0);
+ Close();
+}
+
+TEST_F(CompactionServiceTest, ManualCompaction) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+ GenerateTestData();
+
+ auto my_cs = GetCompactionService();
+
+ std::string start_str = Key(15);
+ std::string end_str = Key(45);
+ Slice start(start_str);
+ Slice end(end_str);
+ uint64_t comp_num = my_cs->GetCompactionNum();
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+ VerifyTestData();
+
+ start_str = Key(120);
+ start = start_str;
+ comp_num = my_cs->GetCompactionNum();
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr));
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+ VerifyTestData();
+
+ end_str = Key(92);
+ end = end_str;
+ comp_num = my_cs->GetCompactionNum();
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, &end));
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+ VerifyTestData();
+
+ comp_num = my_cs->GetCompactionNum();
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+ VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, CancelCompactionOnRemoteSide) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+ GenerateTestData();
+
+ auto my_cs = GetCompactionService();
+
+ std::string start_str = Key(15);
+ std::string end_str = Key(45);
+ Slice start(start_str);
+ Slice end(end_str);
+ uint64_t comp_num = my_cs->GetCompactionNum();
+
+ // Test cancel compaction at the beginning
+ my_cs->SetCanceled(true);
+ auto s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+ ASSERT_TRUE(s.IsIncomplete());
+ // compaction number is not increased
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num);
+ VerifyTestData();
+
+ // Test cancel compaction in progress
+ ReopenWithCompactionService(&options);
+ GenerateTestData();
+ my_cs = GetCompactionService();
+ my_cs->SetCanceled(false);
+
+ std::atomic_bool cancel_issued{false};
+ SyncPoint::GetInstance()->SetCallBack("CompactionJob::Run():Inprogress",
+ [&](void* /*arg*/) {
+ cancel_issued = true;
+ my_cs->SetCanceled(true);
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+ ASSERT_TRUE(s.IsIncomplete());
+ ASSERT_TRUE(cancel_issued);
+ // compaction number is not increased
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num);
+ VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, FailedToStart) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+
+ GenerateTestData();
+
+ auto my_cs = GetCompactionService();
+ my_cs->OverrideStartStatus(CompactionServiceJobStatus::kFailure);
+
+ std::string start_str = Key(15);
+ std::string end_str = Key(45);
+ Slice start(start_str);
+ Slice end(end_str);
+ Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+ ASSERT_TRUE(s.IsIncomplete());
+}
+
+TEST_F(CompactionServiceTest, InvalidResult) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+
+ GenerateTestData();
+
+ auto my_cs = GetCompactionService();
+ my_cs->OverrideWaitResult("Invalid Str");
+
+ std::string start_str = Key(15);
+ std::string end_str = Key(45);
+ Slice start(start_str);
+ Slice end(end_str);
+ Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+ ASSERT_FALSE(s.ok());
+}
+
+TEST_F(CompactionServiceTest, SubCompaction) {
+ Options options = CurrentOptions();
+ options.max_subcompactions = 10;
+ options.target_file_size_base = 1 << 10; // 1KB
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+
+ GenerateTestData();
+ VerifyTestData();
+
+ auto my_cs = GetCompactionService();
+ int compaction_num_before = my_cs->GetCompactionNum();
+
+ auto cro = CompactRangeOptions();
+ cro.max_subcompactions = 10;
+ Status s = db_->CompactRange(cro, nullptr, nullptr);
+ ASSERT_OK(s);
+ VerifyTestData();
+ int compaction_num = my_cs->GetCompactionNum() - compaction_num_before;
+ // make sure there's sub-compaction by checking the compaction number
+ ASSERT_GE(compaction_num, 2);
+}
+
+class PartialDeleteCompactionFilter : public CompactionFilter {
+ public:
+ CompactionFilter::Decision FilterV2(
+ int /*level*/, const Slice& key, ValueType /*value_type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ int i = std::stoi(key.ToString().substr(3));
+ if (i > 5 && i <= 105) {
+ return CompactionFilter::Decision::kRemove;
+ }
+ return CompactionFilter::Decision::kKeep;
+ }
+
+ const char* Name() const override { return "PartialDeleteCompactionFilter"; }
+};
+
+TEST_F(CompactionServiceTest, CompactionFilter) {
+ Options options = CurrentOptions();
+ std::unique_ptr<CompactionFilter> delete_comp_filter(
+ new PartialDeleteCompactionFilter());
+ options.compaction_filter = delete_comp_filter.get();
+ ReopenWithCompactionService(&options);
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // verify result
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i > 5 && i <= 105) {
+ ASSERT_EQ(result, "NOT_FOUND");
+ } else if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+ auto my_cs = GetCompactionService();
+ ASSERT_GE(my_cs->GetCompactionNum(), 1);
+}
+
+TEST_F(CompactionServiceTest, Snapshot) {
+ Options options = CurrentOptions();
+ ReopenWithCompactionService(&options);
+
+ ASSERT_OK(Put(Key(1), "value1"));
+ ASSERT_OK(Put(Key(2), "value1"));
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(1), "value2"));
+ ASSERT_OK(Put(Key(3), "value2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ auto my_cs = GetCompactionService();
+ ASSERT_GE(my_cs->GetCompactionNum(), 1);
+ ASSERT_EQ("value1", Get(Key(1), s1));
+ ASSERT_EQ("value2", Get(Key(1)));
+ db_->ReleaseSnapshot(s1);
+}
+
+TEST_F(CompactionServiceTest, ConcurrentCompaction) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 100;
+ options.max_background_jobs = 20;
+ ReopenWithCompactionService(&options);
+ GenerateTestData();
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+
+ std::vector<std::thread> threads;
+ for (const auto& file : meta.levels[1].files) {
+ threads.emplace_back(std::thread([&]() {
+ std::string fname = file.db_path + "/" + file.name;
+ ASSERT_OK(db_->CompactFiles(CompactionOptions(), {fname}, 2));
+ }));
+ }
+
+ for (auto& thread : threads) {
+ thread.join();
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // verify result
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+ auto my_cs = GetCompactionService();
+ ASSERT_EQ(my_cs->GetCompactionNum(), 10);
+ ASSERT_EQ(FilesPerLevel(), "0,0,10");
+}
+
+TEST_F(CompactionServiceTest, CompactionInfo) {
+ Options options = CurrentOptions();
+ ReopenWithCompactionService(&options);
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ auto my_cs =
+ static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+ uint64_t comp_num = my_cs->GetCompactionNum();
+ ASSERT_GE(comp_num, 1);
+
+ CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart();
+ ASSERT_EQ(dbname_, info.db_name);
+ std::string db_id, db_session_id;
+ ASSERT_OK(db_->GetDbIdentity(db_id));
+ ASSERT_EQ(db_id, info.db_id);
+ ASSERT_OK(db_->GetDbSessionId(db_session_id));
+ ASSERT_EQ(db_session_id, info.db_session_id);
+ ASSERT_EQ(Env::LOW, info.priority);
+ info = my_cs->GetCompactionInfoForWait();
+ ASSERT_EQ(dbname_, info.db_name);
+ ASSERT_EQ(db_id, info.db_id);
+ ASSERT_EQ(db_session_id, info.db_session_id);
+ ASSERT_EQ(Env::LOW, info.priority);
+
+ // Test priority USER
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+ SstFileMetaData file = meta.levels[1].files[0];
+ ASSERT_OK(db_->CompactFiles(CompactionOptions(),
+ {file.db_path + "/" + file.name}, 2));
+ info = my_cs->GetCompactionInfoForStart();
+ ASSERT_EQ(Env::USER, info.priority);
+ info = my_cs->GetCompactionInfoForWait();
+ ASSERT_EQ(Env::USER, info.priority);
+
+ // Test priority BOTTOM
+ env_->SetBackgroundThreads(1, Env::BOTTOM);
+ options.num_levels = 2;
+ ReopenWithCompactionService(&options);
+ my_cs =
+ static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ info = my_cs->GetCompactionInfoForStart();
+ ASSERT_EQ(Env::BOTTOM, info.priority);
+ info = my_cs->GetCompactionInfoForWait();
+ ASSERT_EQ(Env::BOTTOM, info.priority);
+}
+
+TEST_F(CompactionServiceTest, FallbackLocalAuto) {
+ Options options = CurrentOptions();
+ ReopenWithCompactionService(&options);
+
+ auto my_cs = GetCompactionService();
+ Statistics* compactor_statistics = GetCompactorStatistics();
+ Statistics* primary_statistics = GetPrimaryStatistics();
+ uint64_t compactor_write_bytes =
+ compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+ uint64_t primary_write_bytes =
+ primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+ my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal);
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // verify result
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+
+ ASSERT_EQ(my_cs->GetCompactionNum(), 0);
+
+ // make sure the compaction statistics is only recorded on the local side
+ ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ compactor_write_bytes);
+ ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ primary_write_bytes);
+ ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0);
+ ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), 0);
+}
+
+TEST_F(CompactionServiceTest, FallbackLocalManual) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+
+ GenerateTestData();
+ VerifyTestData();
+
+ auto my_cs = GetCompactionService();
+ Statistics* compactor_statistics = GetCompactorStatistics();
+ Statistics* primary_statistics = GetPrimaryStatistics();
+ uint64_t compactor_write_bytes =
+ compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+ uint64_t primary_write_bytes =
+ primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+ // re-enable remote compaction
+ my_cs->ResetOverride();
+ std::string start_str = Key(15);
+ std::string end_str = Key(45);
+ Slice start(start_str);
+ Slice end(end_str);
+ uint64_t comp_num = my_cs->GetCompactionNum();
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+ // make sure the compaction statistics is only recorded on the remote side
+ ASSERT_GT(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ compactor_write_bytes);
+ ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+ compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES));
+ ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ primary_write_bytes);
+
+ // return run local again with API WaitForComplete
+ my_cs->OverrideWaitStatus(CompactionServiceJobStatus::kUseLocal);
+ start_str = Key(120);
+ start = start_str;
+ comp_num = my_cs->GetCompactionNum();
+ compactor_write_bytes =
+ compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+ primary_write_bytes = primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr));
+ ASSERT_EQ(my_cs->GetCompactionNum(),
+ comp_num); // no remote compaction is run
+ // make sure the compaction statistics is only recorded on the local side
+ ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ compactor_write_bytes);
+ ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ primary_write_bytes);
+ ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+ compactor_write_bytes);
+
+ // verify result after 2 manual compactions
+ VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, RemoteEventListener) {
+ class RemoteEventListenerTest : public EventListener {
+ public:
+ const char* Name() const override { return "RemoteEventListenerTest"; }
+
+ void OnSubcompactionBegin(const SubcompactionJobInfo& info) override {
+ auto result = on_going_compactions.emplace(info.job_id);
+ ASSERT_TRUE(result.second); // make sure there's no duplication
+ compaction_num++;
+ EventListener::OnSubcompactionBegin(info);
+ }
+ void OnSubcompactionCompleted(const SubcompactionJobInfo& info) override {
+ auto num = on_going_compactions.erase(info.job_id);
+ ASSERT_TRUE(num == 1); // make sure the compaction id exists
+ EventListener::OnSubcompactionCompleted(info);
+ }
+ void OnTableFileCreated(const TableFileCreationInfo& info) override {
+ ASSERT_EQ(on_going_compactions.count(info.job_id), 1);
+ file_created++;
+ EventListener::OnTableFileCreated(info);
+ }
+ void OnTableFileCreationStarted(
+ const TableFileCreationBriefInfo& info) override {
+ ASSERT_EQ(on_going_compactions.count(info.job_id), 1);
+ file_creation_started++;
+ EventListener::OnTableFileCreationStarted(info);
+ }
+
+ bool ShouldBeNotifiedOnFileIO() override {
+ file_io_notified++;
+ return EventListener::ShouldBeNotifiedOnFileIO();
+ }
+
+ std::atomic_uint64_t file_io_notified{0};
+ std::atomic_uint64_t file_creation_started{0};
+ std::atomic_uint64_t file_created{0};
+
+ std::set<int> on_going_compactions; // store the job_id
+ std::atomic_uint64_t compaction_num{0};
+ };
+
+ auto listener = new RemoteEventListenerTest();
+ remote_listeners.emplace_back(listener);
+
+ Options options = CurrentOptions();
+ ReopenWithCompactionService(&options);
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // check the events are triggered
+ ASSERT_TRUE(listener->file_io_notified > 0);
+ ASSERT_TRUE(listener->file_creation_started > 0);
+ ASSERT_TRUE(listener->file_created > 0);
+ ASSERT_TRUE(listener->compaction_num > 0);
+ ASSERT_TRUE(listener->on_going_compactions.empty());
+
+ // verify result
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+}
+
+TEST_F(CompactionServiceTest, TablePropertiesCollector) {
+ const static std::string kUserPropertyName = "TestCount";
+
+ class TablePropertiesCollectorTest : public TablePropertiesCollector {
+ public:
+ Status Finish(UserCollectedProperties* properties) override {
+ *properties = UserCollectedProperties{
+ {kUserPropertyName, std::to_string(count_)},
+ };
+ return Status::OK();
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties();
+ }
+
+ const char* Name() const override { return "TablePropertiesCollectorTest"; }
+
+ Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+ EntryType /*type*/, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ count_++;
+ return Status::OK();
+ }
+
+ private:
+ uint32_t count_ = 0;
+ };
+
+ class TablePropertiesCollectorFactoryTest
+ : public TablePropertiesCollectorFactory {
+ public:
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ return new TablePropertiesCollectorTest();
+ }
+
+ const char* Name() const override {
+ return "TablePropertiesCollectorFactoryTest";
+ }
+ };
+
+ auto factory = new TablePropertiesCollectorFactoryTest();
+ remote_table_properties_collector_factories.emplace_back(factory);
+
+ const int kNumSst = 3;
+ const int kLevel0Trigger = 4;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kLevel0Trigger;
+ ReopenWithCompactionService(&options);
+
+ // generate a few SSTs locally which should not have user property
+ for (int i = 0; i < kNumSst; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value"));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ TablePropertiesCollection fname_to_props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
+ for (const auto& file_props : fname_to_props) {
+ auto properties = file_props.second->user_collected_properties;
+ auto it = properties.find(kUserPropertyName);
+ ASSERT_EQ(it, properties.end());
+ }
+
+ // trigger compaction
+ for (int i = kNumSst; i < kLevel0Trigger; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value"));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
+
+ bool has_user_property = false;
+ for (const auto& file_props : fname_to_props) {
+ auto properties = file_props.second->user_collected_properties;
+ auto it = properties.find(kUserPropertyName);
+ if (it != properties.end()) {
+ has_user_property = true;
+ ASSERT_GT(std::stoi(it->second), 0);
+ }
+ }
+ ASSERT_TRUE(has_user_property);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_state.cc b/src/rocksdb/db/compaction/compaction_state.cc
new file mode 100644
index 000000000..ee4b0c189
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_state.cc
@@ -0,0 +1,46 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_state.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Slice CompactionState::SmallestUserKey() {
+ for (const auto& sub_compact_state : sub_compact_states) {
+ Slice smallest = sub_compact_state.SmallestUserKey();
+ if (!smallest.empty()) {
+ return smallest;
+ }
+ }
+ // If there is no finished output, return an empty slice.
+ return Slice{nullptr, 0};
+}
+
+Slice CompactionState::LargestUserKey() {
+ for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend();
+ ++it) {
+ Slice largest = it->LargestUserKey();
+ if (!largest.empty()) {
+ return largest;
+ }
+ }
+ // If there is no finished output, return an empty slice.
+ return Slice{nullptr, 0};
+}
+
+void CompactionState::AggregateCompactionStats(
+ InternalStats::CompactionStatsFull& compaction_stats,
+ CompactionJobStats& compaction_job_stats) {
+ for (const auto& sc : sub_compact_states) {
+ sc.AggregateCompactionStats(compaction_stats);
+ compaction_job_stats.Add(sc.compaction_job_stats);
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_state.h b/src/rocksdb/db/compaction/compaction_state.h
new file mode 100644
index 000000000..cc5b66c68
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_state.h
@@ -0,0 +1,42 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/subcompaction_state.h"
+#include "db/internal_stats.h"
+
+// Data structures used for compaction_job and compaction_service_job which has
+// the list of sub_compact_states and the aggregated information for the
+// compaction.
+namespace ROCKSDB_NAMESPACE {
+
+// Maintains state for the entire compaction
+class CompactionState {
+ public:
+ Compaction* const compaction;
+
+ // REQUIRED: subcompaction states are stored in order of increasing key-range
+ std::vector<SubcompactionState> sub_compact_states;
+ Status status;
+
+ void AggregateCompactionStats(
+ InternalStats::CompactionStatsFull& compaction_stats,
+ CompactionJobStats& compaction_job_stats);
+
+ explicit CompactionState(Compaction* c) : compaction(c) {}
+
+ Slice SmallestUserKey();
+
+ Slice LargestUserKey();
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/file_pri.h b/src/rocksdb/db/compaction/file_pri.h
new file mode 100644
index 000000000..82dddcf93
--- /dev/null
+++ b/src/rocksdb/db/compaction/file_pri.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+#include <algorithm>
+
+#include "db/version_edit.h"
+
+namespace ROCKSDB_NAMESPACE {
+// We boost files that are closer to TTL limit. This boosting could be
+// through FileMetaData.compensated_file_size but this compensated size
+// is widely used as something similar to file size so dramatically boost
+// the value might cause unintended consequences.
+//
+// This boosting algorithm can go very fancy, but here we use a simple
+// formula which can satisify:
+// (1) Different levels are triggered slightly differently to avoid
+// too many cascading cases
+// (2) Files in the same level get boosting more when TTL gets closer.
+//
+// Don't do any boosting before TTL has past by half. This is to make
+// sure lower write amp for most of the case. And all levels should be
+// fully boosted when total TTL compaction threshold triggers.
+// Differientiate boosting ranges of each level by 1/2. This will make
+// range for each level exponentially increasing. We could do it by
+// having them to be equal, or go even fancier. We can adjust it after
+// we observe the behavior in production.
+// The threshold starting boosting:
+// +------------------------------------------------------------------ +
+// ^ ^ ^ ^ ^ ^
+// Age 0 ... | | second last level thresold
+// | |
+// | third last level
+// |
+// forth last level
+//
+// We arbitrarily set with 0 when a file is aged boost_age_start and
+// grow linearly. The ratio is arbitrarily set so that when the next level
+// starts to boost, the previous level's boosting amount is 16.
+class FileTtlBooster {
+ public:
+ FileTtlBooster(uint64_t current_time, uint64_t ttl, int num_non_empty_levels,
+ int level)
+ : current_time_(current_time) {
+ if (ttl == 0 || level == 0 || level >= num_non_empty_levels - 1) {
+ enabled_ = false;
+ boost_age_start_ = 0;
+ boost_step_ = 1;
+ } else {
+ enabled_ = true;
+ uint64_t all_boost_start_age = ttl / 2;
+ uint64_t all_boost_age_range = (ttl / 32) * 31 - all_boost_start_age;
+ uint64_t boost_age_range =
+ all_boost_age_range >> (num_non_empty_levels - level - 1);
+ boost_age_start_ = all_boost_start_age + boost_age_range;
+ const uint64_t kBoostRatio = 16;
+ // prevent 0 value to avoid divide 0 error.
+ boost_step_ = std::max(boost_age_range / kBoostRatio, uint64_t{1});
+ }
+ }
+
+ uint64_t GetBoostScore(FileMetaData* f) {
+ if (!enabled_) {
+ return 1;
+ }
+ uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+ if (oldest_ancester_time >= current_time_) {
+ return 1;
+ }
+ uint64_t age = current_time_ - oldest_ancester_time;
+ if (age > boost_age_start_) {
+ // Use integer just for convenience.
+ // We could make all file_to_order double if we want.
+ // Technically this can overflow if users override timing and
+ // give a very high current time. Ignore the case for simplicity.
+ // Boosting is addition to current value, so +1. This will effectively
+ // make boosting to kick in after the first boost_step_ is reached.
+ return (age - boost_age_start_) / boost_step_ + 1;
+ }
+ return 1;
+ }
+
+ private:
+ bool enabled_;
+ uint64_t current_time_;
+ uint64_t boost_age_start_;
+ uint64_t boost_step_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/sst_partitioner.cc b/src/rocksdb/db/compaction/sst_partitioner.cc
new file mode 100644
index 000000000..9e7f9fa89
--- /dev/null
+++ b/src/rocksdb/db/compaction/sst_partitioner.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "rocksdb/sst_partitioner.h"
+
+#include <algorithm>
+
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+static std::unordered_map<std::string, OptionTypeInfo>
+ sst_fixed_prefix_type_info = {
+#ifndef ROCKSDB_LITE
+ {"length",
+ {0, OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+#endif // ROCKSDB_LITE
+};
+
+SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len)
+ : len_(len) {
+ RegisterOptions("Length", &len_, &sst_fixed_prefix_type_info);
+}
+
+PartitionerResult SstPartitionerFixedPrefix::ShouldPartition(
+ const PartitionerRequest& request) {
+ Slice last_key_fixed(*request.prev_user_key);
+ if (last_key_fixed.size() > len_) {
+ last_key_fixed.size_ = len_;
+ }
+ Slice current_key_fixed(*request.current_user_key);
+ if (current_key_fixed.size() > len_) {
+ current_key_fixed.size_ = len_;
+ }
+ return last_key_fixed.compare(current_key_fixed) != 0 ? kRequired
+ : kNotRequired;
+}
+
+bool SstPartitionerFixedPrefix::CanDoTrivialMove(
+ const Slice& smallest_user_key, const Slice& largest_user_key) {
+ return ShouldPartition(PartitionerRequest(smallest_user_key, largest_user_key,
+ 0)) == kNotRequired;
+}
+
+std::unique_ptr<SstPartitioner>
+SstPartitionerFixedPrefixFactory::CreatePartitioner(
+ const SstPartitioner::Context& /* context */) const {
+ return std::unique_ptr<SstPartitioner>(new SstPartitionerFixedPrefix(len_));
+}
+
+std::shared_ptr<SstPartitionerFactory> NewSstPartitionerFixedPrefixFactory(
+ size_t prefix_len) {
+ return std::make_shared<SstPartitionerFixedPrefixFactory>(prefix_len);
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+static int RegisterSstPartitionerFactories(ObjectLibrary& library,
+ const std::string& /*arg*/) {
+ library.AddFactory<SstPartitionerFactory>(
+ SstPartitionerFixedPrefixFactory::kClassName(),
+ [](const std::string& /*uri*/,
+ std::unique_ptr<SstPartitionerFactory>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new SstPartitionerFixedPrefixFactory(0));
+ return guard->get();
+ });
+ return 1;
+}
+} // namespace
+#endif // ROCKSDB_LITE
+
+Status SstPartitionerFactory::CreateFromString(
+ const ConfigOptions& options, const std::string& value,
+ std::shared_ptr<SstPartitionerFactory>* result) {
+#ifndef ROCKSDB_LITE
+ static std::once_flag once;
+ std::call_once(once, [&]() {
+ RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), "");
+ });
+#endif // ROCKSDB_LITE
+ return LoadSharedObject<SstPartitionerFactory>(options, value, nullptr,
+ result);
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/subcompaction_state.cc b/src/rocksdb/db/compaction/subcompaction_state.cc
new file mode 100644
index 000000000..0c56471e9
--- /dev/null
+++ b/src/rocksdb/db/compaction/subcompaction_state.cc
@@ -0,0 +1,106 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/subcompaction_state.h"
+
+#include "rocksdb/sst_partitioner.h"
+
+namespace ROCKSDB_NAMESPACE {
+void SubcompactionState::AggregateCompactionStats(
+ InternalStats::CompactionStatsFull& compaction_stats) const {
+ compaction_stats.stats.Add(compaction_outputs_.stats_);
+ if (HasPenultimateLevelOutputs()) {
+ compaction_stats.has_penultimate_level_output = true;
+ compaction_stats.penultimate_level_stats.Add(
+ penultimate_level_outputs_.stats_);
+ }
+}
+
+OutputIterator SubcompactionState::GetOutputs() const {
+ return OutputIterator(penultimate_level_outputs_.outputs_,
+ compaction_outputs_.outputs_);
+}
+
+void SubcompactionState::Cleanup(Cache* cache) {
+ penultimate_level_outputs_.Cleanup();
+ compaction_outputs_.Cleanup();
+
+ if (!status.ok()) {
+ for (const auto& out : GetOutputs()) {
+ // If this file was inserted into the table cache then remove
+ // them here because this compaction was not committed.
+ TableCache::Evict(cache, out.meta.fd.GetNumber());
+ }
+ }
+ // TODO: sub_compact.io_status is not checked like status. Not sure if thats
+ // intentional. So ignoring the io_status as of now.
+ io_status.PermitUncheckedError();
+}
+
+Slice SubcompactionState::SmallestUserKey() const {
+ if (has_penultimate_level_outputs_) {
+ Slice a = compaction_outputs_.SmallestUserKey();
+ Slice b = penultimate_level_outputs_.SmallestUserKey();
+ if (a.empty()) {
+ return b;
+ }
+ if (b.empty()) {
+ return a;
+ }
+ const Comparator* user_cmp =
+ compaction->column_family_data()->user_comparator();
+ if (user_cmp->Compare(a, b) > 0) {
+ return b;
+ } else {
+ return a;
+ }
+ } else {
+ return compaction_outputs_.SmallestUserKey();
+ }
+}
+
+Slice SubcompactionState::LargestUserKey() const {
+ if (has_penultimate_level_outputs_) {
+ Slice a = compaction_outputs_.LargestUserKey();
+ Slice b = penultimate_level_outputs_.LargestUserKey();
+ if (a.empty()) {
+ return b;
+ }
+ if (b.empty()) {
+ return a;
+ }
+ const Comparator* user_cmp =
+ compaction->column_family_data()->user_comparator();
+ if (user_cmp->Compare(a, b) < 0) {
+ return b;
+ } else {
+ return a;
+ }
+ } else {
+ return compaction_outputs_.LargestUserKey();
+ }
+}
+
+Status SubcompactionState::AddToOutput(
+ const CompactionIterator& iter,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func) {
+ // update target output first
+ is_current_penultimate_level_ = iter.output_to_penultimate_level();
+ current_outputs_ = is_current_penultimate_level_ ? &penultimate_level_outputs_
+ : &compaction_outputs_;
+ if (is_current_penultimate_level_) {
+ has_penultimate_level_outputs_ = true;
+ }
+
+ return Current().AddToOutput(iter, open_file_func, close_file_func);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/subcompaction_state.h b/src/rocksdb/db/compaction/subcompaction_state.h
new file mode 100644
index 000000000..13e63120f
--- /dev/null
+++ b/src/rocksdb/db/compaction/subcompaction_state.h
@@ -0,0 +1,214 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <optional>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_garbage_meter.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/compaction/compaction_outputs.h"
+#include "db/internal_stats.h"
+#include "db/output_validator.h"
+#include "db/range_del_aggregator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Maintains state and outputs for each sub-compaction
+// It contains 2 `CompactionOutputs`:
+// 1. one for the normal output files
+// 2. another for the penultimate level outputs
+// a `current` pointer maintains the current output group, when calling
+// `AddToOutput()`, it checks the output of the current compaction_iterator key
+// and point `current` to the target output group. By default, it just points to
+// normal compaction_outputs, if the compaction_iterator key should be placed on
+// the penultimate level, `current` is changed to point to
+// `penultimate_level_outputs`.
+// The later operations uses `Current()` to get the target group.
+//
+// +----------+ +-----------------------------+ +---------+
+// | *current |--------> | compaction_outputs |----->| output |
+// +----------+ +-----------------------------+ +---------+
+// | | output |
+// | +---------+
+// | | ... |
+// |
+// | +-----------------------------+ +---------+
+// +-------------> | penultimate_level_outputs |----->| output |
+// +-----------------------------+ +---------+
+// | ... |
+
+class SubcompactionState {
+ public:
+ const Compaction* compaction;
+
+ // The boundaries of the key-range this compaction is interested in. No two
+ // sub-compactions may have overlapping key-ranges.
+ // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded
+ const std::optional<Slice> start, end;
+
+ // The return status of this sub-compaction
+ Status status;
+
+ // The return IO Status of this sub-compaction
+ IOStatus io_status;
+
+ // Notify on sub-compaction completion only if listener was notified on
+ // sub-compaction begin.
+ bool notify_on_subcompaction_completion = false;
+
+ // compaction job stats for this sub-compaction
+ CompactionJobStats compaction_job_stats;
+
+ // sub-compaction job id, which is used to identify different sub-compaction
+ // within the same compaction job.
+ const uint32_t sub_job_id;
+
+ Slice SmallestUserKey() const;
+
+ Slice LargestUserKey() const;
+
+ // Get all outputs from the subcompaction. For per_key_placement compaction,
+ // it returns both the last level outputs and penultimate level outputs.
+ OutputIterator GetOutputs() const;
+
+ // Assign range dels aggregator, for each range_del, it can only be assigned
+ // to one output level, for per_key_placement, it's going to be the
+ // penultimate level.
+ void AssignRangeDelAggregator(
+ std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
+ if (compaction->SupportsPerKeyPlacement()) {
+ penultimate_level_outputs_.AssignRangeDelAggregator(
+ std::move(range_del_agg));
+ } else {
+ compaction_outputs_.AssignRangeDelAggregator(std::move(range_del_agg));
+ }
+ }
+
+ void RemoveLastEmptyOutput() {
+ compaction_outputs_.RemoveLastEmptyOutput();
+ penultimate_level_outputs_.RemoveLastEmptyOutput();
+ }
+
+#ifndef ROCKSDB_LITE
+ void BuildSubcompactionJobInfo(
+ SubcompactionJobInfo& subcompaction_job_info) const {
+ const Compaction* c = compaction;
+ const ColumnFamilyData* cfd = c->column_family_data();
+
+ subcompaction_job_info.cf_id = cfd->GetID();
+ subcompaction_job_info.cf_name = cfd->GetName();
+ subcompaction_job_info.status = status;
+ subcompaction_job_info.subcompaction_job_id = static_cast<int>(sub_job_id);
+ subcompaction_job_info.base_input_level = c->start_level();
+ subcompaction_job_info.output_level = c->output_level();
+ subcompaction_job_info.stats = compaction_job_stats;
+ }
+#endif // !ROCKSDB_LITE
+
+ SubcompactionState() = delete;
+ SubcompactionState(const SubcompactionState&) = delete;
+ SubcompactionState& operator=(const SubcompactionState&) = delete;
+
+ SubcompactionState(Compaction* c, const std::optional<Slice> _start,
+ const std::optional<Slice> _end, uint32_t _sub_job_id)
+ : compaction(c),
+ start(_start),
+ end(_end),
+ sub_job_id(_sub_job_id),
+ compaction_outputs_(c, /*is_penultimate_level=*/false),
+ penultimate_level_outputs_(c, /*is_penultimate_level=*/true) {
+ assert(compaction != nullptr);
+ // Set output split key (used for RoundRobin feature) only for normal
+ // compaction_outputs, output to penultimate_level feature doesn't support
+ // RoundRobin feature (and may never going to be supported, because for
+ // RoundRobin, the data time is mostly naturally sorted, no need to have
+ // per-key placement with output_to_penultimate_level).
+ compaction_outputs_.SetOutputSlitKey(start, end);
+ }
+
+ SubcompactionState(SubcompactionState&& state) noexcept
+ : compaction(state.compaction),
+ start(state.start),
+ end(state.end),
+ status(std::move(state.status)),
+ io_status(std::move(state.io_status)),
+ notify_on_subcompaction_completion(
+ state.notify_on_subcompaction_completion),
+ compaction_job_stats(std::move(state.compaction_job_stats)),
+ sub_job_id(state.sub_job_id),
+ compaction_outputs_(std::move(state.compaction_outputs_)),
+ penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)),
+ is_current_penultimate_level_(state.is_current_penultimate_level_),
+ has_penultimate_level_outputs_(state.has_penultimate_level_outputs_) {
+ current_outputs_ = is_current_penultimate_level_
+ ? &penultimate_level_outputs_
+ : &compaction_outputs_;
+ }
+
+ bool HasPenultimateLevelOutputs() const {
+ return has_penultimate_level_outputs_ ||
+ penultimate_level_outputs_.HasRangeDel();
+ }
+
+ bool IsCurrentPenultimateLevel() const {
+ return is_current_penultimate_level_;
+ }
+
+ // Add all the new files from this compaction to version_edit
+ void AddOutputsEdit(VersionEdit* out_edit) const {
+ for (const auto& file : penultimate_level_outputs_.outputs_) {
+ out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta);
+ }
+ for (const auto& file : compaction_outputs_.outputs_) {
+ out_edit->AddFile(compaction->output_level(), file.meta);
+ }
+ }
+
+ void Cleanup(Cache* cache);
+
+ void AggregateCompactionStats(
+ InternalStats::CompactionStatsFull& compaction_stats) const;
+
+ CompactionOutputs& Current() const {
+ assert(current_outputs_);
+ return *current_outputs_;
+ }
+
+ // Add compaction_iterator key/value to the `Current` output group.
+ Status AddToOutput(const CompactionIterator& iter,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func);
+
+ // Close all compaction output files, both output_to_penultimate_level outputs
+ // and normal outputs.
+ Status CloseCompactionFiles(const Status& curr_status,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func) {
+ // Call FinishCompactionOutputFile() even if status is not ok: it needs to
+ // close the output file.
+ Status s = penultimate_level_outputs_.CloseOutput(
+ curr_status, open_file_func, close_file_func);
+ s = compaction_outputs_.CloseOutput(s, open_file_func, close_file_func);
+ return s;
+ }
+
+ private:
+ // State kept for output being generated
+ CompactionOutputs compaction_outputs_;
+ CompactionOutputs penultimate_level_outputs_;
+ CompactionOutputs* current_outputs_ = &compaction_outputs_;
+ bool is_current_penultimate_level_ = false;
+ bool has_penultimate_level_outputs_ = false;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/tiered_compaction_test.cc b/src/rocksdb/db/compaction/tiered_compaction_test.cc
new file mode 100644
index 000000000..aaebcfd94
--- /dev/null
+++ b/src/rocksdb/db/compaction/tiered_compaction_test.cc
@@ -0,0 +1,2028 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/utilities/debug.h"
+#include "test_util/mock_time_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#if !defined(ROCKSDB_LITE)
+
+class TieredCompactionTest : public DBTestBase,
+ public testing::WithParamInterface<bool> {
+ public:
+ TieredCompactionTest()
+ : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true),
+ kBasicCompStats(CompactionReason::kUniversalSizeAmplification, 1),
+ kBasicPerKeyPlacementCompStats(
+ CompactionReason::kUniversalSizeAmplification, 1),
+ kBasicFlushStats(CompactionReason::kFlush, 1) {
+ kBasicCompStats.micros = kHasValue;
+ kBasicCompStats.cpu_micros = kHasValue;
+ kBasicCompStats.bytes_read_non_output_levels = kHasValue;
+ kBasicCompStats.num_input_files_in_non_output_levels = kHasValue;
+ kBasicCompStats.num_input_records = kHasValue;
+ kBasicCompStats.num_dropped_records = kHasValue;
+
+ kBasicPerLevelStats.num_output_records = kHasValue;
+ kBasicPerLevelStats.bytes_written = kHasValue;
+ kBasicPerLevelStats.num_output_files = kHasValue;
+
+ kBasicPerKeyPlacementCompStats.micros = kHasValue;
+ kBasicPerKeyPlacementCompStats.cpu_micros = kHasValue;
+ kBasicPerKeyPlacementCompStats.Add(kBasicPerLevelStats);
+
+ kBasicFlushStats.micros = kHasValue;
+ kBasicFlushStats.cpu_micros = kHasValue;
+ kBasicFlushStats.bytes_written = kHasValue;
+ kBasicFlushStats.num_output_files = kHasValue;
+ }
+
+ protected:
+ static constexpr uint8_t kHasValue = 1;
+
+ InternalStats::CompactionStats kBasicCompStats;
+ InternalStats::CompactionStats kBasicPerKeyPlacementCompStats;
+ InternalStats::CompactionOutputsStats kBasicPerLevelStats;
+ InternalStats::CompactionStats kBasicFlushStats;
+
+ std::atomic_bool enable_per_key_placement = true;
+
+ void SetUp() override {
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = enable_per_key_placement;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ }
+
+ const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ assert(versions->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ assert(internal_stats);
+
+ return internal_stats->TEST_GetCompactionStats();
+ }
+
+ const InternalStats::CompactionStats& GetPerKeyPlacementCompactionStats() {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ assert(versions->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ assert(internal_stats);
+
+ return internal_stats->TEST_GetPerKeyPlacementCompactionStats();
+ }
+
+ // Verify the compaction stats, the stats are roughly compared
+ void VerifyCompactionStats(
+ const std::vector<InternalStats::CompactionStats>& expect_stats,
+ const InternalStats::CompactionStats& expect_pl_stats) {
+ const std::vector<InternalStats::CompactionStats>& stats =
+ GetCompactionStats();
+ const size_t kLevels = expect_stats.size();
+ ASSERT_EQ(kLevels, stats.size());
+
+ for (auto it = stats.begin(), expect = expect_stats.begin();
+ it != stats.end(); it++, expect++) {
+ VerifyCompactionStats(*it, *expect);
+ }
+
+ const InternalStats::CompactionStats& pl_stats =
+ GetPerKeyPlacementCompactionStats();
+ VerifyCompactionStats(pl_stats, expect_pl_stats);
+ }
+
+ void ResetAllStats(std::vector<InternalStats::CompactionStats>& stats,
+ InternalStats::CompactionStats& pl_stats) {
+ ASSERT_OK(dbfull()->ResetStats());
+ for (auto& level_stats : stats) {
+ level_stats.Clear();
+ }
+ pl_stats.Clear();
+ }
+
+ // bottommost_temperature is renaming to last_level_temperature, set either
+ // of them should have the same effect.
+ void SetColdTemperature(Options& options) {
+ if (GetParam()) {
+ options.bottommost_temperature = Temperature::kCold;
+ } else {
+ options.last_level_temperature = Temperature::kCold;
+ }
+ }
+
+ private:
+ void CompareStats(uint64_t val, uint64_t expect) {
+ if (expect > 0) {
+ ASSERT_TRUE(val > 0);
+ } else {
+ ASSERT_EQ(val, 0);
+ }
+ }
+
+ void VerifyCompactionStats(
+ const InternalStats::CompactionStats& stats,
+ const InternalStats::CompactionStats& expect_stats) {
+ CompareStats(stats.micros, expect_stats.micros);
+ CompareStats(stats.cpu_micros, expect_stats.cpu_micros);
+ CompareStats(stats.bytes_read_non_output_levels,
+ expect_stats.bytes_read_non_output_levels);
+ CompareStats(stats.bytes_read_output_level,
+ expect_stats.bytes_read_output_level);
+ CompareStats(stats.bytes_read_blob, expect_stats.bytes_read_blob);
+ CompareStats(stats.bytes_written, expect_stats.bytes_written);
+ CompareStats(stats.bytes_moved, expect_stats.bytes_moved);
+ CompareStats(stats.num_input_files_in_non_output_levels,
+ expect_stats.num_input_files_in_non_output_levels);
+ CompareStats(stats.num_input_files_in_output_level,
+ expect_stats.num_input_files_in_output_level);
+ CompareStats(stats.num_output_files, expect_stats.num_output_files);
+ CompareStats(stats.num_output_files_blob,
+ expect_stats.num_output_files_blob);
+ CompareStats(stats.num_input_records, expect_stats.num_input_records);
+ CompareStats(stats.num_dropped_records, expect_stats.num_dropped_records);
+ CompareStats(stats.num_output_records, expect_stats.num_output_records);
+ ASSERT_EQ(stats.count, expect_stats.count);
+ for (int i = 0; i < static_cast<int>(CompactionReason::kNumOfReasons);
+ i++) {
+ ASSERT_EQ(stats.counts[i], expect_stats.counts[i]);
+ }
+ }
+};
+
+TEST_P(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kLastLevel = kNumLevels - 1;
+
+ auto options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.statistics = CreateDBStatistics();
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+ std::vector<SequenceNumber> seq_history;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+ InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+ InternalStats::CompactionStats expect_pl_stats;
+
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+ expect_stats[0].Add(kBasicFlushStats);
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // the penultimate level file temperature is not cold, all data are output to
+ // the penultimate level.
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // basic compaction stats are still counted to the last level
+ expect_stats[kLastLevel].Add(kBasicCompStats);
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ // move forward the cold_seq to split the file into 2 levels, so should have
+ // both the last level stats and the output_to_penultimate_level stats
+ latest_cold_seq = seq_history[0];
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ last_stats.Add(kBasicPerLevelStats);
+ last_stats.num_dropped_records = 0;
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+ expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // delete all cold data, so all data will be on penultimate level
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ last_stats.bytes_read_output_level = kHasValue;
+ last_stats.num_input_files_in_output_level = kHasValue;
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+ expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // move forward the cold_seq again with range delete, take a snapshot to keep
+ // the range dels in both cold and hot SSTs
+ auto snap = db_->GetSnapshot();
+ latest_cold_seq = seq_history[2];
+ std::string start = Key(25), end = Key(35);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.Add(kBasicPerLevelStats);
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+ expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // verify data
+ std::string value;
+ for (int i = 0; i < kNumKeys; i++) {
+ if (i < 10 || (i >= 25 && i < 35)) {
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+ } else {
+ ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+ }
+ }
+
+ // range delete all hot data
+ start = Key(30);
+ end = Key(130);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // no range del is dropped because of snapshot
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 0);
+
+ // release the snapshot and do compaction again should remove all hot data
+ db_->ReleaseSnapshot(snap);
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // 2 range dels are dropped
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 3);
+
+ // move backward the cold_seq, for example the user may change the setting of
+ // hot/cold data, but it won't impact the existing cold data, as the sequence
+ // number is zeroed out.
+ latest_cold_seq = seq_history[1];
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, RangeBasedTieredStorageUniversal) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kLastLevel = kNumLevels - 1;
+
+ auto options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.statistics = CreateDBStatistics();
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+ auto cmp = options.comparator;
+
+ port::Mutex mutex;
+ std::string hot_start = Key(10);
+ std::string hot_end = Key(50);
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ MutexLock l(&mutex);
+ context->output_to_penultimate_level =
+ cmp->Compare(context->key, hot_start) >= 0 &&
+ cmp->Compare(context->key, hot_end) < 0;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+ InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+ InternalStats::CompactionStats expect_pl_stats;
+
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+ }
+ ASSERT_OK(Flush());
+ expect_stats[0].Add(kBasicFlushStats);
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.Add(kBasicPerLevelStats);
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ // change to all cold, no output_to_penultimate_level output
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(100);
+ hot_end = Key(200);
+ }
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ last_stats.Add(kBasicPerLevelStats);
+ last_stats.num_dropped_records = 0;
+ last_stats.bytes_read_output_level = kHasValue;
+ last_stats.num_input_files_in_output_level = kHasValue;
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // change to all hot, universal compaction support moving data to up level if
+ // it's within compaction level range.
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(0);
+ hot_end = Key(100);
+ }
+
+ // No data is moved from cold tier to hot tier because no input files from L5
+ // or higher, it's not safe to move data to output_to_penultimate_level level.
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+
+ // Add 2 keys in higher level, but in separated files, all keys can be moved
+ // up if it's hot
+ ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(50), "value" + std::to_string(0)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // change to only 1 key cold, to test compaction could stop even it matches
+ // size amp compaction threshold
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(1);
+ hot_end = Key(1000);
+ }
+
+ // generate files just enough to trigger compaction
+ for (int i = 0; i < kNumTrigger - 1; i++) {
+ for (int j = 0; j < 1000; j++) {
+ ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(
+ true)); // make sure the compaction is able to finish
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ auto opts = db_->GetOptions();
+ auto max_size_amp =
+ opts.compaction_options_universal.max_size_amplification_percent / 100;
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown),
+ GetSstSizeHelper(Temperature::kCold) * max_size_amp);
+
+ // delete all cold data
+ ASSERT_OK(Delete(Key(0)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // range delete overlap with both hot/cold data, with a snapshot to make sure
+ // the range del is saved
+ auto snap = db_->GetSnapshot();
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(50);
+ hot_end = Key(100);
+ }
+ std::string start = Key(1), end = Key(70);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // no range del is dropped until snapshot is released
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 0);
+
+ // verify data
+ std::string value;
+ for (int i = 0; i < kNumKeys; i++) {
+ if (i < 70) {
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+ } else {
+ ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+ }
+ }
+
+ db_->ReleaseSnapshot(snap);
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // range del is dropped
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 1);
+}
+
+TEST_P(TieredCompactionTest, LevelColdRangeDelete) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kLastLevel = kNumLevels - 1;
+
+ auto options = CurrentOptions();
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ options.statistics = CreateDBStatistics();
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,1",
+ FilesPerLevel()); // bottommost but not last level file is hot
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // explicitly move the data to the last level
+ MoveFilesToLevel(kLastLevel);
+
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ auto snap = db_->GetSnapshot();
+
+ std::string start = Key(10);
+ std::string end = Key(50);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+
+ // 20->30 will be marked as cold data, but it cannot be placed to cold tier
+ // (bottommost) otherwise, it will be "deleted" by the range del in
+ // output_to_penultimate_level level verify that these data will be able to
+ // queried
+ for (int i = 20; i < 30; i++) {
+ ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+ }
+ // make the range tombstone and data after that cold
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+
+ // add home hot data, just for test
+ for (int i = 30; i < 40; i++) {
+ ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+ }
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ std::string value;
+ for (int i = 0; i < kNumKeys; i++) {
+ auto s = db_->Get(ReadOptions(), Key(i), &value);
+ if ((i >= 10 && i < 20) || (i >= 40 && i < 50)) {
+ ASSERT_TRUE(s.IsNotFound());
+ } else {
+ ASSERT_OK(s);
+ }
+ }
+
+ db_->ReleaseSnapshot(snap);
+}
+
+// Test SST partitioner cut after every single key
+class SingleKeySstPartitioner : public SstPartitioner {
+ public:
+ const char* Name() const override { return "SingleKeySstPartitioner"; }
+
+ PartitionerResult ShouldPartition(
+ const PartitionerRequest& /*request*/) override {
+ return kRequired;
+ }
+
+ bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+ const Slice& /*largest_user_key*/) override {
+ return false;
+ }
+};
+
+class SingleKeySstPartitionerFactory : public SstPartitionerFactory {
+ public:
+ static const char* kClassName() { return "SingleKeySstPartitionerFactory"; }
+ const char* Name() const override { return kClassName(); }
+
+ std::unique_ptr<SstPartitioner> CreatePartitioner(
+ const SstPartitioner::Context& /* context */) const override {
+ return std::unique_ptr<SstPartitioner>(new SingleKeySstPartitioner());
+ }
+};
+
+TEST_P(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 3;
+ const int kNumKeys = 10;
+
+ auto factory = std::make_shared<SingleKeySstPartitionerFactory>();
+ auto options = CurrentOptions();
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ options.statistics = CreateDBStatistics();
+ options.sst_partitioner_factory = factory;
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+
+ MoveFilesToLevel(kNumLevels - 1);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_EQ("0,0,10", FilesPerLevel());
+
+ auto snap = db_->GetSnapshot();
+
+ // only range delete
+ std::string start = Key(3);
+ std::string end = Key(5);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown),
+ 0); // tombstone has no size, even it's in hot tier
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_EQ("0,1,10",
+ FilesPerLevel()); // one file is at the penultimate level which
+ // only contains a range delete
+
+ // Add 2 hot keys, each is a new SST, they will be placed in the same level as
+ // range del, but they don't have overlap with range del, make sure the range
+ // del will still be placed there
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(Put(Key(0), "new value" + std::to_string(0)));
+ auto snap2 = db_->GetSnapshot();
+ ASSERT_OK(Put(Key(6), "new value" + std::to_string(6)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,2,10",
+ FilesPerLevel()); // one file is at the penultimate level
+ // which only contains a range delete
+ std::vector<LiveFileMetaData> live_file_meta;
+ db_->GetLiveFilesMetaData(&live_file_meta);
+ bool found_sst_with_del = false;
+ uint64_t sst_with_del_num = 0;
+ for (const auto& meta : live_file_meta) {
+ if (meta.num_deletions > 0) {
+ // found SST with del, which has 2 entries, one for data one for range del
+ ASSERT_EQ(meta.level,
+ kNumLevels - 2); // output to penultimate level
+ ASSERT_EQ(meta.num_entries, 2);
+ ASSERT_EQ(meta.num_deletions, 1);
+ found_sst_with_del = true;
+ sst_with_del_num = meta.file_number;
+ }
+ }
+ ASSERT_TRUE(found_sst_with_del);
+
+ // release the first snapshot and compact, which should compact the range del
+ // but new inserted key `0` and `6` are still hot data which will be placed on
+ // the penultimate level
+ db_->ReleaseSnapshot(snap);
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,2,7", FilesPerLevel());
+ db_->GetLiveFilesMetaData(&live_file_meta);
+ found_sst_with_del = false;
+ for (const auto& meta : live_file_meta) {
+ // check new SST with del (the old one may not yet be deleted after
+ // compaction)
+ if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) {
+ found_sst_with_del = true;
+ }
+ }
+ ASSERT_FALSE(found_sst_with_del);
+
+ // Now make all data cold, key 0 will be moved to the last level, but key 6 is
+ // still in snap2, so it will be kept at the penultimate level
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,1,8", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ db_->ReleaseSnapshot(snap2);
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,8", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, UniversalRangeDelete) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 10;
+
+ auto factory = std::make_shared<SingleKeySstPartitionerFactory>();
+
+ auto options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.statistics = CreateDBStatistics();
+ options.sst_partitioner_factory = factory;
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+
+ // compact to the penultimate level with 10 files
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_EQ("0,0,0,0,0,10", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // make all data cold
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,10", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // range del which considered as hot data, but it will be merged and deleted
+ // with the last level data
+ std::string start = Key(3);
+ std::string end = Key(5);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_EQ("0,0,0,0,0,0,8", FilesPerLevel());
+
+ // range del with snapshot should be preserved in the penultimate level
+ auto snap = db_->GetSnapshot();
+
+ start = Key(6);
+ end = Key(8);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,8", FilesPerLevel());
+
+ // Add 2 hot keys, each is a new SST, they will be placed in the same level as
+ // range del, but no overlap with range del.
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(Put(Key(4), "new value" + std::to_string(0)));
+ auto snap2 = db_->GetSnapshot();
+ ASSERT_OK(Put(Key(9), "new value" + std::to_string(6)));
+
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,2,8", FilesPerLevel());
+ // find the SST with range del
+ std::vector<LiveFileMetaData> live_file_meta;
+ db_->GetLiveFilesMetaData(&live_file_meta);
+ bool found_sst_with_del = false;
+ uint64_t sst_with_del_num = 0;
+ for (const auto& meta : live_file_meta) {
+ if (meta.num_deletions > 0) {
+ // found SST with del, which has 2 entries, one for data one for range del
+ ASSERT_EQ(meta.level,
+ kNumLevels - 2); // output_to_penultimate_level level
+ ASSERT_EQ(meta.num_entries, 2);
+ ASSERT_EQ(meta.num_deletions, 1);
+ found_sst_with_del = true;
+ sst_with_del_num = meta.file_number;
+ }
+ }
+ ASSERT_TRUE(found_sst_with_del);
+
+ // release the first snapshot which should compact the range del, but data on
+ // the same level is still hot
+ db_->ReleaseSnapshot(snap);
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,2,6", FilesPerLevel());
+ db_->GetLiveFilesMetaData(&live_file_meta);
+ // no range del should be found in SST
+ found_sst_with_del = false;
+ for (const auto& meta : live_file_meta) {
+ // check new SST with del (the old one may not yet be deleted after
+ // compaction)
+ if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) {
+ found_sst_with_del = true;
+ }
+ }
+ ASSERT_FALSE(found_sst_with_del);
+
+ // make all data to cold, but key 6 is still protected by snap2
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,7", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ db_->ReleaseSnapshot(snap2);
+
+ // release snapshot, everything go to bottommost
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,7", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kLastLevel = kNumLevels - 1;
+
+ auto options = CurrentOptions();
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ options.statistics = CreateDBStatistics();
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+ std::vector<SequenceNumber> seq_history;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+ InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+ InternalStats::CompactionStats expect_pl_stats;
+
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ expect_stats[0].Add(kBasicFlushStats);
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // non last level is hot
+ ASSERT_EQ("0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ expect_stats[1].Add(kBasicCompStats);
+ expect_stats[1].Add(kBasicPerLevelStats);
+ expect_stats[1].ResetCompactionReason(CompactionReason::kLevelL0FilesNum);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // move all data to the last level
+ MoveFilesToLevel(kLastLevel);
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ // The compaction won't move the data up
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.Add(kBasicPerLevelStats);
+ last_stats.num_dropped_records = 0;
+ last_stats.bytes_read_non_output_levels = 0;
+ last_stats.num_input_files_in_non_output_levels = 0;
+ last_stats.bytes_read_output_level = kHasValue;
+ last_stats.num_input_files_in_output_level = kHasValue;
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // Add new data, which is all hot and overriding all existing data
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+ ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ // after compaction, all data are hot
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ for (int level = 2; level < kNumLevels - 1; level++) {
+ expect_stats[level].bytes_moved = kHasValue;
+ }
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.bytes_read_output_level = kHasValue;
+ last_stats.num_input_files_in_output_level = kHasValue;
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+ expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // move forward the cold_seq, try to split the data into cold and hot, but in
+ // this case it's unsafe to split the data
+ // because it's non-last-level but bottommost file, the sequence number will
+ // be zeroed out and lost the time information (with
+ // `level_compaction_dynamic_level_bytes` or Universal Compaction, it should
+ // be rare.)
+ // TODO(zjay): ideally we should avoid zero out non-last-level bottommost file
+ latest_cold_seq = seq_history[1];
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ seq_history.clear();
+
+ // manually move all data (cold) to last level
+ MoveFilesToLevel(kLastLevel);
+ seq_history.clear();
+ // Add new data once again
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ latest_cold_seq = seq_history[0];
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // delete all cold data
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ latest_cold_seq = seq_history[2];
+
+ MoveFilesToLevel(kLastLevel);
+
+ // move forward the cold_seq again with range delete, take a snapshot to keep
+ // the range dels in bottommost
+ auto snap = db_->GetSnapshot();
+
+ std::string start = Key(25), end = Key(35);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ // add one small key and large key in the input level, to make sure it's able
+ // to move hot data to input level within that range
+ ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+ ASSERT_OK(Put(Key(100), "value" + std::to_string(0)));
+
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // verify data
+ std::string value;
+ for (int i = 1; i < 130; i++) {
+ if (i < 10 || (i >= 25 && i < 35)) {
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+ } else {
+ ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+ }
+ }
+
+ // delete all hot data
+ ASSERT_OK(Delete(Key(0)));
+ start = Key(30);
+ end = Key(101); // range [101, 130] is cold, because it's not in input range
+ // in previous compaction
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // no range del is dropped because of snapshot
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 0);
+
+ db_->ReleaseSnapshot(snap);
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // 3 range dels dropped, the first one is double counted as expected, which is
+ // spread into 2 SST files
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 3);
+
+ // move backward of cold_seq, which might happen when the user change the
+ // setting. the hot data won't move up, just to make sure it still runs
+ // fine, which is because:
+ // 1. sequence number is zeroed out, so no time information
+ // 2. leveled compaction only support move data up within the higher level
+ // input range
+ latest_cold_seq = seq_history[1];
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+
+ auto options = CurrentOptions();
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.num_levels = kNumLevels;
+ options.statistics = CreateDBStatistics();
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+ auto cmp = options.comparator;
+
+ port::Mutex mutex;
+ std::string hot_start = Key(10);
+ std::string hot_end = Key(50);
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ MutexLock l(&mutex);
+ context->output_to_penultimate_level =
+ cmp->Compare(context->key, hot_start) >= 0 &&
+ cmp->Compare(context->key, hot_end) < 0;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // change to all cold
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(100);
+ hot_end = Key(200);
+ }
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // change to all hot, but level compaction only support move cold to hot
+ // within it's higher level input range.
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(0);
+ hot_end = Key(100);
+ }
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // with mixed hot/cold data
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(50);
+ hot_end = Key(100);
+ }
+ ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+ ASSERT_OK(Put(Key(100), "value" + std::to_string(100)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // delete all hot data, but with snapshot to keep the range del
+ auto snap = db_->GetSnapshot();
+ std::string start = Key(50);
+ std::string end = Key(100);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // no range del is dropped because of snapshot
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 0);
+
+ // release the snapshot and do compaction again should remove all hot data
+ db_->ReleaseSnapshot(snap);
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 1);
+}
+
+INSTANTIATE_TEST_CASE_P(TieredCompactionTest, TieredCompactionTest,
+ testing::Bool());
+
+class PrecludeLastLevelTest : public DBTestBase {
+ public:
+ PrecludeLastLevelTest()
+ : DBTestBase("preclude_last_level_test", /*env_do_fsync=*/false) {
+ mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+ mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
+ }
+
+ protected:
+ std::unique_ptr<Env> mock_env_;
+ std::shared_ptr<MockSystemClock> mock_clock_;
+
+ void SetUp() override {
+ mock_clock_->InstallTimedWaitFixCallback();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+ auto periodic_task_scheduler_ptr =
+ reinterpret_cast<PeriodicTaskScheduler*>(arg);
+ periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
+ });
+ mock_clock_->SetCurrentTime(0);
+ }
+};
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preserve_internal_time_seconds = 10000;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+ int sst_num = 0;
+ // Write files that are overlap and enough to trigger compaction
+ for (; sst_num < kNumTrigger; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // all data is pushed to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ // enable preclude feature
+ options.preclude_last_level_data_seconds = 10000;
+ options.last_level_temperature = Temperature::kCold;
+ Reopen(options);
+
+ // all data is hot, even they're in the last level
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ // Generate a sstable and trigger manual compaction
+ ASSERT_OK(Put(Key(10), "value"));
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // all data is moved up to the penultimate level
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ // close explicitly, because the env is local variable which will be released
+ // first.
+ Close();
+}
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preserve_internal_time_seconds = 10000;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+ int sst_num = 0;
+ // Write files that are overlap and enough to trigger compaction
+ for (; sst_num < kNumTrigger; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // all data is pushed to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ // enable preclude feature
+ options.preclude_last_level_data_seconds = 10000;
+ options.last_level_temperature = Temperature::kCold;
+ // make sure it won't trigger Size Amp compaction, unlike normal Size Amp
+ // compaction which is typically a last level compaction, when tiered Storage
+ // ("preclude_last_level") is enabled, size amp won't include the last level.
+ // As the last level would be in cold tier and the size would not be a
+ // problem, which also avoid frequent hot to cold storage compaction.
+ options.compaction_options_universal.max_size_amplification_percent = 400;
+ Reopen(options);
+
+ // all data is hot, even they're in the last level
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ // Write more data, but still all hot until the 10th SST, as:
+ // write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds
+ // The preclude_last_level_data_seconds is 10k
+ Random rnd(301);
+ for (; sst_num < kNumTrigger * 2 - 1; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ // the value needs to be big enough to trigger full compaction
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100)));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+ }
+
+ // all data is moved up to the penultimate level
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ // close explicitly, because the env is local variable which will be released
+ // first.
+ Close();
+}
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preserve_internal_time_seconds = 2000;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+ int sst_num = 0;
+ // Write files that are overlap and enough to trigger compaction
+ for (; sst_num < kNumTrigger; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // all data is pushed to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ std::vector<KeyVersion> key_versions;
+ ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+ std::numeric_limits<size_t>::max(),
+ &key_versions));
+
+ // make sure there're more than 300 keys and first 100 keys are having seqno
+ // zeroed out, the last 100 key seqno not zeroed out
+ ASSERT_GT(key_versions.size(), 300);
+ for (int i = 0; i < 100; i++) {
+ ASSERT_EQ(key_versions[i].sequence, 0);
+ }
+ auto rit = key_versions.rbegin();
+ for (int i = 0; i < 100; i++) {
+ ASSERT_GT(rit->sequence, 0);
+ rit++;
+ }
+
+ // enable preclude feature
+ options.preclude_last_level_data_seconds = 2000;
+ options.last_level_temperature = Temperature::kCold;
+ Reopen(options);
+
+ // Generate a sstable and trigger manual compaction
+ ASSERT_OK(Put(Key(10), "value"));
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // some data are moved up, some are not
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ Close();
+}
+
+TEST_F(PrecludeLastLevelTest, SmallPrecludeTime) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preclude_last_level_data_seconds = 60;
+ options.preserve_internal_time_seconds = 0;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ options.last_level_temperature = Temperature::kCold;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(rnd.Uniform(10) + 1));
+ });
+
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(rnd.Uniform(2)));
+ });
+ }
+ ASSERT_OK(Flush());
+
+ TablePropertiesCollection tables_props;
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+ ASSERT_EQ(tables_props.size(), 1);
+ ASSERT_FALSE(tables_props.begin()->second->seqno_to_time_mapping.empty());
+ SeqnoToTimeMapping tp_mapping;
+ ASSERT_OK(
+ tp_mapping.Add(tables_props.begin()->second->seqno_to_time_mapping));
+ ASSERT_OK(tp_mapping.Sort());
+ ASSERT_FALSE(tp_mapping.Empty());
+ auto seqs = tp_mapping.TEST_GetInternalMapping();
+ ASSERT_FALSE(seqs.empty());
+
+ // Wait more than preclude_last_level time, then make sure all the data is
+ // compacted to the last level even there's no write (no seqno -> time
+ // information was flushed to any SST).
+ mock_clock_->MockSleepForSeconds(100);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ Close();
+}
+
+TEST_F(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preserve_internal_time_seconds = 2000;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+ int sst_num = 0;
+ // Write files that are overlap and enough to trigger compaction
+ for (; sst_num < kNumTrigger; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // all data is pushed to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ // enable preclude feature
+ options.preclude_last_level_data_seconds = 2000;
+ options.last_level_temperature = Temperature::kCold;
+ Reopen(options);
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // some data are moved up, some are not
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ std::vector<KeyVersion> key_versions;
+ ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+ std::numeric_limits<size_t>::max(),
+ &key_versions));
+
+ // make sure there're more than 300 keys and first 100 keys are having seqno
+ // zeroed out, the last 100 key seqno not zeroed out
+ ASSERT_GT(key_versions.size(), 300);
+ for (int i = 0; i < 100; i++) {
+ ASSERT_EQ(key_versions[i].sequence, 0);
+ }
+ auto rit = key_versions.rbegin();
+ for (int i = 0; i < 100; i++) {
+ ASSERT_GT(rit->sequence, 0);
+ rit++;
+ }
+
+ Close();
+}
+
+class PrecludeLastLevelTestWithParms
+ : public PrecludeLastLevelTest,
+ public testing::WithParamInterface<bool> {
+ public:
+ PrecludeLastLevelTestWithParms() : PrecludeLastLevelTest() {}
+};
+
+TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kKeyPerSec = 10;
+
+ bool enable_preclude_last_level = GetParam();
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preserve_internal_time_seconds = 2000;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+ Random rnd(301);
+ int sst_num = 0;
+ // Write files that are overlap and enough to trigger compaction
+ for (; sst_num < kNumTrigger; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100)));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // all data is pushed to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ std::atomic_bool is_manual_compaction_running = false;
+ std::atomic_bool verified_compaction_order = false;
+
+ // Make sure the manual compaction is in progress and try to trigger a
+ // SizeRatio compaction by flushing 4 files to L0. The compaction will try to
+ // compact 4 files at L0 to L5 (the last empty level).
+ // If the preclude_last_feature is enabled, the auto triggered compaction
+ // cannot be picked. Otherwise, the auto triggered compaction can run in
+ // parallel with the last level compaction.
+ // L0: [a] [b] [c] [d]
+ // L5: (locked if preclude_last_level is enabled)
+ // L6: [z] (locked: manual compaction in progress)
+ // TODO: in this case, L0 files should just be compacted to L4, so the 2
+ // compactions won't be overlapped.
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
+ auto compaction = static_cast<Compaction*>(arg);
+ if (compaction->is_manual_compaction()) {
+ is_manual_compaction_running = true;
+ TEST_SYNC_POINT(
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "ManualCompaction1");
+ TEST_SYNC_POINT(
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "ManualCompaction2");
+ is_manual_compaction_running = false;
+ }
+ });
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ auto compaction = static_cast<Compaction*>(arg);
+ if (enable_preclude_last_level && is_manual_compaction_running) {
+ ASSERT_TRUE(compaction == nullptr);
+ verified_compaction_order = true;
+ } else {
+ ASSERT_TRUE(compaction != nullptr);
+ verified_compaction_order = true;
+ }
+ if (!compaction || !compaction->is_manual_compaction()) {
+ TEST_SYNC_POINT(
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "AutoCompactionPicked");
+ }
+ });
+
+ SyncPoint::GetInstance()->LoadDependency({
+ {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "ManualCompaction1",
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite"},
+ {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "AutoCompactionPicked",
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "ManualCompaction2"},
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // only enable if the Parameter is true
+ if (enable_preclude_last_level) {
+ options.preclude_last_level_data_seconds = 2000;
+ }
+ options.max_background_jobs = 8;
+ options.last_level_temperature = Temperature::kCold;
+ Reopen(options);
+
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ cro.exclusive_manual_compaction = false;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT(
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite");
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ for (; sst_num < kNumTrigger * 2; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ // the value needs to be big enough to trigger full compaction
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+
+ manual_compaction_thread.join();
+
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ if (enable_preclude_last_level) {
+ ASSERT_NE("0,0,0,0,0,1,1", FilesPerLevel());
+ } else {
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ }
+ ASSERT_TRUE(verified_compaction_order);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ stop_token.reset();
+
+ Close();
+}
+
+INSTANTIATE_TEST_CASE_P(PrecludeLastLevelTestWithParms,
+ PrecludeLastLevelTestWithParms, testing::Bool());
+
+// partition the SST into 3 ranges [0, 19] [20, 39] [40, ...]
+class ThreeRangesPartitioner : public SstPartitioner {
+ public:
+ const char* Name() const override { return "SingleKeySstPartitioner"; }
+
+ PartitionerResult ShouldPartition(
+ const PartitionerRequest& request) override {
+ if ((cmp->CompareWithoutTimestamp(*request.current_user_key,
+ DBTestBase::Key(20)) >= 0 &&
+ cmp->CompareWithoutTimestamp(*request.prev_user_key,
+ DBTestBase::Key(20)) < 0) ||
+ (cmp->CompareWithoutTimestamp(*request.current_user_key,
+ DBTestBase::Key(40)) >= 0 &&
+ cmp->CompareWithoutTimestamp(*request.prev_user_key,
+ DBTestBase::Key(40)) < 0)) {
+ return kRequired;
+ } else {
+ return kNotRequired;
+ }
+ }
+
+ bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+ const Slice& /*largest_user_key*/) override {
+ return false;
+ }
+
+ const Comparator* cmp = BytewiseComparator();
+};
+
+class ThreeRangesPartitionerFactory : public SstPartitionerFactory {
+ public:
+ static const char* kClassName() {
+ return "TombstoneTestSstPartitionerFactory";
+ }
+ const char* Name() const override { return kClassName(); }
+
+ std::unique_ptr<SstPartitioner> CreatePartitioner(
+ const SstPartitioner::Context& /* context */) const override {
+ return std::unique_ptr<SstPartitioner>(new ThreeRangesPartitioner());
+ }
+};
+
+TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.preserve_internal_time_seconds = 10000;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+
+ Random rnd(301);
+
+ for (int i = 0; i < 300; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); });
+ }
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // make sure all data is compacted to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ // Create 3 L5 files
+ auto factory = std::make_shared<ThreeRangesPartitionerFactory>();
+ options.sst_partitioner_factory = factory;
+
+ Reopen(options);
+
+ for (int i = 0; i < kNumTrigger - 1; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // L5: [0,19] [20,39] [40,299]
+ // L6: [0, 299]
+ ASSERT_EQ("0,0,0,0,0,3,1", FilesPerLevel());
+
+ // enable tiered storage feature
+ options.preclude_last_level_data_seconds = 10000;
+ options.last_level_temperature = Temperature::kCold;
+ options.statistics = CreateDBStatistics();
+ Reopen(options);
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+ ASSERT_EQ(meta.levels[5].files.size(), 3);
+ ASSERT_EQ(meta.levels[6].files.size(), 1);
+ ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0));
+ ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(299));
+
+ std::string file_path = meta.levels[5].files[1].db_path;
+ std::vector<std::string> files;
+ // pick 3rd file @L5 + file@L6 for compaction
+ files.push_back(file_path + "/" + meta.levels[5].files[2].name);
+ files.push_back(file_path + "/" + meta.levels[6].files[0].name);
+ ASSERT_OK(db_->CompactFiles(CompactionOptions(), files, 6));
+
+ // The compaction only moved partial of the hot data to hot tier, range[0,39]
+ // is unsafe to move up, otherwise, they will be overlapped with the existing
+ // files@L5.
+ // The output should be:
+ // L5: [0,19] [20,39] [40,299] <-- Temperature::kUnknown
+ // L6: [0,19] [20,39] <-- Temperature::kCold
+ // L6 file is split because of the customized partitioner
+ ASSERT_EQ("0,0,0,0,0,3,2", FilesPerLevel());
+
+ // even all the data is hot, but not all data are moved to the hot tier
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ db_->GetColumnFamilyMetaData(&meta);
+ ASSERT_EQ(meta.levels[5].files.size(), 3);
+ ASSERT_EQ(meta.levels[6].files.size(), 2);
+ for (const auto& file : meta.levels[5].files) {
+ ASSERT_EQ(file.temperature, Temperature::kUnknown);
+ }
+ for (const auto& file : meta.levels[6].files) {
+ ASSERT_EQ(file.temperature, Temperature::kCold);
+ }
+ ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0));
+ ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(19));
+ ASSERT_EQ(meta.levels[6].files[1].smallestkey, Key(20));
+ ASSERT_EQ(meta.levels[6].files[1].largestkey, Key(39));
+
+ Close();
+}
+
+struct TestPropertiesCollector : public TablePropertiesCollector {
+ Status AddUserKey(const Slice& key, const Slice& /*value*/,
+ EntryType /*type*/, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ if (cmp->Compare(key, DBTestBase::Key(100)) == 0) {
+ has_key_100 = true;
+ }
+ if (cmp->Compare(key, DBTestBase::Key(200)) == 0) {
+ has_key_200 = true;
+ }
+
+ return Status::OK();
+ }
+
+ const char* Name() const override { return "TestTablePropertiesCollector"; }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ UserCollectedProperties ret;
+ return ret;
+ }
+
+ Status Finish(UserCollectedProperties* /*properties*/) override {
+ // The LSM tree would be like:
+ // L5: [0,19] [20,39] [40,299]
+ // L6: [0, 299]
+ // the 3rd file @L5 has both 100 and 200, which will be marked for
+ // compaction
+ // Also avoid marking flushed SST for compaction, which won't have both 100
+ // and 200
+ if (has_key_100 && has_key_200) {
+ need_compact_ = true;
+ } else {
+ need_compact_ = false;
+ }
+ has_key_100 = false;
+ has_key_200 = false;
+ return Status::OK();
+ }
+
+ bool NeedCompact() const override { return need_compact_; }
+
+ const Comparator* cmp = BytewiseComparator();
+
+ private:
+ bool has_key_100 = false;
+ bool has_key_200 = false;
+
+ bool need_compact_ = false;
+};
+
+class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ return new TestPropertiesCollector;
+ }
+ const char* Name() const override { return "TestTablePropertiesCollector"; }
+};
+
+TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompactionWithRangeDel) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.preserve_internal_time_seconds = 10000;
+ options.num_levels = kNumLevels;
+ // set a small max_compaction_bytes to avoid input level expansion
+ options.max_compaction_bytes = 30000;
+ options.ignore_max_compaction_bytes_for_input = false;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+
+ Random rnd(301);
+
+ for (int i = 0; i < 300; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); });
+ }
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // make sure all data is compacted to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ // Create 3 L5 files
+ auto factory = std::make_shared<ThreeRangesPartitionerFactory>();
+ options.sst_partitioner_factory = factory;
+
+ // the user defined properties_collector will mark the 3rd file for compaction
+ auto collector_factory = std::make_shared<TestPropertiesCollectorFactory>();
+ options.table_properties_collector_factories.resize(1);
+ options.table_properties_collector_factories[0] = collector_factory;
+ // enable tiered storage feature
+ options.preclude_last_level_data_seconds = 10000;
+ options.last_level_temperature = Temperature::kCold;
+ Reopen(options);
+
+ for (int i = 0; i < kNumTrigger - 2; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // make sure there is one and only one compaction supports per-key placement
+ // but has the penultimate level output disabled.
+ std::atomic_int per_key_comp_num = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ auto compaction = static_cast<Compaction*>(arg);
+ if (compaction->SupportsPerKeyPlacement()) {
+ ASSERT_EQ(compaction->GetPenultimateOutputRangeType(),
+ Compaction::PenultimateOutputRangeType::kDisabled);
+ per_key_comp_num++;
+ }
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(200 + j), rnd.RandomString(10)));
+ }
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(32), Key(40)));
+ ASSERT_OK(Flush());
+
+ // Before the per-key placement compaction, the LSM tress should be like:
+ // L5: [0,19] [20,40] [40,299]
+ // L6: [0, 299]
+ // The 2nd file @L5 has the largest key 40 because of range del
+
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ ASSERT_EQ(per_key_comp_num, 1);
+
+ // the compaction won't move any data to the penultimate level
+ ASSERT_EQ("0,0,0,0,0,2,3", FilesPerLevel());
+
+ Close();
+}
+
+#endif // !defined(ROCKSDB_LITE)
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void)argc;
+ (void)argv;
+ return 0;
+#endif
+}
diff --git a/src/rocksdb/db/comparator_db_test.cc b/src/rocksdb/db/comparator_db_test.cc
new file mode 100644
index 000000000..e5e3493b3
--- /dev/null
+++ b/src/rocksdb/db/comparator_db_test.cc
@@ -0,0 +1,678 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#include <array>
+#include <map>
+#include <string>
+
+#include "memtable/stl_wrappers.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/kv_map.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+static const Comparator* kTestComparator = nullptr;
+
+class KVIter : public Iterator {
+ public:
+ explicit KVIter(const stl_wrappers::KVMap* map)
+ : map_(map), iter_(map_->end()) {}
+ bool Valid() const override { return iter_ != map_->end(); }
+ void SeekToFirst() override { iter_ = map_->begin(); }
+ void SeekToLast() override {
+ if (map_->empty()) {
+ iter_ = map_->end();
+ } else {
+ iter_ = map_->find(map_->rbegin()->first);
+ }
+ }
+ void Seek(const Slice& k) override {
+ iter_ = map_->lower_bound(k.ToString());
+ }
+ void SeekForPrev(const Slice& k) override {
+ iter_ = map_->upper_bound(k.ToString());
+ Prev();
+ }
+ void Next() override { ++iter_; }
+ void Prev() override {
+ if (iter_ == map_->begin()) {
+ iter_ = map_->end();
+ return;
+ }
+ --iter_;
+ }
+
+ Slice key() const override { return iter_->first; }
+ Slice value() const override { return iter_->second; }
+ Status status() const override { return Status::OK(); }
+
+ private:
+ const stl_wrappers::KVMap* const map_;
+ stl_wrappers::KVMap::const_iterator iter_;
+};
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+ ASSERT_EQ(iter1->Valid(), iter2->Valid());
+ if (iter1->Valid()) {
+ ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+ ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+ }
+}
+
+// Measuring operations on DB (expect to be empty).
+// source_strings are candidate keys
+void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
+ Random* rnd, int num_writes, int num_iter_ops,
+ int num_trigger_flush) {
+ stl_wrappers::KVMap map((stl_wrappers::LessOfComparator(kTestComparator)));
+
+ for (int i = 0; i < num_writes; i++) {
+ if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) {
+ db->Flush(FlushOptions());
+ }
+
+ int type = rnd->Uniform(2);
+ int index = rnd->Uniform(static_cast<int>(source_strings.size()));
+ auto& key = source_strings[index];
+ switch (type) {
+ case 0:
+ // put
+ map[key] = key;
+ ASSERT_OK(db->Put(WriteOptions(), key, key));
+ break;
+ case 1:
+ // delete
+ if (map.find(key) != map.end()) {
+ map.erase(key);
+ }
+ ASSERT_OK(db->Delete(WriteOptions(), key));
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
+ std::unique_ptr<Iterator> result_iter(new KVIter(&map));
+
+ bool is_valid = false;
+ for (int i = 0; i < num_iter_ops; i++) {
+ // Random walk and make sure iter and result_iter returns the
+ // same key and value
+ int type = rnd->Uniform(6);
+ ASSERT_OK(iter->status());
+ switch (type) {
+ case 0:
+ // Seek to First
+ iter->SeekToFirst();
+ result_iter->SeekToFirst();
+ break;
+ case 1:
+ // Seek to last
+ iter->SeekToLast();
+ result_iter->SeekToLast();
+ break;
+ case 2: {
+ // Seek to random key
+ auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+ auto key = source_strings[key_idx];
+ iter->Seek(key);
+ result_iter->Seek(key);
+ break;
+ }
+ case 3:
+ // Next
+ if (is_valid) {
+ iter->Next();
+ result_iter->Next();
+ } else {
+ continue;
+ }
+ break;
+ case 4:
+ // Prev
+ if (is_valid) {
+ iter->Prev();
+ result_iter->Prev();
+ } else {
+ continue;
+ }
+ break;
+ default: {
+ assert(type == 5);
+ auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+ auto key = source_strings[key_idx];
+ std::string result;
+ auto status = db->Get(ReadOptions(), key, &result);
+ if (map.find(key) == map.end()) {
+ ASSERT_TRUE(status.IsNotFound());
+ } else {
+ ASSERT_EQ(map[key], result);
+ }
+ break;
+ }
+ }
+ AssertItersEqual(iter.get(), result_iter.get());
+ is_valid = iter->Valid();
+ }
+}
+
+class DoubleComparator : public Comparator {
+ public:
+ DoubleComparator() {}
+
+ const char* Name() const override { return "DoubleComparator"; }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+#ifndef CYGWIN
+ double da = std::stod(a.ToString());
+ double db = std::stod(b.ToString());
+#else
+ double da = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+ double db = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+#endif
+ if (da == db) {
+ return a.compare(b);
+ } else if (da > db) {
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class HashComparator : public Comparator {
+ public:
+ HashComparator() {}
+
+ const char* Name() const override { return "HashComparator"; }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ uint32_t ha = Hash(a.data(), a.size(), 66);
+ uint32_t hb = Hash(b.data(), b.size(), 66);
+ if (ha == hb) {
+ return a.compare(b);
+ } else if (ha > hb) {
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class TwoStrComparator : public Comparator {
+ public:
+ TwoStrComparator() {}
+
+ const char* Name() const override { return "TwoStrComparator"; }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ assert(a.size() >= 2);
+ assert(b.size() >= 2);
+ size_t size_a1 = static_cast<size_t>(a[0]);
+ size_t size_b1 = static_cast<size_t>(b[0]);
+ size_t size_a2 = static_cast<size_t>(a[1]);
+ size_t size_b2 = static_cast<size_t>(b[1]);
+ assert(size_a1 + size_a2 + 2 == a.size());
+ assert(size_b1 + size_b2 + 2 == b.size());
+
+ Slice a1 = Slice(a.data() + 2, size_a1);
+ Slice b1 = Slice(b.data() + 2, size_b1);
+ Slice a2 = Slice(a.data() + 2 + size_a1, size_a2);
+ Slice b2 = Slice(b.data() + 2 + size_b1, size_b2);
+
+ if (a1 != b1) {
+ return a1.compare(b1);
+ }
+ return a2.compare(b2);
+ }
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+} // anonymous namespace
+
+class ComparatorDBTest
+ : public testing::Test,
+ virtual public ::testing::WithParamInterface<uint32_t> {
+ private:
+ std::string dbname_;
+ Env* env_;
+ DB* db_;
+ Options last_options_;
+ std::unique_ptr<const Comparator> comparator_guard;
+
+ public:
+ ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
+ kTestComparator = BytewiseComparator();
+ dbname_ = test::PerThreadDBPath("comparator_db_test");
+ BlockBasedTableOptions toptions;
+ toptions.format_version = GetParam();
+ last_options_.table_factory.reset(
+ ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(toptions));
+ EXPECT_OK(DestroyDB(dbname_, last_options_));
+ }
+
+ ~ComparatorDBTest() override {
+ delete db_;
+ EXPECT_OK(DestroyDB(dbname_, last_options_));
+ kTestComparator = BytewiseComparator();
+ }
+
+ DB* GetDB() { return db_; }
+
+ void SetOwnedComparator(const Comparator* cmp, bool owner = true) {
+ if (owner) {
+ comparator_guard.reset(cmp);
+ } else {
+ comparator_guard.reset();
+ }
+ kTestComparator = cmp;
+ last_options_.comparator = cmp;
+ }
+
+ // Return the current option configuration.
+ Options* GetOptions() { return &last_options_; }
+
+ void DestroyAndReopen() {
+ // Destroy using last options
+ Destroy();
+ ASSERT_OK(TryReopen());
+ }
+
+ void Destroy() {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, last_options_));
+ }
+
+ Status TryReopen() {
+ delete db_;
+ db_ = nullptr;
+ last_options_.create_if_missing = true;
+
+ return DB::Open(last_options_, dbname_, &db_);
+ }
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest,
+ testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest,
+ testing::Values(kLatestFormatVersion));
+
+TEST_P(ComparatorDBTest, Bytewise) {
+ for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
+ DestroyAndReopen();
+ Random rnd(rand_seed);
+ DoRandomIteraratorTest(GetDB(),
+ {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd,
+ 8, 100, 3);
+ }
+}
+
+TEST_P(ComparatorDBTest, SimpleSuffixReverseComparator) {
+ SetOwnedComparator(new test::SimpleSuffixReverseComparator());
+
+ for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+ Options* opt = GetOptions();
+ opt->comparator = kTestComparator;
+ DestroyAndReopen();
+ Random rnd(rnd_seed);
+
+ std::vector<std::string> source_strings;
+ std::vector<std::string> source_prefixes;
+ // Randomly generate 5 prefixes
+ for (int i = 0; i < 5; i++) {
+ source_prefixes.push_back(rnd.HumanReadableString(8));
+ }
+ for (int j = 0; j < 20; j++) {
+ int prefix_index = rnd.Uniform(static_cast<int>(source_prefixes.size()));
+ std::string key = source_prefixes[prefix_index] +
+ rnd.HumanReadableString(rnd.Uniform(8));
+ source_strings.push_back(key);
+ }
+
+ DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66);
+ }
+}
+
+TEST_P(ComparatorDBTest, Uint64Comparator) {
+ SetOwnedComparator(test::Uint64Comparator(), false /* owner */);
+
+ for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+ Options* opt = GetOptions();
+ opt->comparator = kTestComparator;
+ DestroyAndReopen();
+ Random rnd(rnd_seed);
+ Random64 rnd64(rnd_seed);
+
+ std::vector<std::string> source_strings;
+ // Randomly generate source keys
+ for (int i = 0; i < 100; i++) {
+ uint64_t r = rnd64.Next();
+ std::string str;
+ str.resize(8);
+ memcpy(&str[0], static_cast<void*>(&r), 8);
+ source_strings.push_back(str);
+ }
+
+ DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+ }
+}
+
+TEST_P(ComparatorDBTest, DoubleComparator) {
+ SetOwnedComparator(new DoubleComparator());
+
+ for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+ Options* opt = GetOptions();
+ opt->comparator = kTestComparator;
+ DestroyAndReopen();
+ Random rnd(rnd_seed);
+
+ std::vector<std::string> source_strings;
+ // Randomly generate source keys
+ for (int i = 0; i < 100; i++) {
+ uint32_t r = rnd.Next();
+ uint32_t divide_order = rnd.Uniform(8);
+ double to_divide = 1.0;
+ for (uint32_t j = 0; j < divide_order; j++) {
+ to_divide *= 10.0;
+ }
+ source_strings.push_back(std::to_string(r / to_divide));
+ }
+
+ DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+ }
+}
+
+TEST_P(ComparatorDBTest, HashComparator) {
+ SetOwnedComparator(new HashComparator());
+
+ for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+ Options* opt = GetOptions();
+ opt->comparator = kTestComparator;
+ DestroyAndReopen();
+ Random rnd(rnd_seed);
+
+ std::vector<std::string> source_strings;
+ // Randomly generate source keys
+ for (int i = 0; i < 100; i++) {
+ source_strings.push_back(test::RandomKey(&rnd, 8));
+ }
+
+ DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+ }
+}
+
+TEST_P(ComparatorDBTest, TwoStrComparator) {
+ SetOwnedComparator(new TwoStrComparator());
+
+ for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+ Options* opt = GetOptions();
+ opt->comparator = kTestComparator;
+ DestroyAndReopen();
+ Random rnd(rnd_seed);
+
+ std::vector<std::string> source_strings;
+ // Randomly generate source keys
+ for (int i = 0; i < 100; i++) {
+ std::string str;
+ uint32_t size1 = rnd.Uniform(8);
+ uint32_t size2 = rnd.Uniform(8);
+ str.append(1, static_cast<char>(size1));
+ str.append(1, static_cast<char>(size2));
+ str.append(test::RandomKey(&rnd, size1));
+ str.append(test::RandomKey(&rnd, size2));
+ source_strings.push_back(str);
+ }
+
+ DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+ }
+}
+
+namespace {
+void VerifyNotSuccessor(const Slice& s, const Slice& t) {
+ auto bc = BytewiseComparator();
+ auto rbc = ReverseBytewiseComparator();
+ ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(s, t));
+ ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t));
+ ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s));
+ ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s));
+}
+
+void VerifySuccessor(const Slice& s, const Slice& t) {
+ auto bc = BytewiseComparator();
+ auto rbc = ReverseBytewiseComparator();
+ ASSERT_TRUE(bc->IsSameLengthImmediateSuccessor(s, t));
+ ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t));
+ ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s));
+ // Should be true but that increases exposure to a design bug in
+ // auto_prefix_mode, so currently set to FALSE
+ ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s));
+}
+
+} // anonymous namespace
+
+TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) {
+ {
+ // different length
+ Slice s("abcxy");
+ Slice t("abcxyz");
+ VerifyNotSuccessor(s, t);
+ }
+ {
+ Slice s("abcxyz");
+ Slice t("abcxy");
+ VerifyNotSuccessor(s, t);
+ }
+ {
+ // not last byte different
+ Slice s("abc1xyz");
+ Slice t("abc2xyz");
+ VerifyNotSuccessor(s, t);
+ }
+ {
+ // same string
+ Slice s("abcxyz");
+ Slice t("abcxyz");
+ VerifyNotSuccessor(s, t);
+ }
+ {
+ Slice s("abcxy");
+ Slice t("abcxz");
+ VerifySuccessor(s, t);
+ }
+ {
+ const char s_array[] = "\x50\x8a\xac";
+ const char t_array[] = "\x50\x8a\xad";
+ Slice s(s_array);
+ Slice t(t_array);
+ VerifySuccessor(s, t);
+ }
+ {
+ const char s_array[] = "\x50\x8a\xff";
+ const char t_array[] = "\x50\x8b\x00";
+ Slice s(s_array, 3);
+ Slice t(t_array, 3);
+ VerifySuccessor(s, t);
+ }
+ {
+ const char s_array[] = "\x50\x8a\xff\xff";
+ const char t_array[] = "\x50\x8b\x00\x00";
+ Slice s(s_array, 4);
+ Slice t(t_array, 4);
+ VerifySuccessor(s, t);
+ }
+ {
+ const char s_array[] = "\x50\x8a\xff\xff";
+ const char t_array[] = "\x50\x8b\x00\x01";
+ Slice s(s_array, 4);
+ Slice t(t_array, 4);
+ VerifyNotSuccessor(s, t);
+ }
+}
+
+TEST_P(ComparatorDBTest, FindShortestSeparator) {
+ std::string s1 = "abc1xyz";
+ std::string s2 = "abc3xy";
+
+ BytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_EQ("abc2", s1);
+
+ s1 = "abc5xyztt";
+
+ ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_EQ("abc5", s1);
+
+ s1 = "abc3";
+ s2 = "abc2xy";
+ ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_EQ("abc3", s1);
+
+ s1 = "abc3xyz";
+ s2 = "abc2xy";
+ ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_EQ("abc3", s1);
+
+ s1 = "abc3xyz";
+ s2 = "abc2";
+ ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_EQ("abc3", s1);
+
+ std::string old_s1 = s1 = "abc2xy";
+ s2 = "abc2";
+ ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+ ASSERT_TRUE(old_s1 >= s1);
+ ASSERT_TRUE(s1 > s2);
+}
+
+TEST_P(ComparatorDBTest, SeparatorSuccessorRandomizeTest) {
+ // Char list for boundary cases.
+ std::array<unsigned char, 6> char_list{{0, 1, 2, 253, 254, 255}};
+ Random rnd(301);
+
+ for (int attempts = 0; attempts < 1000; attempts++) {
+ uint32_t size1 = rnd.Skewed(4);
+ uint32_t size2;
+
+ if (rnd.OneIn(2)) {
+ // size2 to be random size
+ size2 = rnd.Skewed(4);
+ } else {
+ // size1 is within [-2, +2] of size1
+ int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+ int tmp_size2 = static_cast<int>(size1) + diff;
+ if (tmp_size2 < 0) {
+ tmp_size2 = 0;
+ }
+ size2 = static_cast<uint32_t>(tmp_size2);
+ }
+
+ std::string s1;
+ std::string s2;
+ for (uint32_t i = 0; i < size1; i++) {
+ if (rnd.OneIn(2)) {
+ // Use random byte
+ s1 += static_cast<char>(rnd.Uniform(256));
+ } else {
+ // Use one byte in char_list
+ char c = static_cast<char>(char_list[rnd.Uniform(sizeof(char_list))]);
+ s1 += c;
+ }
+ }
+
+ // First set s2 to be the same as s1, and then modify s2.
+ s2 = s1;
+ s2.resize(size2);
+ // We start from the back of the string
+ if (size2 > 0) {
+ uint32_t pos = size2 - 1;
+ do {
+ if (pos >= size1 || rnd.OneIn(4)) {
+ // For 1/4 chance, use random byte
+ s2[pos] = static_cast<char>(rnd.Uniform(256));
+ } else if (rnd.OneIn(4)) {
+ // In 1/4 chance, stop here.
+ break;
+ } else {
+ // Create a char within [-2, +2] of the matching char of s1.
+ int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+ // char may be signed or unsigned based on platform.
+ int s1_char = static_cast<int>(static_cast<unsigned char>(s1[pos]));
+ int s2_char = s1_char + diff;
+ if (s2_char < 0) {
+ s2_char = 0;
+ }
+ if (s2_char > 255) {
+ s2_char = 255;
+ }
+ s2[pos] = static_cast<char>(s2_char);
+ }
+ } while (pos-- != 0);
+ }
+
+ // Test separators
+ for (int rev = 0; rev < 2; rev++) {
+ if (rev == 1) {
+ // switch s1 and s2
+ std::string t = s1;
+ s1 = s2;
+ s2 = t;
+ }
+ std::string separator = s1;
+ BytewiseComparator()->FindShortestSeparator(&separator, s2);
+ std::string rev_separator = s1;
+ ReverseBytewiseComparator()->FindShortestSeparator(&rev_separator, s2);
+
+ if (s1 == s2) {
+ ASSERT_EQ(s1, separator);
+ ASSERT_EQ(s2, rev_separator);
+ } else if (s1 < s2) {
+ ASSERT_TRUE(s1 <= separator);
+ ASSERT_TRUE(s2 > separator);
+ ASSERT_LE(separator.size(), std::max(s1.size(), s2.size()));
+ ASSERT_EQ(s1, rev_separator);
+ } else {
+ ASSERT_TRUE(s1 >= rev_separator);
+ ASSERT_TRUE(s2 < rev_separator);
+ ASSERT_LE(rev_separator.size(), std::max(s1.size(), s2.size()));
+ ASSERT_EQ(s1, separator);
+ }
+ }
+
+ // Test successors
+ std::string succ = s1;
+ BytewiseComparator()->FindShortSuccessor(&succ);
+ ASSERT_TRUE(succ >= s1);
+
+ succ = s1;
+ ReverseBytewiseComparator()->FindShortSuccessor(&succ);
+ ASSERT_TRUE(succ <= s1);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/convenience.cc b/src/rocksdb/db/convenience.cc
new file mode 100644
index 000000000..6344d356d
--- /dev/null
+++ b/src/rocksdb/db/convenience.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/convenience.h"
+
+#include "db/db_impl/db_impl.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CancelAllBackgroundWork(DB* db, bool wait) {
+ (static_cast_with_check<DBImpl>(db->GetRootDB()))
+ ->CancelAllBackgroundWork(wait);
+}
+
+Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end,
+ bool include_end) {
+ RangePtr range(begin, end);
+ return DeleteFilesInRanges(db, column_family, &range, 1, include_end);
+}
+
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+ const RangePtr* ranges, size_t n, bool include_end) {
+ return (static_cast_with_check<DBImpl>(db->GetRootDB()))
+ ->DeleteFilesInRanges(column_family, ranges, n, include_end);
+}
+
+Status VerifySstFileChecksum(const Options& options,
+ const EnvOptions& env_options,
+ const std::string& file_path) {
+ return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path);
+}
+Status VerifySstFileChecksum(const Options& options,
+ const EnvOptions& env_options,
+ const ReadOptions& read_options,
+ const std::string& file_path,
+ const SequenceNumber& largest_seqno) {
+ std::unique_ptr<FSRandomAccessFile> file;
+ uint64_t file_size;
+ InternalKeyComparator internal_comparator(options.comparator);
+ ImmutableOptions ioptions(options);
+
+ Status s = ioptions.fs->NewRandomAccessFile(
+ file_path, FileOptions(env_options), &file, nullptr);
+ if (s.ok()) {
+ s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+ } else {
+ return s;
+ }
+ std::unique_ptr<TableReader> table_reader;
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(
+ std::move(file), file_path, ioptions.clock, nullptr /* io_tracer */,
+ nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
+ ioptions.rate_limiter.get()));
+ const bool kImmortal = true;
+ auto reader_options = TableReaderOptions(
+ ioptions, options.prefix_extractor, env_options, internal_comparator,
+ false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */,
+ -1 /* level */);
+ reader_options.largest_seqno = largest_seqno;
+ s = ioptions.table_factory->NewTableReader(
+ reader_options, std::move(file_reader), file_size, &table_reader,
+ false /* prefetch_index_and_filter_in_cache */);
+ if (!s.ok()) {
+ return s;
+ }
+ s = table_reader->VerifyChecksum(read_options,
+ TableReaderCaller::kUserVerifyChecksum);
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/corruption_test.cc b/src/rocksdb/db/corruption_test.cc
new file mode 100644
index 000000000..8ccac6130
--- /dev/null
+++ b/src/rocksdb/db/corruption_test.cc
@@ -0,0 +1,1587 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/options.h"
+#ifndef ROCKSDB_LITE
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <cinttypes>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/write_batch.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/meta_blocks.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static constexpr int kValueSize = 1000;
+namespace {
+// A wrapper that allows injection of errors.
+class ErrorEnv : public EnvWrapper {
+ public:
+ bool writable_file_error_;
+ int num_writable_file_errors_;
+
+ explicit ErrorEnv(Env* _target)
+ : EnvWrapper(_target),
+ writable_file_error_(false),
+ num_writable_file_errors_(0) {}
+ const char* Name() const override { return "ErrorEnv"; }
+
+ virtual Status NewWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& soptions) override {
+ result->reset();
+ if (writable_file_error_) {
+ ++num_writable_file_errors_;
+ return Status::IOError(fname, "fake error");
+ }
+ return target()->NewWritableFile(fname, result, soptions);
+ }
+};
+} // anonymous namespace
+class CorruptionTest : public testing::Test {
+ public:
+ std::shared_ptr<Env> env_guard_;
+ ErrorEnv* env_;
+ std::string dbname_;
+ std::shared_ptr<Cache> tiny_cache_;
+ Options options_;
+ DB* db_;
+
+ CorruptionTest() {
+ // If LRU cache shard bit is smaller than 2 (or -1 which will automatically
+ // set it to 0), test SequenceNumberRecovery will fail, likely because of a
+ // bug in recovery code. Keep it 4 for now to make the test passes.
+ tiny_cache_ = NewLRUCache(100, 4);
+ Env* base_env = Env::Default();
+ EXPECT_OK(
+ test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+ EXPECT_NE(base_env, nullptr);
+ env_ = new ErrorEnv(base_env);
+ options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+ options_.env = env_;
+ dbname_ = test::PerThreadDBPath(env_, "corruption_test");
+ Status s = DestroyDB(dbname_, options_);
+ EXPECT_OK(s);
+
+ db_ = nullptr;
+ options_.create_if_missing = true;
+ BlockBasedTableOptions table_options;
+ table_options.block_size_deviation = 0; // make unit test pass for now
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen();
+ options_.create_if_missing = false;
+ }
+
+ ~CorruptionTest() override {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({});
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ delete db_;
+ db_ = nullptr;
+ if (getenv("KEEP_DB")) {
+ fprintf(stdout, "db is still at %s\n", dbname_.c_str());
+ } else {
+ Options opts;
+ opts.env = env_->target();
+ EXPECT_OK(DestroyDB(dbname_, opts));
+ }
+ delete env_;
+ }
+
+ void CloseDb() {
+ delete db_;
+ db_ = nullptr;
+ }
+
+ Status TryReopen(Options* options = nullptr) {
+ delete db_;
+ db_ = nullptr;
+ Options opt = (options ? *options : options_);
+ if (opt.env == Options().env) {
+ // If env is not overridden, replace it with ErrorEnv.
+ // Otherwise, the test already uses a non-default Env.
+ opt.env = env_;
+ }
+ opt.arena_block_size = 4096;
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = tiny_cache_;
+ table_options.block_size_deviation = 0;
+ opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ return DB::Open(opt, dbname_, &db_);
+ }
+
+ void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
+
+ void RepairDB() {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_));
+ }
+
+ void Build(int n, int start, int flush_every) {
+ std::string key_space, value_space;
+ WriteBatch batch;
+ for (int i = 0; i < n; i++) {
+ if (flush_every != 0 && i != 0 && i % flush_every == 0) {
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ }
+ // if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+ Slice key = Key(i + start, &key_space);
+ batch.Clear();
+ ASSERT_OK(batch.Put(key, Value(i + start, &value_space)));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ }
+ }
+
+ void Build(int n, int flush_every = 0) { Build(n, 0, flush_every); }
+
+ void Check(int min_expected, int max_expected) {
+ uint64_t next_expected = 0;
+ uint64_t missed = 0;
+ int bad_keys = 0;
+ int bad_values = 0;
+ int correct = 0;
+ std::string value_space;
+ // Do not verify checksums. If we verify checksums then the
+ // db itself will raise errors because data is corrupted.
+ // Instead, we want the reads to be successful and this test
+ // will detect whether the appropriate corruptions have
+ // occurred.
+ Iterator* iter = db_->NewIterator(ReadOptions(false, true));
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ uint64_t key;
+ Slice in(iter->key());
+ if (!ConsumeDecimalNumber(&in, &key) || !in.empty() ||
+ key < next_expected) {
+ bad_keys++;
+ continue;
+ }
+ missed += (key - next_expected);
+ next_expected = key + 1;
+ if (iter->value() != Value(static_cast<int>(key), &value_space)) {
+ bad_values++;
+ } else {
+ correct++;
+ }
+ }
+ iter->status().PermitUncheckedError();
+ delete iter;
+
+ fprintf(
+ stderr,
+ "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n",
+ min_expected, max_expected, correct, bad_keys, bad_values,
+ static_cast<unsigned long long>(missed));
+ ASSERT_LE(min_expected, correct);
+ ASSERT_GE(max_expected, correct);
+ }
+
+ void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+ // Pick file to corrupt
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ uint64_t number;
+ FileType type;
+ std::string fname;
+ int picked_number = -1;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) && type == filetype &&
+ static_cast<int>(number) > picked_number) { // Pick latest file
+ fname = dbname_ + "/" + filenames[i];
+ picked_number = static_cast<int>(number);
+ }
+ }
+ ASSERT_TRUE(!fname.empty()) << filetype;
+
+ ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt));
+ }
+
+ // corrupts exactly one file at level `level`. if no file found at level,
+ // asserts
+ void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ for (const auto& m : metadata) {
+ if (m.level == level) {
+ ASSERT_OK(test::CorruptFile(env_, dbname_ + "/" + m.name, offset,
+ bytes_to_corrupt));
+ return;
+ }
+ }
+ FAIL() << "no file found at level";
+ }
+
+ int Property(const std::string& name) {
+ std::string property;
+ int result;
+ if (db_->GetProperty(name, &property) &&
+ sscanf(property.c_str(), "%d", &result) == 1) {
+ return result;
+ } else {
+ return -1;
+ }
+ }
+
+ // Return the ith key
+ Slice Key(int i, std::string* storage) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%016d", i);
+ storage->assign(buf, strlen(buf));
+ return Slice(*storage);
+ }
+
+ // Return the value to associate with the specified key
+ Slice Value(int k, std::string* storage) {
+ if (k == 0) {
+ // Ugh. Random seed of 0 used to produce no entropy. This code
+ // preserves the implementation that was in place when all of the
+ // magic values in this file were picked.
+ *storage = std::string(kValueSize, ' ');
+ } else {
+ Random r(k);
+ *storage = r.RandomString(kValueSize);
+ }
+ return Slice(*storage);
+ }
+
+ void GetSortedWalFiles(std::vector<uint64_t>& file_nums) {
+ std::vector<std::string> tmp_files;
+ ASSERT_OK(env_->GetChildren(dbname_, &tmp_files));
+ FileType type = kWalFile;
+ for (const auto& file : tmp_files) {
+ uint64_t number = 0;
+ if (ParseFileName(file, &number, &type) && type == kWalFile) {
+ file_nums.push_back(number);
+ }
+ }
+ std::sort(file_nums.begin(), file_nums.end());
+ }
+
+ void CorruptFileWithTruncation(FileType file, uint64_t number,
+ uint64_t bytes_to_truncate = 0) {
+ std::string path;
+ switch (file) {
+ case FileType::kWalFile:
+ path = LogFileName(dbname_, number);
+ break;
+ // TODO: Add other file types as this method is being used for those file
+ // types.
+ default:
+ return;
+ }
+ uint64_t old_size = 0;
+ ASSERT_OK(env_->GetFileSize(path, &old_size));
+ assert(old_size > bytes_to_truncate);
+ uint64_t new_size = old_size - bytes_to_truncate;
+ // If bytes_to_truncate == 0, it will do full truncation.
+ if (bytes_to_truncate == 0) {
+ new_size = 0;
+ }
+ ASSERT_OK(test::TruncateFile(env_, path, new_size));
+ }
+};
+
+TEST_F(CorruptionTest, Recovery) {
+ Build(100);
+ Check(100, 100);
+#ifdef OS_WIN
+ // On Wndows OS Disk cache does not behave properly
+ // We do not call FlushBuffers on every Flush. If we do not close
+ // the log file prior to the corruption we end up with the first
+ // block not corrupted but only the second. However, under the debugger
+ // things work just fine but never pass when running normally
+ // For that reason people may want to run with unbuffered I/O. That option
+ // is not available for WAL though.
+ CloseDb();
+#endif
+ Corrupt(kWalFile, 19, 1); // WriteBatch tag for first record
+ Corrupt(kWalFile, log::kBlockSize + 1000, 1); // Somewhere in second block
+ ASSERT_TRUE(!TryReopen().ok());
+ options_.paranoid_checks = false;
+ Reopen(&options_);
+
+ // The 64 records in the first two log blocks are completely lost.
+ Check(36, 36);
+}
+
+TEST_F(CorruptionTest, PostPITRCorruptionWALsRetained) {
+ // Repro for bug where WALs following the point-in-time recovery were not
+ // retained leading to the next recovery failing.
+ CloseDb();
+
+ options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+ const std::string test_cf_name = "test_cf";
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+ cf_descs.emplace_back(test_cf_name, ColumnFamilyOptions());
+
+ uint64_t log_num;
+ {
+ options_.create_missing_column_families = true;
+ std::vector<ColumnFamilyHandle*> cfhs;
+ ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+ assert(db_ != nullptr); // suppress false clang-analyze report
+
+ ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k", "v"));
+ ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k", "v"));
+ ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k2", "v2"));
+ std::vector<uint64_t> file_nums;
+ GetSortedWalFiles(file_nums);
+ log_num = file_nums.back();
+ for (auto* cfh : cfhs) {
+ delete cfh;
+ }
+ CloseDb();
+ }
+
+ CorruptFileWithTruncation(FileType::kWalFile, log_num,
+ /*bytes_to_truncate=*/1);
+
+ {
+ // Recover "k" -> "v" for both CFs. "k2" -> "v2" is lost due to truncation.
+ options_.avoid_flush_during_recovery = true;
+ std::vector<ColumnFamilyHandle*> cfhs;
+ ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+ assert(db_ != nullptr); // suppress false clang-analyze report
+
+ // Flush one but not both CFs and write some data so there's a seqno gap
+ // between the PITR corruption and the next DB session's first WAL.
+ ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k2", "v2"));
+ ASSERT_OK(db_->Flush(FlushOptions(), cfhs[1]));
+
+ for (auto* cfh : cfhs) {
+ delete cfh;
+ }
+ CloseDb();
+ }
+
+ // With the bug, this DB open would remove the WALs following the PITR
+ // corruption. Then, the next recovery would fail.
+ for (int i = 0; i < 2; ++i) {
+ std::vector<ColumnFamilyHandle*> cfhs;
+ ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+ assert(db_ != nullptr); // suppress false clang-analyze report
+
+ for (auto* cfh : cfhs) {
+ delete cfh;
+ }
+ CloseDb();
+ }
+}
+
+TEST_F(CorruptionTest, RecoverWriteError) {
+ env_->writable_file_error_ = true;
+ Status s = TryReopen();
+ ASSERT_TRUE(!s.ok());
+}
+
+TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
+ // Do enough writing to force minor compaction
+ env_->writable_file_error_ = true;
+ const int num =
+ static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
+ std::string value_storage;
+ Status s;
+ bool failed = false;
+ for (int i = 0; i < num; i++) {
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("a", Value(100, &value_storage)));
+ s = db_->Write(WriteOptions(), &batch);
+ if (!s.ok()) {
+ failed = true;
+ }
+ ASSERT_TRUE(!failed || !s.ok());
+ }
+ ASSERT_TRUE(!s.ok());
+ ASSERT_GE(env_->num_writable_file_errors_, 1);
+ env_->writable_file_error_ = false;
+ Reopen();
+}
+
+TEST_F(CorruptionTest, TableFile) {
+ Build(100);
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+ ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
+
+ Corrupt(kTableFile, 100, 1);
+ Check(99, 99);
+ ASSERT_NOK(dbi->VerifyChecksum());
+}
+
+TEST_F(CorruptionTest, VerifyChecksumReadahead) {
+ Options options;
+ SpecialEnv senv(env_->target());
+ options.env = &senv;
+ // Disable block cache as we are going to check checksum for
+ // the same file twice and measure number of reads.
+ BlockBasedTableOptions table_options_no_bc;
+ table_options_no_bc.no_block_cache = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc));
+
+ Reopen(&options);
+
+ Build(10000);
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+ ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
+
+ senv.count_random_reads_ = true;
+ senv.random_read_counter_.Reset();
+ ASSERT_OK(dbi->VerifyChecksum());
+
+ // Make sure the counter is enabled.
+ ASSERT_GT(senv.random_read_counter_.Read(), 0);
+
+ // The SST file is about 10MB. Default readahead size is 256KB.
+ // Give a conservative 20 reads for metadata blocks, The number
+ // of random reads should be within 10 MB / 256KB + 20 = 60.
+ ASSERT_LT(senv.random_read_counter_.Read(), 60);
+
+ senv.random_read_bytes_counter_ = 0;
+ ReadOptions ro;
+ ro.readahead_size = size_t{32 * 1024};
+ ASSERT_OK(dbi->VerifyChecksum(ro));
+ // The SST file is about 10MB. We set readahead size to 32KB.
+ // Give 0 to 20 reads for metadata blocks, and allow real read
+ // to range from 24KB to 48KB. The lower bound would be:
+ // 10MB / 48KB + 0 = 213
+ // The higher bound is
+ // 10MB / 24KB + 20 = 447.
+ ASSERT_GE(senv.random_read_counter_.Read(), 213);
+ ASSERT_LE(senv.random_read_counter_.Read(), 447);
+
+ // Test readahead shouldn't break mmap mode (where it should be
+ // disabled).
+ options.allow_mmap_reads = true;
+ Reopen(&options);
+ dbi = static_cast<DBImpl*>(db_);
+ ASSERT_OK(dbi->VerifyChecksum(ro));
+
+ CloseDb();
+}
+
+TEST_F(CorruptionTest, TableFileIndexData) {
+ Options options;
+ // very big, we'll trigger flushes manually
+ options.write_buffer_size = 100 * 1024 * 1024;
+ Reopen(&options);
+ // build 2 tables, flush at 5000
+ Build(10000, 5000);
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+
+ // corrupt an index block of an entire file
+ Corrupt(kTableFile, -2000, 500);
+ options.paranoid_checks = false;
+ Reopen(&options);
+ dbi = static_cast_with_check<DBImpl>(db_);
+ // one full file may be readable, since only one was corrupted
+ // the other file should be fully non-readable, since index was corrupted
+ Check(0, 5000);
+ ASSERT_NOK(dbi->VerifyChecksum());
+
+ // In paranoid mode, the db cannot be opened due to the corrupted file.
+ ASSERT_TRUE(TryReopen().IsCorruption());
+}
+
+TEST_F(CorruptionTest, MissingDescriptor) {
+ Build(1000);
+ RepairDB();
+ Reopen();
+ Check(1000, 1000);
+}
+
+TEST_F(CorruptionTest, SequenceNumberRecovery) {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+ RepairDB();
+ Reopen();
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v5", v);
+ // Write something. If sequence number was not recovered properly,
+ // it will be hidden by an earlier write.
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v6", v);
+ Reopen();
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v6", v);
+}
+
+TEST_F(CorruptionTest, CorruptedDescriptor) {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(
+ dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+
+ Corrupt(kDescriptorFile, 0, 1000);
+ Status s = TryReopen();
+ ASSERT_TRUE(!s.ok());
+
+ RepairDB();
+ Reopen();
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("hello", v);
+}
+
+TEST_F(CorruptionTest, CompactionInputError) {
+ Options options;
+ options.env = env_;
+ Reopen(&options);
+ Build(10);
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+ ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
+ ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
+
+ Corrupt(kTableFile, 100, 1);
+ Check(9, 9);
+ ASSERT_NOK(dbi->VerifyChecksum());
+
+ // Force compactions by writing lots of values
+ Build(10000);
+ Check(10000, 10000);
+ ASSERT_NOK(dbi->VerifyChecksum());
+}
+
+TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
+ Options options;
+ options.env = env_;
+ options.paranoid_checks = true;
+ options.write_buffer_size = 131072;
+ options.max_write_buffer_number = 2;
+ Reopen(&options);
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+
+ // Fill levels >= 1
+ for (int level = 1; level < dbi->NumberLevels(); level++) {
+ ASSERT_OK(dbi->Put(WriteOptions(), "", "begin"));
+ ASSERT_OK(dbi->Put(WriteOptions(), "~", "end"));
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
+ ++comp_level) {
+ ASSERT_OK(dbi->TEST_CompactRange(comp_level, nullptr, nullptr));
+ }
+ }
+
+ Reopen(&options);
+
+ dbi = static_cast_with_check<DBImpl>(db_);
+ Build(10);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ ASSERT_OK(dbi->TEST_WaitForCompact());
+ ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
+
+ CorruptTableFileAtLevel(0, 100, 1);
+ Check(9, 9);
+ ASSERT_NOK(dbi->VerifyChecksum());
+
+ // Write must eventually fail because of corrupted table
+ Status s;
+ std::string tmp1, tmp2;
+ bool failed = false;
+ for (int i = 0; i < 10000; i++) {
+ s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+ if (!s.ok()) {
+ failed = true;
+ }
+ // if one write failed, every subsequent write must fail, too
+ ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
+ }
+ ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST_F(CorruptionTest, UnrelatedKeys) {
+ Build(10);
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ Corrupt(kTableFile, 100, 1);
+ ASSERT_NOK(dbi->VerifyChecksum());
+
+ std::string tmp1, tmp2;
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+ ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+ ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+TEST_F(CorruptionTest, RangeDeletionCorrupted) {
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(static_cast<size_t>(1), metadata.size());
+ std::string filename = dbname_ + metadata[0].name;
+
+ FileOptions file_opts;
+ const auto& fs = options_.env->GetFileSystem();
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ ASSERT_OK(RandomAccessFileReader::Create(fs, filename, file_opts,
+ &file_reader, nullptr));
+
+ uint64_t file_size;
+ ASSERT_OK(
+ fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr));
+
+ BlockHandle range_del_handle;
+ ASSERT_OK(FindMetaBlockInFile(
+ file_reader.get(), file_size, kBlockBasedTableMagicNumber,
+ ImmutableOptions(options_), kRangeDelBlockName, &range_del_handle));
+
+ ASSERT_OK(TryReopen());
+ ASSERT_OK(test::CorruptFile(env_, filename,
+ static_cast<int>(range_del_handle.offset()), 1));
+ ASSERT_TRUE(TryReopen().IsCorruption());
+}
+
+TEST_F(CorruptionTest, FileSystemStateCorrupted) {
+ for (int iter = 0; iter < 2; ++iter) {
+ Options options;
+ options.env = env_;
+ options.paranoid_checks = true;
+ options.create_if_missing = true;
+ Reopen(&options);
+ Build(10);
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ std::vector<LiveFileMetaData> metadata;
+ dbi->GetLiveFilesMetaData(&metadata);
+ ASSERT_GT(metadata.size(), 0);
+ std::string filename = dbname_ + metadata[0].name;
+
+ delete db_;
+ db_ = nullptr;
+
+ if (iter == 0) { // corrupt file size
+ std::unique_ptr<WritableFile> file;
+ ASSERT_OK(env_->NewWritableFile(filename, &file, EnvOptions()));
+ ASSERT_OK(file->Append(Slice("corrupted sst")));
+ file.reset();
+ Status x = TryReopen(&options);
+ ASSERT_TRUE(x.IsCorruption());
+ } else { // delete the file
+ ASSERT_OK(env_->DeleteFile(filename));
+ Status x = TryReopen(&options);
+ ASSERT_TRUE(x.IsCorruption());
+ }
+
+ ASSERT_OK(DestroyDB(dbname_, options_));
+ }
+}
+
+static const auto& corruption_modes = {
+ mock::MockTableFactory::kCorruptNone, mock::MockTableFactory::kCorruptKey,
+ mock::MockTableFactory::kCorruptValue,
+ mock::MockTableFactory::kCorruptReorderKey};
+
+TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) {
+ Options options;
+ options.env = env_;
+ options.check_flush_compaction_key_order = false;
+ options.paranoid_file_checks = true;
+ options.create_if_missing = true;
+ Status s;
+ for (const auto& mode : corruption_modes) {
+ delete db_;
+ db_ = nullptr;
+ s = DestroyDB(dbname_, options);
+ ASSERT_OK(s);
+ std::shared_ptr<mock::MockTableFactory> mock =
+ std::make_shared<mock::MockTableFactory>();
+ options.table_factory = mock;
+ mock->SetCorruptionMode(mode);
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ assert(db_ != nullptr); // suppress false clang-analyze report
+ Build(10);
+ s = db_->Flush(FlushOptions());
+ if (mode == mock::MockTableFactory::kCorruptNone) {
+ ASSERT_OK(s);
+ } else {
+ ASSERT_NOK(s);
+ }
+ }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
+ Options options;
+ options.env = env_;
+ options.paranoid_file_checks = true;
+ options.create_if_missing = true;
+ options.check_flush_compaction_key_order = false;
+ Status s;
+ for (const auto& mode : corruption_modes) {
+ delete db_;
+ db_ = nullptr;
+ s = DestroyDB(dbname_, options);
+ ASSERT_OK(s);
+ std::shared_ptr<mock::MockTableFactory> mock =
+ std::make_shared<mock::MockTableFactory>();
+ options.table_factory = mock;
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ assert(db_ != nullptr); // suppress false clang-analyze report
+ Build(100, 2);
+ // ASSERT_OK(db_->Flush(FlushOptions()));
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ mock->SetCorruptionMode(mode);
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ s = dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
+ if (mode == mock::MockTableFactory::kCorruptNone) {
+ ASSERT_OK(s);
+ } else {
+ ASSERT_NOK(s);
+ }
+ }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) {
+ Options options;
+ options.env = env_;
+ options.check_flush_compaction_key_order = false;
+ options.paranoid_file_checks = true;
+ options.create_if_missing = true;
+ for (bool do_flush : {true, false}) {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, options));
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ std::string start, end;
+ assert(db_ != nullptr); // suppress false clang-analyze report
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(3, &start), Key(7, &end)));
+ auto snap = db_->GetSnapshot();
+ ASSERT_NE(snap, nullptr);
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(8, &start), Key(9, &end)));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(2, &start), Key(5, &end)));
+ Build(10);
+ if (do_flush) {
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ } else {
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(
+ dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+ }
+ db_->ReleaseSnapshot(snap);
+ }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) {
+ Options options;
+ options.env = env_;
+ options.check_flush_compaction_key_order = false;
+ options.paranoid_file_checks = true;
+ options.create_if_missing = true;
+ for (bool do_flush : {true, false}) {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, options));
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ assert(db_ != nullptr); // suppress false clang-analyze report
+ Build(10, 0, 0);
+ std::string start, end;
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(5, &start), Key(15, &end)));
+ auto snap = db_->GetSnapshot();
+ ASSERT_NE(snap, nullptr);
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(8, &start), Key(9, &end)));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(12, &start), Key(17, &end)));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(2, &start), Key(4, &end)));
+ Build(10, 10, 0);
+ if (do_flush) {
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ } else {
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(
+ dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+ }
+ db_->ReleaseSnapshot(snap);
+ }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) {
+ Options options;
+ options.env = env_;
+ options.check_flush_compaction_key_order = false;
+ options.paranoid_file_checks = true;
+ options.create_if_missing = true;
+ for (bool do_flush : {true, false}) {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, options));
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ assert(db_ != nullptr); // suppress false clang-analyze report
+ std::string start, end;
+ Build(10);
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(3, &start), Key(7, &end)));
+ auto snap = db_->GetSnapshot();
+ ASSERT_NE(snap, nullptr);
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(6, &start), Key(8, &end)));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(2, &start), Key(5, &end)));
+ if (do_flush) {
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ } else {
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(
+ dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+ }
+ db_->ReleaseSnapshot(snap);
+ }
+}
+
+TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.allow_data_in_errors = true;
+ auto mode = mock::MockTableFactory::kCorruptKey;
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, options));
+
+ std::shared_ptr<mock::MockTableFactory> mock =
+ std::make_shared<mock::MockTableFactory>();
+ mock->SetCorruptionMode(mode);
+ options.table_factory = mock;
+
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ assert(db_ != nullptr); // suppress false clang-analyze report
+ Build(100, 2);
+
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ Status s =
+ dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
+ Options options;
+ options.env = env_;
+ options.paranoid_file_checks = false;
+ options.create_if_missing = true;
+ options.check_flush_compaction_key_order = false;
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, options));
+ std::shared_ptr<mock::MockTableFactory> mock =
+ std::make_shared<mock::MockTableFactory>();
+ options.table_factory = mock;
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ assert(db_ != nullptr); // suppress false clang-analyze report
+ mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey);
+ Build(100, 2);
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+
+ mock->SetCorruptionMode(mock::MockTableFactory::kCorruptNone);
+ ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}}));
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_NOK(
+ dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+}
+
+TEST_F(CorruptionTest, FlushKeyOrderCheck) {
+ Options options;
+ options.env = env_;
+ options.paranoid_file_checks = false;
+ options.create_if_missing = true;
+ ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}}));
+
+ ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1"));
+
+ int cnt = 0;
+ // Generate some out of order keys from the memtable
+ SyncPoint::GetInstance()->SetCallBack(
+ "MemTableIterator::Next:0", [&](void* arg) {
+ MemTableRep::Iterator* mem_iter =
+ static_cast<MemTableRep::Iterator*>(arg);
+ if (++cnt == 3) {
+ mem_iter->Prev();
+ mem_iter->Prev();
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ Status s = static_cast_with_check<DBImpl>(db_)->TEST_FlushMemTable();
+ ASSERT_NOK(s);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(CorruptionTest, DisableKeyOrderCheck) {
+ ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "false"}}));
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "OutputValidator::Add:order_check",
+ [&](void* /*arg*/) { ASSERT_TRUE(false); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1"));
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1"));
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(
+ dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(CorruptionTest, VerifyWholeTableChecksum) {
+ CloseDb();
+ Options options;
+ options.env = env_;
+ ASSERT_OK(DestroyDB(dbname_, options));
+ options.create_if_missing = true;
+ options.file_checksum_gen_factory =
+ ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
+ Reopen(&options);
+
+ Build(10, 5);
+
+ ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+ CloseDb();
+
+ // Corrupt the first byte of each table file, this must be data block.
+ Corrupt(kTableFile, 0, 1);
+
+ ASSERT_OK(TryReopen(&options));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ int count{0};
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
+ auto* s = reinterpret_cast<Status*>(arg);
+ ASSERT_NE(s, nullptr);
+ ++count;
+ ASSERT_NOK(*s);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
+ ASSERT_EQ(1, count);
+}
+
+class CrashDuringRecoveryWithCorruptionTest
+ : public CorruptionTest,
+ public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ explicit CrashDuringRecoveryWithCorruptionTest()
+ : CorruptionTest(),
+ avoid_flush_during_recovery_(std::get<0>(GetParam())),
+ track_and_verify_wals_in_manifest_(std::get<1>(GetParam())) {}
+
+ protected:
+ const bool avoid_flush_during_recovery_;
+ const bool track_and_verify_wals_in_manifest_;
+};
+
+INSTANTIATE_TEST_CASE_P(CorruptionTest, CrashDuringRecoveryWithCorruptionTest,
+ ::testing::Values(std::make_tuple(true, false),
+ std::make_tuple(false, false),
+ std::make_tuple(true, true),
+ std::make_tuple(false, true)));
+
+// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB
+// won't flush the data from WAL to L0 for all column families if possible. As a
+// result, not all column families can increase their log_numbers, and
+// min_log_number_to_keep won't change.
+// It may prematurely persist a new MANIFEST even before we can declare the DB
+// is in consistent state after recovery (this is when the new WAL is synced)
+// and advances log_numbers for some column families.
+//
+// If there is power failure before we sync the new WAL, we will end up in
+// a situation in which after persisting the MANIFEST, RocksDB will see some
+// column families' log_numbers larger than the corrupted wal, and
+// "Column family inconsistency: SST file contains data beyond the point of
+// corruption" error will be hit, causing recovery to fail.
+//
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
+// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
+// synced immediately afterwards. The sequence number of the sentinel
+// WriteBatch will be the next sequence number immediately after the largest
+// sequence number recovered from previous WALs and MANIFEST because of which DB
+// will be in consistent state.
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+//
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
+ CloseDb();
+ Options options;
+ options.track_and_verify_wals_in_manifest =
+ track_and_verify_wals_in_manifest_;
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ options.avoid_flush_during_recovery = false;
+ options.env = env_;
+ ASSERT_OK(DestroyDB(dbname_, options));
+ options.create_if_missing = true;
+ options.max_write_buffer_number = 8;
+
+ Reopen(&options);
+ Status s;
+ const std::string test_cf_name = "test_cf";
+ ColumnFamilyHandle* cfh = nullptr;
+ s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
+ ASSERT_OK(s);
+ delete cfh;
+ CloseDb();
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, options);
+ std::vector<ColumnFamilyHandle*> handles;
+
+ // 1. Open and populate the DB. Write and flush default_cf several times to
+ // advance wal number so that some column families have advanced log_number
+ // while other don't.
+ {
+ ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+ auto* dbimpl = static_cast_with_check<DBImpl>(db_);
+ assert(dbimpl);
+
+ // Write one key to test_cf.
+ ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
+ ASSERT_OK(db_->Flush(FlushOptions(), handles[1]));
+
+ // Write to default_cf and flush this cf several times to advance wal
+ // number. TEST_SwitchMemtable makes sure WALs are not synced and test can
+ // corrupt un-sync WAL.
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i),
+ "value" + std::to_string(i)));
+ ASSERT_OK(dbimpl->TEST_SwitchMemtable());
+ }
+
+ for (auto* h : handles) {
+ delete h;
+ }
+ handles.clear();
+ CloseDb();
+ }
+
+ // 2. Corrupt second last un-syned wal file to emulate power reset which
+ // caused the DB to lose the un-synced WAL.
+ {
+ std::vector<uint64_t> file_nums;
+ GetSortedWalFiles(file_nums);
+ size_t size = file_nums.size();
+ assert(size >= 2);
+ uint64_t log_num = file_nums[size - 2];
+ CorruptFileWithTruncation(FileType::kWalFile, log_num,
+ /*bytes_to_truncate=*/8);
+ }
+
+ // 3. After first crash reopen the DB which contains corrupted WAL. Default
+ // family has higher log number than corrupted wal number.
+ //
+ // Case1: If avoid_flush_during_recovery = true, RocksDB won't flush the data
+ // from WAL to L0 for all column families (test_cf_name in this case). As a
+ // result, not all column families can increase their log_numbers, and
+ // min_log_number_to_keep won't change.
+ //
+ // Case2: If avoid_flush_during_recovery = false, all column families have
+ // flushed their data from WAL to L0 during recovery, and none of them will
+ // ever need to read the WALs again.
+
+ // 4. Fault is injected to fail the recovery.
+ {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
+ auto* tmp_s = reinterpret_cast<Status*>(arg);
+ assert(tmp_s);
+ *tmp_s = Status::IOError("Injected");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ handles.clear();
+ options.avoid_flush_during_recovery = true;
+ s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_EQ("IO error: Injected", s.ToString());
+ for (auto* h : handles) {
+ delete h;
+ }
+ CloseDb();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ }
+
+ // 5. After second crash reopen the db with second corruption. Default family
+ // has higher log number than corrupted wal number.
+ //
+ // Case1: If avoid_flush_during_recovery = true, we persist a new
+ // MANIFEST with advanced log_numbers for some column families only after
+ // syncing the WAL. So during second crash, RocksDB will skip the corrupted
+ // WAL files as they have been moved to different folder. Since newly synced
+ // WAL file's sequence number (sentinel WriteBatch) will be the next
+ // sequence number immediately after the largest sequence number recovered
+ // from previous WALs and MANIFEST, db will be in consistent state and opens
+ // successfully.
+ //
+ // Case2: If avoid_flush_during_recovery = false, the corrupted WAL is below
+ // this number. So during a second crash after persisting the new MANIFEST,
+ // RocksDB will skip the corrupted WAL(s) because they are all below this
+ // bound. Therefore, we won't hit the "column family inconsistency" error
+ // message.
+ {
+ options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
+ ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+ // Verify that data is not lost.
+ {
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v));
+ ASSERT_EQ("dontcare", v);
+
+ v.clear();
+ ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(0), &v));
+ ASSERT_EQ("value" + std::to_string(0), v);
+
+ // Since it's corrupting second last wal, below key is not found.
+ v.clear();
+ ASSERT_EQ(db_->Get(ReadOptions(), "key" + std::to_string(1), &v),
+ Status::NotFound());
+ }
+
+ for (auto* h : handles) {
+ delete h;
+ }
+ handles.clear();
+ CloseDb();
+ }
+}
+
+// In case of TransactionDB, it enables two-phase-commit. The prepare section of
+// an uncommitted transaction always need to be kept. Even if we perform flush
+// during recovery, we may still need to hold an old WAL. The
+// min_log_number_to_keep won't change, and "Column family inconsistency: SST
+// file contains data beyond the point of corruption" error will be hit, causing
+// recovery to fail.
+//
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
+// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
+// synced immediately afterwards. The sequence number of the sentinel
+// WriteBatch will be the next sequence number immediately after the largest
+// sequence number recovered from previous WALs and MANIFEST because of which DB
+// will be in consistent state.
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+//
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
+ CloseDb();
+ Options options;
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ options.track_and_verify_wals_in_manifest =
+ track_and_verify_wals_in_manifest_;
+ options.avoid_flush_during_recovery = false;
+ options.env = env_;
+ ASSERT_OK(DestroyDB(dbname_, options));
+ options.create_if_missing = true;
+ options.max_write_buffer_number = 3;
+ Reopen(&options);
+
+ // Create cf test_cf_name.
+ ColumnFamilyHandle* cfh = nullptr;
+ const std::string test_cf_name = "test_cf";
+ Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
+ ASSERT_OK(s);
+ delete cfh;
+ CloseDb();
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, options);
+ std::vector<ColumnFamilyHandle*> handles;
+
+ TransactionDB* txn_db = nullptr;
+ TransactionDBOptions txn_db_opts;
+
+ // 1. Open and populate the DB. Write and flush default_cf several times to
+ // advance wal number so that some column families have advanced log_number
+ // while other don't.
+ {
+ ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
+ &handles, &txn_db));
+
+ auto* txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions());
+ // Put cf1
+ ASSERT_OK(txn->Put(handles[1], "foo", "value"));
+ ASSERT_OK(txn->SetName("txn0"));
+ ASSERT_OK(txn->Prepare());
+ ASSERT_OK(txn_db->Flush(FlushOptions()));
+
+ delete txn;
+ txn = nullptr;
+
+ auto* dbimpl = static_cast_with_check<DBImpl>(txn_db->GetRootDB());
+ assert(dbimpl);
+
+ // Put and flush cf0
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_OK(txn_db->Put(WriteOptions(), "key" + std::to_string(i),
+ "value" + std::to_string(i)));
+ ASSERT_OK(dbimpl->TEST_SwitchMemtable());
+ }
+
+ // Put cf1
+ txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions());
+ ASSERT_OK(txn->Put(handles[1], "foo1", "value1"));
+ ASSERT_OK(txn->Commit());
+
+ delete txn;
+ txn = nullptr;
+
+ for (auto* h : handles) {
+ delete h;
+ }
+ handles.clear();
+ delete txn_db;
+ }
+
+ // 2. Corrupt second last wal to emulate power reset which caused the DB to
+ // lose the un-synced WAL.
+ {
+ std::vector<uint64_t> file_nums;
+ GetSortedWalFiles(file_nums);
+ size_t size = file_nums.size();
+ assert(size >= 2);
+ uint64_t log_num = file_nums[size - 2];
+ CorruptFileWithTruncation(FileType::kWalFile, log_num,
+ /*bytes_to_truncate=*/8);
+ }
+
+ // 3. After first crash reopen the DB which contains corrupted WAL. Default
+ // family has higher log number than corrupted wal number. There may be old
+ // WAL files that it must not delete because they can contain data of
+ // uncommitted transactions. As a result, min_log_number_to_keep won't change.
+
+ {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::Open::BeforeSyncWAL", [&](void* arg) {
+ auto* tmp_s = reinterpret_cast<Status*>(arg);
+ assert(tmp_s);
+ *tmp_s = Status::IOError("Injected");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ handles.clear();
+ s = TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, &handles,
+ &txn_db);
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_EQ("IO error: Injected", s.ToString());
+ for (auto* h : handles) {
+ delete h;
+ }
+ CloseDb();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ }
+
+ // 4. Corrupt max_wal_num.
+ {
+ std::vector<uint64_t> file_nums;
+ GetSortedWalFiles(file_nums);
+ size_t size = file_nums.size();
+ uint64_t log_num = file_nums[size - 1];
+ CorruptFileWithTruncation(FileType::kWalFile, log_num);
+ }
+
+ // 5. After second crash reopen the db with second corruption. Default family
+ // has higher log number than corrupted wal number.
+ // We persist a new MANIFEST with advanced log_numbers for some column
+ // families only after syncing the WAL. So during second crash, RocksDB will
+ // skip the corrupted WAL files as they have been moved to different folder.
+ // Since newly synced WAL file's sequence number (sentinel WriteBatch) will be
+ // the next sequence number immediately after the largest sequence number
+ // recovered from previous WALs and MANIFEST, db will be in consistent state
+ // and opens successfully.
+ {
+ ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
+ &handles, &txn_db));
+
+ // Verify that data is not lost.
+ {
+ std::string v;
+ // Key not visible since it's not committed.
+ ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo", &v),
+ Status::NotFound());
+
+ v.clear();
+ ASSERT_OK(txn_db->Get(ReadOptions(), "key" + std::to_string(0), &v));
+ ASSERT_EQ("value" + std::to_string(0), v);
+
+ // Last WAL is corrupted which contains two keys below.
+ v.clear();
+ ASSERT_EQ(txn_db->Get(ReadOptions(), "key" + std::to_string(1), &v),
+ Status::NotFound());
+ v.clear();
+ ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo1", &v),
+ Status::NotFound());
+ }
+
+ for (auto* h : handles) {
+ delete h;
+ }
+ delete txn_db;
+ }
+}
+
+// This test is similar to
+// CrashDuringRecoveryWithCorruptionTest.CrashDuringRecovery except it calls
+// flush and corrupts Last WAL. It calls flush to sync some of the WALs and
+// remaining are unsyned one of which is then corrupted to simulate crash.
+//
+// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB
+// won't flush the data from WAL to L0 for all column families if possible. As a
+// result, not all column families can increase their log_numbers, and
+// min_log_number_to_keep won't change.
+// It may prematurely persist a new MANIFEST even before we can declare the DB
+// is in consistent state after recovery (this is when the new WAL is synced)
+// and advances log_numbers for some column families.
+//
+// If there is power failure before we sync the new WAL, we will end up in
+// a situation in which after persisting the MANIFEST, RocksDB will see some
+// column families' log_numbers larger than the corrupted wal, and
+// "Column family inconsistency: SST file contains data beyond the point of
+// corruption" error will be hit, causing recovery to fail.
+//
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
+// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
+// synced immediately afterwards. The sequence number of the sentinel
+// WriteBatch will be the next sequence number immediately after the largest
+// sequence number recovered from previous WALs and MANIFEST because of which DB
+// will be in consistent state.
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecoveryWithFlush) {
+ CloseDb();
+ Options options;
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ options.avoid_flush_during_recovery = false;
+ options.env = env_;
+ options.create_if_missing = true;
+
+ ASSERT_OK(DestroyDB(dbname_, options));
+ Reopen(&options);
+
+ ColumnFamilyHandle* cfh = nullptr;
+ const std::string test_cf_name = "test_cf";
+ Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
+ ASSERT_OK(s);
+ delete cfh;
+
+ CloseDb();
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, options);
+ std::vector<ColumnFamilyHandle*> handles;
+
+ {
+ ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+ // Write one key to test_cf.
+ ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
+
+ // Write to default_cf and flush this cf several times to advance wal
+ // number.
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i),
+ "value" + std::to_string(i)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+
+ ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare"));
+ for (auto* h : handles) {
+ delete h;
+ }
+ handles.clear();
+ CloseDb();
+ }
+
+ // Corrupt second last un-syned wal file to emulate power reset which
+ // caused the DB to lose the un-synced WAL.
+ {
+ std::vector<uint64_t> file_nums;
+ GetSortedWalFiles(file_nums);
+ size_t size = file_nums.size();
+ uint64_t log_num = file_nums[size - 1];
+ CorruptFileWithTruncation(FileType::kWalFile, log_num,
+ /*bytes_to_truncate=*/8);
+ }
+
+ // Fault is injected to fail the recovery.
+ {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
+ auto* tmp_s = reinterpret_cast<Status*>(arg);
+ assert(tmp_s);
+ *tmp_s = Status::IOError("Injected");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ handles.clear();
+ options.avoid_flush_during_recovery = true;
+ s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_EQ("IO error: Injected", s.ToString());
+ for (auto* h : handles) {
+ delete h;
+ }
+ CloseDb();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ }
+
+ // Reopen db again
+ {
+ options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
+ ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+ // Verify that data is not lost.
+ {
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v));
+ ASSERT_EQ("dontcare", v);
+
+ for (int i = 0; i < 2; ++i) {
+ v.clear();
+ ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(i), &v));
+ ASSERT_EQ("value" + std::to_string(i), v);
+ }
+
+ // Since it's corrupting last wal after Flush, below key is not found.
+ v.clear();
+ ASSERT_EQ(db_->Get(ReadOptions(), handles[1], "dontcare", &v),
+ Status::NotFound());
+ }
+
+ for (auto* h : handles) {
+ delete h;
+ }
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/cuckoo_table_db_test.cc b/src/rocksdb/db/cuckoo_table_db_test.cc
new file mode 100644
index 000000000..868b798ea
--- /dev/null
+++ b/src/rocksdb/db/cuckoo_table_db_test.cc
@@ -0,0 +1,361 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CuckooTableDBTest : public testing::Test {
+ private:
+ std::string dbname_;
+ Env* env_;
+ DB* db_;
+
+ public:
+ CuckooTableDBTest() : env_(Env::Default()) {
+ dbname_ = test::PerThreadDBPath("cuckoo_table_db_test");
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ db_ = nullptr;
+ Reopen();
+ }
+
+ ~CuckooTableDBTest() override {
+ delete db_;
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ }
+
+ Options CurrentOptions() {
+ Options options;
+ options.table_factory.reset(NewCuckooTableFactory());
+ options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+ options.allow_mmap_reads = true;
+ options.create_if_missing = true;
+ options.allow_concurrent_memtable_write = false;
+ return options;
+ }
+
+ DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+ // The following util methods are copied from plain_table_db_test.
+ void Reopen(Options* options = nullptr) {
+ delete db_;
+ db_ = nullptr;
+ Options opts;
+ if (options != nullptr) {
+ opts = *options;
+ } else {
+ opts = CurrentOptions();
+ opts.create_if_missing = true;
+ }
+ ASSERT_OK(DB::Open(opts, dbname_, &db_));
+ }
+
+ void DestroyAndReopen(Options* options) {
+ assert(options);
+ ASSERT_OK(db_->Close());
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, *options));
+ Reopen(options);
+ }
+
+ Status Put(const Slice& k, const Slice& v) {
+ return db_->Put(WriteOptions(), k, v);
+ }
+
+ Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+ std::string Get(const std::string& k) {
+ ReadOptions options;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ int NumTableFilesAtLevel(int level) {
+ std::string property;
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.num-files-at-level" + std::to_string(level), &property));
+ return atoi(property.c_str());
+ }
+
+ // Return spread of files per level
+ std::string FilesPerLevel() {
+ std::string result;
+ size_t last_non_zero_offset = 0;
+ for (int level = 0; level < db_->NumberLevels(); level++) {
+ int f = NumTableFilesAtLevel(level);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+};
+
+TEST_F(CuckooTableDBTest, Flush) {
+ // Try with empty DB first.
+ ASSERT_TRUE(dbfull() != nullptr);
+ ASSERT_EQ("NOT_FOUND", Get("key2"));
+
+ // Add some values to db.
+ Options options = CurrentOptions();
+ Reopen(&options);
+
+ ASSERT_OK(Put("key1", "v1"));
+ ASSERT_OK(Put("key2", "v2"));
+ ASSERT_OK(Put("key3", "v3"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ TablePropertiesCollection ptc;
+ ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+ VerifySstUniqueIds(ptc);
+ ASSERT_EQ(1U, ptc.size());
+ ASSERT_EQ(3U, ptc.begin()->second->num_entries);
+ ASSERT_EQ("1", FilesPerLevel());
+
+ ASSERT_EQ("v1", Get("key1"));
+ ASSERT_EQ("v2", Get("key2"));
+ ASSERT_EQ("v3", Get("key3"));
+ ASSERT_EQ("NOT_FOUND", Get("key4"));
+
+ // Now add more keys and flush.
+ ASSERT_OK(Put("key4", "v4"));
+ ASSERT_OK(Put("key5", "v5"));
+ ASSERT_OK(Put("key6", "v6"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+ VerifySstUniqueIds(ptc);
+ ASSERT_EQ(2U, ptc.size());
+ auto row = ptc.begin();
+ ASSERT_EQ(3U, row->second->num_entries);
+ ASSERT_EQ(3U, (++row)->second->num_entries);
+ ASSERT_EQ("2", FilesPerLevel());
+ ASSERT_EQ("v1", Get("key1"));
+ ASSERT_EQ("v2", Get("key2"));
+ ASSERT_EQ("v3", Get("key3"));
+ ASSERT_EQ("v4", Get("key4"));
+ ASSERT_EQ("v5", Get("key5"));
+ ASSERT_EQ("v6", Get("key6"));
+
+ ASSERT_OK(Delete("key6"));
+ ASSERT_OK(Delete("key5"));
+ ASSERT_OK(Delete("key4"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+ VerifySstUniqueIds(ptc);
+ ASSERT_EQ(3U, ptc.size());
+ row = ptc.begin();
+ ASSERT_EQ(3U, row->second->num_entries);
+ ASSERT_EQ(3U, (++row)->second->num_entries);
+ ASSERT_EQ(3U, (++row)->second->num_entries);
+ ASSERT_EQ("3", FilesPerLevel());
+ ASSERT_EQ("v1", Get("key1"));
+ ASSERT_EQ("v2", Get("key2"));
+ ASSERT_EQ("v3", Get("key3"));
+ ASSERT_EQ("NOT_FOUND", Get("key4"));
+ ASSERT_EQ("NOT_FOUND", Get("key5"));
+ ASSERT_EQ("NOT_FOUND", Get("key6"));
+}
+
+TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) {
+ Options options = CurrentOptions();
+ Reopen(&options);
+ ASSERT_OK(Put("key1", "v1"));
+ ASSERT_OK(Put("key2", "v2"));
+ ASSERT_OK(Put("key1", "v3")); // Duplicate
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ TablePropertiesCollection ptc;
+ ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+ VerifySstUniqueIds(ptc);
+ ASSERT_EQ(1U, ptc.size());
+ ASSERT_EQ(2U, ptc.begin()->second->num_entries);
+ ASSERT_EQ("1", FilesPerLevel());
+ ASSERT_EQ("v3", Get("key1"));
+ ASSERT_EQ("v2", Get("key2"));
+}
+
+namespace {
+static std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key_______%06d", i);
+ return std::string(buf);
+}
+static std::string Uint64Key(uint64_t i) {
+ std::string str;
+ str.resize(8);
+ memcpy(&str[0], static_cast<void*>(&i), 8);
+ return str;
+}
+} // namespace.
+
+TEST_F(CuckooTableDBTest, Uint64Comparator) {
+ Options options = CurrentOptions();
+ options.comparator = test::Uint64Comparator();
+ DestroyAndReopen(&options);
+
+ ASSERT_OK(Put(Uint64Key(1), "v1"));
+ ASSERT_OK(Put(Uint64Key(2), "v2"));
+ ASSERT_OK(Put(Uint64Key(3), "v3"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ ASSERT_EQ("v1", Get(Uint64Key(1)));
+ ASSERT_EQ("v2", Get(Uint64Key(2)));
+ ASSERT_EQ("v3", Get(Uint64Key(3)));
+ ASSERT_EQ("NOT_FOUND", Get(Uint64Key(4)));
+
+ // Add more keys.
+ ASSERT_OK(Delete(Uint64Key(2))); // Delete.
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_OK(Put(Uint64Key(3), "v0")); // Update.
+ ASSERT_OK(Put(Uint64Key(4), "v4"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_EQ("v1", Get(Uint64Key(1)));
+ ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2)));
+ ASSERT_EQ("v0", Get(Uint64Key(3)));
+ ASSERT_EQ("v4", Get(Uint64Key(4)));
+}
+
+TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) {
+ // Create a big L0 file and check it compacts into multiple files in L1.
+ Options options = CurrentOptions();
+ options.write_buffer_size = 270 << 10;
+ // Two SST files should be created, each containing 14 keys.
+ // Number of buckets will be 16. Total size ~156 KB.
+ options.target_file_size_base = 160 << 10;
+ Reopen(&options);
+
+ // Write 28 values, each 10016 B ~ 10KB
+ for (int idx = 0; idx < 28; ++idx) {
+ ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ("1", FilesPerLevel());
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow trivial move */));
+ ASSERT_EQ("0,2", FilesPerLevel());
+ for (int idx = 0; idx < 28; ++idx) {
+ ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
+ }
+}
+
+TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
+ // Insert same key twice so that they go to different SST files. Then wait for
+ // compaction and check if the latest value is stored and old value removed.
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.level0_file_num_compaction_trigger = 2;
+ Reopen(&options);
+
+ // Write 11 values, each 10016 B
+ for (int idx = 0; idx < 11; ++idx) {
+ ASSERT_OK(Put(Key(idx), std::string(10000, 'a')));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ("1", FilesPerLevel());
+
+ // Generate one more file in level-0, and should trigger level-0 compaction
+ for (int idx = 0; idx < 11; ++idx) {
+ ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+
+ ASSERT_EQ("0,1", FilesPerLevel());
+ for (int idx = 0; idx < 11; ++idx) {
+ ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
+ }
+}
+
+TEST_F(CuckooTableDBTest, AdaptiveTable) {
+ Options options = CurrentOptions();
+
+ // Ensure options compatible with PlainTable
+ options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+
+ // Write some keys using cuckoo table.
+ options.table_factory.reset(NewCuckooTableFactory());
+ Reopen(&options);
+
+ ASSERT_OK(Put("key1", "v1"));
+ ASSERT_OK(Put("key2", "v2"));
+ ASSERT_OK(Put("key3", "v3"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ // Write some keys using plain table.
+ std::shared_ptr<TableFactory> block_based_factory(
+ NewBlockBasedTableFactory());
+ std::shared_ptr<TableFactory> plain_table_factory(NewPlainTableFactory());
+ std::shared_ptr<TableFactory> cuckoo_table_factory(NewCuckooTableFactory());
+ options.create_if_missing = false;
+ options.table_factory.reset(
+ NewAdaptiveTableFactory(plain_table_factory, block_based_factory,
+ plain_table_factory, cuckoo_table_factory));
+ Reopen(&options);
+ ASSERT_OK(Put("key4", "v4"));
+ ASSERT_OK(Put("key1", "v5"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ // Write some keys using block based table.
+ options.table_factory.reset(
+ NewAdaptiveTableFactory(block_based_factory, block_based_factory,
+ plain_table_factory, cuckoo_table_factory));
+ Reopen(&options);
+ ASSERT_OK(Put("key5", "v6"));
+ ASSERT_OK(Put("key2", "v7"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ ASSERT_EQ("v5", Get("key1"));
+ ASSERT_EQ("v7", Get("key2"));
+ ASSERT_EQ("v3", Get("key3"));
+ ASSERT_EQ("v4", Get("key4"));
+ ASSERT_EQ("v6", Get("key5"));
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ if (ROCKSDB_NAMESPACE::port::kLittleEndian) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+ } else {
+ fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n");
+ return 0;
+ }
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_basic_test.cc b/src/rocksdb/db/db_basic_test.cc
new file mode 100644
index 000000000..a28ac2b88
--- /dev/null
+++ b/src/rocksdb/db/db_basic_test.cc
@@ -0,0 +1,4643 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstring>
+
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "util/file_checksum_helper.h"
+#include "util/random.h"
+#include "utilities/counted_fs.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBasicTest : public DBTestBase {
+ public:
+ DBBasicTest() : DBTestBase("db_basic_test", /*env_do_fsync=*/false) {}
+};
+
+TEST_F(DBBasicTest, OpenWhenOpen) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ DB* db2 = nullptr;
+ Status s = DB::Open(options, dbname_, &db2);
+ ASSERT_NOK(s) << [db2]() {
+ delete db2;
+ return "db2 open: ok";
+ }();
+ ASSERT_EQ(Status::Code::kIOError, s.code());
+ ASSERT_EQ(Status::SubCode::kNone, s.subcode());
+ ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr);
+
+ delete db2;
+}
+
+TEST_F(DBBasicTest, EnableDirectIOWithZeroBuf) {
+ if (!IsDirectIOSupported()) {
+ ROCKSDB_GTEST_BYPASS("Direct IO not supported");
+ return;
+ }
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.use_direct_io_for_flush_and_compaction = true;
+ options.writable_file_max_buffer_size = 0;
+ ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+
+ options.writable_file_max_buffer_size = 1024;
+ Reopen(options);
+ const std::unordered_map<std::string, std::string> new_db_opts = {
+ {"writable_file_max_buffer_size", "0"}};
+ ASSERT_TRUE(db_->SetDBOptions(new_db_opts).IsInvalidArgument());
+}
+
+TEST_F(DBBasicTest, UniqueSession) {
+ Options options = CurrentOptions();
+ std::string sid1, sid2, sid3, sid4;
+
+ ASSERT_OK(db_->GetDbSessionId(sid1));
+ Reopen(options);
+ ASSERT_OK(db_->GetDbSessionId(sid2));
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(db_->GetDbSessionId(sid4));
+ Reopen(options);
+ ASSERT_OK(db_->GetDbSessionId(sid3));
+
+ ASSERT_NE(sid1, sid2);
+ ASSERT_NE(sid1, sid3);
+ ASSERT_NE(sid2, sid3);
+
+ ASSERT_EQ(sid2, sid4);
+
+ // Expected compact format for session ids (see notes in implementation)
+ TestRegex expected("[0-9A-Z]{20}");
+ EXPECT_MATCHES_REGEX(sid1, expected);
+ EXPECT_MATCHES_REGEX(sid2, expected);
+ EXPECT_MATCHES_REGEX(sid3, expected);
+
+#ifndef ROCKSDB_LITE
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_OK(db_->GetDbSessionId(sid1));
+ // Test uniqueness between readonly open (sid1) and regular open (sid3)
+ ASSERT_NE(sid1, sid3);
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_OK(db_->GetDbSessionId(sid2));
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_OK(db_->GetDbSessionId(sid3));
+
+ ASSERT_NE(sid1, sid2);
+
+ ASSERT_EQ(sid2, sid3);
+#endif // ROCKSDB_LITE
+
+ CreateAndReopenWithCF({"goku"}, options);
+ ASSERT_OK(db_->GetDbSessionId(sid1));
+ ASSERT_OK(Put("bar", "e1"));
+ ASSERT_OK(db_->GetDbSessionId(sid2));
+ ASSERT_EQ("e1", Get("bar"));
+ ASSERT_OK(db_->GetDbSessionId(sid3));
+ ReopenWithColumnFamilies({"default", "goku"}, options);
+ ASSERT_OK(db_->GetDbSessionId(sid4));
+
+ ASSERT_EQ(sid1, sid2);
+ ASSERT_EQ(sid2, sid3);
+
+ ASSERT_NE(sid1, sid4);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, ReadOnlyDB) {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+ Close();
+
+ auto verify_one_iter = [&](Iterator* iter) {
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ++count;
+ }
+ // Always expect two keys: "foo" and "bar"
+ ASSERT_EQ(count, 2);
+ };
+
+ auto verify_all_iters = [&]() {
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ verify_one_iter(iter);
+ delete iter;
+
+ std::vector<Iterator*> iters;
+ ASSERT_OK(db_->NewIterators(ReadOptions(),
+ {dbfull()->DefaultColumnFamily()}, &iters));
+ ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
+ verify_one_iter(iters[0]);
+ delete iters[0];
+ };
+
+ auto options = CurrentOptions();
+ assert(options.env == env_);
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ verify_all_iters();
+ Close();
+
+ // Reopen and flush memtable.
+ Reopen(options);
+ ASSERT_OK(Flush());
+ Close();
+ // Now check keys in read only mode.
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ verify_all_iters();
+ ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+}
+
+// TODO akanksha: Update the test to check that combination
+// does not actually write to FS (use open read-only with
+// CompositeEnvWrapper+ReadOnlyFileSystem).
+TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+ Close();
+
+ auto options = CurrentOptions();
+ options.write_dbid_to_manifest = true;
+ assert(options.env == env_);
+ ASSERT_OK(ReadOnlyReopen(options));
+ std::string db_id1;
+ ASSERT_OK(db_->GetDbIdentity(db_id1));
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ++count;
+ }
+ ASSERT_EQ(count, 2);
+ delete iter;
+ Close();
+
+ // Reopen and flush memtable.
+ Reopen(options);
+ ASSERT_OK(Flush());
+ Close();
+ // Now check keys in read only mode.
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+ std::string db_id2;
+ ASSERT_OK(db_->GetDbIdentity(db_id2));
+ ASSERT_EQ(db_id1, db_id2);
+}
+
+TEST_F(DBBasicTest, CompactedDB) {
+ const uint64_t kFileSize = 1 << 20;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = kFileSize;
+ options.target_file_size_base = kFileSize;
+ options.max_bytes_for_level_base = 1 << 30;
+ options.compression = kNoCompression;
+ Reopen(options);
+ // 1 L0 file, use CompactedDB if max_open_files = -1
+ ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
+ ASSERT_OK(Flush());
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ Status s = Put("new", "value");
+ ASSERT_EQ(s.ToString(),
+ "Not implemented: Not supported operation in read only mode.");
+ ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ s = Put("new", "value");
+ ASSERT_EQ(s.ToString(),
+ "Not implemented: Not supported in compacted db mode.");
+ ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+ Close();
+ Reopen(options);
+ // Add more L0 files
+ ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
+ ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("something_not_flushed", "x"));
+ Close();
+
+ ASSERT_OK(ReadOnlyReopen(options));
+ // Fallback to read-only DB
+ s = Put("new", "value");
+ ASSERT_EQ(s.ToString(),
+ "Not implemented: Not supported operation in read only mode.");
+
+ // TODO: validate that other write ops return NotImplemented
+ // (DBImplReadOnly is missing some overrides)
+
+ // Ensure no deadlock on flush triggered by another API function
+ // (Old deadlock bug depends on something_not_flushed above.)
+ std::vector<std::string> files;
+ uint64_t manifest_file_size;
+ ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
+ LiveFilesStorageInfoOptions lfsi_opts;
+ lfsi_opts.wal_size_for_flush = 0; // always
+ std::vector<LiveFileStorageInfo> files2;
+ ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
+
+ Close();
+
+ // Full compaction
+ Reopen(options);
+ // Add more keys
+ ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+ ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h')));
+ ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i')));
+ ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j')));
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(3, NumTableFilesAtLevel(1));
+ Close();
+
+ // CompactedDB
+ ASSERT_OK(ReadOnlyReopen(options));
+ s = Put("new", "value");
+ ASSERT_EQ(s.ToString(),
+ "Not implemented: Not supported in compacted db mode.");
+ ASSERT_EQ("NOT_FOUND", Get("abc"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb"));
+ ASSERT_EQ("NOT_FOUND", Get("ccc"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff"));
+ ASSERT_EQ("NOT_FOUND", Get("ggg"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii"));
+ ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
+ ASSERT_EQ("NOT_FOUND", Get("kkk"));
+
+ // TODO: validate that other write ops return NotImplemented
+ // (CompactedDB is missing some overrides)
+
+ // Ensure no deadlock on flush triggered by another API function
+ ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
+ ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
+
+ // MultiGet
+ std::vector<std::string> values;
+ std::vector<Status> status_list = dbfull()->MultiGet(
+ ReadOptions(),
+ std::vector<Slice>({Slice("aaa"), Slice("ccc"), Slice("eee"),
+ Slice("ggg"), Slice("iii"), Slice("kkk")}),
+ &values);
+ ASSERT_EQ(status_list.size(), static_cast<uint64_t>(6));
+ ASSERT_EQ(values.size(), static_cast<uint64_t>(6));
+ ASSERT_OK(status_list[0]);
+ ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]);
+ ASSERT_TRUE(status_list[1].IsNotFound());
+ ASSERT_OK(status_list[2]);
+ ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]);
+ ASSERT_TRUE(status_list[3].IsNotFound());
+ ASSERT_OK(status_list[4]);
+ ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]);
+ ASSERT_TRUE(status_list[5].IsNotFound());
+
+ Reopen(options);
+ // Add a key
+ ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ s = Put("new", "value");
+ ASSERT_EQ(s.ToString(),
+ "Not implemented: Not supported operation in read only mode.");
+}
+
+TEST_F(DBBasicTest, LevelLimitReopen) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const std::string value(1024 * 1024, ' ');
+ int i = 0;
+ while (NumTableFilesAtLevel(2, 1) == 0) {
+ ASSERT_OK(Put(1, Key(i++), value));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ options.num_levels = 1;
+ options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+ Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ(s.IsInvalidArgument(), true);
+ ASSERT_EQ(s.ToString(),
+ "Invalid argument: db has more levels than options.num_levels");
+
+ options.num_levels = 10;
+ options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, PutDeleteGet) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ ASSERT_EQ("v2", Get(1, "foo"));
+ ASSERT_OK(Delete(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+ } while (ChangeOptions());
+}
+
+TEST_F(DBBasicTest, PutSingleDeleteGet) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_OK(Put(1, "foo2", "v2"));
+ ASSERT_EQ("v2", Get(1, "foo2"));
+ ASSERT_OK(SingleDelete(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+ // Ski FIFO and universal compaction because they do not apply to the test
+ // case. Skip MergePut because single delete does not get removed when it
+ // encounters a merge.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+TEST_F(DBBasicTest, EmptyFlush) {
+ // It is possible to produce empty flushes when using single deletes. Tests
+ // whether empty flushes cause issues.
+ do {
+ Random rnd(301);
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "a", Slice()));
+ ASSERT_OK(SingleDelete(1, "a"));
+ ASSERT_OK(Flush(1));
+
+ ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+ // Skip FIFO and universal compaction as they do not apply to the test
+ // case. Skip MergePut because merges cannot be combined with single
+ // deletions.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+TEST_F(DBBasicTest, GetFromVersions) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+ } while (ChangeOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, GetSnapshot) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+ // Try with both a short key and a long key
+ for (int i = 0; i < 2; i++) {
+ std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+ ASSERT_OK(Put(1, key, "v1"));
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_OK(Put(1, key, "v2"));
+ ASSERT_EQ("v2", Get(1, key));
+ ASSERT_EQ("v1", Get(1, key, s1));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("v2", Get(1, key));
+ ASSERT_EQ("v1", Get(1, key, s1));
+ db_->ReleaseSnapshot(s1);
+ }
+ } while (ChangeOptions());
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, CheckLock) {
+ do {
+ DB* localdb = nullptr;
+ Options options = CurrentOptions();
+ ASSERT_OK(TryReopen(options));
+
+ // second open should fail
+ Status s = DB::Open(options, dbname_, &localdb);
+ ASSERT_NOK(s) << [localdb]() {
+ delete localdb;
+ return "localdb open: ok";
+ }();
+#ifdef OS_LINUX
+ ASSERT_TRUE(s.ToString().find("lock ") != std::string::npos);
+#endif // OS_LINUX
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushMultipleMemtable) {
+ do {
+ Options options = CurrentOptions();
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.max_write_buffer_size_to_maintain = -1;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v1", Get(1, "bar"));
+ ASSERT_OK(Flush(1));
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushEmptyColumnFamily) {
+ // Block flush thread and disable compaction thread
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ test::SleepingBackgroundTask sleeping_task_high;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_high, Env::Priority::HIGH);
+
+ Options options = CurrentOptions();
+ // disable compaction
+ options.disable_auto_compactions = true;
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.max_write_buffer_number = 2;
+ options.min_write_buffer_number_to_merge = 1;
+ options.max_write_buffer_size_to_maintain =
+ static_cast<int64_t>(options.write_buffer_size);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Compaction can still go through even if no thread can flush the
+ // mem table.
+ ASSERT_OK(Flush(0));
+ ASSERT_OK(Flush(1));
+
+ // Insert can go through
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1"));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+ ASSERT_EQ("v1", Get(0, "foo"));
+ ASSERT_EQ("v1", Get(1, "bar"));
+
+ sleeping_task_high.WakeUp();
+ sleeping_task_high.WaitUntilDone();
+
+ // Flush can still go through.
+ ASSERT_OK(Flush(0));
+ ASSERT_OK(Flush(1));
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBBasicTest, Flush) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ SetPerfLevel(kEnableTime);
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+ // this will now also flush the last 2 writes
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+ get_perf_context()->Reset();
+ Get(1, "foo");
+ ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
+ ASSERT_EQ(2, (int)get_perf_context()->get_read_bytes);
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v1", Get(1, "bar"));
+
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+ ASSERT_OK(Flush(1));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v2", Get(1, "bar"));
+ get_perf_context()->Reset();
+ ASSERT_EQ("v2", Get(1, "foo"));
+ ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
+
+ writeOpt.disableWAL = false;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+ ASSERT_OK(Flush(1));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ // 'foo' should be there because its put
+ // has WAL enabled.
+ ASSERT_EQ("v3", Get(1, "foo"));
+ ASSERT_EQ("v3", Get(1, "bar"));
+
+ SetPerfLevel(kDisable);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, ManifestRollOver) {
+ do {
+ Options options;
+ options.max_manifest_file_size = 10; // 10 bytes
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ {
+ ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
+ ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
+ ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
+ uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
+ ASSERT_OK(Flush(1)); // This should trigger LogAndApply.
+ uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
+ ASSERT_GT(manifest_after_flush, manifest_before_flush);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
+ // check if a new manifest file got inserted or not.
+ ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
+ ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
+ ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
+ }
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, IdentityAcrossRestarts) {
+ constexpr size_t kMinIdSize = 10;
+ do {
+ for (bool with_manifest : {false, true}) {
+ std::string idfilename = IdentityFileName(dbname_);
+ std::string id1, tmp;
+ ASSERT_OK(db_->GetDbIdentity(id1));
+ ASSERT_GE(id1.size(), kMinIdSize);
+
+ Options options = CurrentOptions();
+ options.write_dbid_to_manifest = with_manifest;
+ Reopen(options);
+ std::string id2;
+ ASSERT_OK(db_->GetDbIdentity(id2));
+ // id2 should match id1 because identity was not regenerated
+ ASSERT_EQ(id1, id2);
+ ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+ ASSERT_EQ(tmp, id2);
+
+ // Recover from deleted/missing IDENTITY
+ ASSERT_OK(env_->DeleteFile(idfilename));
+ Reopen(options);
+ std::string id3;
+ ASSERT_OK(db_->GetDbIdentity(id3));
+ if (with_manifest) {
+ // id3 should match id1 because identity was restored from manifest
+ ASSERT_EQ(id1, id3);
+ } else {
+ // id3 should NOT match id1 because identity was regenerated
+ ASSERT_NE(id1, id3);
+ ASSERT_GE(id3.size(), kMinIdSize);
+ }
+ ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+ ASSERT_EQ(tmp, id3);
+
+ // Recover from truncated IDENTITY
+ {
+ std::unique_ptr<WritableFile> w;
+ ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
+ ASSERT_OK(w->Close());
+ }
+ Reopen(options);
+ std::string id4;
+ ASSERT_OK(db_->GetDbIdentity(id4));
+ if (with_manifest) {
+ // id4 should match id1 because identity was restored from manifest
+ ASSERT_EQ(id1, id4);
+ } else {
+ // id4 should NOT match id1 because identity was regenerated
+ ASSERT_NE(id1, id4);
+ ASSERT_GE(id4.size(), kMinIdSize);
+ }
+ ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+ ASSERT_EQ(tmp, id4);
+
+ // Recover from overwritten IDENTITY
+ std::string silly_id = "asdf123456789";
+ {
+ std::unique_ptr<WritableFile> w;
+ ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
+ ASSERT_OK(w->Append(silly_id));
+ ASSERT_OK(w->Close());
+ }
+ Reopen(options);
+ std::string id5;
+ ASSERT_OK(db_->GetDbIdentity(id5));
+ if (with_manifest) {
+ // id4 should match id1 because identity was restored from manifest
+ ASSERT_EQ(id1, id5);
+ } else {
+ ASSERT_EQ(id5, silly_id);
+ }
+ ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+ ASSERT_EQ(tmp, id5);
+ }
+ } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, Snapshot) {
+ env_->SetMockSleep();
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+ ASSERT_OK(Put(0, "foo", "0v1"));
+ ASSERT_OK(Put(1, "foo", "1v1"));
+
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_EQ(1U, GetNumSnapshots());
+ uint64_t time_snap1 = GetTimeOldestSnapshots();
+ ASSERT_GT(time_snap1, 0U);
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ ASSERT_OK(Put(0, "foo", "0v2"));
+ ASSERT_OK(Put(1, "foo", "1v2"));
+
+ env_->MockSleepForSeconds(1);
+
+ const Snapshot* s2 = db_->GetSnapshot();
+ ASSERT_EQ(2U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ ASSERT_OK(Put(0, "foo", "0v3"));
+ ASSERT_OK(Put(1, "foo", "1v3"));
+
+ {
+ ManagedSnapshot s3(db_);
+ ASSERT_EQ(3U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+
+ ASSERT_OK(Put(0, "foo", "0v4"));
+ ASSERT_OK(Put(1, "foo", "1v4"));
+ ASSERT_EQ("0v1", Get(0, "foo", s1));
+ ASSERT_EQ("1v1", Get(1, "foo", s1));
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
+ ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ }
+
+ ASSERT_EQ(2U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ ASSERT_EQ("0v1", Get(0, "foo", s1));
+ ASSERT_EQ("1v1", Get(1, "foo", s1));
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+
+ db_->ReleaseSnapshot(s1);
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ ASSERT_EQ(1U, GetNumSnapshots());
+ ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
+
+ db_->ReleaseSnapshot(s2);
+ ASSERT_EQ(0U, GetNumSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ } while (ChangeOptions());
+}
+
+#endif // ROCKSDB_LITE
+
+class DBBasicMultiConfigs : public DBBasicTest,
+ public ::testing::WithParamInterface<int> {
+ public:
+ DBBasicMultiConfigs() { option_config_ = GetParam(); }
+
+ static std::vector<int> GenerateOptionConfigs() {
+ std::vector<int> option_configs;
+ for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+ if (!ShouldSkipOptions(option_config, kSkipFIFOCompaction)) {
+ option_configs.push_back(option_config);
+ }
+ }
+ return option_configs;
+ }
+};
+
+TEST_P(DBBasicMultiConfigs, CompactBetweenSnapshots) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ Options options = CurrentOptions(options_override);
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+ FillLevels("a", "z", 1);
+
+ ASSERT_OK(Put(1, "foo", "first"));
+ const Snapshot* snapshot1 = db_->GetSnapshot();
+ ASSERT_OK(Put(1, "foo", "second"));
+ ASSERT_OK(Put(1, "foo", "third"));
+ ASSERT_OK(Put(1, "foo", "fourth"));
+ const Snapshot* snapshot2 = db_->GetSnapshot();
+ ASSERT_OK(Put(1, "foo", "fifth"));
+ ASSERT_OK(Put(1, "foo", "sixth"));
+
+ // All entries (including duplicates) exist
+ // before any compaction or flush is triggered.
+ ASSERT_EQ(AllEntriesFor("foo", 1),
+ "[ sixth, fifth, fourth, third, second, first ]");
+ ASSERT_EQ("sixth", Get(1, "foo"));
+ ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+ ASSERT_EQ("first", Get(1, "foo", snapshot1));
+
+ // After a flush, "second", "third" and "fifth" should
+ // be removed
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]");
+
+ // after we release the snapshot1, only two values left
+ db_->ReleaseSnapshot(snapshot1);
+ FillLevels("a", "z", 1);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr));
+
+ // We have only one valid snapshot snapshot2. Since snapshot1 is
+ // not valid anymore, "first" should be removed by a compaction.
+ ASSERT_EQ("sixth", Get(1, "foo"));
+ ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]");
+
+ // after we release the snapshot2, only one value should be left
+ db_->ReleaseSnapshot(snapshot2);
+ FillLevels("a", "z", 1);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr));
+ ASSERT_EQ("sixth", Get(1, "foo"));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
+}
+
+INSTANTIATE_TEST_CASE_P(
+ DBBasicMultiConfigs, DBBasicMultiConfigs,
+ ::testing::ValuesIn(DBBasicMultiConfigs::GenerateOptionConfigs()));
+
+TEST_F(DBBasicTest, DBOpen_Options) {
+ Options options = CurrentOptions();
+ Close();
+ Destroy(options);
+
+ // Does not exist, and create_if_missing == false: error
+ DB* db = nullptr;
+ options.create_if_missing = false;
+ Status s = DB::Open(options, dbname_, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+ ASSERT_TRUE(db == nullptr);
+
+ // Does not exist, and create_if_missing == true: OK
+ options.create_if_missing = true;
+ s = DB::Open(options, dbname_, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ delete db;
+ db = nullptr;
+
+ // Does exist, and error_if_exists == true: error
+ options.create_if_missing = false;
+ options.error_if_exists = true;
+ s = DB::Open(options, dbname_, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+ ASSERT_TRUE(db == nullptr);
+
+ // Does exist, and error_if_exists == false: OK
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ s = DB::Open(options, dbname_, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ delete db;
+ db = nullptr;
+}
+
+TEST_F(DBBasicTest, CompactOnFlush) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ Options options = CurrentOptions(options_override);
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]");
+
+ // Write two new keys
+ ASSERT_OK(Put(1, "a", "begin"));
+ ASSERT_OK(Put(1, "z", "end"));
+ ASSERT_OK(Flush(1));
+
+ // Case1: Delete followed by a put
+ ASSERT_OK(Delete(1, "foo"));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+
+ // After the current memtable is flushed, the DEL should
+ // have been removed
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+ nullptr, nullptr));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+
+ // Case 2: Delete followed by another delete
+ ASSERT_OK(Delete(1, "foo"));
+ ASSERT_OK(Delete(1, "foo"));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]");
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]");
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+ nullptr, nullptr));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+ // Case 3: Put followed by a delete
+ ASSERT_OK(Put(1, "foo", "v3"));
+ ASSERT_OK(Delete(1, "foo"));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]");
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]");
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+ nullptr, nullptr));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+ // Case 4: Put followed by another Put
+ ASSERT_OK(Put(1, "foo", "v4"));
+ ASSERT_OK(Put(1, "foo", "v5"));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]");
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+ nullptr, nullptr));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+
+ // clear database
+ ASSERT_OK(Delete(1, "foo"));
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+ nullptr, nullptr));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+ // Case 5: Put followed by snapshot followed by another Put
+ // Both puts should remain.
+ ASSERT_OK(Put(1, "foo", "v6"));
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(Put(1, "foo", "v7"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]");
+ db_->ReleaseSnapshot(snapshot);
+
+ // clear database
+ ASSERT_OK(Delete(1, "foo"));
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+ nullptr, nullptr));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+ // Case 5: snapshot followed by a put followed by another Put
+ // Only the last put should remain.
+ const Snapshot* snapshot1 = db_->GetSnapshot();
+ ASSERT_OK(Put(1, "foo", "v8"));
+ ASSERT_OK(Put(1, "foo", "v9"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]");
+ db_->ReleaseSnapshot(snapshot1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushOneColumnFamily) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+ "alyosha", "popovich"},
+ options);
+
+ ASSERT_OK(Put(0, "Default", "Default"));
+ ASSERT_OK(Put(1, "pikachu", "pikachu"));
+ ASSERT_OK(Put(2, "ilya", "ilya"));
+ ASSERT_OK(Put(3, "muromec", "muromec"));
+ ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
+ ASSERT_OK(Put(5, "nikitich", "nikitich"));
+ ASSERT_OK(Put(6, "alyosha", "alyosha"));
+ ASSERT_OK(Put(7, "popovich", "popovich"));
+
+ for (int i = 0; i < 8; ++i) {
+ ASSERT_OK(Flush(i));
+ auto tables = ListTableFiles(env_, dbname_);
+ ASSERT_EQ(tables.size(), i + 1U);
+ }
+}
+
+TEST_F(DBBasicTest, MultiGetSimple) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ SetPerfLevel(kEnableCount);
+ ASSERT_OK(Put(1, "k1", "v1"));
+ ASSERT_OK(Put(1, "k2", "v2"));
+ ASSERT_OK(Put(1, "k3", "v3"));
+ ASSERT_OK(Put(1, "k4", "v4"));
+ ASSERT_OK(Delete(1, "k4"));
+ ASSERT_OK(Put(1, "k5", "v5"));
+ ASSERT_OK(Delete(1, "no_key"));
+
+ std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+
+ std::vector<std::string> values(20, "Temporary data to be overwritten");
+ std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+
+ get_perf_context()->Reset();
+ std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+ ASSERT_EQ(values.size(), keys.size());
+ ASSERT_EQ(values[0], "v1");
+ ASSERT_EQ(values[1], "v2");
+ ASSERT_EQ(values[2], "v3");
+ ASSERT_EQ(values[4], "v5");
+ // four kv pairs * two bytes per value
+ ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+ ASSERT_OK(s[0]);
+ ASSERT_OK(s[1]);
+ ASSERT_OK(s[2]);
+ ASSERT_TRUE(s[3].IsNotFound());
+ ASSERT_OK(s[4]);
+ ASSERT_TRUE(s[5].IsNotFound());
+ SetPerfLevel(kDisable);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, MultiGetEmpty) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ // Empty Key Set
+ std::vector<Slice> keys;
+ std::vector<std::string> values;
+ std::vector<ColumnFamilyHandle*> cfs;
+ std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+ ASSERT_EQ(s.size(), 0U);
+
+ // Empty Database, Empty Key Set
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+ ASSERT_EQ(s.size(), 0U);
+
+ // Empty Database, Search for Keys
+ keys.resize(2);
+ keys[0] = "a";
+ keys[1] = "b";
+ cfs.push_back(handles_[0]);
+ cfs.push_back(handles_[1]);
+ s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+ ASSERT_EQ(static_cast<int>(s.size()), 2);
+ ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound());
+ } while (ChangeCompactOptions());
+}
+
+class DBBlockChecksumTest : public DBBasicTest,
+ public testing::WithParamInterface<uint32_t> {};
+
+INSTANTIATE_TEST_CASE_P(FormatVersions, DBBlockChecksumTest,
+ testing::ValuesIn(test::kFooterFormatVersionsToTest));
+
+TEST_P(DBBlockChecksumTest, BlockChecksumTest) {
+ BlockBasedTableOptions table_options;
+ table_options.format_version = GetParam();
+ Options options = CurrentOptions();
+ const int kNumPerFile = 2;
+
+ const auto algs = GetSupportedChecksums();
+ const int algs_size = static_cast<int>(algs.size());
+
+ // generate one table with each type of checksum
+ for (int i = 0; i < algs_size; ++i) {
+ table_options.checksum = algs[i];
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ for (int j = 0; j < kNumPerFile; ++j) {
+ ASSERT_OK(Put(Key(i * kNumPerFile + j), Key(i * kNumPerFile + j)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // with each valid checksum type setting...
+ for (int i = 0; i < algs_size; ++i) {
+ table_options.checksum = algs[i];
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ // verify every type of checksum (should be regardless of that setting)
+ for (int j = 0; j < algs_size * kNumPerFile; ++j) {
+ ASSERT_EQ(Key(j), Get(Key(j)));
+ }
+ }
+
+ // Now test invalid checksum type
+ table_options.checksum = static_cast<ChecksumType>(123);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+}
+
+// On Windows you can have either memory mapped file or a file
+// with unbuffered access. So this asserts and does not make
+// sense to run
+#ifndef OS_WIN
+TEST_F(DBBasicTest, MmapAndBufferOptions) {
+ if (!IsMemoryMappedAccessSupported()) {
+ return;
+ }
+ Options options = CurrentOptions();
+
+ options.use_direct_reads = true;
+ options.allow_mmap_reads = true;
+ ASSERT_NOK(TryReopen(options));
+
+ // All other combinations are acceptable
+ options.use_direct_reads = false;
+ ASSERT_OK(TryReopen(options));
+
+ if (IsDirectIOSupported()) {
+ options.use_direct_reads = true;
+ options.allow_mmap_reads = false;
+ ASSERT_OK(TryReopen(options));
+ }
+
+ options.use_direct_reads = false;
+ ASSERT_OK(TryReopen(options));
+}
+#endif
+
+class TestEnv : public EnvWrapper {
+ public:
+ explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {}
+ static const char* kClassName() { return "TestEnv"; }
+ const char* Name() const override { return kClassName(); }
+
+ class TestLogger : public Logger {
+ public:
+ using Logger::Logv;
+ explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+ ~TestLogger() override {
+ if (!closed_) {
+ CloseHelper().PermitUncheckedError();
+ }
+ }
+ void Logv(const char* /*format*/, va_list /*ap*/) override {}
+
+ protected:
+ Status CloseImpl() override { return CloseHelper(); }
+
+ private:
+ Status CloseHelper() {
+ env->CloseCountInc();
+ ;
+ return Status::IOError();
+ }
+ TestEnv* env;
+ };
+
+ void CloseCountInc() { close_count++; }
+
+ int GetCloseCount() { return close_count; }
+
+ Status NewLogger(const std::string& /*fname*/,
+ std::shared_ptr<Logger>* result) override {
+ result->reset(new TestLogger(this));
+ return Status::OK();
+ }
+
+ private:
+ int close_count;
+};
+
+TEST_F(DBBasicTest, DBClose) {
+ Options options = GetDefaultOptions();
+ std::string dbname = test::PerThreadDBPath("db_close_test");
+ ASSERT_OK(DestroyDB(dbname, options));
+
+ DB* db = nullptr;
+ TestEnv* env = new TestEnv(env_);
+ std::unique_ptr<TestEnv> local_env_guard(env);
+ options.create_if_missing = true;
+ options.env = env;
+ Status s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ s = db->Close();
+ ASSERT_EQ(env->GetCloseCount(), 1);
+ ASSERT_EQ(s, Status::IOError());
+
+ delete db;
+ ASSERT_EQ(env->GetCloseCount(), 1);
+
+ // Do not call DB::Close() and ensure our logger Close() still gets called
+ s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+ delete db;
+ ASSERT_EQ(env->GetCloseCount(), 2);
+
+ // Provide our own logger and ensure DB::Close() does not close it
+ options.info_log.reset(new TestEnv::TestLogger(env));
+ options.create_if_missing = false;
+ s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ s = db->Close();
+ ASSERT_EQ(s, Status::OK());
+ delete db;
+ ASSERT_EQ(env->GetCloseCount(), 2);
+ options.info_log.reset();
+ ASSERT_EQ(env->GetCloseCount(), 3);
+}
+
+TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) {
+ Options options = GetDefaultOptions();
+ std::string dbname = test::PerThreadDBPath("db_close_all_dir_fds_test");
+ // Configure a specific WAL directory
+ options.wal_dir = dbname + "_wal_dir";
+ // Configure 3 different data directories
+ options.db_paths.emplace_back(dbname + "_1", 512 * 1024);
+ options.db_paths.emplace_back(dbname + "_2", 4 * 1024 * 1024);
+ options.db_paths.emplace_back(dbname + "_3", 1024 * 1024 * 1024);
+
+ ASSERT_OK(DestroyDB(dbname, options));
+
+ DB* db = nullptr;
+ std::unique_ptr<Env> env = NewCompositeEnv(
+ std::make_shared<CountedFileSystem>(FileSystem::Default()));
+ options.create_if_missing = true;
+ options.env = env.get();
+ Status s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ // Explicitly close the database to ensure the open and close counter for
+ // directories are equivalent
+ s = db->Close();
+ auto* counted_fs =
+ options.env->GetFileSystem()->CheckedCast<CountedFileSystem>();
+ ASSERT_TRUE(counted_fs != nullptr);
+ ASSERT_EQ(counted_fs->counters()->dir_opens,
+ counted_fs->counters()->dir_closes);
+ ASSERT_OK(s);
+ delete db;
+}
+
+TEST_F(DBBasicTest, DBCloseFlushError) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.manual_wal_flush = true;
+ options.write_buffer_size = 100;
+ options.env = fault_injection_env.get();
+
+ Reopen(options);
+ ASSERT_OK(Put("key1", "value1"));
+ ASSERT_OK(Put("key2", "value2"));
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+ ASSERT_OK(Put("key3", "value3"));
+ fault_injection_env->SetFilesystemActive(false);
+ Status s = dbfull()->Close();
+ ASSERT_NE(s, Status::OK());
+ // retry should return the same error
+ s = dbfull()->Close();
+ ASSERT_NE(s, Status::OK());
+ fault_injection_env->SetFilesystemActive(true);
+ // retry close() is no-op even the system is back. Could be improved if
+ // Close() is retry-able: #9029
+ s = dbfull()->Close();
+ ASSERT_NE(s, Status::OK());
+ Destroy(options);
+}
+
+class DBMultiGetTestWithParam
+ : public DBBasicTest,
+ public testing::WithParamInterface<std::tuple<bool, bool>> {};
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
+#ifndef USE_COROUTINES
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+ "alyosha", "popovich"},
+ options);
+ // <CF, key, value> tuples
+ std::vector<std::tuple<int, std::string, std::string>> cf_kv_vec;
+ static const int num_keys = 24;
+ cf_kv_vec.reserve(num_keys);
+
+ for (int i = 0; i < num_keys; ++i) {
+ int cf = i / 3;
+ int cf_key = 1 % 3;
+ cf_kv_vec.emplace_back(std::make_tuple(
+ cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key),
+ "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key)));
+ ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+ std::get<2>(cf_kv_vec[i])));
+ }
+
+ int get_sv_count = 0;
+ ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+ if (++get_sv_count == 2) {
+ // After MultiGet refs a couple of CFs, flush all CFs so MultiGet
+ // is forced to repeat the process
+ for (int i = 0; i < num_keys; ++i) {
+ int cf = i / 3;
+ int cf_key = i % 8;
+ if (cf_key == 0) {
+ ASSERT_OK(Flush(cf));
+ }
+ ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+ std::get<2>(cf_kv_vec[i]) + "_2"));
+ }
+ }
+ if (get_sv_count == 11) {
+ for (int i = 0; i < 8; ++i) {
+ auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+ db->GetColumnFamilyHandle(i))
+ ->cfd();
+ ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<int> cfs;
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+
+ for (int i = 0; i < num_keys; ++i) {
+ cfs.push_back(std::get<0>(cf_kv_vec[i]));
+ keys.push_back(std::get<1>(cf_kv_vec[i]));
+ }
+
+ values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+ std::get<1>(GetParam()));
+ ASSERT_EQ(values.size(), num_keys);
+ for (unsigned int j = 0; j < values.size(); ++j) {
+ ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2");
+ }
+
+ keys.clear();
+ cfs.clear();
+ cfs.push_back(std::get<0>(cf_kv_vec[0]));
+ keys.push_back(std::get<1>(cf_kv_vec[0]));
+ cfs.push_back(std::get<0>(cf_kv_vec[3]));
+ keys.push_back(std::get<1>(cf_kv_vec[3]));
+ cfs.push_back(std::get<0>(cf_kv_vec[4]));
+ keys.push_back(std::get<1>(cf_kv_vec[4]));
+ values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+ std::get<1>(GetParam()));
+ ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2");
+ ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2");
+ ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2");
+
+ keys.clear();
+ cfs.clear();
+ cfs.push_back(std::get<0>(cf_kv_vec[7]));
+ keys.push_back(std::get<1>(cf_kv_vec[7]));
+ cfs.push_back(std::get<0>(cf_kv_vec[6]));
+ keys.push_back(std::get<1>(cf_kv_vec[6]));
+ cfs.push_back(std::get<0>(cf_kv_vec[1]));
+ keys.push_back(std::get<1>(cf_kv_vec[1]));
+ values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+ std::get<1>(GetParam()));
+ ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2");
+ ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2");
+ ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
+
+ for (int cf = 0; cf < 8; ++cf) {
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(
+ static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(cf))
+ ->cfd();
+ ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+ ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
+ }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
+#ifndef USE_COROUTINES
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+ "alyosha", "popovich"},
+ options);
+
+ for (int i = 0; i < 8; ++i) {
+ ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+ "cf" + std::to_string(i) + "_val"));
+ }
+
+ int get_sv_count = 0;
+ int retries = 0;
+ bool last_try = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MultiGet::LastTry", [&](void* /*arg*/) {
+ last_try = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+ if (last_try) {
+ return;
+ }
+ if (++get_sv_count == 2) {
+ ++retries;
+ get_sv_count = 0;
+ for (int i = 0; i < 8; ++i) {
+ ASSERT_OK(Flush(i));
+ ASSERT_OK(Put(
+ i, "cf" + std::to_string(i) + "_key",
+ "cf" + std::to_string(i) + "_val" + std::to_string(retries)));
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<int> cfs;
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+
+ for (int i = 0; i < 8; ++i) {
+ cfs.push_back(i);
+ keys.push_back("cf" + std::to_string(i) + "_key");
+ }
+
+ values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+ std::get<1>(GetParam()));
+ ASSERT_TRUE(last_try);
+ ASSERT_EQ(values.size(), 8);
+ for (unsigned int j = 0; j < values.size(); ++j) {
+ ASSERT_EQ(values[j],
+ "cf" + std::to_string(j) + "_val" + std::to_string(retries));
+ }
+ for (int i = 0; i < 8; ++i) {
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(
+ static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
+ ->cfd();
+ ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+ }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
+#ifndef USE_COROUTINES
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+ "alyosha", "popovich"},
+ options);
+
+ for (int i = 0; i < 8; ++i) {
+ ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+ "cf" + std::to_string(i) + "_val"));
+ }
+
+ int get_sv_count = 0;
+ ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+ if (++get_sv_count == 2) {
+ for (int i = 0; i < 8; ++i) {
+ ASSERT_OK(Flush(i));
+ ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+ "cf" + std::to_string(i) + "_val2"));
+ }
+ }
+ if (get_sv_count == 8) {
+ for (int i = 0; i < 8; ++i) {
+ auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+ db->GetColumnFamilyHandle(i))
+ ->cfd();
+ ASSERT_TRUE(
+ (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVInUse) ||
+ (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVObsolete));
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<int> cfs;
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+
+ for (int i = 0; i < 8; ++i) {
+ cfs.push_back(i);
+ keys.push_back("cf" + std::to_string(i) + "_key");
+ }
+
+ const Snapshot* snapshot = db_->GetSnapshot();
+ values = MultiGet(cfs, keys, snapshot, std::get<0>(GetParam()),
+ std::get<1>(GetParam()));
+ db_->ReleaseSnapshot(snapshot);
+ ASSERT_EQ(values.size(), 8);
+ for (unsigned int j = 0; j < values.size(); ++j) {
+ ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val");
+ }
+ for (int i = 0; i < 8; ++i) {
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(
+ static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
+ ->cfd();
+ ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+ }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFUnsorted) {
+#ifndef USE_COROUTINES
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"one", "two"}, options);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(2, "baz", "xyz"));
+ ASSERT_OK(Put(1, "abc", "def"));
+
+ // Note: keys for the same CF do not form a consecutive range
+ std::vector<int> cfs{1, 2, 1};
+ std::vector<std::string> keys{"foo", "baz", "abc"};
+ std::vector<std::string> values;
+
+ values = MultiGet(cfs, keys, /* snapshot */ nullptr,
+ /* batched */ std::get<0>(GetParam()),
+ /* async */ std::get<1>(GetParam()));
+
+ ASSERT_EQ(values.size(), 3);
+ ASSERT_EQ(values[0], "bar");
+ ASSERT_EQ(values[1], "xyz");
+ ASSERT_EQ(values[2], "def");
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSimpleUnsorted) {
+#ifndef USE_COROUTINES
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ // Skip for unbatched MultiGet
+ if (!std::get<0>(GetParam())) {
+ ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+ return;
+ }
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ SetPerfLevel(kEnableCount);
+ ASSERT_OK(Put(1, "k1", "v1"));
+ ASSERT_OK(Put(1, "k2", "v2"));
+ ASSERT_OK(Put(1, "k3", "v3"));
+ ASSERT_OK(Put(1, "k4", "v4"));
+ ASSERT_OK(Delete(1, "k4"));
+ ASSERT_OK(Put(1, "k5", "v5"));
+ ASSERT_OK(Delete(1, "no_key"));
+
+ get_perf_context()->Reset();
+
+ std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k2", "k1"});
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+ std::vector<Status> s(keys.size());
+
+ ReadOptions ro;
+ ro.async_io = std::get<1>(GetParam());
+ db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+ s.data(), false);
+
+ ASSERT_EQ(values.size(), keys.size());
+ ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
+ ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v2");
+ ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+ ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+ // four kv pairs * two bytes per value
+ ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+ ASSERT_TRUE(s[0].IsNotFound());
+ ASSERT_OK(s[1]);
+ ASSERT_TRUE(s[2].IsNotFound());
+ ASSERT_OK(s[3]);
+ ASSERT_OK(s[4]);
+ ASSERT_OK(s[5]);
+
+ SetPerfLevel(kDisable);
+ } while (ChangeCompactOptions());
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSortedMultiFile) {
+#ifndef USE_COROUTINES
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ // Skip for unbatched MultiGet
+ if (!std::get<0>(GetParam())) {
+ ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+ return;
+ }
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ SetPerfLevel(kEnableCount);
+ // To expand the power of this test, generate > 1 table file and
+ // mix with memtable
+ ASSERT_OK(Put(1, "k1", "v1"));
+ ASSERT_OK(Put(1, "k2", "v2"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "k3", "v3"));
+ ASSERT_OK(Put(1, "k4", "v4"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Delete(1, "k4"));
+ ASSERT_OK(Put(1, "k5", "v5"));
+ ASSERT_OK(Delete(1, "no_key"));
+
+ get_perf_context()->Reset();
+
+ std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+ std::vector<Status> s(keys.size());
+
+ ReadOptions ro;
+ ro.async_io = std::get<1>(GetParam());
+ db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+ s.data(), true);
+
+ ASSERT_EQ(values.size(), keys.size());
+ ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1");
+ ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v2");
+ ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+ ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v5");
+ // four kv pairs * two bytes per value
+ ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+ ASSERT_OK(s[0]);
+ ASSERT_OK(s[1]);
+ ASSERT_OK(s[2]);
+ ASSERT_TRUE(s[3].IsNotFound());
+ ASSERT_OK(s[4]);
+ ASSERT_TRUE(s[5].IsNotFound());
+
+ SetPerfLevel(kDisable);
+ } while (ChangeOptions());
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedDuplicateKeys) {
+#ifndef USE_COROUTINES
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ // Skip for unbatched MultiGet
+ if (!std::get<0>(GetParam())) {
+ ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+ return;
+ }
+ Options opts = CurrentOptions();
+ opts.merge_operator = MergeOperators::CreateStringAppendOperator();
+ CreateAndReopenWithCF({"pikachu"}, opts);
+ SetPerfLevel(kEnableCount);
+ // To expand the power of this test, generate > 1 table file and
+ // mix with memtable
+ ASSERT_OK(Merge(1, "k1", "v1"));
+ ASSERT_OK(Merge(1, "k2", "v2"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(2, 1);
+ ASSERT_OK(Merge(1, "k3", "v3"));
+ ASSERT_OK(Merge(1, "k4", "v4"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(2, 1);
+ ASSERT_OK(Merge(1, "k4", "v4_2"));
+ ASSERT_OK(Merge(1, "k6", "v6"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(2, 1);
+ ASSERT_OK(Merge(1, "k7", "v7"));
+ ASSERT_OK(Merge(1, "k8", "v8"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(2, 1);
+
+ get_perf_context()->Reset();
+
+ std::vector<Slice> keys({"k8", "k8", "k8", "k4", "k4", "k1", "k3"});
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+ std::vector<Status> s(keys.size());
+
+ ReadOptions ro;
+ ro.async_io = std::get<1>(GetParam());
+ db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+ s.data(), false);
+
+ ASSERT_EQ(values.size(), keys.size());
+ ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8");
+ ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v8");
+ ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v8");
+ ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v4,v4_2");
+ ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v4,v4_2");
+ ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
+ ASSERT_EQ(std::string(values[6].data(), values[6].size()), "v3");
+ ASSERT_EQ(24, (int)get_perf_context()->multiget_read_bytes);
+
+ for (Status& status : s) {
+ ASSERT_OK(status);
+ }
+
+ SetPerfLevel(kDisable);
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevel) {
+#ifndef USE_COROUTINES
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ // Skip for unbatched MultiGet
+ if (!std::get<0>(GetParam())) {
+ ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+ return;
+ }
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ int num_keys = 0;
+
+ for (int i = 0; i < 128; ++i) {
+ ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ MoveFilesToLevel(2);
+
+ for (int i = 0; i < 128; i += 3) {
+ ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ MoveFilesToLevel(1);
+
+ for (int i = 0; i < 128; i += 5) {
+ ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ ASSERT_EQ(0, num_keys);
+
+ for (int i = 0; i < 128; i += 9) {
+ ASSERT_OK(Put("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+ }
+
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+
+ for (int i = 64; i < 80; ++i) {
+ keys.push_back("key_" + std::to_string(i));
+ }
+
+ values = MultiGet(keys, nullptr, std::get<1>(GetParam()));
+ ASSERT_EQ(values.size(), 16);
+ for (unsigned int j = 0; j < values.size(); ++j) {
+ int key = j + 64;
+ if (key % 9 == 0) {
+ ASSERT_EQ(values[j], "val_mem_" + std::to_string(key));
+ } else if (key % 5 == 0) {
+ ASSERT_EQ(values[j], "val_l0_" + std::to_string(key));
+ } else if (key % 3 == 0) {
+ ASSERT_EQ(values[j], "val_l1_" + std::to_string(key));
+ } else {
+ ASSERT_EQ(values[j], "val_l2_" + std::to_string(key));
+ }
+ }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevelMerge) {
+#ifndef USE_COROUTINES
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ // Skip for unbatched MultiGet
+ if (!std::get<0>(GetParam())) {
+ ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+ return;
+ }
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+ int num_keys = 0;
+
+ for (int i = 0; i < 128; ++i) {
+ ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ MoveFilesToLevel(2);
+
+ for (int i = 0; i < 128; i += 3) {
+ ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ MoveFilesToLevel(1);
+
+ for (int i = 0; i < 128; i += 5) {
+ ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ ASSERT_EQ(0, num_keys);
+
+ for (int i = 0; i < 128; i += 9) {
+ ASSERT_OK(
+ Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+ }
+
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+
+ for (int i = 32; i < 80; ++i) {
+ keys.push_back("key_" + std::to_string(i));
+ }
+
+ values = MultiGet(keys, nullptr, std::get<1>(GetParam()));
+ ASSERT_EQ(values.size(), keys.size());
+ for (unsigned int j = 0; j < 48; ++j) {
+ int key = j + 32;
+ std::string value;
+ value.append("val_l2_" + std::to_string(key));
+ if (key % 3 == 0) {
+ value.append(",");
+ value.append("val_l1_" + std::to_string(key));
+ }
+ if (key % 5 == 0) {
+ value.append(",");
+ value.append("val_l0_" + std::to_string(key));
+ }
+ if (key % 9 == 0) {
+ value.append(",");
+ value.append("val_mem_" + std::to_string(key));
+ }
+ ASSERT_EQ(values[j], value);
+ }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeInMemory) {
+#ifndef USE_COROUTINES
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ // Skip for unbatched MultiGet
+ if (!std::get<0>(GetParam())) {
+ ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+ return;
+ }
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ SetPerfLevel(kEnableCount);
+ ASSERT_OK(Put(1, "k1", "v_1"));
+ ASSERT_OK(Put(1, "k2", "v_2"));
+ ASSERT_OK(Put(1, "k3", "v_3"));
+ ASSERT_OK(Put(1, "k4", "v_4"));
+ ASSERT_OK(Put(1, "k5", "v_5"));
+ ASSERT_OK(Put(1, "k6", "v_6"));
+ std::vector<Slice> keys = {"k1", "k2", "k3", "k4", "k5", "k6"};
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<Status> s(keys.size());
+ std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+
+ get_perf_context()->Reset();
+ ReadOptions ro;
+ ro.value_size_soft_limit = 11;
+ ro.async_io = std::get<1>(GetParam());
+ db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+ s.data(), false);
+
+ ASSERT_EQ(values.size(), keys.size());
+ for (unsigned int i = 0; i < 4; i++) {
+ ASSERT_EQ(std::string(values[i].data(), values[i].size()),
+ "v_" + std::to_string(i + 1));
+ }
+
+ for (unsigned int i = 4; i < 6; i++) {
+ ASSERT_TRUE(s[i].IsAborted());
+ }
+
+ ASSERT_EQ(12, (int)get_perf_context()->multiget_read_bytes);
+ SetPerfLevel(kDisable);
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSize) {
+#ifndef USE_COROUTINES
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ // Skip for unbatched MultiGet
+ if (!std::get<0>(GetParam())) {
+ return;
+ }
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ SetPerfLevel(kEnableCount);
+
+ ASSERT_OK(Put(1, "k6", "v6"));
+ ASSERT_OK(Put(1, "k7", "v7_"));
+ ASSERT_OK(Put(1, "k3", "v3_"));
+ ASSERT_OK(Put(1, "k4", "v4"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Delete(1, "k4"));
+ ASSERT_OK(Put(1, "k11", "v11"));
+ ASSERT_OK(Delete(1, "no_key"));
+ ASSERT_OK(Put(1, "k8", "v8_"));
+ ASSERT_OK(Put(1, "k13", "v13"));
+ ASSERT_OK(Put(1, "k14", "v14"));
+ ASSERT_OK(Put(1, "k15", "v15"));
+ ASSERT_OK(Put(1, "k16", "v16"));
+ ASSERT_OK(Put(1, "k17", "v17"));
+ ASSERT_OK(Flush(1));
+
+ ASSERT_OK(Put(1, "k1", "v1_"));
+ ASSERT_OK(Put(1, "k2", "v2_"));
+ ASSERT_OK(Put(1, "k5", "v5_"));
+ ASSERT_OK(Put(1, "k9", "v9_"));
+ ASSERT_OK(Put(1, "k10", "v10"));
+ ASSERT_OK(Delete(1, "k2"));
+ ASSERT_OK(Delete(1, "k6"));
+
+ get_perf_context()->Reset();
+
+ std::vector<Slice> keys({"k1", "k10", "k11", "k12", "k13", "k14", "k15",
+ "k16", "k17", "k2", "k3", "k4", "k5", "k6", "k7",
+ "k8", "k9", "no_key"});
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+ std::vector<Status> s(keys.size());
+
+ ReadOptions ro;
+ ro.value_size_soft_limit = 20;
+ ro.async_io = std::get<1>(GetParam());
+ db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+ s.data(), false);
+
+ ASSERT_EQ(values.size(), keys.size());
+
+ // In memory keys
+ ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1_");
+ ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v10");
+ ASSERT_TRUE(s[9].IsNotFound()); // k2
+ ASSERT_EQ(std::string(values[12].data(), values[12].size()), "v5_");
+ ASSERT_TRUE(s[13].IsNotFound()); // k6
+ ASSERT_EQ(std::string(values[16].data(), values[16].size()), "v9_");
+
+ // In sst files
+ ASSERT_EQ(std::string(values[2].data(), values[1].size()), "v11");
+ ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v13");
+ ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v14");
+
+ // Remaining aborted after value_size exceeds.
+ ASSERT_TRUE(s[3].IsAborted());
+ ASSERT_TRUE(s[6].IsAborted());
+ ASSERT_TRUE(s[7].IsAborted());
+ ASSERT_TRUE(s[8].IsAborted());
+ ASSERT_TRUE(s[10].IsAborted());
+ ASSERT_TRUE(s[11].IsAborted());
+ ASSERT_TRUE(s[14].IsAborted());
+ ASSERT_TRUE(s[15].IsAborted());
+ ASSERT_TRUE(s[17].IsAborted());
+
+ // 6 kv pairs * 3 bytes per value (i.e. 18)
+ ASSERT_EQ(21, (int)get_perf_context()->multiget_read_bytes);
+ SetPerfLevel(kDisable);
+ } while (ChangeCompactOptions());
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeMultiLevelMerge) {
+ if (std::get<1>(GetParam())) {
+ ROCKSDB_GTEST_BYPASS("This test needs to be fixed for async IO");
+ return;
+ }
+ // Skip for unbatched MultiGet
+ if (!std::get<0>(GetParam())) {
+ ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+ return;
+ }
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+ int num_keys = 0;
+
+ for (int i = 0; i < 64; ++i) {
+ ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ MoveFilesToLevel(2);
+
+ for (int i = 0; i < 64; i += 3) {
+ ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ MoveFilesToLevel(1);
+
+ for (int i = 0; i < 64; i += 5) {
+ ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ ASSERT_OK(Flush());
+ num_keys = 0;
+ }
+ ASSERT_EQ(0, num_keys);
+
+ for (int i = 0; i < 64; i += 9) {
+ ASSERT_OK(
+ Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+ }
+
+ std::vector<std::string> keys_str;
+ for (int i = 10; i < 50; ++i) {
+ keys_str.push_back("key_" + std::to_string(i));
+ }
+
+ std::vector<Slice> keys(keys_str.size());
+ for (int i = 0; i < 40; i++) {
+ keys[i] = Slice(keys_str[i]);
+ }
+
+ std::vector<PinnableSlice> values(keys_str.size());
+ std::vector<Status> statuses(keys_str.size());
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+ read_options.value_size_soft_limit = 380;
+ read_options.async_io = std::get<1>(GetParam());
+ db_->MultiGet(read_options, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data());
+
+ ASSERT_EQ(values.size(), keys.size());
+
+ for (unsigned int j = 0; j < 26; ++j) {
+ int key = j + 10;
+ std::string value;
+ value.append("val_l2_" + std::to_string(key));
+ if (key % 3 == 0) {
+ value.append(",");
+ value.append("val_l1_" + std::to_string(key));
+ }
+ if (key % 5 == 0) {
+ value.append(",");
+ value.append("val_l0_" + std::to_string(key));
+ }
+ if (key % 9 == 0) {
+ value.append(",");
+ value.append("val_mem_" + std::to_string(key));
+ }
+ ASSERT_EQ(values[j], value);
+ ASSERT_OK(statuses[j]);
+ }
+
+ // All remaning keys status is set Status::Abort
+ for (unsigned int j = 26; j < 40; j++) {
+ ASSERT_TRUE(statuses[j].IsAborted());
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
+ testing::Combine(testing::Bool(), testing::Bool()));
+
+#if USE_COROUTINES
+class DBMultiGetAsyncIOTest : public DBBasicTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ DBMultiGetAsyncIOTest()
+ : DBBasicTest(), statistics_(ROCKSDB_NAMESPACE::CreateDBStatistics()) {
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+ options_ = CurrentOptions();
+ options_.disable_auto_compactions = true;
+ options_.statistics = statistics_;
+ options_.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options_);
+ int num_keys = 0;
+
+ // Put all keys in the bottommost level, and overwrite some keys
+ // in L0 and L1
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_OK(Put(Key(i), "val_l2_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ EXPECT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ EXPECT_OK(Flush());
+ num_keys = 0;
+ }
+ MoveFilesToLevel(2);
+
+ for (int i = 0; i < 128; i += 3) {
+ EXPECT_OK(Put(Key(i), "val_l1_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ EXPECT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ EXPECT_OK(Flush());
+ num_keys = 0;
+ }
+ // Put some range deletes in L1
+ for (int i = 128; i < 256; i += 32) {
+ std::string range_begin = Key(i);
+ std::string range_end = Key(i + 16);
+ EXPECT_OK(dbfull()->DeleteRange(WriteOptions(),
+ dbfull()->DefaultColumnFamily(),
+ range_begin, range_end));
+ // Also do some Puts to force creation of bloom filter
+ for (int j = i + 16; j < i + 32; ++j) {
+ if (j % 3 == 0) {
+ EXPECT_OK(Put(Key(j), "val_l1_" + std::to_string(j)));
+ }
+ }
+ EXPECT_OK(Flush());
+ }
+ MoveFilesToLevel(1);
+
+ for (int i = 0; i < 128; i += 5) {
+ EXPECT_OK(Put(Key(i), "val_l0_" + std::to_string(i)));
+ num_keys++;
+ if (num_keys == 8) {
+ EXPECT_OK(Flush());
+ num_keys = 0;
+ }
+ }
+ if (num_keys > 0) {
+ EXPECT_OK(Flush());
+ num_keys = 0;
+ }
+ EXPECT_EQ(0, num_keys);
+ }
+
+ const std::shared_ptr<Statistics>& statistics() { return statistics_; }
+
+ protected:
+ void ReopenDB() { Reopen(options_); }
+
+ private:
+ std::shared_ptr<Statistics> statistics_;
+ Options options_;
+};
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL0) {
+ // All 3 keys in L0. The L0 files should be read serially.
+ std::vector<std::string> key_strs{Key(0), Key(40), Key(80)};
+ std::vector<Slice> keys{key_strs[0], key_strs[1], key_strs[2]};
+ std::vector<PinnableSlice> values(key_strs.size());
+ std::vector<Status> statuses(key_strs.size());
+
+ ReadOptions ro;
+ ro.async_io = true;
+ ro.optimize_multiget_for_io = GetParam();
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data());
+ ASSERT_EQ(values.size(), 3);
+ ASSERT_OK(statuses[0]);
+ ASSERT_OK(statuses[1]);
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[0], "val_l0_" + std::to_string(0));
+ ASSERT_EQ(values[1], "val_l0_" + std::to_string(40));
+ ASSERT_EQ(values[2], "val_l0_" + std::to_string(80));
+
+ HistogramData multiget_io_batch_size;
+
+ statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+ // With async IO, lookups will happen in parallel for each key
+ if (GetParam()) {
+ ASSERT_EQ(multiget_io_batch_size.count, 1);
+ ASSERT_EQ(multiget_io_batch_size.max, 3);
+ ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
+ } else {
+ // Without Async IO, MultiGet will call MultiRead 3 times, once for each
+ // L0 file
+ ASSERT_EQ(multiget_io_batch_size.count, 3);
+ }
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1) {
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ std::vector<PinnableSlice> values;
+ std::vector<Status> statuses;
+
+ key_strs.push_back(Key(33));
+ key_strs.push_back(Key(54));
+ key_strs.push_back(Key(102));
+ keys.push_back(key_strs[0]);
+ keys.push_back(key_strs[1]);
+ keys.push_back(key_strs[2]);
+ values.resize(keys.size());
+ statuses.resize(keys.size());
+
+ ReadOptions ro;
+ ro.async_io = true;
+ ro.optimize_multiget_for_io = GetParam();
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data());
+ ASSERT_EQ(values.size(), 3);
+ ASSERT_EQ(statuses[0], Status::OK());
+ ASSERT_EQ(statuses[1], Status::OK());
+ ASSERT_EQ(statuses[2], Status::OK());
+ ASSERT_EQ(values[0], "val_l1_" + std::to_string(33));
+ ASSERT_EQ(values[1], "val_l1_" + std::to_string(54));
+ ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+ HistogramData multiget_io_batch_size;
+
+ statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+ // A batch of 3 async IOs is expected, one for each overlapping file in L1
+ ASSERT_EQ(multiget_io_batch_size.count, 1);
+ ASSERT_EQ(multiget_io_batch_size.max, 3);
+ ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1Error) {
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ std::vector<PinnableSlice> values;
+ std::vector<Status> statuses;
+
+ key_strs.push_back(Key(33));
+ key_strs.push_back(Key(54));
+ key_strs.push_back(Key(102));
+ keys.push_back(key_strs[0]);
+ keys.push_back(key_strs[1]);
+ keys.push_back(key_strs[2]);
+ values.resize(keys.size());
+ statuses.resize(keys.size());
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::GetTableReader:BeforeOpenFile", [&](void* status) {
+ static int count = 0;
+ count++;
+ // Fail the last table reader open, which is the 6th SST file
+ // since 3 overlapping L0 files + 3 L1 files containing the keys
+ if (count == 6) {
+ Status* s = static_cast<Status*>(status);
+ *s = Status::IOError();
+ }
+ });
+ // DB open will create table readers unless we reduce the table cache
+ // capacity.
+ // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+ // is allocated with max_open_files - 10 as capacity. So override
+ // max_open_files to 11 so table cache capacity will become 1. This will
+ // prevent file open during DB open and force the file to be opened
+ // during MultiGet
+ SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = (int*)arg;
+ *max_open_files = 11;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ReopenDB();
+
+ ReadOptions ro;
+ ro.async_io = true;
+ ro.optimize_multiget_for_io = GetParam();
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data());
+ SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(values.size(), 3);
+ ASSERT_EQ(statuses[0], Status::OK());
+ ASSERT_EQ(statuses[1], Status::OK());
+ ASSERT_EQ(statuses[2], Status::IOError());
+
+ HistogramData multiget_io_batch_size;
+
+ statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+ // A batch of 3 async IOs is expected, one for each overlapping file in L1
+ ASSERT_EQ(multiget_io_batch_size.count, 1);
+ ASSERT_EQ(multiget_io_batch_size.max, 2);
+ ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) {
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ std::vector<PinnableSlice> values;
+ std::vector<Status> statuses;
+
+ // 21 is the last key in the first L1 file
+ key_strs.push_back(Key(21));
+ key_strs.push_back(Key(54));
+ key_strs.push_back(Key(102));
+ keys.push_back(key_strs[0]);
+ keys.push_back(key_strs[1]);
+ keys.push_back(key_strs[2]);
+ values.resize(keys.size());
+ statuses.resize(keys.size());
+
+ ReadOptions ro;
+ ro.async_io = true;
+ ro.optimize_multiget_for_io = GetParam();
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data());
+ ASSERT_EQ(values.size(), 3);
+ ASSERT_EQ(statuses[0], Status::OK());
+ ASSERT_EQ(statuses[1], Status::OK());
+ ASSERT_EQ(statuses[2], Status::OK());
+ ASSERT_EQ(values[0], "val_l1_" + std::to_string(21));
+ ASSERT_EQ(values[1], "val_l1_" + std::to_string(54));
+ ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+ HistogramData multiget_io_batch_size;
+
+ statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+ // Since the first MultiGet key is the last key in a file, the MultiGet is
+ // expected to lookup in that file first, before moving on to other files.
+ // So the first file lookup will issue one async read, and the next lookup
+ // will lookup 2 files in parallel and issue 2 async reads
+ ASSERT_EQ(multiget_io_batch_size.count, 2);
+ ASSERT_EQ(multiget_io_batch_size.max, 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) {
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ std::vector<PinnableSlice> values;
+ std::vector<Status> statuses;
+
+ // 33 and 102 are in L1, and 56 is in L2
+ key_strs.push_back(Key(33));
+ key_strs.push_back(Key(56));
+ key_strs.push_back(Key(102));
+ keys.push_back(key_strs[0]);
+ keys.push_back(key_strs[1]);
+ keys.push_back(key_strs[2]);
+ values.resize(keys.size());
+ statuses.resize(keys.size());
+
+ ReadOptions ro;
+ ro.async_io = true;
+ ro.optimize_multiget_for_io = GetParam();
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data());
+ ASSERT_EQ(values.size(), 3);
+ ASSERT_EQ(statuses[0], Status::OK());
+ ASSERT_EQ(statuses[1], Status::OK());
+ ASSERT_EQ(statuses[2], Status::OK());
+ ASSERT_EQ(values[0], "val_l1_" + std::to_string(33));
+ ASSERT_EQ(values[1], "val_l2_" + std::to_string(56));
+ ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+ HistogramData multiget_io_batch_size;
+
+ statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+ // There are 2 keys in L1 in twp separate files, and 1 in L2. With
+ // optimize_multiget_for_io, all three lookups will happen in parallel.
+ // Otherwise, the L2 lookup will happen after L1.
+ ASSERT_EQ(multiget_io_batch_size.count, GetParam() ? 1 : 2);
+ ASSERT_EQ(multiget_io_batch_size.max, GetParam() ? 3 : 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ std::vector<PinnableSlice> values;
+ std::vector<Status> statuses;
+
+ // 19 and 26 are in L2, but overlap with L0 and L1 file ranges
+ key_strs.push_back(Key(19));
+ key_strs.push_back(Key(26));
+ keys.push_back(key_strs[0]);
+ keys.push_back(key_strs[1]);
+ values.resize(keys.size());
+ statuses.resize(keys.size());
+
+ ReadOptions ro;
+ ro.async_io = true;
+ ro.optimize_multiget_for_io = GetParam();
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data());
+ ASSERT_EQ(values.size(), 2);
+ ASSERT_EQ(statuses[0], Status::OK());
+ ASSERT_EQ(statuses[1], Status::OK());
+ ASSERT_EQ(values[0], "val_l2_" + std::to_string(19));
+ ASSERT_EQ(values[1], "val_l2_" + std::to_string(26));
+
+ // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
+ ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) {
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ std::vector<PinnableSlice> values;
+ std::vector<Status> statuses;
+
+ // 139 and 163 are in L2, but overlap with a range deletes in L1
+ key_strs.push_back(Key(139));
+ key_strs.push_back(Key(163));
+ keys.push_back(key_strs[0]);
+ keys.push_back(key_strs[1]);
+ values.resize(keys.size());
+ statuses.resize(keys.size());
+
+ ReadOptions ro;
+ ro.async_io = true;
+ ro.optimize_multiget_for_io = GetParam();
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data());
+ ASSERT_EQ(values.size(), 2);
+ ASSERT_EQ(statuses[0], Status::NotFound());
+ ASSERT_EQ(statuses[1], Status::NotFound());
+
+ // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
+ ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2WithRangeDelInL1) {
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ std::vector<PinnableSlice> values;
+ std::vector<Status> statuses;
+
+ // 139 and 163 are in L2, but overlap with a range deletes in L1
+ key_strs.push_back(Key(139));
+ key_strs.push_back(Key(144));
+ key_strs.push_back(Key(163));
+ keys.push_back(key_strs[0]);
+ keys.push_back(key_strs[1]);
+ keys.push_back(key_strs[2]);
+ values.resize(keys.size());
+ statuses.resize(keys.size());
+
+ ReadOptions ro;
+ ro.async_io = true;
+ ro.optimize_multiget_for_io = GetParam();
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data());
+ ASSERT_EQ(values.size(), keys.size());
+ ASSERT_EQ(statuses[0], Status::NotFound());
+ ASSERT_EQ(statuses[1], Status::OK());
+ ASSERT_EQ(values[1], "val_l1_" + std::to_string(144));
+ ASSERT_EQ(statuses[2], Status::NotFound());
+
+ // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
+ ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetAsyncIOTest, DBMultiGetAsyncIOTest,
+ testing::Bool());
+#endif // USE_COROUTINES
+
+TEST_F(DBBasicTest, MultiGetStats) {
+ Options options;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ table_options.partition_filters = true;
+ table_options.no_block_cache = true;
+ table_options.cache_index_and_filter_blocks = false;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ int total_keys = 2000;
+ std::vector<std::string> keys_str(total_keys);
+ std::vector<Slice> keys(total_keys);
+ static size_t kMultiGetBatchSize = 100;
+ std::vector<PinnableSlice> values(kMultiGetBatchSize);
+ std::vector<Status> s(kMultiGetBatchSize);
+ ReadOptions read_opts;
+
+ Random rnd(309);
+ // Create Multiple SST files at multiple levels.
+ for (int i = 0; i < 500; ++i) {
+ keys_str[i] = "k" + std::to_string(i);
+ keys[i] = Slice(keys_str[i]);
+ ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+ if (i % 100 == 0) {
+ ASSERT_OK(Flush(1));
+ }
+ }
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(2, 1);
+
+ for (int i = 501; i < 1000; ++i) {
+ keys_str[i] = "k" + std::to_string(i);
+ keys[i] = Slice(keys_str[i]);
+ ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+ if (i % 100 == 0) {
+ ASSERT_OK(Flush(1));
+ }
+ }
+
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(2, 1);
+
+ for (int i = 1001; i < total_keys; ++i) {
+ keys_str[i] = "k" + std::to_string(i);
+ keys[i] = Slice(keys_str[i]);
+ ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+ if (i % 100 == 0) {
+ ASSERT_OK(Flush(1));
+ }
+ }
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(1, 1);
+ Close();
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_OK(options.statistics->Reset());
+
+ db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[1250],
+ values.data(), s.data(), false);
+
+ ASSERT_EQ(values.size(), kMultiGetBatchSize);
+ HistogramData hist_level;
+ HistogramData hist_index_and_filter_blocks;
+ HistogramData hist_sst;
+
+ options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level);
+ options.statistics->histogramData(NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+ &hist_index_and_filter_blocks);
+ options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst);
+
+ // Maximum number of blocks read from a file system in a level.
+ ASSERT_EQ(hist_level.max, 1);
+ ASSERT_GT(hist_index_and_filter_blocks.max, 0);
+ // Maximum number of sst files read from file system in a level.
+ ASSERT_EQ(hist_sst.max, 2);
+
+ // Minimun number of blocks read in a level.
+ ASSERT_EQ(hist_level.min, 1);
+ ASSERT_GT(hist_index_and_filter_blocks.min, 0);
+ // Minimun number of sst files read in a level.
+ ASSERT_EQ(hist_sst.min, 1);
+
+ for (PinnableSlice& value : values) {
+ value.Reset();
+ }
+ for (Status& status : s) {
+ status = Status::OK();
+ }
+ db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[950],
+ values.data(), s.data(), false);
+ options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level);
+ ASSERT_EQ(hist_level.max, 2);
+}
+
+// Test class for batched MultiGet with prefix extractor
+// Param bool - If true, use partitioned filters
+// If false, use full filter block
+class MultiGetPrefixExtractorTest : public DBBasicTest,
+ public ::testing::WithParamInterface<bool> {
+};
+
+TEST_P(MultiGetPrefixExtractorTest, Batched) {
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+ options.memtable_prefix_bloom_size_ratio = 10;
+ BlockBasedTableOptions bbto;
+ if (GetParam()) {
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ bbto.partition_filters = true;
+ }
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ bbto.cache_index_and_filter_blocks = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ SetPerfLevel(kEnableCount);
+ get_perf_context()->Reset();
+
+ ASSERT_OK(Put("k", "v0"));
+ ASSERT_OK(Put("kk1", "v1"));
+ ASSERT_OK(Put("kk2", "v2"));
+ ASSERT_OK(Put("kk3", "v3"));
+ ASSERT_OK(Put("kk4", "v4"));
+ std::vector<std::string> keys(
+ {"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"});
+ std::vector<std::string> expected(
+ {"v0", "v1", "v2", "v3", "v4", "NOT_FOUND", "NOT_FOUND"});
+ std::vector<std::string> values;
+ values = MultiGet(keys, nullptr);
+ ASSERT_EQ(values, expected);
+ // One key ("k") is not queried against the filter because it is outside
+ // the prefix_extractor domain, leaving 6 keys with queried prefixes.
+ ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2);
+ ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 4);
+ ASSERT_OK(Flush());
+
+ get_perf_context()->Reset();
+ values = MultiGet(keys, nullptr);
+ ASSERT_EQ(values, expected);
+ ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2);
+ ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
+
+ // Also check Get stat
+ get_perf_context()->Reset();
+ for (size_t i = 0; i < keys.size(); ++i) {
+ values[i] = Get(keys[i]);
+ }
+ ASSERT_EQ(values, expected);
+ ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2);
+ ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
+}
+
+INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest,
+ ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+class DBMultiGetRowCacheTest : public DBBasicTest,
+ public ::testing::WithParamInterface<bool> {};
+
+TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
+ do {
+ option_config_ = kRowCache;
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ SetPerfLevel(kEnableCount);
+ ASSERT_OK(Put(1, "k1", "v1"));
+ ASSERT_OK(Put(1, "k2", "v2"));
+ ASSERT_OK(Put(1, "k3", "v3"));
+ ASSERT_OK(Put(1, "k4", "v4"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "k5", "v5"));
+ const Snapshot* snap1 = dbfull()->GetSnapshot();
+ ASSERT_OK(Delete(1, "k4"));
+ ASSERT_OK(Flush(1));
+ const Snapshot* snap2 = dbfull()->GetSnapshot();
+
+ get_perf_context()->Reset();
+
+ std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k1"});
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+ std::vector<Status> s(keys.size());
+
+ ReadOptions ro;
+ bool use_snapshots = GetParam();
+ if (use_snapshots) {
+ ro.snapshot = snap2;
+ }
+ db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+ s.data(), false);
+
+ ASSERT_EQ(values.size(), keys.size());
+ ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1");
+ ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+ ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+ // four kv pairs * two bytes per value
+ ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+ ASSERT_TRUE(s[0].IsNotFound());
+ ASSERT_OK(s[1]);
+ ASSERT_TRUE(s[2].IsNotFound());
+ ASSERT_OK(s[3]);
+ ASSERT_OK(s[4]);
+
+ // Call MultiGet() again with some intersection with the previous set of
+ // keys. Those should already be in the row cache.
+ keys.assign({"no_key", "k5", "k3", "k2"});
+ for (size_t i = 0; i < keys.size(); ++i) {
+ values[i].Reset();
+ s[i] = Status::OK();
+ }
+ get_perf_context()->Reset();
+
+ if (use_snapshots) {
+ ro.snapshot = snap1;
+ }
+ db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+ values.data(), s.data(), false);
+
+ ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2");
+ ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+ ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+ // four kv pairs * two bytes per value
+ ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+ ASSERT_TRUE(s[0].IsNotFound());
+ ASSERT_OK(s[1]);
+ ASSERT_OK(s[2]);
+ ASSERT_OK(s[3]);
+ if (use_snapshots) {
+ // Only reads from the first SST file would have been cached, since
+ // snapshot seq no is > fd.largest_seqno
+ ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT));
+ } else {
+ ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT));
+ }
+
+ SetPerfLevel(kDisable);
+ dbfull()->ReleaseSnapshot(snap1);
+ dbfull()->ReleaseSnapshot(snap2);
+ } while (ChangeCompactOptions());
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest,
+ testing::Values(true, false));
+
+TEST_F(DBBasicTest, GetAllKeyVersions) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_EQ(2, handles_.size());
+ const size_t kNumInserts = 4;
+ const size_t kNumDeletes = 4;
+ const size_t kNumUpdates = 4;
+
+ // Check default column family
+ for (size_t i = 0; i != kNumInserts; ++i) {
+ ASSERT_OK(Put(std::to_string(i), "value"));
+ }
+ for (size_t i = 0; i != kNumUpdates; ++i) {
+ ASSERT_OK(Put(std::to_string(i), "value1"));
+ }
+ for (size_t i = 0; i != kNumDeletes; ++i) {
+ ASSERT_OK(Delete(std::to_string(i)));
+ }
+ std::vector<KeyVersion> key_versions;
+ ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+ std::numeric_limits<size_t>::max(),
+ &key_versions));
+ ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+ for (size_t i = 0; i < kNumInserts + kNumDeletes + kNumUpdates; i++) {
+ if (i % 3 == 0) {
+ ASSERT_EQ(key_versions[i].GetTypeName(), "TypeDeletion");
+ } else {
+ ASSERT_EQ(key_versions[i].GetTypeName(), "TypeValue");
+ }
+ }
+ ASSERT_OK(GetAllKeyVersions(db_, handles_[0], Slice(), Slice(),
+ std::numeric_limits<size_t>::max(),
+ &key_versions));
+ ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+
+ // Check non-default column family
+ for (size_t i = 0; i + 1 != kNumInserts; ++i) {
+ ASSERT_OK(Put(1, std::to_string(i), "value"));
+ }
+ for (size_t i = 0; i + 1 != kNumUpdates; ++i) {
+ ASSERT_OK(Put(1, std::to_string(i), "value1"));
+ }
+ for (size_t i = 0; i + 1 != kNumDeletes; ++i) {
+ ASSERT_OK(Delete(1, std::to_string(i)));
+ }
+ ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(),
+ std::numeric_limits<size_t>::max(),
+ &key_versions));
+ ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
+}
+
+TEST_F(DBBasicTest, ValueTypeString) {
+ KeyVersion key_version;
+ // when adding new type, please also update `value_type_string_map`
+ for (unsigned char i = ValueType::kTypeDeletion; i < ValueType::kTypeMaxValid;
+ i++) {
+ key_version.type = i;
+ ASSERT_TRUE(key_version.GetTypeName() != "Invalid");
+ }
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
+ Options options = CurrentOptions();
+ Random rnd(301);
+ BlockBasedTableOptions table_options;
+ table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+ table_options.block_size = 16 * 1024;
+ ASSERT_TRUE(table_options.block_size >
+ BlockBasedTable::kMultiGetReadStackBufSize);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ std::string zero_str(128, '\0');
+ for (int i = 0; i < 100; ++i) {
+ // Make the value compressible. A purely random string doesn't compress
+ // and the resultant data block will not be compressed
+ std::string value(rnd.RandomString(128) + zero_str);
+ assert(Put(Key(i), value) == Status::OK());
+ }
+ ASSERT_OK(Flush());
+
+ std::vector<std::string> key_data(10);
+ std::vector<Slice> keys;
+ // We cannot resize a PinnableSlice vector, so just set initial size to
+ // largest we think we will need
+ std::vector<PinnableSlice> values(10);
+ std::vector<Status> statuses;
+ ReadOptions ro;
+
+ // Warm up the cache first
+ key_data.emplace_back(Key(0));
+ keys.emplace_back(Slice(key_data.back()));
+ key_data.emplace_back(Key(50));
+ keys.emplace_back(Slice(key_data.back()));
+ statuses.resize(keys.size());
+
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+}
+
+TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ WriteOptions write_opts;
+ write_opts.disableWAL = true;
+ for (size_t cf = 0; cf != num_cfs; ++cf) {
+ for (size_t i = 0; i != 10000; ++i) {
+ std::string key_str = Key(static_cast<int>(i));
+ std::string value_str = std::to_string(cf) + "_" + std::to_string(i);
+
+ ASSERT_OK(Put(static_cast<int>(cf), key_str, value_str));
+ if (0 == (i % 1000)) {
+ ASSERT_OK(Flush(static_cast<int>(cf)));
+ }
+ }
+ }
+ for (size_t cf = 0; cf != num_cfs; ++cf) {
+ ASSERT_OK(Flush(static_cast<int>(cf)));
+ }
+ Close();
+ options.best_efforts_recovery = true;
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+ options);
+ num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ for (size_t cf = 0; cf != num_cfs; ++cf) {
+ for (int i = 0; i != 10000; ++i) {
+ std::string key_str = Key(static_cast<int>(i));
+ std::string expected_value_str =
+ std::to_string(cf) + "_" + std::to_string(i);
+ ASSERT_EQ(expected_value_str, Get(static_cast<int>(cf), key_str));
+ }
+ }
+}
+
+TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_OK(Flush());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+ ASSERT_NE(nullptr, arg);
+ *(reinterpret_cast<Status*>(arg)) =
+ Status::Corruption("Inject corruption");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ options.best_efforts_recovery = true;
+ Status s = TryReopen(options);
+ ASSERT_TRUE(s.IsCorruption());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+class TableFileListener : public EventListener {
+ public:
+ void OnTableFileCreated(const TableFileCreationInfo& info) override {
+ InstrumentedMutexLock lock(&mutex_);
+ cf_to_paths_[info.cf_name].push_back(info.file_path);
+ }
+ std::vector<std::string>& GetFiles(const std::string& cf_name) {
+ InstrumentedMutexLock lock(&mutex_);
+ return cf_to_paths_[cf_name];
+ }
+
+ private:
+ InstrumentedMutex mutex_;
+ std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_;
+};
+} // anonymous namespace
+
+TEST_F(DBBasicTest, LastSstFileNotInManifest) {
+ // If the last sst file is not tracked in MANIFEST,
+ // or the VersionEdit for the last sst file is not synced,
+ // on recovery, the last sst file should be deleted,
+ // and new sst files shouldn't reuse its file number.
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ Close();
+
+ // Manually add a sst file.
+ constexpr uint64_t kSstFileNumber = 100;
+ const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber);
+ ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content",
+ /* fname = */ kSstFile,
+ /* should_sync = */ true));
+ ASSERT_OK(env_->FileExists(kSstFile));
+
+ TableFileListener* listener = new TableFileListener();
+ options.listeners.emplace_back(listener);
+ Reopen(options);
+ // kSstFile should already be deleted.
+ ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound());
+
+ ASSERT_OK(Put("k", "v"));
+ ASSERT_OK(Flush());
+ // New sst file should have file number > kSstFileNumber.
+ std::vector<std::string>& files =
+ listener->GetFiles(kDefaultColumnFamilyName);
+ ASSERT_EQ(files.size(), 1);
+ const std::string fname = files[0].erase(0, (dbname_ + "/").size());
+ uint64_t number = 0;
+ FileType type = kTableFile;
+ ASSERT_TRUE(ParseFileName(fname, &number, &type));
+ ASSERT_EQ(type, kTableFile);
+ ASSERT_GT(number, kSstFileNumber);
+}
+
+TEST_F(DBBasicTest, RecoverWithMissingFiles) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ TableFileListener* listener = new TableFileListener();
+ // Disable auto compaction to simplify SST file name tracking.
+ options.disable_auto_compactions = true;
+ options.listeners.emplace_back(listener);
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
+ "eevee"};
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ for (size_t cf = 0; cf != num_cfs; ++cf) {
+ ASSERT_OK(Put(static_cast<int>(cf), "a", "0_value"));
+ ASSERT_OK(Flush(static_cast<int>(cf)));
+ ASSERT_OK(Put(static_cast<int>(cf), "b", "0_value"));
+ ASSERT_OK(Flush(static_cast<int>(cf)));
+ ASSERT_OK(Put(static_cast<int>(cf), "c", "0_value"));
+ ASSERT_OK(Flush(static_cast<int>(cf)));
+ }
+
+ // Delete and corrupt files
+ for (size_t i = 0; i < all_cf_names.size(); ++i) {
+ std::vector<std::string>& files = listener->GetFiles(all_cf_names[i]);
+ ASSERT_EQ(3, files.size());
+ std::string corrupted_data;
+ ASSERT_OK(ReadFileToString(env_, files[files.size() - 1], &corrupted_data));
+ ASSERT_OK(WriteStringToFile(
+ env_, corrupted_data.substr(0, corrupted_data.size() - 2),
+ files[files.size() - 1], /*should_sync=*/true));
+ for (int j = static_cast<int>(files.size() - 2); j >= static_cast<int>(i);
+ --j) {
+ ASSERT_OK(env_->DeleteFile(files[j]));
+ }
+ }
+ options.best_efforts_recovery = true;
+ ReopenWithColumnFamilies(all_cf_names, options);
+ // Verify data
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
+ iter->SeekToFirst();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ iter.reset(db_->NewIterator(read_opts, handles_[1]));
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a", iter->key());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ iter.reset(db_->NewIterator(read_opts, handles_[2]));
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a", iter->key());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("b", iter->key());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ }
+}
+
+TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "value0"));
+ ASSERT_OK(Flush());
+ Close();
+ {
+ // Hack by adding a new MANIFEST with high file number
+ std::string garbage(10, '\0');
+ ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/MANIFEST-001000",
+ /*should_sync=*/true));
+ }
+ {
+ // Hack by adding a corrupted SST not referenced by any MANIFEST
+ std::string garbage(10, '\0');
+ ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/001001.sst",
+ /*should_sync=*/true));
+ }
+
+ options.best_efforts_recovery = true;
+
+ Reopen(options);
+ ASSERT_OK(Put("bar", "value"));
+}
+
+TEST_F(DBBasicTest, RecoverWithNoCurrentFile) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ options.best_efforts_recovery = true;
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+ ASSERT_EQ(2, handles_.size());
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_OK(Put(1, "bar", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Flush(1));
+ Close();
+ ASSERT_OK(env_->DeleteFile(CurrentFileName(dbname_)));
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+ std::vector<std::string> cf_names;
+ ASSERT_OK(DB::ListColumnFamilies(DBOptions(options), dbname_, &cf_names));
+ ASSERT_EQ(2, cf_names.size());
+ for (const auto& name : cf_names) {
+ ASSERT_TRUE(name == kDefaultColumnFamilyName || name == "pikachu");
+ }
+}
+
+TEST_F(DBBasicTest, RecoverWithNoManifest) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_OK(Flush());
+ Close();
+ {
+ // Delete all MANIFEST.
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(dbname_, &files));
+ for (const auto& file : files) {
+ uint64_t number = 0;
+ FileType type = kWalFile;
+ if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
+ ASSERT_OK(env_->DeleteFile(dbname_ + "/" + file));
+ }
+ }
+ }
+ options.best_efforts_recovery = true;
+ options.create_if_missing = false;
+ Status s = TryReopen(options);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ options.create_if_missing = true;
+ Reopen(options);
+ // Since no MANIFEST exists, best-efforts recovery creates a new, empty db.
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ TableFileListener* listener = new TableFileListener();
+ options.listeners.emplace_back(listener);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ std::vector<std::string> kAllCfNames = {kDefaultColumnFamilyName, "pikachu"};
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(2, num_cfs);
+ for (int cf = 0; cf < static_cast<int>(kAllCfNames.size()); ++cf) {
+ ASSERT_OK(Put(cf, "a", "0_value"));
+ ASSERT_OK(Flush(cf));
+ ASSERT_OK(Put(cf, "b", "0_value"));
+ }
+ // Delete files
+ for (size_t i = 0; i < kAllCfNames.size(); ++i) {
+ std::vector<std::string>& files = listener->GetFiles(kAllCfNames[i]);
+ ASSERT_EQ(1, files.size());
+ for (int j = static_cast<int>(files.size() - 1); j >= static_cast<int>(i);
+ --j) {
+ ASSERT_OK(env_->DeleteFile(files[j]));
+ }
+ }
+ options.best_efforts_recovery = true;
+ ReopenWithColumnFamilies(kAllCfNames, options);
+ // Verify WAL is not applied
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
+ iter->SeekToFirst();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ iter.reset(db_->NewIterator(read_opts, handles_[1]));
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a", iter->key());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+}
+
+TEST_F(DBBasicTest, DisableTrackWal) {
+ // If WAL tracking was enabled, and then disabled during reopen,
+ // the previously tracked WALs should be removed from MANIFEST.
+
+ Options options = CurrentOptions();
+ options.track_and_verify_wals_in_manifest = true;
+ // extremely small write buffer size,
+ // so that new WALs are created more frequently.
+ options.write_buffer_size = 100;
+ options.env = env_;
+ DestroyAndReopen(options);
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put("foo" + std::to_string(i), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+ ASSERT_OK(db_->SyncWAL());
+ // Some WALs are tracked.
+ ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+ Close();
+
+ // Disable WAL tracking.
+ options.track_and_verify_wals_in_manifest = false;
+ options.create_if_missing = false;
+ ASSERT_OK(TryReopen(options));
+ // Previously tracked WALs are cleared.
+ ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+ Close();
+
+ // Re-enable WAL tracking again.
+ options.track_and_verify_wals_in_manifest = true;
+ options.create_if_missing = false;
+ ASSERT_OK(TryReopen(options));
+ ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBBasicTest, ManifestChecksumMismatch) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("bar", "value"));
+ ASSERT_OK(Flush());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum", [&](void* arg) {
+ auto* crc = reinterpret_cast<uint32_t*>(arg);
+ *crc = *crc + 1;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions write_opts;
+ write_opts.disableWAL = true;
+ Status s = db_->Put(write_opts, "foo", "value");
+ ASSERT_OK(s);
+ ASSERT_OK(Flush());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ ASSERT_OK(Put("foo", "value1"));
+ ASSERT_OK(Flush());
+ s = TryReopen(options);
+ ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBBasicTest, ConcurrentlyCloseDB) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ std::vector<std::thread> workers;
+ for (int i = 0; i < 10; i++) {
+ workers.push_back(std::thread([&]() {
+ auto s = db_->Close();
+ ASSERT_OK(s);
+ }));
+ }
+ for (auto& w : workers) {
+ w.join();
+ }
+}
+
+#ifndef ROCKSDB_LITE
+class DBBasicTestTrackWal : public DBTestBase,
+ public testing::WithParamInterface<bool> {
+ public:
+ DBBasicTestTrackWal()
+ : DBTestBase("db_basic_test_track_wal", /*env_do_fsync=*/false) {}
+
+ int CountWalFiles() {
+ VectorLogPtr log_files;
+ EXPECT_OK(dbfull()->GetSortedWalFiles(log_files));
+ return static_cast<int>(log_files.size());
+ };
+};
+
+TEST_P(DBBasicTestTrackWal, DoNotTrackObsoleteWal) {
+ // If a WAL becomes obsolete after flushing, but is not deleted from disk yet,
+ // then if SyncWAL is called afterwards, the obsolete WAL should not be
+ // tracked in MANIFEST.
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.track_and_verify_wals_in_manifest = true;
+ options.atomic_flush = GetParam();
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"cf"}, options);
+ ASSERT_EQ(handles_.size(), 2); // default, cf
+ // Do not delete WALs.
+ ASSERT_OK(db_->DisableFileDeletions());
+ constexpr int n = 10;
+ std::vector<std::unique_ptr<LogFile>> wals(n);
+ for (size_t i = 0; i < n; i++) {
+ // Generate a new WAL for each key-value.
+ const int cf = i % 2;
+ ASSERT_OK(db_->GetCurrentWalFile(&wals[i]));
+ ASSERT_OK(Put(cf, "k" + std::to_string(i), "v" + std::to_string(i)));
+ ASSERT_OK(Flush({0, 1}));
+ }
+ ASSERT_EQ(CountWalFiles(), n);
+ // Since all WALs are obsolete, no WAL should be tracked in MANIFEST.
+ ASSERT_OK(db_->SyncWAL());
+
+ // Manually delete all WALs.
+ Close();
+ for (const auto& wal : wals) {
+ ASSERT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber())));
+ }
+
+ // If SyncWAL tracks the obsolete WALs in MANIFEST,
+ // reopen will fail because the WALs are missing from disk.
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "cf"}, options));
+ Destroy(options);
+}
+
+INSTANTIATE_TEST_CASE_P(DBBasicTestTrackWal, DBBasicTestTrackWal,
+ testing::Bool());
+#endif // ROCKSDB_LITE
+
+class DBBasicTestMultiGet : public DBTestBase {
+ public:
+ DBBasicTestMultiGet(std::string test_dir, int num_cfs, bool compressed_cache,
+ bool uncompressed_cache, bool _compression_enabled,
+ bool _fill_cache, uint32_t compression_parallel_threads)
+ : DBTestBase(test_dir, /*env_do_fsync=*/false) {
+ compression_enabled_ = _compression_enabled;
+ fill_cache_ = _fill_cache;
+
+ if (compressed_cache) {
+ std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+ compressed_cache_ = std::make_shared<MyBlockCache>(cache);
+ }
+ if (uncompressed_cache) {
+ std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+ uncompressed_cache_ = std::make_shared<MyBlockCache>(cache);
+ }
+
+ env_->count_random_reads_ = true;
+
+ Options options = CurrentOptions();
+ Random rnd(301);
+ BlockBasedTableOptions table_options;
+
+#ifndef ROCKSDB_LITE
+ if (compression_enabled_) {
+ std::vector<CompressionType> compression_types;
+ compression_types = GetSupportedCompressions();
+ // Not every platform may have compression libraries available, so
+ // dynamically pick based on what's available
+ CompressionType tmp_type = kNoCompression;
+ for (auto c_type : compression_types) {
+ if (c_type != kNoCompression) {
+ tmp_type = c_type;
+ break;
+ }
+ }
+ if (tmp_type != kNoCompression) {
+ options.compression = tmp_type;
+ } else {
+ compression_enabled_ = false;
+ }
+ }
+#else
+ // GetSupportedCompressions() is not available in LITE build
+ if (!Snappy_Supported()) {
+ compression_enabled_ = false;
+ }
+#endif // ROCKSDB_LITE
+
+ table_options.block_cache = uncompressed_cache_;
+ if (table_options.block_cache == nullptr) {
+ table_options.no_block_cache = true;
+ } else {
+ table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+ }
+ table_options.block_cache_compressed = compressed_cache_;
+ table_options.flush_block_policy_factory.reset(
+ new MyFlushBlockPolicyFactory());
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ if (!compression_enabled_) {
+ options.compression = kNoCompression;
+ } else {
+ options.compression_opts.parallel_threads = compression_parallel_threads;
+ }
+ options_ = options;
+ Reopen(options);
+
+ if (num_cfs > 1) {
+ for (int cf = 0; cf < num_cfs; ++cf) {
+ cf_names_.emplace_back("cf" + std::to_string(cf));
+ }
+ CreateColumnFamilies(cf_names_, options);
+ cf_names_.emplace_back("default");
+ }
+
+ std::string zero_str(128, '\0');
+ for (int cf = 0; cf < num_cfs; ++cf) {
+ for (int i = 0; i < 100; ++i) {
+ // Make the value compressible. A purely random string doesn't compress
+ // and the resultant data block will not be compressed
+ values_.emplace_back(rnd.RandomString(128) + zero_str);
+ assert(((num_cfs == 1) ? Put(Key(i), values_[i])
+ : Put(cf, Key(i), values_[i])) == Status::OK());
+ }
+ if (num_cfs == 1) {
+ EXPECT_OK(Flush());
+ } else {
+ EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
+ }
+
+ for (int i = 0; i < 100; ++i) {
+ // block cannot gain space by compression
+ uncompressable_values_.emplace_back(rnd.RandomString(256) + '\0');
+ std::string tmp_key = "a" + Key(i);
+ assert(((num_cfs == 1) ? Put(tmp_key, uncompressable_values_[i])
+ : Put(cf, tmp_key, uncompressable_values_[i])) ==
+ Status::OK());
+ }
+ if (num_cfs == 1) {
+ EXPECT_OK(Flush());
+ } else {
+ EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
+ }
+ }
+ // Clear compressed cache, which is always pre-populated
+ if (compressed_cache_) {
+ compressed_cache_->SetCapacity(0);
+ compressed_cache_->SetCapacity(1048576);
+ }
+ }
+
+ bool CheckValue(int i, const std::string& value) {
+ if (values_[i].compare(value) == 0) {
+ return true;
+ }
+ return false;
+ }
+
+ bool CheckUncompressableValue(int i, const std::string& value) {
+ if (uncompressable_values_[i].compare(value) == 0) {
+ return true;
+ }
+ return false;
+ }
+
+ const std::vector<std::string>& GetCFNames() const { return cf_names_; }
+
+ int num_lookups() { return uncompressed_cache_->num_lookups(); }
+ int num_found() { return uncompressed_cache_->num_found(); }
+ int num_inserts() { return uncompressed_cache_->num_inserts(); }
+
+ int num_lookups_compressed() { return compressed_cache_->num_lookups(); }
+ int num_found_compressed() { return compressed_cache_->num_found(); }
+ int num_inserts_compressed() { return compressed_cache_->num_inserts(); }
+
+ bool fill_cache() { return fill_cache_; }
+ bool compression_enabled() { return compression_enabled_; }
+ bool has_compressed_cache() { return compressed_cache_ != nullptr; }
+ bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; }
+ Options get_options() { return options_; }
+
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ protected:
+ class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+ MyFlushBlockPolicyFactory() {}
+
+ virtual const char* Name() const override {
+ return "MyFlushBlockPolicyFactory";
+ }
+
+ virtual FlushBlockPolicy* NewFlushBlockPolicy(
+ const BlockBasedTableOptions& /*table_options*/,
+ const BlockBuilder& data_block_builder) const override {
+ return new MyFlushBlockPolicy(data_block_builder);
+ }
+ };
+
+ class MyFlushBlockPolicy : public FlushBlockPolicy {
+ public:
+ explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder)
+ : num_keys_(0), data_block_builder_(data_block_builder) {}
+
+ bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+ if (data_block_builder_.empty()) {
+ // First key in this block
+ num_keys_ = 1;
+ return false;
+ }
+ // Flush every 10 keys
+ if (num_keys_ == 10) {
+ num_keys_ = 1;
+ return true;
+ }
+ num_keys_++;
+ return false;
+ }
+
+ private:
+ int num_keys_;
+ const BlockBuilder& data_block_builder_;
+ };
+
+ class MyBlockCache : public CacheWrapper {
+ public:
+ explicit MyBlockCache(std::shared_ptr<Cache> target)
+ : CacheWrapper(target),
+ num_lookups_(0),
+ num_found_(0),
+ num_inserts_(0) {}
+
+ const char* Name() const override { return "MyBlockCache"; }
+
+ using Cache::Insert;
+ Status Insert(const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value),
+ Handle** handle = nullptr,
+ Priority priority = Priority::LOW) override {
+ num_inserts_++;
+ return target_->Insert(key, value, charge, deleter, handle, priority);
+ }
+
+ using Cache::Lookup;
+ Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
+ num_lookups_++;
+ Handle* handle = target_->Lookup(key, stats);
+ if (handle != nullptr) {
+ num_found_++;
+ }
+ return handle;
+ }
+ int num_lookups() { return num_lookups_; }
+
+ int num_found() { return num_found_; }
+
+ int num_inserts() { return num_inserts_; }
+
+ private:
+ int num_lookups_;
+ int num_found_;
+ int num_inserts_;
+ };
+
+ std::shared_ptr<MyBlockCache> compressed_cache_;
+ std::shared_ptr<MyBlockCache> uncompressed_cache_;
+ Options options_;
+ bool compression_enabled_;
+ std::vector<std::string> values_;
+ std::vector<std::string> uncompressable_values_;
+ bool fill_cache_;
+ std::vector<std::string> cf_names_;
+};
+
+class DBBasicTestWithParallelIO
+ : public DBBasicTestMultiGet,
+ public testing::WithParamInterface<
+ std::tuple<bool, bool, bool, bool, uint32_t>> {
+ public:
+ DBBasicTestWithParallelIO()
+ : DBBasicTestMultiGet("/db_basic_test_with_parallel_io", 1,
+ std::get<0>(GetParam()), std::get<1>(GetParam()),
+ std::get<2>(GetParam()), std::get<3>(GetParam()),
+ std::get<4>(GetParam())) {}
+};
+
+TEST_P(DBBasicTestWithParallelIO, MultiGet) {
+ std::vector<std::string> key_data(10);
+ std::vector<Slice> keys;
+ // We cannot resize a PinnableSlice vector, so just set initial size to
+ // largest we think we will need
+ std::vector<PinnableSlice> values(10);
+ std::vector<Status> statuses;
+ ReadOptions ro;
+ ro.fill_cache = fill_cache();
+
+ // Warm up the cache first
+ key_data.emplace_back(Key(0));
+ keys.emplace_back(Slice(key_data.back()));
+ key_data.emplace_back(Key(50));
+ keys.emplace_back(Slice(key_data.back()));
+ statuses.resize(keys.size());
+
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+ ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+ int random_reads = env_->random_read_counter_.Read();
+ key_data[0] = Key(1);
+ key_data[1] = Key(51);
+ keys[0] = Slice(key_data[0]);
+ keys[1] = Slice(key_data[1]);
+ values[0].Reset();
+ values[1].Reset();
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+ ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+ bool read_from_cache = false;
+ if (fill_cache()) {
+ if (has_uncompressed_cache()) {
+ read_from_cache = true;
+ } else if (has_compressed_cache() && compression_enabled()) {
+ read_from_cache = true;
+ }
+ }
+
+ int expected_reads = random_reads + (read_from_cache ? 0 : 2);
+ ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+ keys.resize(10);
+ statuses.resize(10);
+ std::vector<int> key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
+ for (size_t i = 0; i < key_ints.size(); ++i) {
+ key_data[i] = Key(key_ints[i]);
+ keys[i] = Slice(key_data[i]);
+ statuses[i] = Status::OK();
+ values[i].Reset();
+ }
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ for (size_t i = 0; i < key_ints.size(); ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString()));
+ }
+ if (compression_enabled() && !has_compressed_cache()) {
+ expected_reads += (read_from_cache ? 2 : 3);
+ } else {
+ expected_reads += (read_from_cache ? 2 : 4);
+ }
+ ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+ keys.resize(10);
+ statuses.resize(10);
+ std::vector<int> key_uncmp{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
+ for (size_t i = 0; i < key_uncmp.size(); ++i) {
+ key_data[i] = "a" + Key(key_uncmp[i]);
+ keys[i] = Slice(key_data[i]);
+ statuses[i] = Status::OK();
+ values[i].Reset();
+ }
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ for (size_t i = 0; i < key_uncmp.size(); ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_TRUE(CheckUncompressableValue(key_uncmp[i], values[i].ToString()));
+ }
+ if (compression_enabled() && !has_compressed_cache()) {
+ expected_reads += (read_from_cache ? 3 : 3);
+ } else {
+ expected_reads += (read_from_cache ? 4 : 4);
+ }
+ ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+ keys.resize(5);
+ statuses.resize(5);
+ std::vector<int> key_tr{1, 2, 15, 16, 55};
+ for (size_t i = 0; i < key_tr.size(); ++i) {
+ key_data[i] = "a" + Key(key_tr[i]);
+ keys[i] = Slice(key_data[i]);
+ statuses[i] = Status::OK();
+ values[i].Reset();
+ }
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ for (size_t i = 0; i < key_tr.size(); ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_TRUE(CheckUncompressableValue(key_tr[i], values[i].ToString()));
+ }
+ if (compression_enabled() && !has_compressed_cache()) {
+ expected_reads += (read_from_cache ? 0 : 2);
+ ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+ } else {
+ if (has_uncompressed_cache()) {
+ expected_reads += (read_from_cache ? 0 : 3);
+ ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+ } else {
+ // A rare case, even we enable the block compression but some of data
+ // blocks are not compressed due to content. If user only enable the
+ // compressed cache, the uncompressed blocks will not tbe cached, and
+ // block reads will be triggered. The number of reads is related to
+ // the compression algorithm.
+ ASSERT_TRUE(env_->random_read_counter_.Read() >= expected_reads);
+ }
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) {
+ class FakeDirectIOEnv : public EnvWrapper {
+ class FakeDirectIOSequentialFile;
+ class FakeDirectIORandomAccessFile;
+
+ public:
+ FakeDirectIOEnv(Env* env) : EnvWrapper(env) {}
+ static const char* kClassName() { return "FakeDirectIOEnv"; }
+ const char* Name() const override { return kClassName(); }
+
+ Status NewRandomAccessFile(const std::string& fname,
+ std::unique_ptr<RandomAccessFile>* result,
+ const EnvOptions& options) override {
+ std::unique_ptr<RandomAccessFile> file;
+ assert(options.use_direct_reads);
+ EnvOptions opts = options;
+ opts.use_direct_reads = false;
+ Status s = target()->NewRandomAccessFile(fname, &file, opts);
+ if (!s.ok()) {
+ return s;
+ }
+ result->reset(new FakeDirectIORandomAccessFile(std::move(file)));
+ return s;
+ }
+
+ private:
+ class FakeDirectIOSequentialFile : public SequentialFileWrapper {
+ public:
+ FakeDirectIOSequentialFile(std::unique_ptr<SequentialFile>&& file)
+ : SequentialFileWrapper(file.get()), file_(std::move(file)) {}
+ ~FakeDirectIOSequentialFile() {}
+
+ bool use_direct_io() const override { return true; }
+ size_t GetRequiredBufferAlignment() const override { return 1; }
+
+ private:
+ std::unique_ptr<SequentialFile> file_;
+ };
+
+ class FakeDirectIORandomAccessFile : public RandomAccessFileWrapper {
+ public:
+ FakeDirectIORandomAccessFile(std::unique_ptr<RandomAccessFile>&& file)
+ : RandomAccessFileWrapper(file.get()), file_(std::move(file)) {}
+ ~FakeDirectIORandomAccessFile() {}
+
+ bool use_direct_io() const override { return true; }
+ size_t GetRequiredBufferAlignment() const override { return 1; }
+
+ private:
+ std::unique_ptr<RandomAccessFile> file_;
+ };
+ };
+
+ std::unique_ptr<FakeDirectIOEnv> env(new FakeDirectIOEnv(env_));
+ Options opts = get_options();
+ opts.env = env.get();
+ opts.use_direct_reads = true;
+ Reopen(opts);
+
+ std::vector<std::string> key_data(10);
+ std::vector<Slice> keys;
+ // We cannot resize a PinnableSlice vector, so just set initial size to
+ // largest we think we will need
+ std::vector<PinnableSlice> values(10);
+ std::vector<Status> statuses;
+ ReadOptions ro;
+ ro.fill_cache = fill_cache();
+
+ // Warm up the cache first
+ key_data.emplace_back(Key(0));
+ keys.emplace_back(Slice(key_data.back()));
+ key_data.emplace_back(Key(50));
+ keys.emplace_back(Slice(key_data.back()));
+ statuses.resize(keys.size());
+
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+ ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+ int random_reads = env_->random_read_counter_.Read();
+ key_data[0] = Key(1);
+ key_data[1] = Key(51);
+ keys[0] = Slice(key_data[0]);
+ keys[1] = Slice(key_data[1]);
+ values[0].Reset();
+ values[1].Reset();
+ if (uncompressed_cache_) {
+ uncompressed_cache_->SetCapacity(0);
+ uncompressed_cache_->SetCapacity(1048576);
+ }
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+ ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+ bool read_from_cache = false;
+ if (fill_cache()) {
+ if (has_uncompressed_cache()) {
+ read_from_cache = true;
+ } else if (has_compressed_cache() && compression_enabled()) {
+ read_from_cache = true;
+ }
+ }
+
+ int expected_reads = random_reads;
+ if (!compression_enabled() || !has_compressed_cache()) {
+ expected_reads += 2;
+ } else {
+ expected_reads += (read_from_cache ? 0 : 2);
+ }
+ if (env_->random_read_counter_.Read() != expected_reads) {
+ ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+ }
+ Close();
+}
+#endif // ROCKSDB_LITE
+
+TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) {
+ std::vector<std::string> key_data(10);
+ std::vector<Slice> keys;
+ // We cannot resize a PinnableSlice vector, so just set initial size to
+ // largest we think we will need
+ std::vector<PinnableSlice> values(10);
+ std::vector<Status> statuses;
+ int read_count = 0;
+ ReadOptions ro;
+ ro.fill_cache = fill_cache();
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "RetrieveMultipleBlocks:VerifyChecksum", [&](void* status) {
+ Status* s = static_cast<Status*>(status);
+ read_count++;
+ if (read_count == 2) {
+ *s = Status::Corruption();
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Warm up the cache first
+ key_data.emplace_back(Key(0));
+ keys.emplace_back(Slice(key_data.back()));
+ key_data.emplace_back(Key(50));
+ keys.emplace_back(Slice(key_data.back()));
+ statuses.resize(keys.size());
+
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+ // ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+ ASSERT_EQ(statuses[0], Status::OK());
+ ASSERT_EQ(statuses[1], Status::Corruption());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) {
+ std::vector<std::string> key_data(10);
+ std::vector<Slice> keys;
+ // We cannot resize a PinnableSlice vector, so just set initial size to
+ // largest we think we will need
+ std::vector<PinnableSlice> values(10);
+ std::vector<Status> statuses;
+ ReadOptions ro;
+ ro.fill_cache = fill_cache();
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::MultiGet:FindTable", [&](void* status) {
+ Status* s = static_cast<Status*>(status);
+ *s = Status::IOError();
+ });
+ // DB open will create table readers unless we reduce the table cache
+ // capacity.
+ // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+ // is allocated with max_open_files - 10 as capacity. So override
+ // max_open_files to 11 so table cache capacity will become 1. This will
+ // prevent file open during DB open and force the file to be opened
+ // during MultiGet
+ SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = (int*)arg;
+ *max_open_files = 11;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(CurrentOptions());
+
+ // Warm up the cache first
+ key_data.emplace_back(Key(0));
+ keys.emplace_back(Slice(key_data.back()));
+ key_data.emplace_back(Key(50));
+ keys.emplace_back(Slice(key_data.back()));
+ statuses.resize(keys.size());
+
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), values.data(), statuses.data(), true);
+ ASSERT_EQ(statuses[0], Status::IOError());
+ ASSERT_EQ(statuses[1], Status::IOError());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(ParallelIO, DBBasicTestWithParallelIO,
+ // Params are as follows -
+ // Param 0 - Compressed cache enabled
+ // Param 1 - Uncompressed cache enabled
+ // Param 2 - Data compression enabled
+ // Param 3 - ReadOptions::fill_cache
+ // Param 4 - CompressionOptions::parallel_threads
+ ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+ ::testing::Bool(), ::testing::Bool(),
+ ::testing::Values(1, 4)));
+
+// Forward declaration
+class DeadlineFS;
+
+class DeadlineRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+ DeadlineRandomAccessFile(DeadlineFS& fs,
+ std::unique_ptr<FSRandomAccessFile>& file)
+ : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
+
+ IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+
+ IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+ const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+ std::function<void(const FSReadRequest&, void*)> cb,
+ void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+ IODebugContext* dbg) override;
+
+ private:
+ DeadlineFS& fs_;
+ std::unique_ptr<FSRandomAccessFile> file_;
+};
+
+class DeadlineFS : public FileSystemWrapper {
+ public:
+ // The error_on_delay parameter specifies whether a IOStatus::TimedOut()
+ // status should be returned after delaying the IO to exceed the timeout,
+ // or to simply delay but return success anyway. The latter mimics the
+ // behavior of PosixFileSystem, which does not enforce any timeout
+ explicit DeadlineFS(SpecialEnv* env, bool error_on_delay)
+ : FileSystemWrapper(env->GetFileSystem()),
+ deadline_(std::chrono::microseconds::zero()),
+ io_timeout_(std::chrono::microseconds::zero()),
+ env_(env),
+ timedout_(false),
+ ignore_deadline_(false),
+ error_on_delay_(error_on_delay) {}
+
+ static const char* kClassName() { return "DeadlineFileSystem"; }
+ const char* Name() const override { return kClassName(); }
+
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& opts,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* dbg) override {
+ std::unique_ptr<FSRandomAccessFile> file;
+ IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+ EXPECT_OK(s);
+ result->reset(new DeadlineRandomAccessFile(*this, file));
+
+ const std::chrono::microseconds deadline = GetDeadline();
+ const std::chrono::microseconds io_timeout = GetIOTimeout();
+ if (deadline.count() || io_timeout.count()) {
+ AssertDeadline(deadline, io_timeout, opts.io_options);
+ }
+ return ShouldDelay(opts.io_options);
+ }
+
+ // Set a vector of {IO counter, delay in microseconds, return status} tuples
+ // that control when to inject a delay and duration of the delay
+ void SetDelayTrigger(const std::chrono::microseconds deadline,
+ const std::chrono::microseconds io_timeout,
+ const int trigger) {
+ delay_trigger_ = trigger;
+ io_count_ = 0;
+ deadline_ = deadline;
+ io_timeout_ = io_timeout;
+ timedout_ = false;
+ }
+
+ // Increment the IO counter and return a delay in microseconds
+ IOStatus ShouldDelay(const IOOptions& opts) {
+ if (timedout_) {
+ return IOStatus::TimedOut();
+ } else if (!deadline_.count() && !io_timeout_.count()) {
+ return IOStatus::OK();
+ }
+ if (!ignore_deadline_ && delay_trigger_ == io_count_++) {
+ env_->SleepForMicroseconds(static_cast<int>(opts.timeout.count() + 1));
+ timedout_ = true;
+ if (error_on_delay_) {
+ return IOStatus::TimedOut();
+ }
+ }
+ return IOStatus::OK();
+ }
+
+ const std::chrono::microseconds GetDeadline() {
+ return ignore_deadline_ ? std::chrono::microseconds::zero() : deadline_;
+ }
+
+ const std::chrono::microseconds GetIOTimeout() {
+ return ignore_deadline_ ? std::chrono::microseconds::zero() : io_timeout_;
+ }
+
+ bool TimedOut() { return timedout_; }
+
+ void IgnoreDeadline(bool ignore) { ignore_deadline_ = ignore; }
+
+ void AssertDeadline(const std::chrono::microseconds deadline,
+ const std::chrono::microseconds io_timeout,
+ const IOOptions& opts) const {
+ // Give a leeway of +- 10us as it can take some time for the Get/
+ // MultiGet call to reach here, in order to avoid false alarms
+ std::chrono::microseconds now =
+ std::chrono::microseconds(env_->NowMicros());
+ std::chrono::microseconds timeout;
+ if (deadline.count()) {
+ timeout = deadline - now;
+ if (io_timeout.count()) {
+ timeout = std::min(timeout, io_timeout);
+ }
+ } else {
+ timeout = io_timeout;
+ }
+ if (opts.timeout != timeout) {
+ ASSERT_EQ(timeout, opts.timeout);
+ }
+ }
+
+ private:
+ // The number of IOs to trigger the delay after
+ int delay_trigger_;
+ // Current IO count
+ int io_count_;
+ // ReadOptions deadline for the Get/MultiGet/Iterator
+ std::chrono::microseconds deadline_;
+ // ReadOptions io_timeout for the Get/MultiGet/Iterator
+ std::chrono::microseconds io_timeout_;
+ SpecialEnv* env_;
+ // Flag to indicate whether we injected a delay
+ bool timedout_;
+ // Temporarily ignore deadlines/timeouts
+ bool ignore_deadline_;
+ // Return IOStatus::TimedOut() or IOStatus::OK()
+ bool error_on_delay_;
+};
+
+IOStatus DeadlineRandomAccessFile::Read(uint64_t offset, size_t len,
+ const IOOptions& opts, Slice* result,
+ char* scratch,
+ IODebugContext* dbg) const {
+ const std::chrono::microseconds deadline = fs_.GetDeadline();
+ const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+ IOStatus s;
+ if (deadline.count() || io_timeout.count()) {
+ fs_.AssertDeadline(deadline, io_timeout, opts);
+ }
+ if (s.ok()) {
+ s = FSRandomAccessFileWrapper::Read(offset, len, opts, result, scratch,
+ dbg);
+ }
+ if (s.ok()) {
+ s = fs_.ShouldDelay(opts);
+ }
+ return s;
+}
+
+IOStatus DeadlineRandomAccessFile::ReadAsync(
+ FSReadRequest& req, const IOOptions& opts,
+ std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+ void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+ const std::chrono::microseconds deadline = fs_.GetDeadline();
+ const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+ IOStatus s;
+ if (deadline.count() || io_timeout.count()) {
+ fs_.AssertDeadline(deadline, io_timeout, opts);
+ }
+ if (s.ok()) {
+ s = FSRandomAccessFileWrapper::ReadAsync(req, opts, cb, cb_arg, io_handle,
+ del_fn, dbg);
+ }
+ if (s.ok()) {
+ s = fs_.ShouldDelay(opts);
+ }
+ return s;
+}
+
+IOStatus DeadlineRandomAccessFile::MultiRead(FSReadRequest* reqs,
+ size_t num_reqs,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ const std::chrono::microseconds deadline = fs_.GetDeadline();
+ const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+ IOStatus s;
+ if (deadline.count() || io_timeout.count()) {
+ fs_.AssertDeadline(deadline, io_timeout, options);
+ }
+ if (s.ok()) {
+ s = FSRandomAccessFileWrapper::MultiRead(reqs, num_reqs, options, dbg);
+ }
+ if (s.ok()) {
+ s = fs_.ShouldDelay(options);
+ }
+ return s;
+}
+
+// A test class for intercepting random reads and injecting artificial
+// delays. Used for testing the MultiGet deadline feature
+class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet,
+ public testing::WithParamInterface<bool> {
+ public:
+ DBBasicTestMultiGetDeadline()
+ : DBBasicTestMultiGet(
+ "db_basic_test_multiget_deadline" /*Test dir*/,
+ 10 /*# of column families*/, false /*compressed cache enabled*/,
+ true /*uncompressed cache enabled*/, true /*compression enabled*/,
+ true /*ReadOptions.fill_cache*/,
+ 1 /*# of parallel compression threads*/) {}
+
+ inline void CheckStatus(std::vector<Status>& statuses, size_t num_ok) {
+ for (size_t i = 0; i < statuses.size(); ++i) {
+ if (i < num_ok) {
+ EXPECT_OK(statuses[i]);
+ } else {
+ if (statuses[i] != Status::TimedOut()) {
+ EXPECT_EQ(statuses[i], Status::TimedOut());
+ }
+ }
+ }
+ }
+};
+
+TEST_P(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
+#ifndef USE_COROUTINES
+ if (GetParam()) {
+ ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+ return;
+ }
+#endif // USE_COROUTINES
+ std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, false);
+ std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+ Options options = CurrentOptions();
+
+ std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = env.get();
+ SetTimeElapseOnlySleepOnReopen(&options);
+ ReopenWithColumnFamilies(GetCFNames(), options);
+
+ // Test the non-batched version of MultiGet with multiple column
+ // families
+ std::vector<std::string> key_str;
+ size_t i;
+ for (i = 0; i < 5; ++i) {
+ key_str.emplace_back(Key(static_cast<int>(i)));
+ }
+ std::vector<ColumnFamilyHandle*> cfs(key_str.size());
+ ;
+ std::vector<Slice> keys(key_str.size());
+ std::vector<std::string> values(key_str.size());
+ for (i = 0; i < key_str.size(); ++i) {
+ cfs[i] = handles_[i];
+ keys[i] = Slice(key_str[i].data(), key_str[i].size());
+ }
+
+ ReadOptions ro;
+ ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+ ro.async_io = GetParam();
+ // Delay the first IO
+ fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0);
+
+ std::vector<Status> statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
+ // The first key is successful because we check after the lookup, but
+ // subsequent keys fail due to deadline exceeded
+ CheckStatus(statuses, 1);
+
+ // Clear the cache
+ cache->SetCapacity(0);
+ cache->SetCapacity(1048576);
+ // Test non-batched Multiget with multiple column families and
+ // introducing an IO delay in one of the middle CFs
+ key_str.clear();
+ for (i = 0; i < 10; ++i) {
+ key_str.emplace_back(Key(static_cast<int>(i)));
+ }
+ cfs.resize(key_str.size());
+ keys.resize(key_str.size());
+ values.resize(key_str.size());
+ for (i = 0; i < key_str.size(); ++i) {
+ // 2 keys per CF
+ cfs[i] = handles_[i / 2];
+ keys[i] = Slice(key_str[i].data(), key_str[i].size());
+ }
+ ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+ fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1);
+ statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
+ CheckStatus(statuses, 3);
+
+ // Test batched MultiGet with an IO delay in the first data block read.
+ // Both keys in the first CF should succeed as they're in the same data
+ // block and would form one batch, and we check for deadline between
+ // batches.
+ std::vector<PinnableSlice> pin_values(keys.size());
+ cache->SetCapacity(0);
+ cache->SetCapacity(1048576);
+ statuses.clear();
+ statuses.resize(keys.size());
+ ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+ fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0);
+ dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+ pin_values.data(), statuses.data());
+ CheckStatus(statuses, 2);
+
+ // Similar to the previous one, but an IO delay in the third CF data block
+ // read
+ for (PinnableSlice& value : pin_values) {
+ value.Reset();
+ }
+ cache->SetCapacity(0);
+ cache->SetCapacity(1048576);
+ statuses.clear();
+ statuses.resize(keys.size());
+ ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+ fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 2);
+ dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+ pin_values.data(), statuses.data());
+ CheckStatus(statuses, 6);
+
+ // Similar to the previous one, but an IO delay in the last but one CF
+ for (PinnableSlice& value : pin_values) {
+ value.Reset();
+ }
+ cache->SetCapacity(0);
+ cache->SetCapacity(1048576);
+ statuses.clear();
+ statuses.resize(keys.size());
+ ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+ fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 3);
+ dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+ pin_values.data(), statuses.data());
+ CheckStatus(statuses, 8);
+
+ // Test batched MultiGet with single CF and lots of keys. Inject delay
+ // into the second batch of keys. As each batch is 32, the first 64 keys,
+ // i.e first two batches, should succeed and the rest should time out
+ for (PinnableSlice& value : pin_values) {
+ value.Reset();
+ }
+ cache->SetCapacity(0);
+ cache->SetCapacity(1048576);
+ key_str.clear();
+ for (i = 0; i < 100; ++i) {
+ key_str.emplace_back(Key(static_cast<int>(i)));
+ }
+ keys.resize(key_str.size());
+ pin_values.clear();
+ pin_values.resize(key_str.size());
+ for (i = 0; i < key_str.size(); ++i) {
+ keys[i] = Slice(key_str[i].data(), key_str[i].size());
+ }
+ statuses.clear();
+ statuses.resize(keys.size());
+ ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+ fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1);
+ dbfull()->MultiGet(ro, handles_[0], keys.size(), keys.data(),
+ pin_values.data(), statuses.data());
+ CheckStatus(statuses, 64);
+ Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DeadlineIO, DBBasicTestMultiGetDeadline,
+ ::testing::Bool());
+
+TEST_F(DBBasicTest, ManifestWriteFailure) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) {
+ ASSERT_NE(nullptr, arg);
+ auto* s = reinterpret_cast<Status*>(arg);
+ ASSERT_OK(*s);
+ // Manually overwrite return status
+ *s = Status::IOError();
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put("key", "value"));
+ ASSERT_NOK(Flush());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->EnableProcessing();
+ Reopen(options);
+}
+
+TEST_F(DBBasicTest, DestroyDefaultCfHandle) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ for (const auto* h : handles_) {
+ ASSERT_NE(db_->DefaultColumnFamily(), h);
+ }
+
+ // We have two handles to the default column family. The two handles point to
+ // different ColumnFamilyHandle objects.
+ assert(db_->DefaultColumnFamily());
+ ASSERT_EQ(0U, db_->DefaultColumnFamily()->GetID());
+ assert(handles_[0]);
+ ASSERT_EQ(0U, handles_[0]->GetID());
+
+ // You can destroy handles_[...].
+ for (auto* h : handles_) {
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(h));
+ }
+ handles_.clear();
+
+ // But you should not destroy db_->DefaultColumnFamily(), since it's going to
+ // be deleted in `DBImpl::CloseHelper()`. Before that, it may be used
+ // elsewhere internally too.
+ ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+ ASSERT_TRUE(db_->DestroyColumnFamilyHandle(default_cf).IsInvalidArgument());
+}
+
+TEST_F(DBBasicTest, FailOpenIfLoggerCreationFail) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) {
+ auto* s = reinterpret_cast<Status*>(arg);
+ assert(s);
+ *s = Status::IOError("Injected");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Status s = TryReopen(options);
+ ASSERT_EQ(nullptr, options.info_log);
+ ASSERT_TRUE(s.IsIOError());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, VerifyFileChecksums) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.env = env_;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("a", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
+
+ options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ Reopen(options);
+ ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+
+ // Write an L0 with checksum computed.
+ ASSERT_OK(Put("b", "value"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+
+ // Does the right thing but with the wrong name -- using it should lead to an
+ // error.
+ class MisnamedFileChecksumGenerator : public FileChecksumGenCrc32c {
+ public:
+ MisnamedFileChecksumGenerator(const FileChecksumGenContext& context)
+ : FileChecksumGenCrc32c(context) {}
+
+ const char* Name() const override { return "sha1"; }
+ };
+
+ class MisnamedFileChecksumGenFactory : public FileChecksumGenCrc32cFactory {
+ public:
+ std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+ const FileChecksumGenContext& context) override {
+ return std::unique_ptr<FileChecksumGenerator>(
+ new MisnamedFileChecksumGenerator(context));
+ }
+ };
+
+ options.file_checksum_gen_factory.reset(new MisnamedFileChecksumGenFactory());
+ Reopen(options);
+ ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
+}
+
+// TODO: re-enable after we provide finer-grained control for WAL tracking to
+// meet the needs of different use cases, durability levels and recovery modes.
+TEST_F(DBBasicTest, DISABLED_ManualWalSync) {
+ Options options = CurrentOptions();
+ options.track_and_verify_wals_in_manifest = true;
+ options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("x", "y"));
+ // This does not create a new WAL.
+ ASSERT_OK(db_->SyncWAL());
+ EXPECT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+
+ std::unique_ptr<LogFile> wal;
+ Status s = db_->GetCurrentWalFile(&wal);
+ ASSERT_OK(s);
+ Close();
+
+ EXPECT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber())));
+
+ ASSERT_TRUE(TryReopen(options).IsCorruption());
+}
+#endif // !ROCKSDB_LITE
+
+// A test class for intercepting random reads and injecting artificial
+// delays. Used for testing the deadline/timeout feature
+class DBBasicTestDeadline
+ : public DBBasicTest,
+ public testing::WithParamInterface<std::tuple<bool, bool>> {};
+
+TEST_P(DBBasicTestDeadline, PointLookupDeadline) {
+ std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, true);
+ std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+ bool set_deadline = std::get<0>(GetParam());
+ bool set_timeout = std::get<1>(GetParam());
+
+ for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+ if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) {
+ continue;
+ }
+ option_config_ = option_config;
+ Options options = CurrentOptions();
+ if (options.use_direct_reads) {
+ continue;
+ }
+ options.env = env.get();
+ options.disable_auto_compactions = true;
+ Cache* block_cache = nullptr;
+ // Fileter block reads currently don't cause the request to get
+ // aborted on a read timeout, so its possible those block reads
+ // may get issued even if the deadline is past
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Get:BeforeFilterMatch",
+ [&](void* /*arg*/) { fs->IgnoreDeadline(true); });
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Get:AfterFilterMatch",
+ [&](void* /*arg*/) { fs->IgnoreDeadline(false); });
+ // DB open will create table readers unless we reduce the table cache
+ // capacity.
+ // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+ // is allocated with max_open_files - 10 as capacity. So override
+ // max_open_files to 11 so table cache capacity will become 1. This will
+ // prevent file open during DB open and force the file to be opened
+ // during MultiGet
+ SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = (int*)arg;
+ *max_open_files = 11;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ SetTimeElapseOnlySleepOnReopen(&options);
+ Reopen(options);
+
+ if (options.table_factory) {
+ block_cache = options.table_factory->GetOptions<Cache>(
+ TableFactory::kBlockCacheOpts());
+ }
+
+ Random rnd(301);
+ for (int i = 0; i < 400; ++i) {
+ std::string key = "k" + std::to_string(i);
+ ASSERT_OK(Put(key, rnd.RandomString(100)));
+ }
+ ASSERT_OK(Flush());
+
+ bool timedout = true;
+ // A timeout will be forced when the IO counter reaches this value
+ int io_deadline_trigger = 0;
+ // Keep incrementing io_deadline_trigger and call Get() until there is an
+ // iteration that doesn't cause a timeout. This ensures that we cover
+ // all file reads in the point lookup path that can potentially timeout
+ // and cause the Get() to fail.
+ while (timedout) {
+ ReadOptions ro;
+ if (set_deadline) {
+ ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+ }
+ if (set_timeout) {
+ ro.io_timeout = std::chrono::microseconds{5000};
+ }
+ fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger);
+
+ block_cache->SetCapacity(0);
+ block_cache->SetCapacity(1048576);
+
+ std::string value;
+ Status s = dbfull()->Get(ro, "k50", &value);
+ if (fs->TimedOut()) {
+ ASSERT_EQ(s, Status::TimedOut());
+ } else {
+ timedout = false;
+ ASSERT_OK(s);
+ }
+ io_deadline_trigger++;
+ }
+ // Reset the delay sequence in order to avoid false alarms during Reopen
+ fs->SetDelayTrigger(std::chrono::microseconds::zero(),
+ std::chrono::microseconds::zero(), 0);
+ }
+ Close();
+}
+
+TEST_P(DBBasicTestDeadline, IteratorDeadline) {
+ std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, true);
+ std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+ bool set_deadline = std::get<0>(GetParam());
+ bool set_timeout = std::get<1>(GetParam());
+
+ for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+ if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) {
+ continue;
+ }
+ Options options = CurrentOptions();
+ if (options.use_direct_reads) {
+ continue;
+ }
+ options.env = env.get();
+ options.disable_auto_compactions = true;
+ Cache* block_cache = nullptr;
+ // DB open will create table readers unless we reduce the table cache
+ // capacity.
+ // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+ // is allocated with max_open_files - 10 as capacity. So override
+ // max_open_files to 11 so table cache capacity will become 1. This will
+ // prevent file open during DB open and force the file to be opened
+ // during MultiGet
+ SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = (int*)arg;
+ *max_open_files = 11;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ SetTimeElapseOnlySleepOnReopen(&options);
+ Reopen(options);
+
+ if (options.table_factory) {
+ block_cache = options.table_factory->GetOptions<Cache>(
+ TableFactory::kBlockCacheOpts());
+ }
+
+ Random rnd(301);
+ for (int i = 0; i < 400; ++i) {
+ std::string key = "k" + std::to_string(i);
+ ASSERT_OK(Put(key, rnd.RandomString(100)));
+ }
+ ASSERT_OK(Flush());
+
+ bool timedout = true;
+ // A timeout will be forced when the IO counter reaches this value
+ int io_deadline_trigger = 0;
+ // Keep incrementing io_deadline_trigger and call Get() until there is an
+ // iteration that doesn't cause a timeout. This ensures that we cover
+ // all file reads in the point lookup path that can potentially timeout
+ while (timedout) {
+ ReadOptions ro;
+ if (set_deadline) {
+ ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+ }
+ if (set_timeout) {
+ ro.io_timeout = std::chrono::microseconds{5000};
+ }
+ fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger);
+
+ block_cache->SetCapacity(0);
+ block_cache->SetCapacity(1048576);
+
+ Iterator* iter = dbfull()->NewIterator(ro);
+ int count = 0;
+ iter->Seek("k50");
+ while (iter->Valid() && count++ < 100) {
+ iter->Next();
+ }
+ if (fs->TimedOut()) {
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(iter->status(), Status::TimedOut());
+ } else {
+ timedout = false;
+ ASSERT_OK(iter->status());
+ }
+ delete iter;
+ io_deadline_trigger++;
+ }
+ // Reset the delay sequence in order to avoid false alarms during Reopen
+ fs->SetDelayTrigger(std::chrono::microseconds::zero(),
+ std::chrono::microseconds::zero(), 0);
+ }
+ Close();
+}
+
+// Param 0: If true, set read_options.deadline
+// Param 1: If true, set read_options.io_timeout
+INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline,
+ ::testing::Values(std::make_tuple(true, false),
+ std::make_tuple(false, true),
+ std::make_tuple(true, true)));
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_block_cache_test.cc b/src/rocksdb/db/db_block_cache_test.cc
new file mode 100644
index 000000000..db80b82cb
--- /dev/null
+++ b/src/rocksdb/db/db_block_cache_test.cc
@@ -0,0 +1,2313 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <unordered_set>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
+#include "cache/lru_cache.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "env/unique_id_gen.h"
+#include "port/stack_trace.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/unique_id_impl.h"
+#include "util/compression.h"
+#include "util/defer.h"
+#include "util/hash.h"
+#include "util/math.h"
+#include "util/random.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlockCacheTest : public DBTestBase {
+ private:
+ size_t miss_count_ = 0;
+ size_t hit_count_ = 0;
+ size_t insert_count_ = 0;
+ size_t failure_count_ = 0;
+ size_t compression_dict_miss_count_ = 0;
+ size_t compression_dict_hit_count_ = 0;
+ size_t compression_dict_insert_count_ = 0;
+ size_t compressed_miss_count_ = 0;
+ size_t compressed_hit_count_ = 0;
+ size_t compressed_insert_count_ = 0;
+ size_t compressed_failure_count_ = 0;
+
+ public:
+ const size_t kNumBlocks = 10;
+ const size_t kValueSize = 100;
+
+ DBBlockCacheTest()
+ : DBTestBase("db_block_cache_test", /*env_do_fsync=*/true) {}
+
+ BlockBasedTableOptions GetTableOptions() {
+ BlockBasedTableOptions table_options;
+ // Set a small enough block size so that each key-value get its own block.
+ table_options.block_size = 1;
+ return table_options;
+ }
+
+ Options GetOptions(const BlockBasedTableOptions& table_options) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.avoid_flush_during_recovery = false;
+ // options.compression = kNoCompression;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ return options;
+ }
+
+ void InitTable(const Options& /*options*/) {
+ std::string value(kValueSize, 'a');
+ for (size_t i = 0; i < kNumBlocks; i++) {
+ ASSERT_OK(Put(std::to_string(i), value.c_str()));
+ }
+ }
+
+ void RecordCacheCounters(const Options& options) {
+ miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+ hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+ insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+ compressed_miss_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+ compressed_hit_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+ compressed_insert_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD);
+ compressed_failure_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+ }
+
+ void RecordCacheCountersForCompressionDict(const Options& options) {
+ compression_dict_miss_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+ compression_dict_hit_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+ compression_dict_insert_count_ =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+ }
+
+ void CheckCacheCounters(const Options& options, size_t expected_misses,
+ size_t expected_hits, size_t expected_inserts,
+ size_t expected_failures) {
+ size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+ size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+ size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ size_t new_failure_count =
+ TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+ ASSERT_EQ(miss_count_ + expected_misses, new_miss_count);
+ ASSERT_EQ(hit_count_ + expected_hits, new_hit_count);
+ ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count);
+ ASSERT_EQ(failure_count_ + expected_failures, new_failure_count);
+ miss_count_ = new_miss_count;
+ hit_count_ = new_hit_count;
+ insert_count_ = new_insert_count;
+ failure_count_ = new_failure_count;
+ }
+
+ void CheckCacheCountersForCompressionDict(
+ const Options& options, size_t expected_compression_dict_misses,
+ size_t expected_compression_dict_hits,
+ size_t expected_compression_dict_inserts) {
+ size_t new_compression_dict_miss_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+ size_t new_compression_dict_hit_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+ size_t new_compression_dict_insert_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+ ASSERT_EQ(compression_dict_miss_count_ + expected_compression_dict_misses,
+ new_compression_dict_miss_count);
+ ASSERT_EQ(compression_dict_hit_count_ + expected_compression_dict_hits,
+ new_compression_dict_hit_count);
+ ASSERT_EQ(
+ compression_dict_insert_count_ + expected_compression_dict_inserts,
+ new_compression_dict_insert_count);
+ compression_dict_miss_count_ = new_compression_dict_miss_count;
+ compression_dict_hit_count_ = new_compression_dict_hit_count;
+ compression_dict_insert_count_ = new_compression_dict_insert_count;
+ }
+
+ void CheckCompressedCacheCounters(const Options& options,
+ size_t expected_misses,
+ size_t expected_hits,
+ size_t expected_inserts,
+ size_t expected_failures) {
+ size_t new_miss_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+ size_t new_hit_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+ size_t new_insert_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD);
+ size_t new_failure_count =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+ ASSERT_EQ(compressed_miss_count_ + expected_misses, new_miss_count);
+ ASSERT_EQ(compressed_hit_count_ + expected_hits, new_hit_count);
+ ASSERT_EQ(compressed_insert_count_ + expected_inserts, new_insert_count);
+ ASSERT_EQ(compressed_failure_count_ + expected_failures, new_failure_count);
+ compressed_miss_count_ = new_miss_count;
+ compressed_hit_count_ = new_hit_count;
+ compressed_insert_count_ = new_insert_count;
+ compressed_failure_count_ = new_failure_count;
+ }
+
+#ifndef ROCKSDB_LITE
+ const std::array<size_t, kNumCacheEntryRoles> GetCacheEntryRoleCountsBg() {
+ // Verify in cache entry role stats
+ std::array<size_t, kNumCacheEntryRoles> cache_entry_role_counts;
+ std::map<std::string, std::string> values;
+ EXPECT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats,
+ &values));
+ for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+ auto role = static_cast<CacheEntryRole>(i);
+ cache_entry_role_counts[i] =
+ ParseSizeT(values[BlockCacheEntryStatsMapKeys::EntryCount(role)]);
+ }
+ return cache_entry_role_counts;
+ }
+#endif // ROCKSDB_LITE
+};
+
+TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) {
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ InitTable(options);
+
+ LRUCacheOptions co;
+ co.capacity = 0;
+ co.num_shard_bits = 0;
+ co.strict_capacity_limit = false;
+ // Needed not to count entry stats collector
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ std::shared_ptr<Cache> cache = NewLRUCache(co);
+ table_options.block_cache = cache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ RecordCacheCounters(options);
+
+ std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+ Iterator* iter = nullptr;
+
+ ASSERT_EQ(0, cache->GetUsage());
+ iter = db_->NewIterator(read_options);
+ iter->Seek(std::to_string(0));
+ ASSERT_LT(0, cache->GetUsage());
+ delete iter;
+ iter = nullptr;
+ ASSERT_EQ(0, cache->GetUsage());
+}
+
+TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) {
+ ReadOptions read_options;
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ InitTable(options);
+
+ LRUCacheOptions co;
+ co.capacity = 0;
+ co.num_shard_bits = 0;
+ co.strict_capacity_limit = false;
+ // Needed not to count entry stats collector
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ std::shared_ptr<Cache> cache = NewLRUCache(co);
+ table_options.block_cache = cache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ RecordCacheCounters(options);
+
+ std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+ Iterator* iter = nullptr;
+
+ // Load blocks into cache.
+ for (size_t i = 0; i + 1 < kNumBlocks; i++) {
+ iter = db_->NewIterator(read_options);
+ iter->Seek(std::to_string(i));
+ ASSERT_OK(iter->status());
+ CheckCacheCounters(options, 1, 0, 1, 0);
+ iterators[i].reset(iter);
+ }
+ size_t usage = cache->GetUsage();
+ ASSERT_LT(0, usage);
+ cache->SetCapacity(usage);
+ ASSERT_EQ(usage, cache->GetPinnedUsage());
+
+ // Test with strict capacity limit.
+ cache->SetStrictCapacityLimit(true);
+ iter = db_->NewIterator(read_options);
+ iter->Seek(std::to_string(kNumBlocks - 1));
+ ASSERT_TRUE(iter->status().IsMemoryLimit());
+ CheckCacheCounters(options, 1, 0, 0, 1);
+ delete iter;
+ iter = nullptr;
+
+ // Release iterators and access cache again.
+ for (size_t i = 0; i + 1 < kNumBlocks; i++) {
+ iterators[i].reset();
+ CheckCacheCounters(options, 0, 0, 0, 0);
+ }
+ ASSERT_EQ(0, cache->GetPinnedUsage());
+ for (size_t i = 0; i + 1 < kNumBlocks; i++) {
+ iter = db_->NewIterator(read_options);
+ iter->Seek(std::to_string(i));
+ ASSERT_OK(iter->status());
+ CheckCacheCounters(options, 0, 1, 0, 0);
+ iterators[i].reset(iter);
+ }
+}
+
+#ifdef SNAPPY
+TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ table_options.block_cache_compressed = nullptr;
+ table_options.block_size = 1;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ table_options.cache_index_and_filter_blocks = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.compression = CompressionType::kSnappyCompression;
+
+ DestroyAndReopen(options);
+
+ std::string value(kValueSize, 'a');
+ for (size_t i = 0; i < kNumBlocks; i++) {
+ ASSERT_OK(Put(std::to_string(i), value));
+ ASSERT_OK(Flush());
+ }
+
+ ReadOptions read_options;
+ std::shared_ptr<Cache> compressed_cache = NewLRUCache(1 << 25, 0, false);
+ LRUCacheOptions co;
+ co.capacity = 0;
+ co.num_shard_bits = 0;
+ co.strict_capacity_limit = false;
+ // Needed not to count entry stats collector
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ std::shared_ptr<Cache> cache = NewLRUCache(co);
+ table_options.block_cache = cache;
+ table_options.no_block_cache = false;
+ table_options.block_cache_compressed = compressed_cache;
+ table_options.max_auto_readahead_size = 0;
+ table_options.cache_index_and_filter_blocks = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ RecordCacheCounters(options);
+
+ // Load blocks into cache.
+ for (size_t i = 0; i < kNumBlocks - 1; i++) {
+ ASSERT_EQ(value, Get(std::to_string(i)));
+ CheckCacheCounters(options, 1, 0, 1, 0);
+ CheckCompressedCacheCounters(options, 1, 0, 1, 0);
+ }
+
+ size_t usage = cache->GetUsage();
+ ASSERT_EQ(0, usage);
+ ASSERT_EQ(usage, cache->GetPinnedUsage());
+ size_t compressed_usage = compressed_cache->GetUsage();
+ ASSERT_LT(0, compressed_usage);
+ // Compressed block cache cannot be pinned.
+ ASSERT_EQ(0, compressed_cache->GetPinnedUsage());
+
+ // Set strict capacity limit flag. Now block will only load into compressed
+ // block cache.
+ cache->SetCapacity(usage);
+ cache->SetStrictCapacityLimit(true);
+ ASSERT_EQ(usage, cache->GetPinnedUsage());
+
+ // Load last key block.
+ ASSERT_EQ(
+ "Operation aborted: Memory limit reached: Insert failed due to LRU cache "
+ "being full.",
+ Get(std::to_string(kNumBlocks - 1)));
+ // Failure will also record the miss counter.
+ CheckCacheCounters(options, 1, 0, 0, 1);
+ CheckCompressedCacheCounters(options, 1, 0, 1, 0);
+
+ // Clear strict capacity limit flag. This time we shall hit compressed block
+ // cache and load into block cache.
+ cache->SetStrictCapacityLimit(false);
+ // Load last key block.
+ ASSERT_EQ(value, Get(std::to_string(kNumBlocks - 1)));
+ CheckCacheCounters(options, 1, 0, 1, 0);
+ CheckCompressedCacheCounters(options, 0, 1, 0, 0);
+}
+
+namespace {
+class PersistentCacheFromCache : public PersistentCache {
+ public:
+ PersistentCacheFromCache(std::shared_ptr<Cache> cache, bool read_only)
+ : cache_(cache), read_only_(read_only) {}
+
+ Status Insert(const Slice& key, const char* data,
+ const size_t size) override {
+ if (read_only_) {
+ return Status::NotSupported();
+ }
+ std::unique_ptr<char[]> copy{new char[size]};
+ std::copy_n(data, size, copy.get());
+ Status s = cache_->Insert(
+ key, copy.get(), size,
+ GetCacheEntryDeleterForRole<char[], CacheEntryRole::kMisc>());
+ if (s.ok()) {
+ copy.release();
+ }
+ return s;
+ }
+
+ Status Lookup(const Slice& key, std::unique_ptr<char[]>* data,
+ size_t* size) override {
+ auto handle = cache_->Lookup(key);
+ if (handle) {
+ char* ptr = static_cast<char*>(cache_->Value(handle));
+ *size = cache_->GetCharge(handle);
+ data->reset(new char[*size]);
+ std::copy_n(ptr, *size, data->get());
+ cache_->Release(handle);
+ return Status::OK();
+ } else {
+ return Status::NotFound();
+ }
+ }
+
+ bool IsCompressed() override { return false; }
+
+ StatsType Stats() override { return StatsType(); }
+
+ std::string GetPrintableOptions() const override { return ""; }
+
+ uint64_t NewId() override { return cache_->NewId(); }
+
+ private:
+ std::shared_ptr<Cache> cache_;
+ bool read_only_;
+};
+
+class ReadOnlyCacheWrapper : public CacheWrapper {
+ using CacheWrapper::CacheWrapper;
+
+ using Cache::Insert;
+ Status Insert(const Slice& /*key*/, void* /*value*/, size_t /*charge*/,
+ void (*)(const Slice& key, void* value) /*deleter*/,
+ Handle** /*handle*/, Priority /*priority*/) override {
+ return Status::NotSupported();
+ }
+};
+
+} // anonymous namespace
+
+TEST_F(DBBlockCacheTest, TestWithSameCompressed) {
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ InitTable(options);
+
+ std::shared_ptr<Cache> rw_cache{NewLRUCache(1000000)};
+ std::shared_ptr<PersistentCacheFromCache> rw_pcache{
+ new PersistentCacheFromCache(rw_cache, /*read_only*/ false)};
+ // Exercise some obscure behavior with read-only wrappers
+ std::shared_ptr<Cache> ro_cache{new ReadOnlyCacheWrapper(rw_cache)};
+ std::shared_ptr<PersistentCacheFromCache> ro_pcache{
+ new PersistentCacheFromCache(rw_cache, /*read_only*/ true)};
+
+ // Simple same pointer
+ table_options.block_cache = rw_cache;
+ table_options.block_cache_compressed = rw_cache;
+ table_options.persistent_cache.reset();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ASSERT_EQ(TryReopen(options).ToString(),
+ "Invalid argument: block_cache same as block_cache_compressed not "
+ "currently supported, and would be bad for performance anyway");
+
+ // Other cases
+ table_options.block_cache = ro_cache;
+ table_options.block_cache_compressed = rw_cache;
+ table_options.persistent_cache.reset();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ASSERT_EQ(TryReopen(options).ToString(),
+ "Invalid argument: block_cache and block_cache_compressed share "
+ "the same key space, which is not supported");
+
+ table_options.block_cache = rw_cache;
+ table_options.block_cache_compressed = ro_cache;
+ table_options.persistent_cache.reset();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ASSERT_EQ(TryReopen(options).ToString(),
+ "Invalid argument: block_cache_compressed and block_cache share "
+ "the same key space, which is not supported");
+
+ table_options.block_cache = ro_cache;
+ table_options.block_cache_compressed.reset();
+ table_options.persistent_cache = rw_pcache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ASSERT_EQ(TryReopen(options).ToString(),
+ "Invalid argument: block_cache and persistent_cache share the same "
+ "key space, which is not supported");
+
+ table_options.block_cache = rw_cache;
+ table_options.block_cache_compressed.reset();
+ table_options.persistent_cache = ro_pcache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ASSERT_EQ(TryReopen(options).ToString(),
+ "Invalid argument: persistent_cache and block_cache share the same "
+ "key space, which is not supported");
+
+ table_options.block_cache.reset();
+ table_options.no_block_cache = true;
+ table_options.block_cache_compressed = ro_cache;
+ table_options.persistent_cache = rw_pcache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ASSERT_EQ(TryReopen(options).ToString(),
+ "Invalid argument: block_cache_compressed and persistent_cache "
+ "share the same key space, which is not supported");
+
+ table_options.block_cache.reset();
+ table_options.no_block_cache = true;
+ table_options.block_cache_compressed = rw_cache;
+ table_options.persistent_cache = ro_pcache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ASSERT_EQ(TryReopen(options).ToString(),
+ "Invalid argument: persistent_cache and block_cache_compressed "
+ "share the same key space, which is not supported");
+}
+#endif // SNAPPY
+
+#ifndef ROCKSDB_LITE
+
+// Make sure that when options.block_cache is set, after a new table is
+// created its index/filter blocks are added to block cache.
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "key", "val"));
+ // Create a new table.
+ ASSERT_OK(Flush(1));
+
+ // index/filter blocks added to block cache right after table creation.
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(2, /* only index/filter were added */
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+ uint64_t int_num;
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_EQ(int_num, 0U);
+
+ // Make sure filter block is in cache.
+ std::string value;
+ ReadOptions ropt;
+ db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+
+ // Miss count should remain the same.
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+ db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+ // Make sure index block is in cache.
+ auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+ value = Get(1, "key");
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(index_block_hit + 1,
+ TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+ value = Get(1, "key");
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(index_block_hit + 2,
+ TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+}
+
+// With fill_cache = false, fills up the cache, then iterates over the entire
+// db, verify dummy entries inserted in `BlockBasedTable::NewDataBlockIterator`
+// does not cause heap-use-after-free errors in COMPILE_WITH_ASAN=1 runs
+TEST_F(DBBlockCacheTest, FillCacheAndIterateDB) {
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ InitTable(options);
+
+ std::shared_ptr<Cache> cache = NewLRUCache(10, 0, true);
+ table_options.block_cache = cache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_OK(Put("key1", "val1"));
+ ASSERT_OK(Put("key2", "val2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("key3", "val3"));
+ ASSERT_OK(Put("key4", "val4"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("key5", "val5"));
+ ASSERT_OK(Put("key6", "val6"));
+ ASSERT_OK(Flush());
+
+ Iterator* iter = nullptr;
+
+ iter = db_->NewIterator(read_options);
+ iter->Seek(std::to_string(0));
+ while (iter->Valid()) {
+ iter->Next();
+ }
+ delete iter;
+ iter = nullptr;
+}
+
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ LRUCacheOptions co;
+ // 500 bytes are enough to hold the first two blocks
+ co.capacity = 500;
+ co.num_shard_bits = 0;
+ co.strict_capacity_limit = false;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ std::shared_ptr<Cache> cache = NewLRUCache(co);
+ table_options.block_cache = cache;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "longer_key", "val"));
+ // Create a new table
+ ASSERT_OK(Flush(1));
+ size_t index_bytes_insert =
+ TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT);
+ size_t filter_bytes_insert =
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT);
+ ASSERT_GT(index_bytes_insert, 0);
+ ASSERT_GT(filter_bytes_insert, 0);
+ ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert);
+ // set the cache capacity to the current usage
+ cache->SetCapacity(index_bytes_insert + filter_bytes_insert);
+ // The index and filter eviction statistics were broken by the refactoring
+ // that moved the readers out of the block cache. Disabling these until we can
+ // bring the stats back.
+ // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
+ // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
+ // Note that the second key needs to be no longer than the first one.
+ // Otherwise the second index block may not fit in cache.
+ ASSERT_OK(Put(1, "key", "val"));
+ // Create a new table
+ ASSERT_OK(Flush(1));
+ // cache evicted old index and block entries
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT),
+ index_bytes_insert);
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT),
+ filter_bytes_insert);
+ // The index and filter eviction statistics were broken by the refactoring
+ // that moved the readers out of the block cache. Disabling these until we can
+ // bring the stats back.
+ // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
+ // index_bytes_insert);
+ // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
+ // filter_bytes_insert);
+}
+
+#if (defined OS_LINUX || defined OS_WIN)
+TEST_F(DBBlockCacheTest, WarmCacheWithDataBlocksDuringFlush) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+ table_options.cache_index_and_filter_blocks = false;
+ table_options.prepopulate_block_cache =
+ BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ std::string value(kValueSize, 'a');
+ for (size_t i = 1; i <= kNumBlocks; i++) {
+ ASSERT_OK(Put(std::to_string(i), value));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+ ASSERT_EQ(value, Get(std::to_string(i)));
+ ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ }
+ // Verify compaction not counted
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ EXPECT_EQ(kNumBlocks,
+ options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+}
+
+// This test cache data, index and filter blocks during flush.
+class DBBlockCacheTest1 : public DBTestBase,
+ public ::testing::WithParamInterface<uint32_t> {
+ public:
+ const size_t kNumBlocks = 10;
+ const size_t kValueSize = 100;
+ DBBlockCacheTest1() : DBTestBase("db_block_cache_test1", true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBBlockCacheTest1, DBBlockCacheTest1,
+ ::testing::Values(1, 2));
+
+TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+
+ uint32_t filter_type = GetParam();
+ switch (filter_type) {
+ case 1: // partition_filter
+ table_options.partition_filters = true;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ break;
+ case 2: // full filter
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ break;
+ default:
+ assert(false);
+ }
+
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.prepopulate_block_cache =
+ BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ std::string value(kValueSize, 'a');
+ for (size_t i = 1; i <= kNumBlocks; i++) {
+ ASSERT_OK(Put(std::to_string(i), value));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+ if (filter_type == 1) {
+ ASSERT_EQ(2 * i,
+ options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+ ASSERT_EQ(2 * i,
+ options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+ } else {
+ ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+ ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+ }
+ ASSERT_EQ(value, Get(std::to_string(i)));
+
+ ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(i * 3, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT));
+ if (filter_type == 1) {
+ ASSERT_EQ(i * 3,
+ options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+ } else {
+ ASSERT_EQ(i * 2,
+ options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+ }
+ ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+ }
+
+ // Verify compaction not counted
+ CompactRangeOptions cro;
+ // Ensure files are rewritten, not just trivially moved.
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+ EXPECT_EQ(kNumBlocks,
+ options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+ // Index and filter blocks are automatically warmed when the new table file
+ // is automatically opened at the end of compaction. This is not easily
+ // disabled so results in the new index and filter blocks being warmed.
+ if (filter_type == 1) {
+ EXPECT_EQ(2 * (1 + kNumBlocks),
+ options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+ EXPECT_EQ(2 * (1 + kNumBlocks),
+ options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+ } else {
+ EXPECT_EQ(1 + kNumBlocks,
+ options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+ EXPECT_EQ(1 + kNumBlocks,
+ options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+ }
+}
+
+TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+ table_options.cache_index_and_filter_blocks = false;
+ table_options.prepopulate_block_cache =
+ BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ std::string value(kValueSize, 'a');
+
+ for (size_t i = 1; i <= 5; i++) {
+ ASSERT_OK(Put(std::to_string(i), value));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(1,
+ options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+
+ ASSERT_EQ(value, Get(std::to_string(i)));
+ ASSERT_EQ(0,
+ options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+ ASSERT_EQ(
+ 0, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+ ASSERT_EQ(1,
+ options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
+ }
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"block_based_table_factory", "{prepopulate_block_cache=kDisable;}"}}));
+
+ for (size_t i = 6; i <= kNumBlocks; i++) {
+ ASSERT_OK(Put(std::to_string(i), value));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(0,
+ options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+
+ ASSERT_EQ(value, Get(std::to_string(i)));
+ ASSERT_EQ(1,
+ options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+ ASSERT_EQ(
+ 1, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+ ASSERT_EQ(0,
+ options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
+ }
+}
+#endif
+
+namespace {
+
+// A mock cache wraps LRUCache, and record how many entries have been
+// inserted for each priority.
+class MockCache : public LRUCache {
+ public:
+ static uint32_t high_pri_insert_count;
+ static uint32_t low_pri_insert_count;
+
+ MockCache()
+ : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/,
+ false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/,
+ 0.0 /*low_pri_pool_ratio*/) {}
+
+ using ShardedCache::Insert;
+
+ Status Insert(const Slice& key, void* value,
+ const Cache::CacheItemHelper* helper_cb, size_t charge,
+ Handle** handle, Priority priority) override {
+ DeleterFn delete_cb = helper_cb->del_cb;
+ if (priority == Priority::LOW) {
+ low_pri_insert_count++;
+ } else {
+ high_pri_insert_count++;
+ }
+ return LRUCache::Insert(key, value, charge, delete_cb, handle, priority);
+ }
+};
+
+uint32_t MockCache::high_pri_insert_count = 0;
+uint32_t MockCache::low_pri_insert_count = 0;
+
+} // anonymous namespace
+
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) {
+ for (auto priority : {Cache::Priority::LOW, Cache::Priority::HIGH}) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.block_cache.reset(new MockCache());
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ table_options.cache_index_and_filter_blocks_with_high_priority =
+ priority == Cache::Priority::HIGH ? true : false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ MockCache::high_pri_insert_count = 0;
+ MockCache::low_pri_insert_count = 0;
+
+ // Create a new table.
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_OK(Put("bar", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // index/filter blocks added to block cache right after table creation.
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(2, /* only index/filter were added */
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+ if (priority == Cache::Priority::LOW) {
+ ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+ ASSERT_EQ(2u, MockCache::low_pri_insert_count);
+ } else {
+ ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+ ASSERT_EQ(0u, MockCache::low_pri_insert_count);
+ }
+
+ // Access data block.
+ ASSERT_EQ("value", Get("foo"));
+
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(3, /*adding data block*/
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+
+ // Data block should be inserted with low priority.
+ if (priority == Cache::Priority::LOW) {
+ ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+ ASSERT_EQ(3u, MockCache::low_pri_insert_count);
+ } else {
+ ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+ ASSERT_EQ(1u, MockCache::low_pri_insert_count);
+ }
+ }
+}
+
+namespace {
+
+// An LRUCache wrapper that can falsely report "not found" on Lookup.
+// This allows us to manipulate BlockBasedTableReader into thinking
+// another thread inserted the data in between Lookup and Insert,
+// while mostly preserving the LRUCache interface/behavior.
+class LookupLiarCache : public CacheWrapper {
+ int nth_lookup_not_found_ = 0;
+
+ public:
+ explicit LookupLiarCache(std::shared_ptr<Cache> target)
+ : CacheWrapper(std::move(target)) {}
+
+ using Cache::Lookup;
+ Handle* Lookup(const Slice& key, Statistics* stats) override {
+ if (nth_lookup_not_found_ == 1) {
+ nth_lookup_not_found_ = 0;
+ return nullptr;
+ }
+ if (nth_lookup_not_found_ > 1) {
+ --nth_lookup_not_found_;
+ }
+ return CacheWrapper::Lookup(key, stats);
+ }
+
+ // 1 == next lookup, 2 == after next, etc.
+ void SetNthLookupNotFound(int n) { nth_lookup_not_found_ = n; }
+};
+
+} // anonymous namespace
+
+TEST_F(DBBlockCacheTest, AddRedundantStats) {
+ const size_t capacity = size_t{1} << 25;
+ const int num_shard_bits = 0; // 1 shard
+ int iterations_tested = 0;
+ for (std::shared_ptr<Cache> base_cache :
+ {NewLRUCache(capacity, num_shard_bits),
+ HyperClockCacheOptions(
+ capacity,
+ BlockBasedTableOptions().block_size /*estimated_value_size*/,
+ num_shard_bits)
+ .MakeSharedCache()}) {
+ if (!base_cache) {
+ // Skip clock cache when not supported
+ continue;
+ }
+ ++iterations_tested;
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ std::shared_ptr<LookupLiarCache> cache =
+ std::make_shared<LookupLiarCache>(base_cache);
+
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.block_cache = cache;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(50));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ // Create a new table.
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_OK(Put("bar", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // Normal access filter+index+data.
+ ASSERT_EQ("value", Get("foo"));
+
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+ // --------
+ ASSERT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+ // --------
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+ // Againt access filter+index+data, but force redundant load+insert on index
+ cache->SetNthLookupNotFound(2);
+ ASSERT_EQ("value", Get("bar"));
+
+ ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+ // --------
+ ASSERT_EQ(4, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+ // --------
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+ // Access just filter (with high probability), and force redundant
+ // load+insert
+ cache->SetNthLookupNotFound(1);
+ ASSERT_EQ("NOT_FOUND", Get("this key was not added"));
+
+ EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+ EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+ EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+ // --------
+ EXPECT_EQ(5, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+ EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+ EXPECT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+ // --------
+ EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+ // Access just data, forcing redundant load+insert
+ ReadOptions read_options;
+ std::unique_ptr<Iterator> iter{db_->NewIterator(read_options)};
+ cache->SetNthLookupNotFound(1);
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), "bar");
+
+ EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+ EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+ EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+ // --------
+ EXPECT_EQ(6, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+ EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+ EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+ // --------
+ EXPECT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+ }
+ EXPECT_GE(iterations_tested, 1);
+}
+
+TEST_F(DBBlockCacheTest, ParanoidFileChecks) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.level0_file_num_compaction_trigger = 2;
+ options.paranoid_file_checks = true;
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = false;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "1_key", "val"));
+ ASSERT_OK(Put(1, "9_key", "val"));
+ // Create a new table.
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(1, /* read and cache data block */
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ ASSERT_OK(Put(1, "1_key2", "val2"));
+ ASSERT_OK(Put(1, "9_key2", "val2"));
+ // Create a new SST file. This will further trigger a compaction
+ // and generate another file.
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(3, /* Totally 3 files created up to now */
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ // After disabling options.paranoid_file_checks. NO further block
+ // is added after generating a new file.
+ ASSERT_OK(
+ dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}}));
+
+ ASSERT_OK(Put(1, "1_key3", "val3"));
+ ASSERT_OK(Put(1, "9_key3", "val3"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "1_key4", "val4"));
+ ASSERT_OK(Put(1, "9_key4", "val4"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(3, /* Totally 3 files created up to now */
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+}
+
+TEST_F(DBBlockCacheTest, CompressedCache) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+ int num_iter = 80;
+
+ // Run this test three iterations.
+ // Iteration 1: only a uncompressed block cache
+ // Iteration 2: only a compressed block cache
+ // Iteration 3: both block cache and compressed cache
+ // Iteration 4: both block cache and compressed cache, but DB is not
+ // compressed
+ for (int iter = 0; iter < 4; iter++) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 64 * 1024; // small write buffer
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ BlockBasedTableOptions table_options;
+ switch (iter) {
+ case 0:
+ // only uncompressed block cache
+ table_options.block_cache = NewLRUCache(8 * 1024);
+ table_options.block_cache_compressed = nullptr;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ break;
+ case 1:
+ // no block cache, only compressed cache
+ table_options.no_block_cache = true;
+ table_options.block_cache = nullptr;
+ table_options.block_cache_compressed = NewLRUCache(8 * 1024);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ break;
+ case 2:
+ // both compressed and uncompressed block cache
+ table_options.block_cache = NewLRUCache(1024);
+ table_options.block_cache_compressed = NewLRUCache(8 * 1024);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ break;
+ case 3:
+ // both block cache and compressed cache, but DB is not compressed
+ // also, make block cache sizes bigger, to trigger block cache hits
+ table_options.block_cache = NewLRUCache(1024 * 1024);
+ table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.compression = kNoCompression;
+ break;
+ default:
+ FAIL();
+ }
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // default column family doesn't have block cache
+ Options no_block_cache_opts;
+ no_block_cache_opts.statistics = options.statistics;
+ no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+ BlockBasedTableOptions table_options_no_bc;
+ table_options_no_bc.no_block_cache = true;
+ no_block_cache_opts.table_factory.reset(
+ NewBlockBasedTableFactory(table_options_no_bc));
+ ReopenWithColumnFamilies(
+ {"default", "pikachu"},
+ std::vector<Options>({no_block_cache_opts, options}));
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ std::vector<std::string> values;
+ std::string str;
+ for (int i = 0; i < num_iter; i++) {
+ if (i % 4 == 0) { // high compression ratio
+ str = rnd.RandomString(1000);
+ }
+ values.push_back(str);
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+
+ // flush all data from memtable so that reads are from block cache
+ ASSERT_OK(Flush(1));
+
+ for (int i = 0; i < num_iter; i++) {
+ ASSERT_EQ(Get(1, Key(i)), values[i]);
+ }
+
+ // check that we triggered the appropriate code paths in the cache
+ switch (iter) {
+ case 0:
+ // only uncompressed block cache
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+ break;
+ case 1:
+ // no block cache, only compressed cache
+ ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+ break;
+ case 2:
+ // both compressed and uncompressed block cache
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+ break;
+ case 3:
+ // both compressed and uncompressed block cache
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0);
+ ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+ // compressed doesn't have any hits since blocks are not compressed on
+ // storage
+ ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0);
+ break;
+ default:
+ FAIL();
+ }
+
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ }
+}
+
+TEST_F(DBBlockCacheTest, CacheCompressionDict) {
+ const int kNumFiles = 4;
+ const int kNumEntriesPerFile = 128;
+ const int kNumBytesPerEntry = 1024;
+
+ // Try all the available libraries that support dictionary compression
+ std::vector<CompressionType> compression_types;
+ if (Zlib_Supported()) {
+ compression_types.push_back(kZlibCompression);
+ }
+ if (LZ4_Supported()) {
+ compression_types.push_back(kLZ4Compression);
+ compression_types.push_back(kLZ4HCCompression);
+ }
+ if (ZSTD_Supported()) {
+ compression_types.push_back(kZSTD);
+ } else if (ZSTDNotFinal_Supported()) {
+ compression_types.push_back(kZSTDNotFinalCompression);
+ }
+ Random rnd(301);
+ for (auto compression_type : compression_types) {
+ Options options = CurrentOptions();
+ options.bottommost_compression = compression_type;
+ options.bottommost_compression_opts.max_dict_bytes = 4096;
+ options.bottommost_compression_opts.enabled = true;
+ options.create_if_missing = true;
+ options.num_levels = 2;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.block_cache.reset(new MockCache());
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ RecordCacheCountersForCompressionDict(options);
+
+ for (int i = 0; i < kNumFiles; ++i) {
+ ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
+ for (int j = 0; j < kNumEntriesPerFile; ++j) {
+ std::string value = rnd.RandomString(kNumBytesPerEntry);
+ ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str()));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
+
+ // Compression dictionary blocks are preloaded.
+ CheckCacheCountersForCompressionDict(
+ options, kNumFiles /* expected_compression_dict_misses */,
+ 0 /* expected_compression_dict_hits */,
+ kNumFiles /* expected_compression_dict_inserts */);
+
+ // Seek to a key in a file. It should cause the SST's dictionary meta-block
+ // to be read.
+ RecordCacheCounters(options);
+ RecordCacheCountersForCompressionDict(options);
+ ReadOptions read_options;
+ ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
+ // Two block hits: index and dictionary since they are prefetched
+ // One block missed/added: data block
+ CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */,
+ 1 /* expected_inserts */, 0 /* expected_failures */);
+ CheckCacheCountersForCompressionDict(
+ options, 0 /* expected_compression_dict_misses */,
+ 1 /* expected_compression_dict_hits */,
+ 0 /* expected_compression_dict_inserts */);
+ }
+}
+
+static void ClearCache(Cache* cache) {
+ auto roles = CopyCacheDeleterRoleMap();
+ std::deque<std::string> keys;
+ Cache::ApplyToAllEntriesOptions opts;
+ auto callback = [&](const Slice& key, void* /*value*/, size_t /*charge*/,
+ Cache::DeleterFn deleter) {
+ if (roles.find(deleter) == roles.end()) {
+ // Keep the stats collector
+ return;
+ }
+ keys.push_back(key.ToString());
+ };
+ cache->ApplyToAllEntries(callback, opts);
+ for (auto& k : keys) {
+ cache->Erase(k);
+ }
+}
+
+TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
+ const size_t capacity = size_t{1} << 25;
+ int iterations_tested = 0;
+ for (bool partition : {false, true}) {
+ for (std::shared_ptr<Cache> cache :
+ {NewLRUCache(capacity),
+ HyperClockCacheOptions(
+ capacity,
+ BlockBasedTableOptions().block_size /*estimated_value_size*/)
+ .MakeSharedCache()}) {
+ ++iterations_tested;
+
+ Options options = CurrentOptions();
+ SetTimeElapseOnlySleepOnReopen(&options);
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.max_open_files = 13;
+ options.table_cache_numshardbits = 0;
+ // If this wakes up, it could interfere with test
+ options.stats_dump_period_sec = 0;
+
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(50));
+ if (partition) {
+ table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+ table_options.partition_filters = true;
+ }
+ table_options.metadata_cache_options.top_level_index_pinning =
+ PinningTier::kNone;
+ table_options.metadata_cache_options.partition_pinning =
+ PinningTier::kNone;
+ table_options.metadata_cache_options.unpartitioned_pinning =
+ PinningTier::kNone;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ // Create a new table.
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_OK(Put("bar", "value"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("zfoo", "value"));
+ ASSERT_OK(Put("zbar", "value"));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+
+ // Fresh cache
+ ClearCache(cache.get());
+
+ std::array<size_t, kNumCacheEntryRoles> expected{};
+ // For CacheEntryStatsCollector
+ expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
+ EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+ std::array<size_t, kNumCacheEntryRoles> prev_expected = expected;
+
+ // First access only filters
+ ASSERT_EQ("NOT_FOUND", Get("different from any key added"));
+ expected[static_cast<size_t>(CacheEntryRole::kFilterBlock)] += 2;
+ if (partition) {
+ expected[static_cast<size_t>(CacheEntryRole::kFilterMetaBlock)] += 2;
+ }
+ // Within some time window, we will get cached entry stats
+ EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+ // Not enough to force a miss
+ env_->MockSleepForSeconds(45);
+ EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+ // Enough to force a miss
+ env_->MockSleepForSeconds(601);
+ EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+ // Now access index and data block
+ ASSERT_EQ("value", Get("foo"));
+ expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+ if (partition) {
+ // top-level
+ expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+ }
+ expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
+ // Enough to force a miss
+ env_->MockSleepForSeconds(601);
+ // But inject a simulated long scan so that we need a longer
+ // interval to force a miss next time.
+ SyncPoint::GetInstance()->SetCallBack(
+ "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
+ [this](void*) {
+ // To spend no more than 0.2% of time scanning, we would need
+ // interval of at least 10000s
+ env_->MockSleepForSeconds(20);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+ prev_expected = expected;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // The same for other file
+ ASSERT_EQ("value", Get("zfoo"));
+ expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+ if (partition) {
+ // top-level
+ expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+ }
+ expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
+ // Because of the simulated long scan, this is not enough to force
+ // a miss
+ env_->MockSleepForSeconds(601);
+ EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+ // But this is enough
+ env_->MockSleepForSeconds(10000);
+ EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+ prev_expected = expected;
+
+ // Also check the GetProperty interface
+ std::map<std::string, std::string> values;
+ ASSERT_TRUE(
+ db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+
+ for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+ auto role = static_cast<CacheEntryRole>(i);
+ EXPECT_EQ(std::to_string(expected[i]),
+ values[BlockCacheEntryStatsMapKeys::EntryCount(role)]);
+ }
+
+ // Add one for kWriteBuffer
+ {
+ WriteBufferManager wbm(size_t{1} << 20, cache);
+ wbm.ReserveMem(1024);
+ expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]++;
+ // Now we check that the GetProperty interface is more agressive about
+ // re-scanning stats, but not totally aggressive.
+ // Within some time window, we will get cached entry stats
+ env_->MockSleepForSeconds(1);
+ EXPECT_EQ(std::to_string(prev_expected[static_cast<size_t>(
+ CacheEntryRole::kWriteBuffer)]),
+ values[BlockCacheEntryStatsMapKeys::EntryCount(
+ CacheEntryRole::kWriteBuffer)]);
+ // Not enough for a "background" miss but enough for a "foreground" miss
+ env_->MockSleepForSeconds(45);
+
+ ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats,
+ &values));
+ EXPECT_EQ(
+ std::to_string(
+ expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
+ values[BlockCacheEntryStatsMapKeys::EntryCount(
+ CacheEntryRole::kWriteBuffer)]);
+ }
+ prev_expected = expected;
+
+ // With collector pinned in cache, we should be able to hit
+ // even if the cache is full
+ ClearCache(cache.get());
+ Cache::Handle* h = nullptr;
+ if (strcmp(cache->Name(), "LRUCache") == 0) {
+ ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1,
+ GetNoopDeleterForRole<CacheEntryRole::kMisc>(),
+ &h, Cache::Priority::HIGH));
+ } else {
+ // For ClockCache we use a 16-byte key.
+ ASSERT_OK(cache->Insert("Fill-it-up-xxxxx", nullptr, capacity + 1,
+ GetNoopDeleterForRole<CacheEntryRole::kMisc>(),
+ &h, Cache::Priority::HIGH));
+ }
+ ASSERT_GT(cache->GetUsage(), cache->GetCapacity());
+ expected = {};
+ // For CacheEntryStatsCollector
+ expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
+ // For Fill-it-up
+ expected[static_cast<size_t>(CacheEntryRole::kMisc)]++;
+ // Still able to hit on saved stats
+ EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+ // Enough to force a miss
+ env_->MockSleepForSeconds(1000);
+ EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+ cache->Release(h);
+
+ // Now we test that the DB mutex is not held during scans, for the ways
+ // we know how to (possibly) trigger them. Without a better good way to
+ // check this, we simply inject an acquire & release of the DB mutex
+ // deep in the stat collection code. If we were already holding the
+ // mutex, that is UB that would at least be found by TSAN.
+ int scan_count = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
+ [this, &scan_count](void*) {
+ dbfull()->TEST_LockMutex();
+ dbfull()->TEST_UnlockMutex();
+ ++scan_count;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Different things that might trigger a scan, with mock sleeps to
+ // force a miss.
+ env_->MockSleepForSeconds(10000);
+ dbfull()->DumpStats();
+ ASSERT_EQ(scan_count, 1);
+
+ env_->MockSleepForSeconds(60);
+ ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats,
+ &values));
+ ASSERT_EQ(scan_count, 1);
+ ASSERT_TRUE(
+ db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+ ASSERT_EQ(scan_count, 2);
+
+ env_->MockSleepForSeconds(10000);
+ ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats,
+ &values));
+ ASSERT_EQ(scan_count, 3);
+
+ env_->MockSleepForSeconds(60);
+ std::string value_str;
+ ASSERT_TRUE(db_->GetProperty(DB::Properties::kFastBlockCacheEntryStats,
+ &value_str));
+ ASSERT_EQ(scan_count, 3);
+ ASSERT_TRUE(
+ db_->GetProperty(DB::Properties::kBlockCacheEntryStats, &value_str));
+ ASSERT_EQ(scan_count, 4);
+
+ env_->MockSleepForSeconds(10000);
+ ASSERT_TRUE(db_->GetProperty(DB::Properties::kFastBlockCacheEntryStats,
+ &value_str));
+ ASSERT_EQ(scan_count, 5);
+
+ ASSERT_TRUE(db_->GetProperty(DB::Properties::kCFStats, &value_str));
+ // To match historical speed, querying this property no longer triggers
+ // a scan, even if results are old. But periodic dump stats should keep
+ // things reasonably updated.
+ ASSERT_EQ(scan_count, /*unchanged*/ 5);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ }
+ EXPECT_GE(iterations_tested, 1);
+ }
+}
+
+namespace {
+
+void DummyFillCache(Cache& cache, size_t entry_size,
+ std::vector<CacheHandleGuard<void>>& handles) {
+ // fprintf(stderr, "Entry size: %zu\n", entry_size);
+ handles.clear();
+ cache.EraseUnRefEntries();
+ void* fake_value = &cache;
+ size_t capacity = cache.GetCapacity();
+ OffsetableCacheKey ck{"abc", "abc", 42};
+ for (size_t my_usage = 0; my_usage < capacity;) {
+ size_t charge = std::min(entry_size, capacity - my_usage);
+ Cache::Handle* handle;
+ Status st = cache.Insert(ck.WithOffset(my_usage).AsSlice(), fake_value,
+ charge, /*deleter*/ nullptr, &handle);
+ ASSERT_OK(st);
+ handles.emplace_back(&cache, handle);
+ my_usage += charge;
+ }
+}
+
+class CountingLogger : public Logger {
+ public:
+ ~CountingLogger() override {}
+ using Logger::Logv;
+ void Logv(const InfoLogLevel log_level, const char* format,
+ va_list /*ap*/) override {
+ if (std::strstr(format, "HyperClockCache") == nullptr) {
+ // Not a match
+ return;
+ }
+ // static StderrLogger debug;
+ // debug.Logv(log_level, format, ap);
+ if (log_level == InfoLogLevel::INFO_LEVEL) {
+ ++info_count_;
+ } else if (log_level == InfoLogLevel::WARN_LEVEL) {
+ ++warn_count_;
+ } else if (log_level == InfoLogLevel::ERROR_LEVEL) {
+ ++error_count_;
+ }
+ }
+
+ std::array<int, 3> PopCounts() {
+ std::array<int, 3> rv{{info_count_, warn_count_, error_count_}};
+ info_count_ = warn_count_ = error_count_ = 0;
+ return rv;
+ }
+
+ private:
+ int info_count_{};
+ int warn_count_{};
+ int error_count_{};
+};
+
+} // namespace
+
+TEST_F(DBBlockCacheTest, HyperClockCacheReportProblems) {
+ size_t capacity = 1024 * 1024;
+ size_t value_size_est = 8 * 1024;
+ HyperClockCacheOptions hcc_opts{capacity, value_size_est};
+ hcc_opts.num_shard_bits = 2; // 4 shards
+ hcc_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+ std::shared_ptr<Cache> cache = hcc_opts.MakeSharedCache();
+ std::shared_ptr<CountingLogger> logger = std::make_shared<CountingLogger>();
+
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ table_options.block_cache = cache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.info_log = logger;
+ // Going to sample more directly
+ options.stats_dump_period_sec = 0;
+ Reopen(options);
+
+ std::vector<CacheHandleGuard<void>> handles;
+
+ // Clear anything from DB startup
+ logger->PopCounts();
+
+ // Fill cache based on expected size and check that when we
+ // don't report anything relevant in periodic stats dump
+ DummyFillCache(*cache, value_size_est, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+ // Same, within reasonable bounds
+ DummyFillCache(*cache, value_size_est - value_size_est / 4, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+ DummyFillCache(*cache, value_size_est + value_size_est / 3, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+ // Estimate too high (value size too low) eventually reports ERROR
+ DummyFillCache(*cache, value_size_est / 2, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+
+ DummyFillCache(*cache, value_size_est / 3, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 1}}));
+
+ // Estimate too low (value size too high) starts with INFO
+ // and is only WARNING in the worst case
+ DummyFillCache(*cache, value_size_est * 2, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{1, 0, 0}}));
+
+ DummyFillCache(*cache, value_size_est * 3, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+
+ DummyFillCache(*cache, value_size_est * 20, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+}
+
+#endif // ROCKSDB_LITE
+
+class DBBlockCacheKeyTest
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ DBBlockCacheKeyTest()
+ : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {}
+
+ void SetUp() override {
+ use_compressed_cache_ = std::get<0>(GetParam());
+ exclude_file_numbers_ = std::get<1>(GetParam());
+ }
+
+ bool use_compressed_cache_;
+ bool exclude_file_numbers_;
+};
+
+// Disable LinkFile so that we can physically copy a DB using Checkpoint.
+// Disable file GetUniqueId to enable stable cache keys.
+class StableCacheKeyTestFS : public FaultInjectionTestFS {
+ public:
+ explicit StableCacheKeyTestFS(const std::shared_ptr<FileSystem>& base)
+ : FaultInjectionTestFS(base) {
+ SetFailGetUniqueId(true);
+ }
+
+ virtual ~StableCacheKeyTestFS() override {}
+
+ IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&,
+ IODebugContext*) override {
+ return IOStatus::NotSupported("Disabled");
+ }
+};
+
+TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
+ std::shared_ptr<StableCacheKeyTestFS> test_fs{
+ new StableCacheKeyTestFS(env_->GetFileSystem())};
+ std::unique_ptr<CompositeEnvWrapper> test_env{
+ new CompositeEnvWrapper(env_, test_fs)};
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.env = test_env.get();
+
+ // Corrupting the table properties corrupts the unique id.
+ // Ignore the unique id recorded in the manifest.
+ options.verify_sst_unique_id_in_manifest = false;
+
+ BlockBasedTableOptions table_options;
+
+ int key_count = 0;
+ uint64_t expected_stat = 0;
+
+ std::function<void()> verify_stats;
+ if (use_compressed_cache_) {
+ if (!Snappy_Supported()) {
+ ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support");
+ return;
+ }
+ options.compression = CompressionType::kSnappyCompression;
+ table_options.no_block_cache = true;
+ table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false);
+ verify_stats = [&options, &expected_stat] {
+ // One for ordinary SST file and one for external SST file
+ ASSERT_EQ(expected_stat,
+ options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD));
+ };
+ } else {
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+ verify_stats = [&options, &expected_stat] {
+ ASSERT_EQ(expected_stat,
+ options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+ ASSERT_EQ(expected_stat,
+ options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+ ASSERT_EQ(expected_stat,
+ options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+ };
+ }
+
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"koko"}, options);
+
+ if (exclude_file_numbers_) {
+ // Simulate something like old behavior without file numbers in properties.
+ // This is a "control" side of the test that also ensures safely degraded
+ // behavior on old files.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey",
+ [&](void* arg) {
+ TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+ props->orig_file_number = 0;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ }
+
+ std::function<void()> perform_gets = [&key_count, &expected_stat, this]() {
+ if (exclude_file_numbers_) {
+ // No cache key reuse should happen, because we can't rely on current
+ // file number being stable
+ expected_stat += key_count;
+ } else {
+ // Cache keys should be stable
+ expected_stat = key_count;
+ }
+ for (int i = 0; i < key_count; ++i) {
+ ASSERT_EQ(Get(1, Key(i)), "abc");
+ }
+ };
+
+ // Ordinary SST files with same session id
+ const std::string something_compressible(500U, 'x');
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_OK(Put(1, Key(key_count), "abc"));
+ ASSERT_OK(Put(1, Key(key_count) + "a", something_compressible));
+ ASSERT_OK(Flush(1));
+ ++key_count;
+ }
+
+#ifndef ROCKSDB_LITE
+ // Save an export of those ordinary SST files for later
+ std::string export_files_dir = dbname_ + "/exported";
+ ExportImportFilesMetaData* metadata_ptr_ = nullptr;
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir,
+ &metadata_ptr_));
+ ASSERT_NE(metadata_ptr_, nullptr);
+ delete checkpoint;
+ checkpoint = nullptr;
+
+ // External SST files with same session id
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ std::vector<std::string> external;
+ for (int i = 0; i < 2; ++i) {
+ std::string f = dbname_ + "/external" + std::to_string(i) + ".sst";
+ external.push_back(f);
+ ASSERT_OK(sst_file_writer.Open(f));
+ ASSERT_OK(sst_file_writer.Put(Key(key_count), "abc"));
+ ASSERT_OK(
+ sst_file_writer.Put(Key(key_count) + "a", something_compressible));
+ ++key_count;
+ ExternalSstFileInfo external_info;
+ ASSERT_OK(sst_file_writer.Finish(&external_info));
+ IngestExternalFileOptions ingest_opts;
+ ASSERT_OK(db_->IngestExternalFile(handles_[1], {f}, ingest_opts));
+ }
+
+ if (exclude_file_numbers_) {
+ // FIXME(peterd): figure out where these extra ADDs are coming from
+ options.statistics->recordTick(BLOCK_CACHE_COMPRESSED_ADD,
+ uint64_t{0} - uint64_t{2});
+ }
+#endif
+
+ perform_gets();
+ verify_stats();
+
+ // Make sure we can cache hit after re-open
+ ReopenWithColumnFamilies({"default", "koko"}, options);
+
+ perform_gets();
+ verify_stats();
+
+ // Make sure we can cache hit even on a full copy of the DB. Using
+ // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link.
+ // (Checkpoint not available in LITE mode to test this.)
+#ifndef ROCKSDB_LITE
+ auto db_copy_name = dbname_ + "-copy";
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name));
+ delete checkpoint;
+
+ Close();
+ Destroy(options);
+
+ // Switch to the DB copy
+ SaveAndRestore<std::string> save_dbname(&dbname_, db_copy_name);
+ ReopenWithColumnFamilies({"default", "koko"}, options);
+
+ perform_gets();
+ verify_stats();
+
+ // And ensure that re-importing + ingesting the same files into a
+ // different DB uses same cache keys
+ DestroyAndReopen(options);
+
+ ColumnFamilyHandle* cfh = nullptr;
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ *metadata_ptr_, &cfh));
+ ASSERT_NE(cfh, nullptr);
+ delete cfh;
+ cfh = nullptr;
+ delete metadata_ptr_;
+ metadata_ptr_ = nullptr;
+
+ ASSERT_OK(DestroyDB(export_files_dir, options));
+
+ ReopenWithColumnFamilies({"default", "yoyo"}, options);
+
+ IngestExternalFileOptions ingest_opts;
+ ASSERT_OK(db_->IngestExternalFile(handles_[1], {external}, ingest_opts));
+
+ perform_gets();
+ verify_stats();
+#endif // !ROCKSDB_LITE
+
+ Close();
+ Destroy(options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class CacheKeyTest : public testing::Test {
+ public:
+ CacheKey GetBaseCacheKey() {
+ CacheKey rv = GetOffsetableCacheKey(0, /*min file_number*/ 1).WithOffset(0);
+ // Correct for file_number_ == 1
+ *reinterpret_cast<uint64_t*>(&rv) ^= ReverseBits(uint64_t{1});
+ return rv;
+ }
+ CacheKey GetCacheKey(uint64_t session_counter, uint64_t file_number,
+ uint64_t offset) {
+ OffsetableCacheKey offsetable =
+ GetOffsetableCacheKey(session_counter, file_number);
+ // * 4 to counteract optimization that strips lower 2 bits in encoding
+ // the offset in BlockBasedTable::GetCacheKey (which we prefer to include
+ // in unit tests to maximize functional coverage).
+ EXPECT_GE(offset * 4, offset); // no overflow
+ return BlockBasedTable::GetCacheKey(offsetable,
+ BlockHandle(offset * 4, /*size*/ 5));
+ }
+
+ protected:
+ OffsetableCacheKey GetOffsetableCacheKey(uint64_t session_counter,
+ uint64_t file_number) {
+ // Like SemiStructuredUniqueIdGen::GenerateNext
+ tp_.db_session_id = EncodeSessionId(base_session_upper_,
+ base_session_lower_ ^ session_counter);
+ tp_.db_id = std::to_string(db_id_);
+ tp_.orig_file_number = file_number;
+ bool is_stable;
+ std::string cur_session_id = ""; // ignored
+ uint64_t cur_file_number = 42; // ignored
+ OffsetableCacheKey rv;
+ BlockBasedTable::SetupBaseCacheKey(&tp_, cur_session_id, cur_file_number,
+ &rv, &is_stable);
+ EXPECT_TRUE(is_stable);
+ EXPECT_TRUE(!rv.IsEmpty());
+ // BEGIN some assertions in relation to SST unique IDs
+ std::string external_unique_id_str;
+ EXPECT_OK(GetUniqueIdFromTableProperties(tp_, &external_unique_id_str));
+ UniqueId64x2 sst_unique_id = {};
+ EXPECT_OK(DecodeUniqueIdBytes(external_unique_id_str, &sst_unique_id));
+ ExternalUniqueIdToInternal(&sst_unique_id);
+ OffsetableCacheKey ock =
+ OffsetableCacheKey::FromInternalUniqueId(&sst_unique_id);
+ EXPECT_EQ(rv.WithOffset(0).AsSlice(), ock.WithOffset(0).AsSlice());
+ EXPECT_EQ(ock.ToInternalUniqueId(), sst_unique_id);
+ // END some assertions in relation to SST unique IDs
+ return rv;
+ }
+
+ TableProperties tp_;
+ uint64_t base_session_upper_ = 0;
+ uint64_t base_session_lower_ = 0;
+ uint64_t db_id_ = 0;
+};
+
+TEST_F(CacheKeyTest, DBImplSessionIdStructure) {
+ // We have to generate our own session IDs for simulation purposes in other
+ // tests. Here we verify that the DBImpl implementation seems to match
+ // our construction here, by using lowest XORed-in bits for "session
+ // counter."
+ std::string session_id1 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
+ std::string session_id2 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
+ uint64_t upper1, upper2, lower1, lower2;
+ ASSERT_OK(DecodeSessionId(session_id1, &upper1, &lower1));
+ ASSERT_OK(DecodeSessionId(session_id2, &upper2, &lower2));
+ // Because generated in same process
+ ASSERT_EQ(upper1, upper2);
+ // Unless we generate > 4 billion session IDs in this process...
+ ASSERT_EQ(Upper32of64(lower1), Upper32of64(lower2));
+ // But they must be different somewhere
+ ASSERT_NE(Lower32of64(lower1), Lower32of64(lower2));
+}
+
+namespace {
+// Deconstruct cache key, based on knowledge of implementation details.
+void DeconstructNonemptyCacheKey(const CacheKey& key, uint64_t* file_num_etc64,
+ uint64_t* offset_etc64) {
+ *file_num_etc64 = *reinterpret_cast<const uint64_t*>(key.AsSlice().data());
+ *offset_etc64 = *reinterpret_cast<const uint64_t*>(key.AsSlice().data() + 8);
+ assert(*file_num_etc64 != 0);
+ if (*offset_etc64 == 0) {
+ std::swap(*file_num_etc64, *offset_etc64);
+ }
+ assert(*offset_etc64 != 0);
+}
+
+// Make a bit mask of 0 to 64 bits
+uint64_t MakeMask64(int bits) {
+ if (bits >= 64) {
+ return uint64_t{0} - 1;
+ } else {
+ return (uint64_t{1} << bits) - 1;
+ }
+}
+
+// See CacheKeyTest::Encodings
+struct CacheKeyDecoder {
+ // Inputs
+ uint64_t base_file_num_etc64, base_offset_etc64;
+ int session_counter_bits, file_number_bits, offset_bits;
+
+ // Derived
+ uint64_t session_counter_mask, file_number_mask, offset_mask;
+
+ // Outputs
+ uint64_t decoded_session_counter, decoded_file_num, decoded_offset;
+
+ void SetBaseCacheKey(const CacheKey& base) {
+ DeconstructNonemptyCacheKey(base, &base_file_num_etc64, &base_offset_etc64);
+ }
+
+ void SetRanges(int _session_counter_bits, int _file_number_bits,
+ int _offset_bits) {
+ session_counter_bits = _session_counter_bits;
+ session_counter_mask = MakeMask64(session_counter_bits);
+ file_number_bits = _file_number_bits;
+ file_number_mask = MakeMask64(file_number_bits);
+ offset_bits = _offset_bits;
+ offset_mask = MakeMask64(offset_bits);
+ }
+
+ void Decode(const CacheKey& key) {
+ uint64_t file_num_etc64, offset_etc64;
+ DeconstructNonemptyCacheKey(key, &file_num_etc64, &offset_etc64);
+
+ // First decode session counter
+ if (offset_bits + session_counter_bits <= 64) {
+ // fully recoverable from offset_etc64
+ decoded_session_counter =
+ ReverseBits((offset_etc64 ^ base_offset_etc64)) &
+ session_counter_mask;
+ } else if (file_number_bits + session_counter_bits <= 64) {
+ // fully recoverable from file_num_etc64
+ decoded_session_counter = DownwardInvolution(
+ (file_num_etc64 ^ base_file_num_etc64) & session_counter_mask);
+ } else {
+ // Need to combine parts from each word.
+ // Piece1 will contain some correct prefix of the bottom bits of
+ // session counter.
+ uint64_t piece1 =
+ ReverseBits((offset_etc64 ^ base_offset_etc64) & ~offset_mask);
+ int piece1_bits = 64 - offset_bits;
+ // Piece2 will contain involuded bits that we can combine with piece1
+ // to infer rest of session counter
+ int piece2_bits = std::min(64 - file_number_bits, 64 - piece1_bits);
+ ASSERT_LT(piece2_bits, 64);
+ uint64_t piece2_mask = MakeMask64(piece2_bits);
+ uint64_t piece2 = (file_num_etc64 ^ base_file_num_etc64) & piece2_mask;
+
+ // Cancel out the part of piece2 that we can infer from piece1
+ // (DownwardInvolution distributes over xor)
+ piece2 ^= DownwardInvolution(piece1) & piece2_mask;
+
+ // Now we need to solve for the unknown original bits in higher
+ // positions than piece1 provides. We use Gaussian elimination
+ // because we know that a piece2_bits X piece2_bits submatrix of
+ // the matrix underlying DownwardInvolution times the vector of
+ // unknown original bits equals piece2.
+ //
+ // Build an augmented row matrix for that submatrix, built column by
+ // column.
+ std::array<uint64_t, 64> aug_rows{};
+ for (int i = 0; i < piece2_bits; ++i) { // over columns
+ uint64_t col_i = DownwardInvolution(uint64_t{1} << piece1_bits << i);
+ ASSERT_NE(col_i & 1U, 0);
+ for (int j = 0; j < piece2_bits; ++j) { // over rows
+ aug_rows[j] |= (col_i & 1U) << i;
+ col_i >>= 1;
+ }
+ }
+ // Augment with right hand side
+ for (int j = 0; j < piece2_bits; ++j) { // over rows
+ aug_rows[j] |= (piece2 & 1U) << piece2_bits;
+ piece2 >>= 1;
+ }
+ // Run Gaussian elimination
+ for (int i = 0; i < piece2_bits; ++i) { // over columns
+ // Find a row that can be used to cancel others
+ uint64_t canceller = 0;
+ // Note: Rows 0 through i-1 contain 1s in columns already eliminated
+ for (int j = i; j < piece2_bits; ++j) { // over rows
+ if (aug_rows[j] & (uint64_t{1} << i)) {
+ // Swap into appropriate row
+ std::swap(aug_rows[i], aug_rows[j]);
+ // Keep a handy copy for row reductions
+ canceller = aug_rows[i];
+ break;
+ }
+ }
+ ASSERT_NE(canceller, 0);
+ for (int j = 0; j < piece2_bits; ++j) { // over rows
+ if (i != j && ((aug_rows[j] >> i) & 1) != 0) {
+ // Row reduction
+ aug_rows[j] ^= canceller;
+ }
+ }
+ }
+ // Extract result
+ decoded_session_counter = piece1;
+ for (int j = 0; j < piece2_bits; ++j) { // over rows
+ ASSERT_EQ(aug_rows[j] & piece2_mask, uint64_t{1} << j);
+ decoded_session_counter |= aug_rows[j] >> piece2_bits << piece1_bits
+ << j;
+ }
+ }
+
+ decoded_offset =
+ offset_etc64 ^ base_offset_etc64 ^ ReverseBits(decoded_session_counter);
+
+ decoded_file_num = ReverseBits(file_num_etc64 ^ base_file_num_etc64 ^
+ DownwardInvolution(decoded_session_counter));
+ }
+};
+} // anonymous namespace
+
+TEST_F(CacheKeyTest, Encodings) {
+ // This test primarily verifies this claim from cache_key.cc:
+ // // In fact, if DB ids were not involved, we would be guaranteed unique
+ // // cache keys for files generated in a single process until total bits for
+ // // biggest session_id_counter, orig_file_number, and offset_in_file
+ // // reach 128 bits.
+ //
+ // To demonstrate this, CacheKeyDecoder can reconstruct the structured inputs
+ // to the cache key when provided an output cache key, the unstructured
+ // inputs, and bounds on the structured inputs.
+ //
+ // See OffsetableCacheKey comments in cache_key.cc.
+
+ // We are going to randomly initialize some values that *should* not affect
+ // result
+ Random64 r{std::random_device{}()};
+
+ CacheKeyDecoder decoder;
+ db_id_ = r.Next();
+ base_session_upper_ = r.Next();
+ base_session_lower_ = r.Next();
+ if (base_session_lower_ == 0) {
+ base_session_lower_ = 1;
+ }
+
+ decoder.SetBaseCacheKey(GetBaseCacheKey());
+
+ // Loop over configurations and test those
+ for (int session_counter_bits = 0; session_counter_bits <= 64;
+ ++session_counter_bits) {
+ for (int file_number_bits = 1; file_number_bits <= 64; ++file_number_bits) {
+ // 62 bits max because unoptimized offset will be 64 bits in that case
+ for (int offset_bits = 0; offset_bits <= 62; ++offset_bits) {
+ if (session_counter_bits + file_number_bits + offset_bits > 128) {
+ break;
+ }
+
+ decoder.SetRanges(session_counter_bits, file_number_bits, offset_bits);
+
+ uint64_t session_counter = r.Next() & decoder.session_counter_mask;
+ uint64_t file_number = r.Next() & decoder.file_number_mask;
+ if (file_number == 0) {
+ // Minimum
+ file_number = 1;
+ }
+ uint64_t offset = r.Next() & decoder.offset_mask;
+ decoder.Decode(GetCacheKey(session_counter, file_number, offset));
+
+ EXPECT_EQ(decoder.decoded_session_counter, session_counter);
+ EXPECT_EQ(decoder.decoded_file_num, file_number);
+ EXPECT_EQ(decoder.decoded_offset, offset);
+ }
+ }
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest,
+ ::testing::Combine(::testing::Bool(),
+ ::testing::Bool()));
+
+class DBBlockCachePinningTest
+ : public DBTestBase,
+ public testing::WithParamInterface<
+ std::tuple<bool, PinningTier, PinningTier, PinningTier>> {
+ public:
+ DBBlockCachePinningTest()
+ : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {}
+
+ void SetUp() override {
+ partition_index_and_filters_ = std::get<0>(GetParam());
+ top_level_index_pinning_ = std::get<1>(GetParam());
+ partition_pinning_ = std::get<2>(GetParam());
+ unpartitioned_pinning_ = std::get<3>(GetParam());
+ }
+
+ bool partition_index_and_filters_;
+ PinningTier top_level_index_pinning_;
+ PinningTier partition_pinning_;
+ PinningTier unpartitioned_pinning_;
+};
+
+TEST_P(DBBlockCachePinningTest, TwoLevelDB) {
+ // Creates one file in L0 and one file in L1. Both files have enough data that
+ // their index and filter blocks are partitioned. The L1 file will also have
+ // a compression dictionary (those are trained only during compaction), which
+ // must be unpartitioned.
+ const int kKeySize = 32;
+ const int kBlockSize = 128;
+ const int kNumBlocksPerFile = 128;
+ const int kNumKeysPerFile = kBlockSize * kNumBlocksPerFile / kKeySize;
+
+ Options options = CurrentOptions();
+ // `kNoCompression` makes the unit test more portable. But it relies on the
+ // current behavior of persisting/accessing dictionary even when there's no
+ // (de)compression happening, which seems fairly likely to change over time.
+ options.compression = kNoCompression;
+ options.compression_opts.max_dict_bytes = 4 << 10;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = NewLRUCache(1 << 20 /* capacity */);
+ table_options.block_size = kBlockSize;
+ table_options.metadata_block_size = kBlockSize;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.metadata_cache_options.top_level_index_pinning =
+ top_level_index_pinning_;
+ table_options.metadata_cache_options.partition_pinning = partition_pinning_;
+ table_options.metadata_cache_options.unpartitioned_pinning =
+ unpartitioned_pinning_;
+ table_options.filter_policy.reset(
+ NewBloomFilterPolicy(10 /* bits_per_key */));
+ if (partition_index_and_filters_) {
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ table_options.partition_filters = true;
+ }
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kKeySize)));
+ }
+ ASSERT_OK(Flush());
+ if (i == 0) {
+ // Prevent trivial move so file will be rewritten with dictionary and
+ // reopened with L1's pinning settings.
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ }
+ }
+
+ // Clear all unpinned blocks so unpinned blocks will show up as cache misses
+ // when reading a key from a file.
+ table_options.block_cache->EraseUnRefEntries();
+
+ // Get base cache values
+ uint64_t filter_misses = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ uint64_t index_misses = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+ uint64_t compression_dict_misses =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+
+ // Read a key from the L0 file
+ Get(Key(kNumKeysPerFile));
+ uint64_t expected_filter_misses = filter_misses;
+ uint64_t expected_index_misses = index_misses;
+ uint64_t expected_compression_dict_misses = compression_dict_misses;
+ if (partition_index_and_filters_) {
+ if (top_level_index_pinning_ == PinningTier::kNone) {
+ ++expected_filter_misses;
+ ++expected_index_misses;
+ }
+ if (partition_pinning_ == PinningTier::kNone) {
+ ++expected_filter_misses;
+ ++expected_index_misses;
+ }
+ } else {
+ if (unpartitioned_pinning_ == PinningTier::kNone) {
+ ++expected_filter_misses;
+ ++expected_index_misses;
+ }
+ }
+ if (unpartitioned_pinning_ == PinningTier::kNone) {
+ ++expected_compression_dict_misses;
+ }
+ ASSERT_EQ(expected_filter_misses,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(expected_index_misses,
+ TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(expected_compression_dict_misses,
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+
+ // Clear all unpinned blocks so unpinned blocks will show up as cache misses
+ // when reading a key from a file.
+ table_options.block_cache->EraseUnRefEntries();
+
+ // Read a key from the L1 file
+ Get(Key(0));
+ if (partition_index_and_filters_) {
+ if (top_level_index_pinning_ == PinningTier::kNone ||
+ top_level_index_pinning_ == PinningTier::kFlushedAndSimilar) {
+ ++expected_filter_misses;
+ ++expected_index_misses;
+ }
+ if (partition_pinning_ == PinningTier::kNone ||
+ partition_pinning_ == PinningTier::kFlushedAndSimilar) {
+ ++expected_filter_misses;
+ ++expected_index_misses;
+ }
+ } else {
+ if (unpartitioned_pinning_ == PinningTier::kNone ||
+ unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) {
+ ++expected_filter_misses;
+ ++expected_index_misses;
+ }
+ }
+ if (unpartitioned_pinning_ == PinningTier::kNone ||
+ unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) {
+ ++expected_compression_dict_misses;
+ }
+ ASSERT_EQ(expected_filter_misses,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(expected_index_misses,
+ TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(expected_compression_dict_misses,
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+}
+
+INSTANTIATE_TEST_CASE_P(
+ DBBlockCachePinningTest, DBBlockCachePinningTest,
+ ::testing::Combine(
+ ::testing::Bool(),
+ ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+ PinningTier::kAll),
+ ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+ PinningTier::kAll),
+ ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+ PinningTier::kAll)));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_bloom_filter_test.cc b/src/rocksdb/db/db_bloom_filter_test.cc
new file mode 100644
index 000000000..d68ab6115
--- /dev/null
+++ b/src/rocksdb/db/db_bloom_filter_test.cc
@@ -0,0 +1,3498 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstring>
+#include <iomanip>
+#include <sstream>
+#include <string>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/format.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+std::shared_ptr<const FilterPolicy> Create(double bits_per_key,
+ const std::string& name) {
+ return BloomLikeFilterPolicy::Create(name, bits_per_key);
+}
+const std::string kLegacyBloom = test::LegacyBloomFilterPolicy::kClassName();
+const std::string kFastLocalBloom =
+ test::FastLocalBloomFilterPolicy::kClassName();
+const std::string kStandard128Ribbon =
+ test::Standard128RibbonFilterPolicy::kClassName();
+const std::string kAutoBloom = BloomFilterPolicy::kClassName();
+const std::string kAutoRibbon = RibbonFilterPolicy::kClassName();
+} // anonymous namespace
+
+// DB tests related to bloom filter.
+
+class DBBloomFilterTest : public DBTestBase {
+ public:
+ DBBloomFilterTest()
+ : DBTestBase("db_bloom_filter_test", /*env_do_fsync=*/true) {}
+};
+
+class DBBloomFilterTestWithParam
+ : public DBTestBase,
+ public testing::WithParamInterface<
+ std::tuple<std::string, bool, uint32_t>> {
+ // public testing::WithParamInterface<bool> {
+ protected:
+ std::string bfp_impl_;
+ bool partition_filters_;
+ uint32_t format_version_;
+
+ public:
+ DBBloomFilterTestWithParam()
+ : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {}
+
+ ~DBBloomFilterTestWithParam() override {}
+
+ void SetUp() override {
+ bfp_impl_ = std::get<0>(GetParam());
+ partition_filters_ = std::get<1>(GetParam());
+ format_version_ = std::get<2>(GetParam());
+ }
+};
+
+class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {};
+
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+ const char* Name() const override {
+ return "SliceTransformLimitedDomainGeneric";
+ }
+
+ Slice Transform(const Slice& src) const override {
+ return Slice(src.data(), 5);
+ }
+
+ bool InDomain(const Slice& src) const override {
+ // prefix will be x????
+ return src.size() >= 5;
+ }
+
+ bool InRange(const Slice& dst) const override {
+ // prefix will be x????
+ return dst.size() == 5;
+ }
+};
+
+// KeyMayExist can lead to a few false positives, but not false negatives.
+// To make test deterministic, use a much larger number of bits per key-20 than
+// bits in the key, so that false positives are eliminated
+TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
+ do {
+ ReadOptions ropts;
+ std::string value;
+ anon::OptionsOverride options_override;
+ options_override.filter_policy = Create(20, bfp_impl_);
+ options_override.partition_filters = partition_filters_;
+ options_override.metadata_block_size = 32;
+ options_override.full_block_cache = true;
+ Options options = CurrentOptions(options_override);
+ if (partition_filters_) {
+ auto* table_options =
+ options.table_factory->GetOptions<BlockBasedTableOptions>();
+ if (table_options != nullptr &&
+ table_options->index_type !=
+ BlockBasedTableOptions::kTwoLevelIndexSearch) {
+ // In the current implementation partitioned filters depend on
+ // partitioned indexes
+ continue;
+ }
+ }
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+
+ ASSERT_OK(Put(1, "a", "b"));
+ bool value_found = false;
+ ASSERT_TRUE(
+ db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+ ASSERT_TRUE(value_found);
+ ASSERT_EQ("b", value);
+
+ ASSERT_OK(Flush(1));
+ value.clear();
+
+ uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ ASSERT_TRUE(
+ db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+ ASSERT_TRUE(!value_found);
+ // assert that no new files were opened and no new blocks were
+ // read into block cache.
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ ASSERT_OK(Delete(1, "a"));
+
+ numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+ true /* disallow trivial move */));
+
+ numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ ASSERT_OK(Delete(1, "c"));
+
+ numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ // KeyMayExist function only checks data in block caches, which is not used
+ // by plain table format.
+ } while (
+ ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction));
+}
+
+TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) {
+ for (bool partition_filters : {true, false}) {
+ Options options = last_options_;
+ options.prefix_extractor =
+ std::make_shared<SliceTransformLimitedDomainGeneric>();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ get_perf_context()->EnablePerLevelPerfContext();
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+ if (partition_filters) {
+ bbto.partition_filters = true;
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ WriteOptions wo;
+ ReadOptions ro;
+ FlushOptions fo;
+ fo.wait = true;
+ std::string value;
+
+ ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+ ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+ ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+ ASSERT_OK(dbfull()->Flush(fo));
+
+ ASSERT_EQ("foo", Get("barbarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ(
+ 0,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+ ASSERT_EQ("foo2", Get("barbarbar2"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ(
+ 0,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+ ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ(
+ 0,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+ ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ(
+ 1,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+ ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+ ASSERT_EQ(
+ 2,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+ ro.total_order_seek = true;
+ // NOTE: total_order_seek no longer affects Get()
+ ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+ ASSERT_EQ(
+ 3,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+ // No bloom on extractor changed
+#ifndef ROCKSDB_LITE
+ ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}}));
+ ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+ ASSERT_EQ(
+ 3,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+#endif // ROCKSDB_LITE
+
+ // No bloom on extractor changed, after re-open
+ options.prefix_extractor.reset(NewCappedPrefixTransform(10));
+ Reopen(options);
+ ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+ ASSERT_EQ(
+ 3,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+ get_perf_context()->Reset();
+ }
+}
+
+TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
+ for (bool partition_filters : {true, false}) {
+ Options options = last_options_;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ get_perf_context()->EnablePerLevelPerfContext();
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+ if (partition_filters) {
+ bbto.partition_filters = true;
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ WriteOptions wo;
+ ReadOptions ro;
+ FlushOptions fo;
+ fo.wait = true;
+ std::string value;
+
+ ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+ ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+ ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+ ASSERT_OK(dbfull()->Flush(fo));
+
+ ASSERT_EQ("foo", Get("barbarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ("foo2", Get("barbarbar2"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+ ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+ ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+
+ ro.total_order_seek = true;
+ // NOTE: total_order_seek no longer affects Get()
+ ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+ ASSERT_EQ(
+ 3,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+ // No bloom on extractor changed
+#ifndef ROCKSDB_LITE
+ ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}}));
+ ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+ ASSERT_EQ(
+ 3,
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+#endif // ROCKSDB_LITE
+
+ get_perf_context()->Reset();
+ }
+}
+
+TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
+ for (bool partition_filters : {true, false}) {
+ Options options = last_options_;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ get_perf_context()->EnablePerLevelPerfContext();
+
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+ bbto.whole_key_filtering = false;
+ if (partition_filters) {
+ bbto.partition_filters = true;
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ WriteOptions wo;
+ ReadOptions ro;
+ FlushOptions fo;
+ fo.wait = true;
+ std::string value;
+
+ ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+ // Needs insert some keys to make sure files are not filtered out by key
+ // ranges.
+ ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+ ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+ ASSERT_OK(dbfull()->Flush(fo));
+
+ Reopen(options);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+ // Reopen with whole key filtering enabled and prefix extractor
+ // NULL. Bloom filter should be off for both of whole key and
+ // prefix bloom.
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.prefix_extractor.reset();
+ Reopen(options);
+
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ // Write DB with only full key filtering.
+ ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+ // Needs insert some keys to make sure files are not filtered out by key
+ // ranges.
+ ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+ ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Reopen with both of whole key off and prefix extractor enabled.
+ // Still no bloom filter should be used.
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+ // Try to create a DB with mixed files:
+ ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+ // Needs insert some keys to make sure files are not filtered out by key
+ // ranges.
+ ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+ ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ options.prefix_extractor.reset();
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ // Try to create a DB with mixed files.
+ ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar"));
+ // In this case needs insert some keys to make sure files are
+ // not filtered out by key ranges.
+ ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+ ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+ ASSERT_OK(Flush());
+
+ // Now we have two files:
+ // File 1: An older file with prefix bloom.
+ // File 2: A newer file with whole bloom filter.
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+ ASSERT_EQ("bar", Get("barfoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+
+ // Reopen with the same setting: only whole key is used
+ Reopen(options);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+ ASSERT_EQ("bar", Get("barfoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+
+ // Restart with both filters are allowed
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+ // File 1 will has it filtered out.
+ // File 2 will not, as prefix `foo` exists in the file.
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+ ASSERT_EQ("bar", Get("barfoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+
+ // Restart with only prefix bloom is allowed.
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+ ASSERT_EQ("NOT_FOUND", Get("bar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+ ASSERT_EQ("foo", Get("foobar"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+ ASSERT_EQ("bar", Get("barfoo"));
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+ uint64_t bloom_filter_useful_all_levels = 0;
+ for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+ if (kv.second.bloom_filter_useful > 0) {
+ bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+ }
+ }
+ ASSERT_EQ(12, bloom_filter_useful_all_levels);
+ get_perf_context()->Reset();
+ }
+}
+
+TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
+ do {
+ Options options = CurrentOptions();
+ env_->count_random_reads_ = true;
+ options.env = env_;
+ // ChangeCompactOptions() only changes compaction style, which does not
+ // trigger reset of table_factory
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ table_options.filter_policy = Create(10, bfp_impl_);
+ table_options.partition_filters = partition_filters_;
+ if (partition_filters_) {
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ table_options.format_version = format_version_;
+ if (format_version_ >= 4) {
+ // value delta encoding challenged more with index interval > 1
+ table_options.index_block_restart_interval = 8;
+ }
+ table_options.metadata_block_size = 32;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Populate multiple layers
+ const int N = 10000;
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ Compact(1, "a", "z");
+ for (int i = 0; i < N; i += 100) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ ASSERT_OK(Flush(1));
+
+ // Prevent auto compactions triggered by seeks
+ env_->delay_sstable_sync_.store(true, std::memory_order_release);
+
+ // Lookup present keys. Should rarely read from small sstable.
+ env_->random_read_counter_.Reset();
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i), Get(1, Key(i)));
+ }
+ int reads = env_->random_read_counter_.Read();
+ fprintf(stderr, "%d present => %d reads\n", N, reads);
+ ASSERT_GE(reads, N);
+ if (partition_filters_) {
+ // Without block cache, we read an extra partition filter per each
+ // level*read and a partition index per each read
+ ASSERT_LE(reads, 4 * N + 2 * N / 100);
+ } else {
+ ASSERT_LE(reads, N + 2 * N / 100);
+ }
+
+ // Lookup present keys. Should rarely read from either sstable.
+ env_->random_read_counter_.Reset();
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing"));
+ }
+ reads = env_->random_read_counter_.Read();
+ fprintf(stderr, "%d missing => %d reads\n", N, reads);
+ if (partition_filters_) {
+ // With partitioned filter we read one extra filter per level per each
+ // missed read.
+ ASSERT_LE(reads, 2 * N + 3 * N / 100);
+ } else {
+ ASSERT_LE(reads, 3 * N / 100);
+ }
+
+#ifndef ROCKSDB_LITE
+ // Sanity check some table properties
+ std::map<std::string, std::string> props;
+ ASSERT_TRUE(db_->GetMapProperty(
+ handles_[1], DB::Properties::kAggregatedTableProperties, &props));
+ uint64_t nkeys = N + N / 100;
+ uint64_t filter_size = ParseUint64(props["filter_size"]);
+ EXPECT_LE(filter_size,
+ (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ 8);
+ if (bfp_impl_ == kAutoRibbon) {
+ // Sometimes using Ribbon filter which is more space-efficient
+ EXPECT_GE(filter_size, 7 * nkeys / /*bits / byte*/ 8);
+ } else {
+ // Always Bloom
+ EXPECT_GE(filter_size, 10 * nkeys / /*bits / byte*/ 8);
+ }
+
+ uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]);
+ EXPECT_EQ(num_filter_entries, nkeys);
+#endif // ROCKSDB_LITE
+
+ env_->delay_sstable_sync_.store(false, std::memory_order_release);
+ Close();
+ } while (ChangeCompactOptions());
+}
+
+namespace {
+
+class AlwaysTrueBitsBuilder : public FilterBitsBuilder {
+ public:
+ void AddKey(const Slice&) override {}
+ size_t EstimateEntriesAdded() override { return 0U; }
+ Slice Finish(std::unique_ptr<const char[]>* /* buf */) override {
+ // Interpreted as "always true" filter (0 probes over 1 byte of
+ // payload, 5 bytes metadata)
+ return Slice("\0\0\0\0\0\0", 6);
+ }
+ using FilterBitsBuilder::Finish;
+ size_t ApproximateNumEntries(size_t) override { return SIZE_MAX; }
+};
+
+class AlwaysTrueFilterPolicy : public ReadOnlyBuiltinFilterPolicy {
+ public:
+ explicit AlwaysTrueFilterPolicy(bool skip) : skip_(skip) {}
+
+ FilterBitsBuilder* GetBuilderWithContext(
+ const FilterBuildingContext&) const override {
+ if (skip_) {
+ return nullptr;
+ } else {
+ return new AlwaysTrueBitsBuilder();
+ }
+ }
+
+ private:
+ bool skip_;
+};
+
+} // anonymous namespace
+
+TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) {
+ constexpr int maxKey = 10;
+ auto PutFn = [&]() {
+ int i;
+ // Put
+ for (i = 0; i < maxKey; i++) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ }
+ Flush();
+ };
+ auto GetFn = [&]() {
+ int i;
+ // Get OK
+ for (i = 0; i < maxKey; i++) {
+ ASSERT_EQ(Key(i), Get(Key(i)));
+ }
+ // Get NotFound
+ for (; i < maxKey * 2; i++) {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ };
+ auto PutAndGetFn = [&]() {
+ PutFn();
+ GetFn();
+ };
+#ifndef ROCKSDB_LITE
+ std::map<std::string, std::string> props;
+ const auto& kAggTableProps = DB::Properties::kAggregatedTableProperties;
+#endif // ROCKSDB_LITE
+
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.partition_filters = partition_filters_;
+ if (partition_filters_) {
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ table_options.format_version = format_version_;
+
+ // Test 1: bits per key < 0.5 means skip filters -> no filter
+ // constructed or read.
+ table_options.filter_policy = Create(0.4, bfp_impl_);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+ PutAndGetFn();
+
+ // Verify no filter access nor contruction
+ EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0);
+ EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0);
+
+#ifndef ROCKSDB_LITE
+ props.clear();
+ ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+ EXPECT_EQ(props["filter_size"], "0");
+#endif // ROCKSDB_LITE
+
+ // Test 2: use custom API to skip filters -> no filter constructed
+ // or read.
+ table_options.filter_policy.reset(
+ new AlwaysTrueFilterPolicy(/* skip */ true));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+ PutAndGetFn();
+
+ // Verify no filter access nor construction
+ EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0);
+ EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0);
+
+#ifndef ROCKSDB_LITE
+ props.clear();
+ ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+ EXPECT_EQ(props["filter_size"], "0");
+#endif // ROCKSDB_LITE
+
+ // Control test: using an actual filter with 100% FP rate -> the filter
+ // is constructed and checked on read.
+ table_options.filter_policy.reset(
+ new AlwaysTrueFilterPolicy(/* skip */ false));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+ PutAndGetFn();
+
+ // Verify filter is accessed (and constructed)
+ EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE),
+ maxKey * 2);
+ EXPECT_EQ(
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+ maxKey);
+#ifndef ROCKSDB_LITE
+ props.clear();
+ ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+ EXPECT_NE(props["filter_size"], "0");
+#endif // ROCKSDB_LITE
+
+ // Test 3 (options test): Able to read existing filters with longstanding
+ // generated options file entry `filter_policy=rocksdb.BuiltinBloomFilter`
+ ASSERT_OK(FilterPolicy::CreateFromString(ConfigOptions(),
+ "rocksdb.BuiltinBloomFilter",
+ &table_options.filter_policy));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ GetFn();
+
+ // Verify filter is accessed
+ EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE),
+ maxKey * 2);
+ EXPECT_EQ(
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+ maxKey);
+
+ // But new filters are not generated (configuration details unknown)
+ DestroyAndReopen(options);
+ PutAndGetFn();
+
+ // Verify no filter access nor construction
+ EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0);
+ EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0);
+
+#ifndef ROCKSDB_LITE
+ props.clear();
+ ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+ EXPECT_EQ(props["filter_size"], "0");
+#endif // ROCKSDB_LITE
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+INSTANTIATE_TEST_CASE_P(
+ FormatDef, DBBloomFilterTestDefFormatVersion,
+ ::testing::Values(
+ std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion),
+ std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion),
+ std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+ FormatDef, DBBloomFilterTestWithParam,
+ ::testing::Values(
+ std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion),
+ std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion),
+ std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+ FormatLatest, DBBloomFilterTestWithParam,
+ ::testing::Values(std::make_tuple(kAutoBloom, true, kLatestFormatVersion),
+ std::make_tuple(kAutoBloom, false, kLatestFormatVersion),
+ std::make_tuple(kAutoRibbon, false,
+ kLatestFormatVersion)));
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(DBBloomFilterTest, BloomFilterRate) {
+ while (ChangeFilterOptions()) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ get_perf_context()->EnablePerLevelPerfContext();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const int maxKey = 10000;
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ // Add a large key to make the file contain wide range
+ ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+ Flush(1);
+
+ // Check if they can be found
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ(Key(i), Get(1, Key(i)));
+ }
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+ // Check if filter is useful
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+ }
+ ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98);
+ ASSERT_GE(
+ (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful,
+ maxKey * 0.98);
+ get_perf_context()->Reset();
+ }
+}
+
+namespace {
+struct CompatibilityConfig {
+ std::shared_ptr<const FilterPolicy> policy;
+ bool partitioned;
+ uint32_t format_version;
+
+ void SetInTableOptions(BlockBasedTableOptions* table_options) {
+ table_options->filter_policy = policy;
+ table_options->partition_filters = partitioned;
+ if (partitioned) {
+ table_options->index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ } else {
+ table_options->index_type =
+ BlockBasedTableOptions::IndexType::kBinarySearch;
+ }
+ table_options->format_version = format_version;
+ }
+};
+// High bits per key -> almost no FPs
+std::shared_ptr<const FilterPolicy> kCompatibilityBloomPolicy{
+ NewBloomFilterPolicy(20)};
+// bloom_before_level=-1 -> always use Ribbon
+std::shared_ptr<const FilterPolicy> kCompatibilityRibbonPolicy{
+ NewRibbonFilterPolicy(20, -1)};
+
+std::vector<CompatibilityConfig> kCompatibilityConfigs = {
+ {kCompatibilityBloomPolicy, false, BlockBasedTableOptions().format_version},
+ {kCompatibilityBloomPolicy, true, BlockBasedTableOptions().format_version},
+ {kCompatibilityBloomPolicy, false, /* legacy Bloom */ 4U},
+ {kCompatibilityRibbonPolicy, false,
+ BlockBasedTableOptions().format_version},
+ {kCompatibilityRibbonPolicy, true, BlockBasedTableOptions().format_version},
+};
+} // anonymous namespace
+
+TEST_F(DBBloomFilterTest, BloomFilterCompatibility) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.level0_file_num_compaction_trigger =
+ static_cast<int>(kCompatibilityConfigs.size()) + 1;
+ options.max_open_files = -1;
+
+ Close();
+
+ // Create one file for each kind of filter. Each file covers a distinct key
+ // range.
+ for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) {
+ BlockBasedTableOptions table_options;
+ kCompatibilityConfigs[i].SetInTableOptions(&table_options);
+ ASSERT_TRUE(table_options.filter_policy != nullptr);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ std::string prefix = std::to_string(i) + "_";
+ ASSERT_OK(Put(prefix + "A", "val"));
+ ASSERT_OK(Put(prefix + "Z", "val"));
+ ASSERT_OK(Flush());
+ }
+
+ // Test filter is used between each pair of {reader,writer} configurations,
+ // because any built-in FilterPolicy should be able to read filters from any
+ // other built-in FilterPolicy
+ for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) {
+ BlockBasedTableOptions table_options;
+ kCompatibilityConfigs[i].SetInTableOptions(&table_options);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ for (size_t j = 0; j < kCompatibilityConfigs.size(); ++j) {
+ std::string prefix = std::to_string(j) + "_";
+ ASSERT_EQ("val", Get(prefix + "A")); // Filter positive
+ ASSERT_EQ("val", Get(prefix + "Z")); // Filter positive
+ // Filter negative, with high probability
+ ASSERT_EQ("NOT_FOUND", Get(prefix + "Q"));
+ EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE),
+ 2);
+ EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+ }
+ }
+}
+
+// To align with the type of hash entry being reserved in implementation.
+using FilterConstructionReserveMemoryHash = uint64_t;
+
+class ChargeFilterConstructionTestWithParam
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<
+ CacheEntryRoleOptions::Decision, std::string, bool, bool>> {
+ public:
+ ChargeFilterConstructionTestWithParam()
+ : DBTestBase("db_bloom_filter_tests",
+ /*env_do_fsync=*/true),
+ num_key_(0),
+ charge_filter_construction_(std::get<0>(GetParam())),
+ policy_(std::get<1>(GetParam())),
+ partition_filters_(std::get<2>(GetParam())),
+ detect_filter_construct_corruption_(std::get<3>(GetParam())) {
+ if (charge_filter_construction_ ==
+ CacheEntryRoleOptions::Decision::kDisabled ||
+ policy_ == kLegacyBloom) {
+ // For these cases, we only interested in whether filter construction
+ // cache charging happens instead of its accuracy. Therefore we don't
+ // need many keys.
+ num_key_ = 5;
+ } else if (partition_filters_) {
+ // For PartitionFilter case, since we set
+ // table_options.metadata_block_size big enough such that each partition
+ // trigger at least 1 dummy entry reservation each for hash entries and
+ // final filter, we need a large number of keys to ensure we have at least
+ // two partitions.
+ num_key_ = 18 *
+ CacheReservationManagerImpl<
+ CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() /
+ sizeof(FilterConstructionReserveMemoryHash);
+ } else if (policy_ == kFastLocalBloom) {
+ // For Bloom Filter + FullFilter case, since we design the num_key_ to
+ // make hash entry cache charging be a multiple of dummy entries, the
+ // correct behavior of charging final filter on top of it will trigger at
+ // least another dummy entry insertion. Therefore we can assert that
+ // behavior and we don't need a large number of keys to verify we
+ // indeed charge the final filter for in cache, even though final
+ // filter is a lot smaller than hash entries.
+ num_key_ = 1 *
+ CacheReservationManagerImpl<
+ CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() /
+ sizeof(FilterConstructionReserveMemoryHash);
+ } else {
+ // For Ribbon Filter + FullFilter case, we need a large enough number of
+ // keys so that charging final filter after releasing the hash entries
+ // reservation will trigger at least another dummy entry (or equivalently
+ // to saying, causing another peak in cache charging) as banding
+ // reservation might not be a multiple of dummy entry.
+ num_key_ = 12 *
+ CacheReservationManagerImpl<
+ CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() /
+ sizeof(FilterConstructionReserveMemoryHash);
+ }
+ }
+
+ BlockBasedTableOptions GetBlockBasedTableOptions() {
+ BlockBasedTableOptions table_options;
+
+ // We set cache capacity big enough to prevent cache full for convenience in
+ // calculation.
+ constexpr std::size_t kCacheCapacity = 100 * 1024 * 1024;
+
+ table_options.cache_usage_options.options_overrides.insert(
+ {CacheEntryRole::kFilterConstruction,
+ {/*.charged = */ charge_filter_construction_}});
+ table_options.filter_policy = Create(10, policy_);
+ table_options.partition_filters = partition_filters_;
+ if (table_options.partition_filters) {
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ // We set table_options.metadata_block_size big enough so that each
+ // partition trigger at least 1 dummy entry insertion each for hash
+ // entries and final filter.
+ table_options.metadata_block_size = 409000;
+ }
+ table_options.detect_filter_construct_corruption =
+ detect_filter_construct_corruption_;
+
+ LRUCacheOptions lo;
+ lo.capacity = kCacheCapacity;
+ lo.num_shard_bits = 0; // 2^0 shard
+ lo.strict_capacity_limit = true;
+ cache_ = std::make_shared<
+ TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>(
+ (NewLRUCache(lo)));
+ table_options.block_cache = cache_;
+
+ return table_options;
+ }
+
+ std::size_t GetNumKey() { return num_key_; }
+
+ CacheEntryRoleOptions::Decision ChargeFilterConstructMemory() {
+ return charge_filter_construction_;
+ }
+
+ std::string GetFilterPolicy() { return policy_; }
+
+ bool PartitionFilters() { return partition_filters_; }
+
+ std::shared_ptr<
+ TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+ GetCache() {
+ return cache_;
+ }
+
+ private:
+ std::size_t num_key_;
+ CacheEntryRoleOptions::Decision charge_filter_construction_;
+ std::string policy_;
+ bool partition_filters_;
+ std::shared_ptr<
+ TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+ cache_;
+ bool detect_filter_construct_corruption_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ ChargeFilterConstructionTestWithParam,
+ ChargeFilterConstructionTestWithParam,
+ ::testing::Values(
+ std::make_tuple(CacheEntryRoleOptions::Decision::kDisabled,
+ kFastLocalBloom, false, false),
+
+ std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+ kFastLocalBloom, false, false),
+ std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+ kFastLocalBloom, false, true),
+ std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+ kFastLocalBloom, true, false),
+ std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+ kFastLocalBloom, true, true),
+
+ std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+ kStandard128Ribbon, false, false),
+ std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+ kStandard128Ribbon, false, true),
+ std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+ kStandard128Ribbon, true, false),
+ std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+ kStandard128Ribbon, true, true),
+
+ std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kLegacyBloom,
+ false, false)));
+
+// TODO: Speed up this test, and reduce disk space usage (~700MB)
+// The current test inserts many keys (on the scale of dummy entry size)
+// in order to make small memory user (e.g, final filter, partitioned hash
+// entries/filter/banding) , which is proportional to the number of
+// keys, big enough so that its cache charging triggers dummy entry insertion
+// and becomes observable in the test.
+//
+// However, inserting that many keys slows down this test and leaves future
+// developers an opportunity to speed it up.
+//
+// Possible approaches & challenges:
+// 1. Use sync point during cache charging of filter construction
+//
+// Benefit: It does not rely on triggering dummy entry insertion
+// but the sync point to verify small memory user is charged correctly.
+//
+// Challenge: this approach is intrusive.
+//
+// 2. Make dummy entry size configurable and set it small in the test
+//
+// Benefit: It increases the precision of cache charging and therefore
+// small memory usage can still trigger insertion of dummy entry.
+//
+// Challenge: change CacheReservationManager related APIs and a hack
+// might be needed to control the size of dummmy entry of
+// CacheReservationManager used in filter construction for testing
+// since CacheReservationManager is not exposed at the high level.
+//
+TEST_P(ChargeFilterConstructionTestWithParam, Basic) {
+ Options options = CurrentOptions();
+ // We set write_buffer_size big enough so that in the case where there is
+ // filter construction cache charging, flush won't be triggered before we
+ // manually trigger it for clean testing
+ options.write_buffer_size = 640 << 20;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ std::shared_ptr<
+ TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+ cache = GetCache();
+ options.create_if_missing = true;
+ // Disable auto compaction to prevent its unexpected side effect
+ // to the number of keys per partition designed by us in the test
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ int num_key = static_cast<int>(GetNumKey());
+ for (int i = 0; i < num_key; i++) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ }
+
+ ASSERT_EQ(cache->GetChargedCacheIncrementSum(), 0)
+ << "Flush was triggered too early in the test case with filter "
+ "construction cache charging - please make sure no flush triggered "
+ "during the key insertions above";
+
+ ASSERT_OK(Flush());
+
+ bool charge_filter_construction = (ChargeFilterConstructMemory() ==
+ CacheEntryRoleOptions::Decision::kEnabled);
+ std::string policy = GetFilterPolicy();
+ bool partition_filters = PartitionFilters();
+ bool detect_filter_construct_corruption =
+ table_options.detect_filter_construct_corruption;
+
+ std::deque<std::size_t> filter_construction_cache_res_peaks =
+ cache->GetChargedCachePeaks();
+ std::size_t filter_construction_cache_res_increments_sum =
+ cache->GetChargedCacheIncrementSum();
+
+ if (!charge_filter_construction) {
+ EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0);
+ return;
+ }
+
+ if (policy == kLegacyBloom) {
+ EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0)
+ << "There shouldn't be filter construction cache charging as this "
+ "feature does not support kLegacyBloom";
+ return;
+ }
+
+ const std::size_t kDummyEntrySize = CacheReservationManagerImpl<
+ CacheEntryRole::kFilterConstruction>::GetDummyEntrySize();
+
+ const std::size_t predicted_hash_entries_cache_res =
+ num_key * sizeof(FilterConstructionReserveMemoryHash);
+ ASSERT_EQ(predicted_hash_entries_cache_res % kDummyEntrySize, 0)
+ << "It's by this test's design that predicted_hash_entries_cache_res is "
+ "a multipe of dummy entry";
+
+ const std::size_t predicted_hash_entries_cache_res_dummy_entry_num =
+ predicted_hash_entries_cache_res / kDummyEntrySize;
+ const std::size_t predicted_final_filter_cache_res =
+ static_cast<std::size_t>(
+ std::ceil(1.0 * predicted_hash_entries_cache_res_dummy_entry_num / 6 *
+ (policy == kStandard128Ribbon ? 0.7 : 1))) *
+ kDummyEntrySize;
+ const std::size_t predicted_banding_cache_res =
+ static_cast<std::size_t>(
+ std::ceil(predicted_hash_entries_cache_res_dummy_entry_num * 2.5)) *
+ kDummyEntrySize;
+
+ if (policy == kFastLocalBloom) {
+ /* kFastLocalBloom + FullFilter
+ * p0
+ * / \
+ * b / \
+ * / \
+ * / \
+ * 0/ \
+ * hash entries = b - 0, final filter = p0 - b
+ * p0 = hash entries + final filter
+ *
+ * The test is designed in a way such that the reservation for b is a
+ * multiple of dummy entries so that reservation for (p0 - b)
+ * will trigger at least another dummy entry insertion.
+ *
+ * kFastLocalBloom + FullFilter +
+ * detect_filter_construct_corruption
+ * The peak p0 stays the same as
+ * (kFastLocalBloom + FullFilter) but just lasts
+ * longer since we release hash entries reservation later.
+ *
+ * kFastLocalBloom + PartitionedFilter
+ * p1
+ * / \
+ * p0 b'/ \
+ * / \ / \
+ * b / \ / \
+ * / \ / \
+ * / a \
+ * 0/ \
+ * partitioned hash entries1 = b - 0, partitioned hash entries1 = b' - a
+ * parittioned final filter1 = p0 - b, parittioned final filter2 = p1 - b'
+ *
+ * (increment p0 - 0) + (increment p1 - a)
+ * = partitioned hash entries1 + partitioned hash entries2
+ * + parittioned final filter1 + parittioned final filter2
+ * = hash entries + final filter
+ *
+ * kFastLocalBloom + PartitionedFilter +
+ * detect_filter_construct_corruption
+ * The peak p0, p1 stay the same as
+ * (kFastLocalBloom + PartitionedFilter) but just
+ * last longer since we release hash entries reservation later.
+ *
+ */
+ if (!partition_filters) {
+ EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1)
+ << "Filter construction cache charging should have only 1 peak in "
+ "case: kFastLocalBloom + FullFilter";
+ std::size_t filter_construction_cache_res_peak =
+ filter_construction_cache_res_peaks[0];
+ EXPECT_GT(filter_construction_cache_res_peak,
+ predicted_hash_entries_cache_res)
+ << "The testing number of hash entries is designed to make hash "
+ "entries cache charging be multiples of dummy entries"
+ " so the correct behavior of charging final filter on top of it"
+ " should've triggered at least another dummy entry insertion";
+
+ std::size_t predicted_filter_construction_cache_res_peak =
+ predicted_hash_entries_cache_res + predicted_final_filter_cache_res;
+ EXPECT_GE(filter_construction_cache_res_peak,
+ predicted_filter_construction_cache_res_peak * 0.9);
+ EXPECT_LE(filter_construction_cache_res_peak,
+ predicted_filter_construction_cache_res_peak * 1.1);
+ return;
+ } else {
+ EXPECT_GE(filter_construction_cache_res_peaks.size(), 2)
+ << "Filter construction cache charging should have multiple peaks "
+ "in case: kFastLocalBloom + "
+ "PartitionedFilter";
+ std::size_t predicted_filter_construction_cache_res_increments_sum =
+ predicted_hash_entries_cache_res + predicted_final_filter_cache_res;
+ EXPECT_GE(filter_construction_cache_res_increments_sum,
+ predicted_filter_construction_cache_res_increments_sum * 0.9);
+ EXPECT_LE(filter_construction_cache_res_increments_sum,
+ predicted_filter_construction_cache_res_increments_sum * 1.1);
+ return;
+ }
+ }
+
+ if (policy == kStandard128Ribbon) {
+ /* kStandard128Ribbon + FullFilter
+ * p0
+ * / \ p1
+ * / \/\
+ * b / b' \
+ * / \
+ * 0/ \
+ * hash entries = b - 0, banding = p0 - b, final filter = p1 - b'
+ * p0 = hash entries + banding
+ *
+ * The test is designed in a way such that the reservation for (p1 - b')
+ * will trigger at least another dummy entry insertion
+ * (or equivelantly to saying, creating another peak).
+ *
+ * kStandard128Ribbon + FullFilter +
+ * detect_filter_construct_corruption
+ *
+ * new p0
+ * / \
+ * / \
+ * pre p0 \
+ * / \
+ * / \
+ * b / \
+ * / \
+ * 0/ \
+ * hash entries = b - 0, banding = pre p0 - b,
+ * final filter = new p0 - pre p0
+ * new p0 = hash entries + banding + final filter
+ *
+ * The previous p0 will no longer be a peak since under
+ * detect_filter_construct_corruption == true, we do not release hash
+ * entries reserveration (like p0 - b' previously) until after final filter
+ * creation and post-verification
+ *
+ * kStandard128Ribbon + PartitionedFilter
+ * p3
+ * p0 /\ p4
+ * / \ p1 / \ /\
+ * / \/\ b''/ a' \
+ * b / b' \ / \
+ * / \ / \
+ * 0/ a \
+ * partitioned hash entries1 = b - 0, partitioned hash entries2 = b'' - a
+ * partitioned banding1 = p0 - b, partitioned banding2 = p3 - b''
+ * parittioned final filter1 = p1 - b',parittioned final filter2 = p4 - a'
+ *
+ * (increment p0 - 0) + (increment p1 - b')
+ * + (increment p3 - a) + (increment p4 - a')
+ * = partitioned hash entries1 + partitioned hash entries2
+ * + parittioned banding1 + parittioned banding2
+ * + parittioned final filter1 + parittioned final filter2
+ * = hash entries + banding + final filter
+ *
+ * kStandard128Ribbon + PartitionedFilter +
+ * detect_filter_construct_corruption
+ *
+ * new p3
+ * / \
+ * pre p3 \
+ * new p0 / \
+ * / \ / \
+ * pre p0 \ / \
+ * / \ b'/ \
+ * / \ / \
+ * b / \ / \
+ * / \a \
+ * 0/ \
+ * partitioned hash entries1 = b - 0, partitioned hash entries2 = b' - a
+ * partitioned banding1 = pre p0 - b, partitioned banding2 = pre p3 - b'
+ * parittioned final filter1 = new p0 - pre p0,
+ * parittioned final filter2 = new p3 - pre p3
+ *
+ * The previous p0 and p3 will no longer be a peak since under
+ * detect_filter_construct_corruption == true, we do not release hash
+ * entries reserveration (like p0 - b', p3 - a' previously) until after
+ * parittioned final filter creation and post-verification
+ *
+ * However, increments sum stay the same as shown below:
+ * (increment new p0 - 0) + (increment new p3 - a)
+ * = partitioned hash entries1 + partitioned hash entries2
+ * + parittioned banding1 + parittioned banding2
+ * + parittioned final filter1 + parittioned final filter2
+ * = hash entries + banding + final filter
+ *
+ */
+ if (!partition_filters) {
+ ASSERT_GE(
+ std::floor(
+ 1.0 * predicted_final_filter_cache_res /
+ CacheReservationManagerImpl<
+ CacheEntryRole::kFilterConstruction>::GetDummyEntrySize()),
+ 1)
+ << "Final filter cache charging too small for this test - please "
+ "increase the number of keys";
+ if (!detect_filter_construct_corruption) {
+ EXPECT_EQ(filter_construction_cache_res_peaks.size(), 2)
+ << "Filter construction cache charging should have 2 peaks in "
+ "case: kStandard128Ribbon + "
+ "FullFilter. "
+ "The second peak is resulted from charging the final filter "
+ "after "
+ "decreasing the hash entry reservation since the testing final "
+ "filter reservation is designed to be at least 1 dummy entry "
+ "size";
+
+ std::size_t filter_construction_cache_res_peak =
+ filter_construction_cache_res_peaks[0];
+ std::size_t predicted_filter_construction_cache_res_peak =
+ predicted_hash_entries_cache_res + predicted_banding_cache_res;
+ EXPECT_GE(filter_construction_cache_res_peak,
+ predicted_filter_construction_cache_res_peak * 0.9);
+ EXPECT_LE(filter_construction_cache_res_peak,
+ predicted_filter_construction_cache_res_peak * 1.1);
+ } else {
+ EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1)
+ << "Filter construction cache charging should have 1 peaks in "
+ "case: kStandard128Ribbon + FullFilter "
+ "+ detect_filter_construct_corruption. "
+ "The previous second peak now disappears since we don't "
+ "decrease the hash entry reservation"
+ "until after final filter reservation and post-verification";
+
+ std::size_t filter_construction_cache_res_peak =
+ filter_construction_cache_res_peaks[0];
+ std::size_t predicted_filter_construction_cache_res_peak =
+ predicted_hash_entries_cache_res + predicted_banding_cache_res +
+ predicted_final_filter_cache_res;
+ EXPECT_GE(filter_construction_cache_res_peak,
+ predicted_filter_construction_cache_res_peak * 0.9);
+ EXPECT_LE(filter_construction_cache_res_peak,
+ predicted_filter_construction_cache_res_peak * 1.1);
+ }
+ return;
+ } else {
+ if (!detect_filter_construct_corruption) {
+ EXPECT_GE(filter_construction_cache_res_peaks.size(), 3)
+ << "Filter construction cache charging should have more than 3 "
+ "peaks "
+ "in case: kStandard128Ribbon + "
+ "PartitionedFilter";
+ } else {
+ EXPECT_GE(filter_construction_cache_res_peaks.size(), 2)
+ << "Filter construction cache charging should have more than 2 "
+ "peaks "
+ "in case: kStandard128Ribbon + "
+ "PartitionedFilter + detect_filter_construct_corruption";
+ }
+ std::size_t predicted_filter_construction_cache_res_increments_sum =
+ predicted_hash_entries_cache_res + predicted_banding_cache_res +
+ predicted_final_filter_cache_res;
+ EXPECT_GE(filter_construction_cache_res_increments_sum,
+ predicted_filter_construction_cache_res_increments_sum * 0.9);
+ EXPECT_LE(filter_construction_cache_res_increments_sum,
+ predicted_filter_construction_cache_res_increments_sum * 1.1);
+ return;
+ }
+ }
+}
+
+class DBFilterConstructionCorruptionTestWithParam
+ : public DBTestBase,
+ public testing::WithParamInterface<
+ std::tuple<bool /* detect_filter_construct_corruption */, std::string,
+ bool /* partition_filters */>> {
+ public:
+ DBFilterConstructionCorruptionTestWithParam()
+ : DBTestBase("db_bloom_filter_tests",
+ /*env_do_fsync=*/true) {}
+
+ BlockBasedTableOptions GetBlockBasedTableOptions() {
+ BlockBasedTableOptions table_options;
+ table_options.detect_filter_construct_corruption = std::get<0>(GetParam());
+ table_options.filter_policy = Create(10, std::get<1>(GetParam()));
+ table_options.partition_filters = std::get<2>(GetParam());
+ if (table_options.partition_filters) {
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ // We set table_options.metadata_block_size small enough so we can
+ // trigger filter partitioning with GetNumKey() amount of keys
+ table_options.metadata_block_size = 10;
+ }
+
+ return table_options;
+ }
+
+ // Return an appropriate amount of keys for testing
+ // to generate a long filter (i.e, size >= 8 + kMetadataLen)
+ std::size_t GetNumKey() { return 5000; }
+};
+
+INSTANTIATE_TEST_CASE_P(
+ DBFilterConstructionCorruptionTestWithParam,
+ DBFilterConstructionCorruptionTestWithParam,
+ ::testing::Values(std::make_tuple(false, kFastLocalBloom, false),
+ std::make_tuple(true, kFastLocalBloom, false),
+ std::make_tuple(true, kFastLocalBloom, true),
+ std::make_tuple(true, kStandard128Ribbon, false),
+ std::make_tuple(true, kStandard128Ribbon, true)));
+
+TEST_P(DBFilterConstructionCorruptionTestWithParam, DetectCorruption) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+
+ DestroyAndReopen(options);
+ int num_key = static_cast<int>(GetNumKey());
+ Status s;
+
+ // Case 1: No corruption in filter construction
+ for (int i = 0; i < num_key; i++) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ }
+ s = Flush();
+ EXPECT_TRUE(s.ok());
+
+ // Case 2: Corruption of hash entries in filter construction
+ for (int i = 0; i < num_key; i++) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ }
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) {
+ std::deque<uint64_t>* hash_entries_to_corrupt =
+ (std::deque<uint64_t>*)arg;
+ assert(!hash_entries_to_corrupt->empty());
+ *(hash_entries_to_corrupt->begin()) =
+ *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 };
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ s = Flush();
+
+ if (table_options.detect_filter_construct_corruption) {
+ EXPECT_TRUE(s.IsCorruption());
+ EXPECT_TRUE(
+ s.ToString().find("Filter's hash entries checksum mismatched") !=
+ std::string::npos);
+ } else {
+ EXPECT_TRUE(s.ok());
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearCallBack(
+ "XXPH3FilterBitsBuilder::Finish::"
+ "TamperHashEntries");
+
+ // Case 3: Corruption of filter content in filter construction
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < num_key; i++) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ }
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "XXPH3FilterBitsBuilder::Finish::TamperFilter", [&](void* arg) {
+ std::pair<std::unique_ptr<char[]>*, std::size_t>* TEST_arg_pair =
+ (std::pair<std::unique_ptr<char[]>*, std::size_t>*)arg;
+ std::size_t filter_size = TEST_arg_pair->second;
+ // 5 is the kMetadataLen and
+ assert(filter_size >= 8 + 5);
+ std::unique_ptr<char[]>* filter_content_to_corrupt =
+ TEST_arg_pair->first;
+ std::memset(filter_content_to_corrupt->get(), '\0', 8);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ s = Flush();
+
+ if (table_options.detect_filter_construct_corruption) {
+ EXPECT_TRUE(s.IsCorruption());
+ EXPECT_TRUE(s.ToString().find("Corrupted filter content") !=
+ std::string::npos);
+ } else {
+ EXPECT_TRUE(s.ok());
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearCallBack(
+ "XXPH3FilterBitsBuilder::Finish::"
+ "TamperFilter");
+}
+
+// RocksDB lite does not support dynamic options
+#ifndef ROCKSDB_LITE
+TEST_P(DBFilterConstructionCorruptionTestWithParam,
+ DynamicallyTurnOnAndOffDetectConstructCorruption) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ // We intend to turn on
+ // table_options.detect_filter_construct_corruption dynamically
+ // therefore we override this test parmater's value
+ table_options.detect_filter_construct_corruption = false;
+
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.create_if_missing = true;
+
+ int num_key = static_cast<int>(GetNumKey());
+ Status s;
+
+ DestroyAndReopen(options);
+
+ // Case 1: !table_options.detect_filter_construct_corruption
+ for (int i = 0; i < num_key; i++) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ }
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) {
+ std::deque<uint64_t>* hash_entries_to_corrupt =
+ (std::deque<uint64_t>*)arg;
+ assert(!hash_entries_to_corrupt->empty());
+ *(hash_entries_to_corrupt->begin()) =
+ *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 };
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ s = Flush();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearCallBack(
+ "XXPH3FilterBitsBuilder::Finish::"
+ "TamperHashEntries");
+
+ ASSERT_FALSE(table_options.detect_filter_construct_corruption);
+ EXPECT_TRUE(s.ok());
+
+ // Case 2: dynamically turn on
+ // table_options.detect_filter_construct_corruption
+ ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+ "{detect_filter_construct_corruption=true;}"}}));
+
+ for (int i = 0; i < num_key; i++) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ }
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) {
+ std::deque<uint64_t>* hash_entries_to_corrupt =
+ (std::deque<uint64_t>*)arg;
+ assert(!hash_entries_to_corrupt->empty());
+ *(hash_entries_to_corrupt->begin()) =
+ *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 };
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ s = Flush();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearCallBack(
+ "XXPH3FilterBitsBuilder::Finish::"
+ "TamperHashEntries");
+
+ auto updated_table_options =
+ db_->GetOptions().table_factory->GetOptions<BlockBasedTableOptions>();
+ EXPECT_TRUE(updated_table_options->detect_filter_construct_corruption);
+ EXPECT_TRUE(s.IsCorruption());
+ EXPECT_TRUE(s.ToString().find("Filter's hash entries checksum mismatched") !=
+ std::string::npos);
+
+ // Case 3: dynamically turn off
+ // table_options.detect_filter_construct_corruption
+ ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+ "{detect_filter_construct_corruption=false;}"}}));
+ updated_table_options =
+ db_->GetOptions().table_factory->GetOptions<BlockBasedTableOptions>();
+ EXPECT_FALSE(updated_table_options->detect_filter_construct_corruption);
+}
+#endif // ROCKSDB_LITE
+
+namespace {
+// NOTE: This class is referenced by HISTORY.md as a model for a wrapper
+// FilterPolicy selecting among configurations based on context.
+class LevelAndStyleCustomFilterPolicy : public FilterPolicy {
+ public:
+ explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+ int bpk_otherwise)
+ : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)),
+ policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)),
+ policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {}
+
+ const char* Name() const override {
+ return "LevelAndStyleCustomFilterPolicy";
+ }
+
+ // OK to use built-in policy name because we are deferring to a
+ // built-in builder. We aren't changing the serialized format.
+ const char* CompatibilityName() const override {
+ return policy_fifo_->CompatibilityName();
+ }
+
+ FilterBitsBuilder* GetBuilderWithContext(
+ const FilterBuildingContext& context) const override {
+ if (context.compaction_style == kCompactionStyleFIFO) {
+ return policy_fifo_->GetBuilderWithContext(context);
+ } else if (context.level_at_creation == 0) {
+ return policy_l0_other_->GetBuilderWithContext(context);
+ } else {
+ return policy_otherwise_->GetBuilderWithContext(context);
+ }
+ }
+
+ FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+ // OK to defer to any of them; they all can parse built-in filters
+ // from any settings.
+ return policy_fifo_->GetFilterBitsReader(contents);
+ }
+
+ private:
+ const std::unique_ptr<const FilterPolicy> policy_fifo_;
+ const std::unique_ptr<const FilterPolicy> policy_l0_other_;
+ const std::unique_ptr<const FilterPolicy> policy_otherwise_;
+};
+
+static std::map<TableFileCreationReason, std::string>
+ table_file_creation_reason_to_string{
+ {TableFileCreationReason::kCompaction, "kCompaction"},
+ {TableFileCreationReason::kFlush, "kFlush"},
+ {TableFileCreationReason::kMisc, "kMisc"},
+ {TableFileCreationReason::kRecovery, "kRecovery"},
+ };
+
+class TestingContextCustomFilterPolicy
+ : public LevelAndStyleCustomFilterPolicy {
+ public:
+ explicit TestingContextCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+ int bpk_otherwise)
+ : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, bpk_otherwise) {
+ }
+
+ FilterBitsBuilder* GetBuilderWithContext(
+ const FilterBuildingContext& context) const override {
+ test_report_ += "cf=";
+ test_report_ += context.column_family_name;
+ test_report_ += ",s=";
+ test_report_ +=
+ OptionsHelper::compaction_style_to_string[context.compaction_style];
+ test_report_ += ",n=";
+ test_report_ += std::to_string(context.num_levels);
+ test_report_ += ",l=";
+ test_report_ += std::to_string(context.level_at_creation);
+ test_report_ += ",b=";
+ test_report_ += std::to_string(int{context.is_bottommost});
+ test_report_ += ",r=";
+ test_report_ += table_file_creation_reason_to_string[context.reason];
+ test_report_ += "\n";
+
+ return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context);
+ }
+
+ std::string DumpTestReport() {
+ std::string rv;
+ std::swap(rv, test_report_);
+ return rv;
+ }
+
+ private:
+ mutable std::string test_report_;
+};
+} // anonymous namespace
+
+TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
+ auto policy = std::make_shared<TestingContextCustomFilterPolicy>(15, 8, 5);
+ Options options;
+ for (bool fifo : {true, false}) {
+ options = CurrentOptions();
+ options.max_open_files = fifo ? -1 : options.max_open_files;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.compaction_style =
+ fifo ? kCompactionStyleFIFO : kCompactionStyleLevel;
+
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy = policy;
+ table_options.format_version = 5;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ TryReopen(options);
+ CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options);
+
+ const int maxKey = 10000;
+ for (int i = 0; i < maxKey / 2; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ // Add a large key to make the file contain wide range
+ ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+ Flush(1);
+ EXPECT_EQ(policy->DumpTestReport(),
+ fifo ? "cf=abe,s=kCompactionStyleFIFO,n=7,l=0,b=0,r=kFlush\n"
+ : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n");
+
+ for (int i = maxKey / 2; i < maxKey; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ Flush(1);
+ EXPECT_EQ(policy->DumpTestReport(),
+ fifo ? "cf=abe,s=kCompactionStyleFIFO,n=7,l=0,b=0,r=kFlush\n"
+ : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n");
+
+ // Check that they can be found
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ(Key(i), Get(1, Key(i)));
+ }
+ // Since we have two tables / two filters, we might have Bloom checks on
+ // our queries, but no more than one "useful" per query on a found key.
+ EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey);
+
+ // Check that we have two filters, each about
+ // fifo: 0.12% FP rate (15 bits per key)
+ // level: 2.3% FP rate (8 bits per key)
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+ }
+ {
+ auto useful_count =
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+ EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975));
+ EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98));
+ }
+
+ if (!fifo) { // FIFO doesn't fully support CompactRange
+ // Full compaction
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+ nullptr));
+ EXPECT_EQ(policy->DumpTestReport(),
+ "cf=bob,s=kCompactionStyleLevel,n=7,l=1,b=1,r=kCompaction\n");
+
+ // Check that we now have one filter, about 9.2% FP rate (5 bits per key)
+ for (int i = 0; i < maxKey; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+ }
+ {
+ auto useful_count =
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+ EXPECT_GE(useful_count, maxKey * 0.90);
+ EXPECT_LE(useful_count, maxKey * 0.91);
+ }
+ } else {
+#ifndef ROCKSDB_LITE
+ // Also try external SST file
+ {
+ std::string file_path = dbname_ + "/external.sst";
+ SstFileWriter sst_file_writer(EnvOptions(), options, handles_[1]);
+ ASSERT_OK(sst_file_writer.Open(file_path));
+ ASSERT_OK(sst_file_writer.Put("key", "value"));
+ ASSERT_OK(sst_file_writer.Finish());
+ }
+ // Note: kCompactionStyleLevel is default, ignored if num_levels == -1
+ EXPECT_EQ(policy->DumpTestReport(),
+ "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n");
+#endif
+ }
+
+ // Destroy
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+ handles_[1] = nullptr;
+ }
+}
+
+class SliceTransformLimitedDomain : public SliceTransform {
+ const char* Name() const override { return "SliceTransformLimitedDomain"; }
+
+ Slice Transform(const Slice& src) const override {
+ return Slice(src.data(), 5);
+ }
+
+ bool InDomain(const Slice& src) const override {
+ // prefix will be x????
+ return src.size() >= 5 && src[0] == 'x';
+ }
+
+ bool InRange(const Slice& dst) const override {
+ // prefix will be x????
+ return dst.size() == 5 && dst[0] == 'x';
+ }
+};
+
+TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter1) {
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+ bbto.whole_key_filtering = false;
+
+ Options options = CurrentOptions();
+ options.prefix_extractor = std::make_shared<SliceTransformLimitedDomain>();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("x1111_AAAA", "val1"));
+ ASSERT_OK(Put("x1112_AAAA", "val2"));
+ ASSERT_OK(Put("x1113_AAAA", "val3"));
+ ASSERT_OK(Put("x1114_AAAA", "val4"));
+ // Not in domain, wont be added to filter
+ ASSERT_OK(Put("zzzzz_AAAA", "val5"));
+
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(Get("x1111_AAAA"), "val1");
+ ASSERT_EQ(Get("x1112_AAAA"), "val2");
+ ASSERT_EQ(Get("x1113_AAAA"), "val3");
+ ASSERT_EQ(Get("x1114_AAAA"), "val4");
+ // Was not added to filter but rocksdb will try to read it from the filter
+ ASSERT_EQ(Get("zzzzz_AAAA"), "val5");
+}
+
+TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter2) {
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+
+ Options options = CurrentOptions();
+ options.prefix_extractor = std::make_shared<SliceTransformLimitedDomain>();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("x1113_AAAA", "val3"));
+ ASSERT_OK(Put("x1114_AAAA", "val4"));
+ // Not in domain, wont be added to filter
+ ASSERT_OK(Put("zzzzz_AAAA", "val1"));
+ ASSERT_OK(Put("zzzzz_AAAB", "val2"));
+ ASSERT_OK(Put("zzzzz_AAAC", "val3"));
+ ASSERT_OK(Put("zzzzz_AAAD", "val4"));
+
+ ASSERT_OK(Flush());
+
+ std::vector<std::string> iter_res;
+ auto iter = db_->NewIterator(ReadOptions());
+ // Seek to a key that was not in Domain
+ for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) {
+ iter_res.emplace_back(iter->value().ToString());
+ }
+
+ std::vector<std::string> expected_res = {"val1", "val2", "val3", "val4"};
+ ASSERT_EQ(iter_res, expected_res);
+ delete iter;
+}
+
+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
+ // regression test for #2743. the range delete tombstones in memtable should
+ // be added even when Get() skips searching due to its prefix bloom filter
+ const int kMemtableSize = 1 << 20; // 1MB
+ const int kMemtablePrefixFilterSize = 1 << 13; // 8KB
+ const int kPrefixLen = 4;
+ Options options = CurrentOptions();
+ options.memtable_prefix_bloom_size_ratio =
+ static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+ options.prefix_extractor.reset(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen));
+ options.write_buffer_size = kMemtableSize;
+ options.memtable_whole_key_filtering = false;
+ Reopen(options);
+ std::string key1("AAAABBBB");
+ std::string key2("AAAACCCC"); // not in DB
+ std::string key3("AAAADDDD");
+ std::string key4("AAAAEEEE");
+ std::string value1("Value1");
+ std::string value3("Value3");
+ std::string value4("Value4");
+
+ ASSERT_OK(Put(key1, value1, WriteOptions()));
+
+ // check memtable bloom stats
+ ASSERT_EQ("NOT_FOUND", Get(key2));
+ ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+ // same prefix, bloom filter false positive
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+ // enable whole key bloom filter
+ options.memtable_whole_key_filtering = true;
+ Reopen(options);
+ // check memtable bloom stats
+ ASSERT_OK(Put(key3, value3, WriteOptions()));
+ ASSERT_EQ("NOT_FOUND", Get(key2));
+ // whole key bloom filter kicks in and determines it's a miss
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+ // verify whole key filtering does not depend on prefix_extractor
+ options.prefix_extractor.reset();
+ Reopen(options);
+ // check memtable bloom stats
+ ASSERT_OK(Put(key4, value4, WriteOptions()));
+ ASSERT_EQ("NOT_FOUND", Get(key2));
+ // whole key bloom filter kicks in and determines it's a miss
+ ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count);
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+}
+
+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
+ Options options = CurrentOptions();
+ options.memtable_prefix_bloom_size_ratio = 0.015;
+ options.memtable_whole_key_filtering = true;
+ Reopen(options);
+ std::string key1("AA");
+ std::string key2("BB");
+ std::string key3("CC");
+ std::string key4("DD");
+ std::string key_not("EE");
+ std::string value1("Value1");
+ std::string value2("Value2");
+ std::string value3("Value3");
+ std::string value4("Value4");
+
+ ASSERT_OK(Put(key1, value1, WriteOptions()));
+ ASSERT_OK(Put(key2, value2, WriteOptions()));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(key3, value3, WriteOptions()));
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(Put(key4, value4, WriteOptions()));
+
+ // Delete key2 and key3
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ"));
+
+ // Read without snapshot
+ auto results = MultiGet({key_not, key1, key2, key3, key4});
+ ASSERT_EQ(results[0], "NOT_FOUND");
+ ASSERT_EQ(results[1], value1);
+ ASSERT_EQ(results[2], "NOT_FOUND");
+ ASSERT_EQ(results[3], "NOT_FOUND");
+ ASSERT_EQ(results[4], value4);
+
+ // Also check Get
+ ASSERT_EQ(Get(key1), value1);
+ ASSERT_EQ(Get(key2), "NOT_FOUND");
+ ASSERT_EQ(Get(key3), "NOT_FOUND");
+ ASSERT_EQ(Get(key4), value4);
+
+ // Read with snapshot
+ results = MultiGet({key_not, key1, key2, key3, key4}, snapshot);
+ ASSERT_EQ(results[0], "NOT_FOUND");
+ ASSERT_EQ(results[1], value1);
+ ASSERT_EQ(results[2], value2);
+ ASSERT_EQ(results[3], value3);
+ ASSERT_EQ(results[4], "NOT_FOUND");
+
+ // Also check Get
+ ASSERT_EQ(Get(key1, snapshot), value1);
+ ASSERT_EQ(Get(key2, snapshot), value2);
+ ASSERT_EQ(Get(key3, snapshot), value3);
+ ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND");
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
+ constexpr size_t kPrefixSize = 8;
+ const std::string kKey = "key";
+ assert(kKey.size() < kPrefixSize);
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize));
+ options.memtable_prefix_bloom_size_ratio = 0.25;
+ Reopen(options);
+ ASSERT_OK(Put(kKey, "v"));
+ ASSERT_EQ("v", Get(kKey));
+ std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+ iter->Seek(kKey);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(kKey, iter->key());
+ iter->SeekForPrev(kKey);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(kKey, iter->key());
+}
+
+class DBBloomFilterTestVaryPrefixAndFormatVer
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<bool, uint32_t>> {
+ protected:
+ bool use_prefix_;
+ uint32_t format_version_;
+
+ public:
+ DBBloomFilterTestVaryPrefixAndFormatVer()
+ : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {}
+
+ ~DBBloomFilterTestVaryPrefixAndFormatVer() override {}
+
+ void SetUp() override {
+ use_prefix_ = std::get<0>(GetParam());
+ format_version_ = std::get<1>(GetParam());
+ }
+
+ static std::string UKey(uint32_t i) { return Key(static_cast<int>(i)); }
+};
+
+TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) {
+ Options options = CurrentOptions();
+ if (use_prefix_) {
+ // Entire key from UKey()
+ options.prefix_extractor.reset(NewCappedPrefixTransform(9));
+ }
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(20));
+ bbto.partition_filters = true;
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ bbto.whole_key_filtering = !use_prefix_;
+ if (use_prefix_) { // (not related to prefix, just alternating between)
+ // Make sure code appropriately deals with metadata block size setting
+ // that is "too small" (smaller than minimum size for filter builder)
+ bbto.metadata_block_size = 63;
+ } else {
+ // Make sure the test will work even on platforms with large minimum
+ // filter size, due to large cache line size.
+ // (Largest cache line size + 10+% overhead.)
+ bbto.metadata_block_size = 290;
+ }
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+ ReadOptions ropts;
+
+ constexpr uint32_t N = 12000;
+ // Add N/2 evens
+ for (uint32_t i = 0; i < N; i += 2) {
+ ASSERT_OK(Put(UKey(i), UKey(i)));
+ }
+ ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(TotalTableFiles(), 1);
+#endif
+
+ constexpr uint32_t Q = 29;
+ // MultiGet In
+ std::array<std::string, Q> keys;
+ std::array<Slice, Q> key_slices;
+ std::array<ColumnFamilyHandle*, Q> column_families;
+ // MultiGet Out
+ std::array<Status, Q> statuses;
+ std::array<PinnableSlice, Q> values;
+
+ TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+ TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL);
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED);
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE);
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+
+ // Check that initial clump of keys only loads one partition filter from
+ // block cache.
+ // And that spread out keys load many partition filters.
+ // In both cases, mix present vs. not present keys.
+ for (uint32_t stride : {uint32_t{1}, (N / Q) | 1}) {
+ for (uint32_t i = 0; i < Q; ++i) {
+ keys[i] = UKey(i * stride);
+ key_slices[i] = Slice(keys[i]);
+ column_families[i] = db_->DefaultColumnFamily();
+ statuses[i] = Status();
+ values[i] = PinnableSlice();
+ }
+
+ db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0],
+ /*timestamps=*/nullptr, &statuses[0], true);
+
+ // Confirm correct status results
+ uint32_t number_not_found = 0;
+ for (uint32_t i = 0; i < Q; ++i) {
+ if ((i * stride % 2) == 0) {
+ ASSERT_OK(statuses[i]);
+ } else {
+ ASSERT_TRUE(statuses[i].IsNotFound());
+ ++number_not_found;
+ }
+ }
+
+ // Confirm correct Bloom stats (no FPs)
+ uint64_t filter_useful = TestGetAndResetTickerCount(
+ options,
+ use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL);
+ uint64_t filter_checked =
+ TestGetAndResetTickerCount(options, use_prefix_
+ ? BLOOM_FILTER_PREFIX_CHECKED
+ : BLOOM_FILTER_FULL_POSITIVE) +
+ (use_prefix_ ? 0 : filter_useful);
+ EXPECT_EQ(filter_useful, number_not_found);
+ EXPECT_EQ(filter_checked, Q);
+ if (!use_prefix_) {
+ EXPECT_EQ(
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+ Q - number_not_found);
+ }
+
+ // Confirm no duplicate loading same filter partition
+ uint64_t filter_accesses =
+ TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) +
+ TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ if (stride == 1) {
+ EXPECT_EQ(filter_accesses, 1);
+ } else {
+ // for large stride
+ EXPECT_GE(filter_accesses, Q / 2 + 1);
+ }
+ }
+
+ // Check that a clump of keys (present and not) works when spanning
+ // two partitions
+ int found_spanning = 0;
+ for (uint32_t start = 0; start < N / 2;) {
+ for (uint32_t i = 0; i < Q; ++i) {
+ keys[i] = UKey(start + i);
+ key_slices[i] = Slice(keys[i]);
+ column_families[i] = db_->DefaultColumnFamily();
+ statuses[i] = Status();
+ values[i] = PinnableSlice();
+ }
+
+ db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0],
+ /*timestamps=*/nullptr, &statuses[0], true);
+
+ // Confirm correct status results
+ uint32_t number_not_found = 0;
+ for (uint32_t i = 0; i < Q; ++i) {
+ if (((start + i) % 2) == 0) {
+ ASSERT_OK(statuses[i]);
+ } else {
+ ASSERT_TRUE(statuses[i].IsNotFound());
+ ++number_not_found;
+ }
+ }
+
+ // Confirm correct Bloom stats (might see some FPs)
+ uint64_t filter_useful = TestGetAndResetTickerCount(
+ options,
+ use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL);
+ uint64_t filter_checked =
+ TestGetAndResetTickerCount(options, use_prefix_
+ ? BLOOM_FILTER_PREFIX_CHECKED
+ : BLOOM_FILTER_FULL_POSITIVE) +
+ (use_prefix_ ? 0 : filter_useful);
+ EXPECT_GE(filter_useful, number_not_found - 2); // possible FP
+ EXPECT_EQ(filter_checked, Q);
+ if (!use_prefix_) {
+ EXPECT_EQ(
+ TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+ Q - number_not_found);
+ }
+
+ // Confirm no duplicate loading of same filter partition
+ uint64_t filter_accesses =
+ TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) +
+ TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ if (filter_accesses == 2) {
+ // Spanned across partitions.
+ ++found_spanning;
+ if (found_spanning >= 2) {
+ break;
+ } else {
+ // Ensure that at least once we have at least one present and
+ // one non-present key on both sides of partition boundary.
+ start += 2;
+ }
+ } else {
+ EXPECT_EQ(filter_accesses, 1);
+ // See explanation at "start += 2"
+ start += Q - 4;
+ }
+ }
+ EXPECT_TRUE(found_spanning >= 2);
+}
+
+INSTANTIATE_TEST_CASE_P(DBBloomFilterTestVaryPrefixAndFormatVer,
+ DBBloomFilterTestVaryPrefixAndFormatVer,
+ ::testing::Values(
+ // (use_prefix, format_version)
+ std::make_tuple(false, 2),
+ std::make_tuple(false, 3),
+ std::make_tuple(false, 4),
+ std::make_tuple(false, 5), std::make_tuple(true, 2),
+ std::make_tuple(true, 3), std::make_tuple(true, 4),
+ std::make_tuple(true, 5)));
+
+#ifndef ROCKSDB_LITE
+namespace {
+static const std::string kPlainTable = "test_PlainTableBloom";
+} // anonymous namespace
+
+class BloomStatsTestWithParam
+ : public DBBloomFilterTest,
+ public testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+ BloomStatsTestWithParam() {
+ bfp_impl_ = std::get<0>(GetParam());
+ partition_filters_ = std::get<1>(GetParam());
+
+ options_.create_if_missing = true;
+ options_.prefix_extractor.reset(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(4));
+ options_.memtable_prefix_bloom_size_ratio =
+ 8.0 * 1024.0 / static_cast<double>(options_.write_buffer_size);
+ if (bfp_impl_ == kPlainTable) {
+ assert(!partition_filters_); // not supported in plain table
+ PlainTableOptions table_options;
+ options_.table_factory.reset(NewPlainTableFactory(table_options));
+ } else {
+ BlockBasedTableOptions table_options;
+ if (partition_filters_) {
+ table_options.partition_filters = partition_filters_;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ table_options.filter_policy = Create(10, bfp_impl_);
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ }
+ options_.env = env_;
+
+ get_perf_context()->Reset();
+ DestroyAndReopen(options_);
+ }
+
+ ~BloomStatsTestWithParam() override {
+ get_perf_context()->Reset();
+ Destroy(options_);
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ std::string bfp_impl_;
+ bool partition_filters_;
+ Options options_;
+};
+
+// 1 Insert 2 K-V pairs into DB
+// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2
+// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1
+// 4 Call Flush() to create SST
+// 5 Call Get() for both keys - expext SST bloom hit stat to be 2
+// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1
+// Test both: block and plain SST
+TEST_P(BloomStatsTestWithParam, BloomStatsTest) {
+ std::string key1("AAAA");
+ std::string key2("RXDB"); // not in DB
+ std::string key3("ZBRA");
+ std::string value1("Value1");
+ std::string value3("Value3");
+
+ ASSERT_OK(Put(key1, value1, WriteOptions()));
+ ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+ // check memtable bloom stats
+ ASSERT_EQ(value1, Get(key1));
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+ ASSERT_EQ(value3, Get(key3));
+ ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+ ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+ ASSERT_EQ("NOT_FOUND", Get(key2));
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+ ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+
+ // sanity checks
+ ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count);
+ ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count);
+
+ Flush();
+
+ // sanity checks
+ ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count);
+ ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count);
+
+ // check SST bloom stats
+ ASSERT_EQ(value1, Get(key1));
+ ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count);
+ ASSERT_EQ(value3, Get(key3));
+ ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count);
+
+ ASSERT_EQ("NOT_FOUND", Get(key2));
+ ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
+}
+
+// Same scenario as in BloomStatsTest but using an iterator
+TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
+ std::string key1("AAAA");
+ std::string key2("RXDB"); // not in DB
+ std::string key3("ZBRA");
+ std::string value1("Value1");
+ std::string value3("Value3");
+
+ ASSERT_OK(Put(key1, value1, WriteOptions()));
+ ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+ std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+
+ // check memtable bloom stats
+ iter->Seek(key1);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(value1, iter->value().ToString());
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+ ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+ iter->Seek(key3);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(value3, iter->value().ToString());
+ ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+ ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+ iter->Seek(key2);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+ ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+
+ Flush();
+
+ iter.reset(dbfull()->NewIterator(ReadOptions()));
+
+ // Check SST bloom stats
+ iter->Seek(key1);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(value1, iter->value().ToString());
+ ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count);
+
+ iter->Seek(key3);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(value3, iter->value().ToString());
+ uint64_t expected_hits = 2;
+ ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
+
+ iter->Seek(key2);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
+ ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
+}
+
+INSTANTIATE_TEST_CASE_P(
+ BloomStatsTestWithParam, BloomStatsTestWithParam,
+ ::testing::Values(std::make_tuple(kLegacyBloom, false),
+ std::make_tuple(kLegacyBloom, true),
+ std::make_tuple(kFastLocalBloom, false),
+ std::make_tuple(kFastLocalBloom, true),
+ std::make_tuple(kPlainTable, false)));
+
+namespace {
+void PrefixScanInit(DBBloomFilterTest* dbtest) {
+ char buf[100];
+ std::string keystr;
+ const int small_range_sstfiles = 5;
+ const int big_range_sstfiles = 5;
+
+ // Generate 11 sst files with the following prefix ranges.
+ // GROUP 0: [0,10] (level 1)
+ // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0)
+ // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0)
+ //
+ // A seek with the previous API would do 11 random I/Os (to all the
+ // files). With the new API and a prefix filter enabled, we should
+ // only do 2 random I/O, to the 2 files containing the key.
+
+ // GROUP 0
+ snprintf(buf, sizeof(buf), "%02d______:start", 0);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ snprintf(buf, sizeof(buf), "%02d______:end", 10);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ ASSERT_OK(dbtest->Flush());
+ ASSERT_OK(dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr,
+ nullptr)); // move to level 1
+
+ // GROUP 1
+ for (int i = 1; i <= small_range_sstfiles; i++) {
+ snprintf(buf, sizeof(buf), "%02d______:start", i);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ snprintf(buf, sizeof(buf), "%02d______:end", i + 1);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ dbtest->Flush();
+ }
+
+ // GROUP 2
+ for (int i = 1; i <= big_range_sstfiles; i++) {
+ snprintf(buf, sizeof(buf), "%02d______:start", 0);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1);
+ keystr = std::string(buf);
+ ASSERT_OK(dbtest->Put(keystr, keystr));
+ dbtest->Flush();
+ }
+}
+} // anonymous namespace
+
+TEST_F(DBBloomFilterTest, PrefixScan) {
+ while (ChangeFilterOptions()) {
+ int count;
+ Slice prefix;
+ Slice key;
+ char buf[100];
+ Iterator* iter;
+ snprintf(buf, sizeof(buf), "03______:");
+ prefix = Slice(buf, 8);
+ key = Slice(buf, 9);
+ ASSERT_EQ(key.difference_offset(prefix), 8);
+ ASSERT_EQ(prefix.difference_offset(key), 8);
+ // db configs
+ env_->count_random_reads_ = true;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ options.disable_auto_compactions = true;
+ options.max_background_compactions = 2;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+ assert(!options.unordered_write);
+ // It is incompatible with allow_concurrent_memtable_write=false
+ options.allow_concurrent_memtable_write = false;
+
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ table_options.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ // 11 RAND I/Os
+ DestroyAndReopen(options);
+ PrefixScanInit(this);
+ count = 0;
+ env_->random_read_counter_.Reset();
+ iter = db_->NewIterator(ReadOptions());
+ for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+ if (!iter->key().starts_with(prefix)) {
+ break;
+ }
+ count++;
+ }
+ ASSERT_OK(iter->status());
+ delete iter;
+ ASSERT_EQ(count, 2);
+ ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+ Close();
+ } // end of while
+}
+
+TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 64 * 1024;
+ options.arena_block_size = 4 * 1024;
+ options.target_file_size_base = 64 * 1024;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 4;
+ options.max_bytes_for_level_base = 256 * 1024;
+ options.max_write_buffer_number = 2;
+ options.max_background_compactions = 8;
+ options.max_background_flushes = 8;
+ options.compression = kNoCompression;
+ options.compaction_style = kCompactionStyleLevel;
+ options.level_compaction_dynamic_level_bytes = true;
+ BlockBasedTableOptions bbto;
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.optimize_filters_for_hits = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ CreateAndReopenWithCF({"mypikachu"}, options);
+
+ int numkeys = 200000;
+
+ // Generate randomly shuffled keys, so the updates are almost
+ // random.
+ std::vector<int> keys;
+ keys.reserve(numkeys);
+ for (int i = 0; i < numkeys; i += 2) {
+ keys.push_back(i);
+ }
+ RandomShuffle(std::begin(keys), std::end(keys), /*seed*/ 42);
+ int num_inserted = 0;
+ for (int key : keys) {
+ ASSERT_OK(Put(1, Key(key), "val"));
+ if (++num_inserted % 1000 == 0) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ }
+ ASSERT_OK(Put(1, Key(0), "val"));
+ ASSERT_OK(Put(1, Key(numkeys), "val"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ if (NumTableFilesAtLevel(0, 1) == 0) {
+ // No Level 0 file. Create one.
+ ASSERT_OK(Put(1, Key(0), "val"));
+ ASSERT_OK(Put(1, Key(numkeys), "val"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ for (int i = 1; i < numkeys; i += 2) {
+ ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND");
+ }
+
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+ // Now we have three sorted run, L0, L5 and L6 with most files in L6 have
+ // no bloom filter. Most keys be checked bloom filters twice.
+ ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2);
+ ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2);
+ uint64_t bloom_filter_useful_all_levels = 0;
+ for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+ if (kv.second.bloom_filter_useful > 0) {
+ bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+ }
+ }
+ ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2);
+ ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2);
+
+ for (int i = 0; i < numkeys; i += 2) {
+ ASSERT_EQ(Get(1, Key(i)), "val");
+ }
+
+ // Part 2 (read path): rewrite last level with blooms, then verify they get
+ // cached only if !optimize_filters_for_hits
+ options.disable_auto_compactions = true;
+ options.num_levels = 9;
+ options.optimize_filters_for_hits = false;
+ options.statistics = CreateDBStatistics();
+ bbto.block_cache.reset();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+ MoveFilesToLevel(7 /* level */, 1 /* column family index */);
+
+ std::string value = Get(1, Key(0));
+ uint64_t prev_cache_filter_hits =
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+ value = Get(1, Key(0));
+ ASSERT_EQ(prev_cache_filter_hits + 1,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+ // Now that we know the filter blocks exist in the last level files, see if
+ // filter caching is skipped for this optimization
+ options.optimize_filters_for_hits = true;
+ options.statistics = CreateDBStatistics();
+ bbto.block_cache.reset();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+ value = Get(1, Key(0));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(2 /* index and data block */,
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+ // Check filter block ignored for files preloaded during DB::Open()
+ options.max_open_files = -1;
+ options.statistics = CreateDBStatistics();
+ bbto.block_cache.reset();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+ uint64_t prev_cache_filter_misses =
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+ Get(1, Key(0));
+ ASSERT_EQ(prev_cache_filter_misses,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(prev_cache_filter_hits,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+ // Check filter block ignored for file trivially-moved to bottom level
+ bbto.block_cache.reset();
+ options.max_open_files = 100; // setting > -1 makes it not preload all files
+ options.statistics = CreateDBStatistics();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+ ASSERT_OK(Put(1, Key(numkeys + 1), "val"));
+ ASSERT_OK(Flush(1));
+
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ CompactRangeOptions compact_options;
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kSkip;
+ compact_options.change_level = true;
+ compact_options.target_level = 7;
+ ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)
+ .IsNotSupported());
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+ prev_cache_filter_misses =
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ value = Get(1, Key(numkeys + 1));
+ ASSERT_EQ(prev_cache_filter_hits,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(prev_cache_filter_misses,
+ TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+
+ // Check filter block not cached for iterator
+ bbto.block_cache.reset();
+ options.statistics = CreateDBStatistics();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions(), handles_[1]));
+ iter->SeekToFirst();
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(2 /* index and data block */,
+ TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ get_perf_context()->Reset();
+}
+
+int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
+ int count = 0;
+ for (iter->Seek(key); iter->Valid(); iter->Next()) {
+ count++;
+ }
+ EXPECT_OK(iter->status());
+ return count;
+}
+
+// use iterate_upper_bound to hint compatiability of existing bloom filters.
+// The BF is considered compatible if 1) upper bound and seek key transform
+// into the same string, or 2) the transformed seek key is of the same length
+// as the upper bound and two keys are adjacent according to the comparator.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
+ for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+ Options options;
+ options.create_if_missing = true;
+ options.env = CurrentOptions().env;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(4));
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy = Create(10, bfp_impl);
+ table_options.index_shortening = BlockBasedTableOptions::
+ IndexShorteningMode::kShortenSeparatorsAndSuccessor;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("abcdxxx0", "val1"));
+ ASSERT_OK(Put("abcdxxx1", "val2"));
+ ASSERT_OK(Put("abcdxxx2", "val3"));
+ ASSERT_OK(Put("abcdxxx3", "val4"));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ {
+ // prefix_extractor has not changed, BF will always be read
+ Slice upper_bound("abce");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+ }
+ {
+ Slice upper_bound("abcdzzzz");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}}));
+ ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+ "rocksdb.FixedPrefix.5");
+ {
+ // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read
+ Slice upper_bound("abce");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abcdxx00"), 4);
+ // should check bloom filter since upper bound meets requirement
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ {
+ // [abcdxx01, abcey) is not valid bound since upper bound is too long for
+ // the BF in SST (capped:4)
+ Slice upper_bound("abcey");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abcdxx01"), 4);
+ // should skip bloom filter since upper bound is too long
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ {
+ // [abcdxx02, abcdy) is a valid bound since the prefix is the same
+ Slice upper_bound("abcdy");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abcdxx02"), 4);
+ // should check bloom filter since upper bound matches transformed seek
+ // key
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ {
+ // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the
+ // same prefix, 2) the prefixes are not consecutive
+ Slice upper_bound("abce");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0);
+ // should skip bloom filter since mismatch is found
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}}));
+ {
+ // [abc, abd) is not a valid bound since the upper bound is too short
+ // for BF (capped:4)
+ Slice upper_bound("abd");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abc"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ // Same with re-open
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ Reopen(options);
+ {
+ Slice upper_bound("abd");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abc"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ // Set back to capped:4 and verify BF is always read
+ options.prefix_extractor.reset(NewCappedPrefixTransform(4));
+ Reopen(options);
+ {
+ Slice upper_bound("abd");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abc"), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+ }
+ // Same if there's a problem initally loading prefix transform
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::ForceNullTablePrefixExtractor",
+ [&](void* arg) { *static_cast<bool*>(arg) = true; });
+ SyncPoint::GetInstance()->EnableProcessing();
+ Reopen(options);
+ {
+ Slice upper_bound("abd");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "abc"), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+// Create multiple SST files each with a different prefix_extractor config,
+// verify iterators can read all SST files using the latest config.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
+ for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy = Create(10, bfp_impl);
+ table_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ Slice upper_bound("foz90000");
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+
+ // first SST with fixed:1 BF
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("foq1", "bar1"));
+ ASSERT_OK(Put("fpa", "0"));
+ dbfull()->Flush(FlushOptions());
+ std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1);
+
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+ ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+ "rocksdb.CappedPrefix.3");
+ read_options.iterate_upper_bound = &upper_bound;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "foo"), 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+ ASSERT_EQ(CountIter(iter, "gpk"), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+ // second SST with capped:3 BF
+ ASSERT_OK(Put("foo3", "bar3"));
+ ASSERT_OK(Put("foo4", "bar4"));
+ ASSERT_OK(Put("foq5", "bar5"));
+ ASSERT_OK(Put("fpb", "1"));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ {
+ // BF is cappped:3 now
+ std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_tmp, "foo"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+ // both counters are incremented because BF is "not changed" for 1 of the
+ // 2 SST files, so filter is checked once and found no match.
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+ }
+
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
+ ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+ "rocksdb.FixedPrefix.2");
+ // third SST with fixed:2 BF
+ ASSERT_OK(Put("foo6", "bar6"));
+ ASSERT_OK(Put("foo7", "bar7"));
+ ASSERT_OK(Put("foq8", "bar8"));
+ ASSERT_OK(Put("fpc", "2"));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ {
+ // BF is fixed:2 now
+ std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_tmp, "foo"), 9);
+ // the first and last BF are checked
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 7);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+ ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+ // only last BF is checked and not found
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 8);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+ }
+
+ // iter_old can only see the first SST, so checked plus 1
+ ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9);
+ // iter was created after the first setoptions call so only full filter
+ // will check the filter
+ ASSERT_EQ(CountIter(iter, "foo"), 2);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 10);
+
+ {
+ // keys in all three SSTs are visible to iterator
+ // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2)
+ // so +2 for checked counter
+ std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_all, "foo"), 9);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+ ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 13);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+ }
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+ ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+ "rocksdb.CappedPrefix.3");
+ {
+ std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_all, "foo"), 6);
+ // all three SST are checked because the current options has the same as
+ // the remaining SST (capped:3)
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 16);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+ ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 17);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4);
+ }
+ // TODO(Zhongyi): Maybe also need to add Get calls to test point look up?
+ }
+}
+
+// Create a new column family in a running DB, change prefix_extractor
+// dynamically, verify the iterator created on the new column family behaves
+// as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
+ int iteration = 0;
+ for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy = Create(10, bfp_impl);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options);
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ // create a new CF and set prefix_extractor dynamically
+ options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+ CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options);
+ ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(),
+ "rocksdb.CappedPrefix.3");
+ ASSERT_OK(Put(2, "foo3", "bar3"));
+ ASSERT_OK(Put(2, "foo4", "bar4"));
+ ASSERT_OK(Put(2, "foo5", "bar5"));
+ ASSERT_OK(Put(2, "foq6", "bar6"));
+ ASSERT_OK(Put(2, "fpq7", "bar7"));
+ dbfull()->Flush(FlushOptions());
+ {
+ std::unique_ptr<Iterator> iter(
+ db_->NewIterator(read_options, handles_[2]));
+ ASSERT_EQ(CountIter(iter, "foo"), 3);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ ASSERT_OK(
+ dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}}));
+ ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(),
+ "rocksdb.FixedPrefix.2");
+ {
+ std::unique_ptr<Iterator> iter(
+ db_->NewIterator(read_options, handles_[2]));
+ ASSERT_EQ(CountIter(iter, "foo"), 4);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[2]));
+ ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[2]));
+ handles_[2] = nullptr;
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+ handles_[1] = nullptr;
+ iteration++;
+ }
+}
+
+// Verify it's possible to change prefix_extractor at runtime and iterators
+// behaves as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
+ for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy = Create(10, bfp_impl);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("fpa", "0"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("foo3", "bar3"));
+ ASSERT_OK(Put("foo4", "bar4"));
+ ASSERT_OK(Put("foo5", "bar5"));
+ ASSERT_OK(Put("fpb", "1"));
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Put("foo6", "bar6"));
+ ASSERT_OK(Put("foo7", "bar7"));
+ ASSERT_OK(Put("foo8", "bar8"));
+ ASSERT_OK(Put("fpc", "2"));
+ dbfull()->Flush(FlushOptions());
+
+ ReadOptions read_options;
+ read_options.prefix_same_as_start = true;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter, "foo"), 12);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+ std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+ ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+ ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+ "rocksdb.CappedPrefix.3");
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ // "fp*" should be skipped
+ ASSERT_EQ(CountIter(iter, "foo"), 9);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ }
+
+ // iterator created before should not be affected and see all keys
+ ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+ ASSERT_EQ(CountIter(iter_old, "abc"), 0);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
+ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+ }
+}
+
+TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) {
+ Options options = CurrentOptions();
+ constexpr size_t kNumKeys = 10000;
+ static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeys + 10));
+ options.create_if_missing = true;
+ constexpr size_t kPrefixLength = 4;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
+ options.compression = kNoCompression;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(50));
+ bbto.index_shortening =
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+ bbto.block_size = 128;
+ bbto.metadata_block_size = 128;
+ bbto.partition_filters = true;
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ const std::string value(64, '\0');
+
+ WriteOptions write_opts;
+ write_opts.disableWAL = true;
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ std::ostringstream oss;
+ oss << std::setfill('0') << std::setw(4) << std::fixed << i;
+ ASSERT_OK(db_->Put(write_opts, oss.str(), value));
+ }
+ ASSERT_OK(Flush());
+
+ ReadOptions read_opts;
+ // Use legacy, implicit prefix seek
+ read_opts.total_order_seek = false;
+ read_opts.auto_prefix_mode = false;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ // Seek with a key after each one added but with same prefix. One will
+ // surely cross a partition boundary.
+ std::ostringstream oss;
+ oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
+ it->SeekForPrev(oss.str());
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(it->Valid());
+ }
+ it.reset();
+}
+
+namespace {
+class BackwardBytewiseComparator : public Comparator {
+ public:
+ const char* Name() const override { return "BackwardBytewiseComparator"; }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ int min_size_neg = -static_cast<int>(std::min(a.size(), b.size()));
+ const char* a_end = a.data() + a.size();
+ const char* b_end = b.data() + b.size();
+ for (int i = -1; i >= min_size_neg; --i) {
+ if (a_end[i] != b_end[i]) {
+ if (static_cast<unsigned char>(a_end[i]) <
+ static_cast<unsigned char>(b_end[i])) {
+ return -1;
+ } else {
+ return 1;
+ }
+ }
+ }
+ return static_cast<int>(a.size()) - static_cast<int>(b.size());
+ }
+
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+const BackwardBytewiseComparator kBackwardBytewiseComparator{};
+
+class FixedSuffix4Transform : public SliceTransform {
+ const char* Name() const override { return "FixedSuffixTransform"; }
+
+ Slice Transform(const Slice& src) const override {
+ return Slice(src.data() + src.size() - 4, 4);
+ }
+
+ bool InDomain(const Slice& src) const override { return src.size() >= 4; }
+};
+
+std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
+ if (sst) {
+ return {
+ options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_CHECKED),
+ options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_USEFUL)};
+ } else {
+ auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
+ auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
+ return {hit + miss, miss};
+ }
+}
+
+std::pair<uint64_t, uint64_t> CheckedAndUseful(uint64_t checked,
+ uint64_t useful) {
+ return {checked, useful};
+}
+} // anonymous namespace
+
+// This uses a prefix_extractor + comparator combination that violates
+// one of the old obsolete, unnecessary axioms of prefix extraction:
+// * key.starts_with(prefix(key))
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter1) {
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+ bbto.whole_key_filtering = false;
+
+ Options options = CurrentOptions();
+ options.comparator = &kBackwardBytewiseComparator;
+ options.prefix_extractor = std::make_shared<FixedSuffix4Transform>();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.memtable_prefix_bloom_size_ratio = 0.1;
+ options.statistics = CreateDBStatistics();
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("321aaaa", "val1"));
+ ASSERT_OK(Put("112aaaa", "val2"));
+ ASSERT_OK(Put("009aaaa", "val3"));
+ ASSERT_OK(Put("baa", "val4")); // out of domain
+ ASSERT_OK(Put("321abaa", "val5"));
+ ASSERT_OK(Put("zzz", "val6")); // out of domain
+
+ for (auto flushed : {false, true}) {
+ SCOPED_TRACE("flushed=" + std::to_string(flushed));
+ if (flushed) {
+ ASSERT_OK(Flush());
+ }
+ ReadOptions read_options;
+ if (flushed) { // TODO: support auto_prefix_mode in memtable?
+ read_options.auto_prefix_mode = true;
+ }
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+ {
+ Slice ub("999aaaa");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "aaaa"), 3);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+ }
+ {
+ Slice ub("999abaa");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "abaa"), 1);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+ }
+ {
+ Slice ub("999acaa");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "acaa"), 0);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+ }
+ {
+ Slice ub("zzzz");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "baa"), 3);
+ if (flushed) { // TODO: fix memtable case
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+ }
+ }
+ }
+}
+
+// This uses a prefix_extractor + comparator combination that violates
+// one of the old obsolete, unnecessary axioms of prefix extraction:
+// * Compare(prefix(key), key) <= 0
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) {
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+ bbto.whole_key_filtering = false;
+
+ Options options = CurrentOptions();
+ options.comparator = ReverseBytewiseComparator();
+ options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.memtable_prefix_bloom_size_ratio = 0.1;
+ options.statistics = CreateDBStatistics();
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("aaaa123", "val1"));
+ ASSERT_OK(Put("aaaa211", "val2"));
+ ASSERT_OK(Put("aaaa900", "val3"));
+ ASSERT_OK(Put("aab", "val4")); // out of domain
+ ASSERT_OK(Put("aaba123", "val5"));
+ ASSERT_OK(Put("qqqq123", "val7"));
+ ASSERT_OK(Put("qqqq", "val8"));
+ ASSERT_OK(Put("zzz", "val8")); // out of domain
+
+ for (auto flushed : {false, true}) {
+ SCOPED_TRACE("flushed=" + std::to_string(flushed));
+ if (flushed) {
+ ASSERT_OK(Flush());
+ }
+ ReadOptions read_options;
+ if (flushed) { // TODO: support auto_prefix_mode in memtable?
+ read_options.auto_prefix_mode = true;
+ } else {
+ // TODO: why needed?
+ get_perf_context()->bloom_memtable_hit_count = 0;
+ get_perf_context()->bloom_memtable_miss_count = 0;
+ }
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+ {
+ Slice ub("aaaa000");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "aaaa999"), 3);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+ }
+ {
+ // Note: prefix does work as upper bound
+ Slice ub("aaaa");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "aaaa999"), 3);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+ }
+ {
+ // Note: prefix does not work here as seek key
+ Slice ub("aaaa500");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "aaaa"), 0);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+ }
+ {
+ Slice ub("aaba000");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "aaba999"), 1);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+ }
+ {
+ Slice ub("aaca000");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "aaca999"), 0);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+ }
+ {
+ Slice ub("aaaz");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "zzz"), 5);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+ }
+ {
+ // Note: prefix does work here as seek key, but only finds key equal
+ // to prefix (others with same prefix are less)
+ read_options.auto_prefix_mode = false;
+ read_options.iterate_upper_bound = nullptr;
+ read_options.prefix_same_as_start = true;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "qqqq"), 1);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+ }
+ }
+}
+
+namespace {
+// A weird comparator that in combination with NonIdempotentFixed4Transform
+// breaks an old axiom of prefix filtering.
+class WeirdComparator : public Comparator {
+ public:
+ const char* Name() const override { return "WeirdComparator"; }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ bool a_in = a.size() >= 5;
+ bool b_in = b.size() >= 5;
+ if (a_in != b_in) {
+ // Order keys after prefixes
+ return a_in - b_in;
+ }
+ if (a_in) {
+ return BytewiseComparator()->Compare(a, b);
+ } else {
+ // Different ordering on the prefixes
+ return ReverseBytewiseComparator()->Compare(a, b);
+ }
+ }
+
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+const WeirdComparator kWeirdComparator{};
+
+// Non-idempotentent because prefix is always 4 bytes, but this is
+// out-of-domain for keys to be assigned prefixes (>= 5 bytes)
+class NonIdempotentFixed4Transform : public SliceTransform {
+ const char* Name() const override { return "NonIdempotentFixed4Transform"; }
+
+ Slice Transform(const Slice& src) const override {
+ return Slice(src.data(), 4);
+ }
+
+ bool InDomain(const Slice& src) const override { return src.size() >= 5; }
+};
+} // anonymous namespace
+
+// This uses a prefix_extractor + comparator combination that violates
+// two of the old obsolete, unnecessary axioms of prefix extraction:
+// * prefix(prefix(key)) == prefix(key)
+// * If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) {
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+ bbto.whole_key_filtering = false;
+
+ Options options = CurrentOptions();
+ options.prefix_extractor = std::make_shared<NonIdempotentFixed4Transform>();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.memtable_prefix_bloom_size_ratio = 0.1;
+ options.statistics = CreateDBStatistics();
+
+ for (auto weird_comparator : {false, true}) {
+ if (weird_comparator) {
+ options.comparator = &kWeirdComparator;
+ }
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("aaaa123", "val1"));
+ ASSERT_OK(Put("aaaa211", "val2"));
+ ASSERT_OK(Put("aaaa900", "val3"));
+ ASSERT_OK(Put("aab", "val4")); // out of domain
+ ASSERT_OK(Put("aaba123", "val5"));
+ ASSERT_OK(Put("qqqq123", "val7"));
+ ASSERT_OK(Put("qqqq", "val8")); // out of domain
+ ASSERT_OK(Put("zzzz", "val8")); // out of domain
+
+ for (auto flushed : {false, true}) {
+ SCOPED_TRACE("flushed=" + std::to_string(flushed));
+ if (flushed) {
+ ASSERT_OK(Flush());
+ }
+ ReadOptions read_options;
+ if (flushed) { // TODO: support auto_prefix_mode in memtable?
+ read_options.auto_prefix_mode = true;
+ } else {
+ // TODO: why needed?
+ get_perf_context()->bloom_memtable_hit_count = 0;
+ get_perf_context()->bloom_memtable_miss_count = 0;
+ }
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+ {
+ Slice ub("aaaa999");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "aaaa000"), 3);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+ }
+ {
+ // Note: prefix as seek key is not bloom-optimized
+ // Note: the count works with weird_comparator because "aaaa" is
+ // ordered as the last of the prefixes
+ Slice ub("aaaa999");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "aaaa"), 3);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+ }
+ {
+ Slice ub("aaba9");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "aaba0"), 1);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+ }
+ {
+ Slice ub("aaca9");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "aaca0"), 0);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+ }
+ {
+ Slice ub("qqqq9");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "qqqq0"), 1);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+ }
+ {
+ // Note: prefix as seek key is not bloom-optimized
+ Slice ub("qqqq9");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "qqqq"), weird_comparator ? 7 : 2);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+ }
+ {
+ // Note: prefix as seek key is not bloom-optimized
+ Slice ub("zzzz9");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "zzzz"), weird_comparator ? 8 : 1);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+ }
+ {
+ Slice ub("zzzz9");
+ read_options.iterate_upper_bound = &ub;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ EXPECT_EQ(CountIter(iter, "aab"), weird_comparator ? 6 : 5);
+ EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+ }
+ }
+ }
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_compaction_filter_test.cc b/src/rocksdb/db/db_compaction_filter_test.cc
new file mode 100644
index 000000000..be863d4f6
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_filter_test.cc
@@ -0,0 +1,1036 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static int cfilter_count = 0;
+static int cfilter_skips = 0;
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static std::string NEW_VALUE = "NewValue";
+
+class DBTestCompactionFilter : public DBTestBase {
+ public:
+ DBTestCompactionFilter()
+ : DBTestBase("db_compaction_filter_test", /*env_do_fsync=*/true) {}
+};
+
+// Param variant of DBTestBase::ChangeCompactOptions
+class DBTestCompactionFilterWithCompactParam
+ : public DBTestCompactionFilter,
+ public ::testing::WithParamInterface<DBTestBase::OptionConfig> {
+ public:
+ DBTestCompactionFilterWithCompactParam() : DBTestCompactionFilter() {
+ option_config_ = GetParam();
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ if (option_config_ == kDefault || option_config_ == kUniversalCompaction ||
+ option_config_ == kUniversalCompactionMultiLevel) {
+ options.create_if_missing = true;
+ }
+ if (option_config_ == kLevelSubcompactions ||
+ option_config_ == kUniversalSubcompactions) {
+ assert(options.max_subcompactions > 1);
+ }
+ Reopen(options);
+ }
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+INSTANTIATE_TEST_CASE_P(
+ CompactionFilterWithOption, DBTestCompactionFilterWithCompactParam,
+ ::testing::Values(DBTestBase::OptionConfig::kDefault,
+ DBTestBase::OptionConfig::kUniversalCompaction,
+ DBTestBase::OptionConfig::kUniversalCompactionMultiLevel,
+ DBTestBase::OptionConfig::kLevelSubcompactions,
+ DBTestBase::OptionConfig::kUniversalSubcompactions));
+#else
+// Run fewer cases in non-full valgrind to save time.
+INSTANTIATE_TEST_CASE_P(CompactionFilterWithOption,
+ DBTestCompactionFilterWithCompactParam,
+ ::testing::Values(DBTestBase::OptionConfig::kDefault));
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class KeepFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ cfilter_count++;
+ return false;
+ }
+
+ const char* Name() const override { return "KeepFilter"; }
+};
+
+class DeleteFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ cfilter_count++;
+ return true;
+ }
+
+ bool FilterMergeOperand(int /*level*/, const Slice& /*key*/,
+ const Slice& /*operand*/) const override {
+ return true;
+ }
+
+ const char* Name() const override { return "DeleteFilter"; }
+};
+
+class DeleteISFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ cfilter_count++;
+ int i = std::stoi(key.ToString());
+ if (i > 5 && i <= 105) {
+ return true;
+ }
+ return false;
+ }
+
+ bool IgnoreSnapshots() const override { return true; }
+
+ const char* Name() const override { return "DeleteFilter"; }
+};
+
+// Skip x if floor(x/10) is even, use range skips. Requires that keys are
+// zero-padded to length 10.
+class SkipEvenFilter : public CompactionFilter {
+ public:
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* skip_until) const override {
+ cfilter_count++;
+ int i = std::stoi(key.ToString());
+ if (i / 10 % 2 == 0) {
+ char key_str[100];
+ snprintf(key_str, sizeof(key_str), "%010d", i / 10 * 10 + 10);
+ *skip_until = key_str;
+ ++cfilter_skips;
+ return Decision::kRemoveAndSkipUntil;
+ }
+ return Decision::kKeep;
+ }
+
+ bool IgnoreSnapshots() const override { return true; }
+
+ const char* Name() const override { return "DeleteFilter"; }
+};
+
+class ConditionalFilter : public CompactionFilter {
+ public:
+ explicit ConditionalFilter(const std::string* filtered_value)
+ : filtered_value_(filtered_value) {}
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return value.ToString() == *filtered_value_;
+ }
+
+ const char* Name() const override { return "ConditionalFilter"; }
+
+ private:
+ const std::string* filtered_value_;
+};
+
+class ChangeFilter : public CompactionFilter {
+ public:
+ explicit ChangeFilter() {}
+
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* new_value, bool* value_changed) const override {
+ assert(new_value != nullptr);
+ *new_value = NEW_VALUE;
+ *value_changed = true;
+ return false;
+ }
+
+ const char* Name() const override { return "ChangeFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit KeepFilterFactory(bool check_context = false,
+ bool check_context_cf_id = false)
+ : check_context_(check_context),
+ check_context_cf_id_(check_context_cf_id),
+ compaction_filter_created_(false) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ if (check_context_) {
+ EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+ EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+ }
+ if (check_context_cf_id_) {
+ EXPECT_EQ(expect_cf_id_.load(), context.column_family_id);
+ }
+ compaction_filter_created_ = true;
+ return std::unique_ptr<CompactionFilter>(new KeepFilter());
+ }
+
+ bool compaction_filter_created() const { return compaction_filter_created_; }
+
+ const char* Name() const override { return "KeepFilterFactory"; }
+ bool check_context_;
+ bool check_context_cf_id_;
+ std::atomic_bool expect_full_compaction_;
+ std::atomic_bool expect_manual_compaction_;
+ std::atomic<uint32_t> expect_cf_id_;
+ bool compaction_filter_created_;
+};
+
+// This filter factory is configured with a `TableFileCreationReason`. Only
+// table files created for that reason will undergo filtering. This
+// configurability makes it useful to tests for filtering non-compaction table
+// files, such as "CompactionFilterFlush" and "CompactionFilterRecovery".
+class DeleteFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit DeleteFilterFactory(TableFileCreationReason reason)
+ : reason_(reason) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ EXPECT_EQ(reason_, context.reason);
+ if (context.reason == TableFileCreationReason::kCompaction &&
+ !context.is_manual_compaction) {
+ // Table files created by automatic compaction do not undergo filtering.
+ // Presumably some tests rely on this.
+ return std::unique_ptr<CompactionFilter>(nullptr);
+ }
+ return std::unique_ptr<CompactionFilter>(new DeleteFilter());
+ }
+
+ bool ShouldFilterTableFileCreation(
+ TableFileCreationReason reason) const override {
+ return reason_ == reason;
+ }
+
+ const char* Name() const override { return "DeleteFilterFactory"; }
+
+ private:
+ const TableFileCreationReason reason_;
+};
+
+// Delete Filter Factory which ignores snapshots
+class DeleteISFilterFactory : public CompactionFilterFactory {
+ public:
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ if (context.is_manual_compaction) {
+ return std::unique_ptr<CompactionFilter>(new DeleteISFilter());
+ } else {
+ return std::unique_ptr<CompactionFilter>(nullptr);
+ }
+ }
+
+ const char* Name() const override { return "DeleteFilterFactory"; }
+};
+
+class SkipEvenFilterFactory : public CompactionFilterFactory {
+ public:
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ if (context.is_manual_compaction) {
+ return std::unique_ptr<CompactionFilter>(new SkipEvenFilter());
+ } else {
+ return std::unique_ptr<CompactionFilter>(nullptr);
+ }
+ }
+
+ const char* Name() const override { return "SkipEvenFilterFactory"; }
+};
+
+class ConditionalFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit ConditionalFilterFactory(const Slice& filtered_value)
+ : filtered_value_(filtered_value.ToString()) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(
+ new ConditionalFilter(&filtered_value_));
+ }
+
+ const char* Name() const override { return "ConditionalFilterFactory"; }
+
+ private:
+ std::string filtered_value_;
+};
+
+class ChangeFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit ChangeFilterFactory() {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(new ChangeFilter());
+ }
+
+ const char* Name() const override { return "ChangeFilterFactory"; }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTestCompactionFilter, CompactionFilter) {
+ Options options = CurrentOptions();
+ options.max_open_files = -1;
+ options.num_levels = 3;
+ options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Write 100K keys, these are written to a few files in L0.
+ const std::string value(10, 'x');
+ for (int i = 0; i < 100000; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ ASSERT_OK(Put(1, key, value));
+ }
+ ASSERT_OK(Flush(1));
+
+ // Push all files to the highest level L2. Verify that
+ // the compaction is each level invokes the filter for
+ // all the keys in that level.
+ cfilter_count = 0;
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+ ASSERT_EQ(cfilter_count, 100000);
+ cfilter_count = 0;
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+ ASSERT_EQ(cfilter_count, 100000);
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+ ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+ cfilter_count = 0;
+
+ // All the files are in the lowest level.
+ // Verify that all but the 100001st record
+ // has sequence number zero. The 100001st record
+ // is at the tip of this snapshot and cannot
+ // be zeroed out.
+ int count = 0;
+ int total = 0;
+ Arena arena;
+ {
+ InternalKeyComparator icmp(options.comparator);
+ ReadOptions read_options;
+ ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+ read_options, &arena, kMaxSequenceNumber, handles_[1]));
+ iter->SeekToFirst();
+ ASSERT_OK(iter->status());
+ while (iter->Valid()) {
+ ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+ ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+ total++;
+ if (ikey.sequence != 0) {
+ count++;
+ }
+ iter->Next();
+ }
+ ASSERT_OK(iter->status());
+ }
+ ASSERT_EQ(total, 100000);
+ ASSERT_EQ(count, 0);
+
+ // overwrite all the 100K keys once again.
+ for (int i = 0; i < 100000; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ ASSERT_OK(Put(1, key, value));
+ }
+ ASSERT_OK(Flush(1));
+
+ // push all files to the highest level L2. This
+ // means that all keys should pass at least once
+ // via the compaction filter
+ cfilter_count = 0;
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+ ASSERT_EQ(cfilter_count, 100000);
+ cfilter_count = 0;
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+ ASSERT_EQ(cfilter_count, 100000);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+ ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+
+ // create a new database with the compaction
+ // filter in such a way that it deletes all keys
+ options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>(
+ TableFileCreationReason::kCompaction);
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // write all the keys once again.
+ for (int i = 0; i < 100000; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ ASSERT_OK(Put(1, key, value));
+ }
+ ASSERT_OK(Flush(1));
+ ASSERT_NE(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0);
+
+ // Push all files to the highest level L2. This
+ // triggers the compaction filter to delete all keys,
+ // verify that at the end of the compaction process,
+ // nothing is left.
+ cfilter_count = 0;
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+ ASSERT_EQ(cfilter_count, 100000);
+ cfilter_count = 0;
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+ ASSERT_EQ(cfilter_count, 0);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+
+ {
+ // Scan the entire database to ensure that nothing is left
+ std::unique_ptr<Iterator> iter(
+ db_->NewIterator(ReadOptions(), handles_[1]));
+ iter->SeekToFirst();
+ count = 0;
+ while (iter->Valid()) {
+ count++;
+ iter->Next();
+ }
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(count, 0);
+ }
+
+ // The sequence number of the remaining record
+ // is not zeroed out even though it is at the
+ // level Lmax because this record is at the tip
+ count = 0;
+ {
+ InternalKeyComparator icmp(options.comparator);
+ ReadOptions read_options;
+ ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+ read_options, &arena, kMaxSequenceNumber, handles_[1]));
+ iter->SeekToFirst();
+ ASSERT_OK(iter->status());
+ while (iter->Valid()) {
+ ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+ ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+ ASSERT_NE(ikey.sequence, (unsigned)0);
+ count++;
+ iter->Next();
+ }
+ ASSERT_EQ(count, 0);
+ }
+}
+
+// Tests the edge case where compaction does not produce any output -- all
+// entries are deleted. The compaction should create bunch of 'DeleteFile'
+// entries in VersionEdit, but none of the 'AddFile's.
+TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
+ Options options = CurrentOptions();
+ options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>(
+ TableFileCreationReason::kCompaction);
+ options.disable_auto_compactions = true;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ // put some data
+ for (int table = 0; table < 4; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // this will produce empty file (delete compaction filter)
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(0U, CountLiveFiles());
+
+ Reopen(options);
+
+ Iterator* itr = db_->NewIterator(ReadOptions());
+ itr->SeekToFirst();
+ ASSERT_OK(itr->status());
+ // empty db
+ ASSERT_TRUE(!itr->Valid());
+
+ delete itr;
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, CompactionFilterFlush) {
+ // Tests a `CompactionFilterFactory` that filters when table file is created
+ // by flush.
+ Options options = CurrentOptions();
+ options.compaction_filter_factory =
+ std::make_shared<DeleteFilterFactory>(TableFileCreationReason::kFlush);
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ Reopen(options);
+
+ // Puts and Merges are purged in flush.
+ ASSERT_OK(Put("a", "v"));
+ ASSERT_OK(Merge("b", "v"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("NOT_FOUND", Get("a"));
+ ASSERT_EQ("NOT_FOUND", Get("b"));
+
+ // However, Puts and Merges are preserved by recovery.
+ ASSERT_OK(Put("a", "v"));
+ ASSERT_OK(Merge("b", "v"));
+ Reopen(options);
+ ASSERT_EQ("v", Get("a"));
+ ASSERT_EQ("v", Get("b"));
+
+ // Likewise, compaction does not apply filtering.
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("v", Get("a"));
+ ASSERT_EQ("v", Get("b"));
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterRecovery) {
+ // Tests a `CompactionFilterFactory` that filters when table file is created
+ // by recovery.
+ Options options = CurrentOptions();
+ options.compaction_filter_factory =
+ std::make_shared<DeleteFilterFactory>(TableFileCreationReason::kRecovery);
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ Reopen(options);
+
+ // Puts and Merges are purged in recovery.
+ ASSERT_OK(Put("a", "v"));
+ ASSERT_OK(Merge("b", "v"));
+ Reopen(options);
+ ASSERT_EQ("NOT_FOUND", Get("a"));
+ ASSERT_EQ("NOT_FOUND", Get("b"));
+
+ // However, Puts and Merges are preserved by flush.
+ ASSERT_OK(Put("a", "v"));
+ ASSERT_OK(Merge("b", "v"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("v", Get("a"));
+ ASSERT_EQ("v", Get("b"));
+
+ // Likewise, compaction does not apply filtering.
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("v", Get("a"));
+ ASSERT_EQ("v", Get("b"));
+}
+
+TEST_P(DBTestCompactionFilterWithCompactParam,
+ CompactionFilterWithValueChange) {
+ Options options = CurrentOptions();
+ options.num_levels = 3;
+ options.compaction_filter_factory = std::make_shared<ChangeFilterFactory>();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Write 100K+1 keys, these are written to a few files
+ // in L0. We do this so that the current snapshot points
+ // to the 100001 key.The compaction filter is not invoked
+ // on keys that are visible via a snapshot because we
+ // anyways cannot delete it.
+ const std::string value(10, 'x');
+ for (int i = 0; i < 100001; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ ASSERT_OK(Put(1, key, value));
+ }
+
+ // push all files to lower levels
+ ASSERT_OK(Flush(1));
+ if (option_config_ != kUniversalCompactionMultiLevel &&
+ option_config_ != kUniversalSubcompactions) {
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+ } else {
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+ nullptr, nullptr));
+ }
+
+ // re-write all data again
+ for (int i = 0; i < 100001; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ ASSERT_OK(Put(1, key, value));
+ }
+
+ // push all files to lower levels. This should
+ // invoke the compaction filter for all 100000 keys.
+ ASSERT_OK(Flush(1));
+ if (option_config_ != kUniversalCompactionMultiLevel &&
+ option_config_ != kUniversalSubcompactions) {
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+ } else {
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+ nullptr, nullptr));
+ }
+
+ // verify that all keys now have the new value that
+ // was set by the compaction process.
+ for (int i = 0; i < 100001; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ std::string newvalue = Get(1, key);
+ ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+ }
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
+ std::string one, two, three, four;
+ PutFixed64(&one, 1);
+ PutFixed64(&two, 2);
+ PutFixed64(&three, 3);
+ PutFixed64(&four, 4);
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ options.num_levels = 3;
+ // Filter out keys with value is 2.
+ options.compaction_filter_factory =
+ std::make_shared<ConditionalFilterFactory>(two);
+ DestroyAndReopen(options);
+
+ // In the same compaction, a value type needs to be deleted based on
+ // compaction filter, and there is a merge type for the key. compaction
+ // filter result is ignored.
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", two));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "foo", one));
+ ASSERT_OK(Flush());
+ std::string newvalue = Get("foo");
+ ASSERT_EQ(newvalue, three);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ newvalue = Get("foo");
+ ASSERT_EQ(newvalue, three);
+
+ // value key can be deleted based on compaction filter, leaving only
+ // merge keys.
+ ASSERT_OK(db_->Put(WriteOptions(), "bar", two));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ newvalue = Get("bar");
+ ASSERT_EQ("NOT_FOUND", newvalue);
+ ASSERT_OK(db_->Merge(WriteOptions(), "bar", two));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ newvalue = Get("bar");
+ ASSERT_EQ(two, two);
+
+ // Compaction filter never applies to merge keys.
+ ASSERT_OK(db_->Put(WriteOptions(), "foobar", one));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two));
+ ASSERT_OK(Flush());
+ newvalue = Get("foobar");
+ ASSERT_EQ(newvalue, three);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ newvalue = Get("foobar");
+ ASSERT_EQ(newvalue, three);
+
+ // In the same compaction, both of value type and merge type keys need to be
+ // deleted based on compaction filter, and there is a merge type for the key.
+ // For both keys, compaction filter results are ignored.
+ ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two));
+ ASSERT_OK(Flush());
+ newvalue = Get("barfoo");
+ ASSERT_EQ(newvalue, four);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ newvalue = Get("barfoo");
+ ASSERT_EQ(newvalue, four);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
+ KeepFilterFactory* filter = new KeepFilterFactory(true, true);
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_filter_factory.reset(filter);
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 8;
+ Reopen(options);
+ int num_keys_per_file = 400;
+ for (int j = 0; j < 3; j++) {
+ // Write several keys.
+ const std::string value(10, 'x');
+ for (int i = 0; i < num_keys_per_file; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%08d%02d", i, j);
+ ASSERT_OK(Put(key, value));
+ }
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ // Make sure next file is much smaller so automatic compaction will not
+ // be triggered.
+ num_keys_per_file /= 2;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Force a manual compaction
+ cfilter_count = 0;
+ filter->expect_manual_compaction_.store(true);
+ filter->expect_full_compaction_.store(true);
+ filter->expect_cf_id_.store(0);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(cfilter_count, 700);
+ ASSERT_EQ(NumSortedRuns(0), 1);
+ ASSERT_TRUE(filter->compaction_filter_created());
+
+ // Verify total number of keys is correct after manual compaction.
+ {
+ int count = 0;
+ int total = 0;
+ Arena arena;
+ InternalKeyComparator icmp(options.comparator);
+ ReadOptions read_options;
+ ScopedArenaIterator iter(dbfull()->NewInternalIterator(read_options, &arena,
+ kMaxSequenceNumber));
+ iter->SeekToFirst();
+ ASSERT_OK(iter->status());
+ while (iter->Valid()) {
+ ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+ ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+ total++;
+ if (ikey.sequence != 0) {
+ count++;
+ }
+ iter->Next();
+ }
+ ASSERT_EQ(total, 700);
+ ASSERT_EQ(count, 0);
+ }
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) {
+ KeepFilterFactory* filter = new KeepFilterFactory(false, true);
+ filter->expect_cf_id_.store(1);
+
+ Options options = CurrentOptions();
+ options.compaction_filter_factory.reset(filter);
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 2;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ int num_keys_per_file = 400;
+ for (int j = 0; j < 3; j++) {
+ // Write several keys.
+ const std::string value(10, 'x');
+ for (int i = 0; i < num_keys_per_file; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%08d%02d", i, j);
+ ASSERT_OK(Put(1, key, value));
+ }
+ ASSERT_OK(Flush(1));
+ // Make sure next file is much smaller so automatic compaction will not
+ // be triggered.
+ num_keys_per_file /= 2;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_TRUE(filter->compaction_filter_created());
+}
+
+#ifndef ROCKSDB_LITE
+// Compaction filters aplies to all records, regardless snapshots.
+TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
+ std::string five = std::to_string(5);
+ Options options = CurrentOptions();
+ options.compaction_filter_factory = std::make_shared<DeleteISFilterFactory>();
+ options.disable_auto_compactions = true;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ // Put some data.
+ const Snapshot* snapshot = nullptr;
+ for (int table = 0; table < 4; ++table) {
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+ }
+ ASSERT_OK(Flush());
+
+ if (table == 0) {
+ snapshot = db_->GetSnapshot();
+ }
+ }
+ assert(snapshot != nullptr);
+
+ cfilter_count = 0;
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // The filter should delete 40 records.
+ ASSERT_EQ(40, cfilter_count);
+
+ {
+ // Scan the entire database as of the snapshot to ensure
+ // that nothing is left
+ ReadOptions read_options;
+ read_options.snapshot = snapshot;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ iter->SeekToFirst();
+ ASSERT_OK(iter->status());
+ int count = 0;
+ while (iter->Valid()) {
+ count++;
+ iter->Next();
+ }
+ ASSERT_EQ(count, 6);
+ read_options.snapshot = nullptr;
+ std::unique_ptr<Iterator> iter1(db_->NewIterator(read_options));
+ ASSERT_OK(iter1->status());
+ iter1->SeekToFirst();
+ count = 0;
+ while (iter1->Valid()) {
+ count++;
+ iter1->Next();
+ }
+ // We have deleted 10 keys from 40 using the compaction filter
+ // Keys 6-9 before the snapshot and 100-105 after the snapshot
+ ASSERT_EQ(count, 30);
+ }
+
+ // Release the snapshot and compact again -> now all records should be
+ // removed.
+ db_->ReleaseSnapshot(snapshot);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, SkipUntil) {
+ Options options = CurrentOptions();
+ options.compaction_filter_factory = std::make_shared<SkipEvenFilterFactory>();
+ options.disable_auto_compactions = true;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ // Write 100K keys, these are written to a few files in L0.
+ for (int table = 0; table < 4; ++table) {
+ // Key ranges in tables are [0, 38], [106, 149], [212, 260], [318, 371].
+ for (int i = table * 6; i < 39 + table * 11; ++i) {
+ char key[100];
+ snprintf(key, sizeof(key), "%010d", table * 100 + i);
+ ASSERT_OK(Put(key, std::to_string(table * 1000 + i)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ cfilter_skips = 0;
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // Number of skips in tables: 2, 3, 3, 3.
+ ASSERT_EQ(11, cfilter_skips);
+
+ for (int table = 0; table < 4; ++table) {
+ for (int i = table * 6; i < 39 + table * 11; ++i) {
+ int k = table * 100 + i;
+ char key[100];
+ snprintf(key, sizeof(key), "%010d", table * 100 + i);
+ auto expected = std::to_string(table * 1000 + i);
+ std::string val;
+ Status s = db_->Get(ReadOptions(), key, &val);
+ if (k / 10 % 2 == 0) {
+ ASSERT_TRUE(s.IsNotFound());
+ } else {
+ ASSERT_OK(s);
+ ASSERT_EQ(expected, val);
+ }
+ }
+ }
+}
+
+TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) {
+ BlockBasedTableOptions table_options;
+ table_options.whole_key_filtering = false;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(100, false));
+
+ Options options = CurrentOptions();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewCappedPrefixTransform(9));
+ options.compaction_filter_factory = std::make_shared<SkipEvenFilterFactory>();
+ options.disable_auto_compactions = true;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("0000000010", "v10"));
+ ASSERT_OK(Put("0000000020", "v20")); // skipped
+ ASSERT_OK(Put("0000000050", "v50"));
+ ASSERT_OK(Flush());
+
+ cfilter_skips = 0;
+ EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ EXPECT_EQ(1, cfilter_skips);
+
+ Status s;
+ std::string val;
+
+ s = db_->Get(ReadOptions(), "0000000010", &val);
+ ASSERT_OK(s);
+ EXPECT_EQ("v10", val);
+
+ s = db_->Get(ReadOptions(), "0000000020", &val);
+ EXPECT_TRUE(s.IsNotFound());
+
+ s = db_->Get(ReadOptions(), "0000000050", &val);
+ ASSERT_OK(s);
+ EXPECT_EQ("v50", val);
+}
+
+class TestNotSupportedFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return true;
+ }
+
+ const char* Name() const override { return "NotSupported"; }
+ bool IgnoreSnapshots() const override { return false; }
+};
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) {
+ Options options = CurrentOptions();
+ options.compaction_filter = new TestNotSupportedFilter();
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("a", "v10"));
+ ASSERT_OK(Put("z", "v20"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("a", "v10"));
+ ASSERT_OK(Put("z", "v20"));
+ ASSERT_OK(Flush());
+
+ // Comapction should fail because IgnoreSnapshots() = false
+ EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+ .IsNotSupported());
+
+ delete options.compaction_filter;
+}
+
+class TestNotSupportedFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit TestNotSupportedFilterFactory(TableFileCreationReason reason)
+ : reason_(reason) {}
+
+ bool ShouldFilterTableFileCreation(
+ TableFileCreationReason reason) const override {
+ return reason_ == reason;
+ }
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /* context */) override {
+ return std::unique_ptr<CompactionFilter>(new TestNotSupportedFilter());
+ }
+
+ const char* Name() const override { return "TestNotSupportedFilterFactory"; }
+
+ private:
+ const TableFileCreationReason reason_;
+};
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseDuringFlush) {
+ Options options = CurrentOptions();
+ options.compaction_filter_factory =
+ std::make_shared<TestNotSupportedFilterFactory>(
+ TableFileCreationReason::kFlush);
+ Reopen(options);
+
+ ASSERT_OK(Put("a", "v10"));
+ ASSERT_TRUE(Flush().IsNotSupported());
+}
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseRecovery) {
+ Options options = CurrentOptions();
+ options.compaction_filter_factory =
+ std::make_shared<TestNotSupportedFilterFactory>(
+ TableFileCreationReason::kRecovery);
+ Reopen(options);
+
+ ASSERT_OK(Put("a", "v10"));
+ ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+TEST_F(DBTestCompactionFilter, DropKeyWithSingleDelete) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("a", "v0"));
+ ASSERT_OK(Put("b", "v0"));
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ ASSERT_OK(SingleDelete("b"));
+ ASSERT_OK(Flush());
+
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = options.num_levels - 1;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ }
+
+ db_->ReleaseSnapshot(snapshot);
+ Close();
+
+ class DeleteFilterV2 : public CompactionFilter {
+ public:
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
+ const Slice& /*existing_value*/,
+ std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ if (key.starts_with("b")) {
+ return Decision::kPurge;
+ }
+ return Decision::kRemove;
+ }
+
+ const char* Name() const override { return "DeleteFilterV2"; }
+ } delete_filter_v2;
+
+ options.compaction_filter = &delete_filter_v2;
+ options.level0_file_num_compaction_trigger = 2;
+ Reopen(options);
+
+ ASSERT_OK(Put("b", "v1"));
+ ASSERT_OK(Put("x", "v1"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("r", "v1"));
+ ASSERT_OK(Put("z", "v1"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ Close();
+
+ options.compaction_filter = nullptr;
+ Reopen(options);
+ ASSERT_OK(SingleDelete("b"));
+ ASSERT_OK(Flush());
+ {
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_compaction_test.cc b/src/rocksdb/db/db_compaction_test.cc
new file mode 100644
index 000000000..ba9c50b9a
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_test.cc
@@ -0,0 +1,8227 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <tuple>
+
+#include "db/blob/blob_index.h"
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/concurrent_task_limiter.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/utilities/convenience.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/concurrent_task_limiter_impl.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SYNC_POINT is not supported in released Windows mode.
+#if !defined(ROCKSDB_LITE)
+
+class CompactionStatsCollector : public EventListener {
+ public:
+ CompactionStatsCollector()
+ : compaction_completed_(
+ static_cast<int>(CompactionReason::kNumOfReasons)) {
+ for (auto& v : compaction_completed_) {
+ v.store(0);
+ }
+ }
+
+ ~CompactionStatsCollector() override {}
+
+ void OnCompactionCompleted(DB* /* db */,
+ const CompactionJobInfo& info) override {
+ int k = static_cast<int>(info.compaction_reason);
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ assert(k >= 0 && k < num_of_reasons);
+ compaction_completed_[k]++;
+ }
+
+ void OnExternalFileIngested(
+ DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
+ int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
+ compaction_completed_[k]++;
+ }
+
+ void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
+ int k = static_cast<int>(CompactionReason::kFlush);
+ compaction_completed_[k]++;
+ }
+
+ int NumberOfCompactions(CompactionReason reason) const {
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ int k = static_cast<int>(reason);
+ assert(k >= 0 && k < num_of_reasons);
+ return compaction_completed_.at(k).load();
+ }
+
+ private:
+ std::vector<std::atomic<int>> compaction_completed_;
+};
+
+class DBCompactionTest : public DBTestBase {
+ public:
+ DBCompactionTest()
+ : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {}
+
+ protected:
+ /*
+ * Verifies compaction stats of cfd are valid.
+ *
+ * For each level of cfd, its compaction stats are valid if
+ * 1) sum(stat.counts) == stat.count, and
+ * 2) stat.counts[i] == collector.NumberOfCompactions(i)
+ */
+ void VerifyCompactionStats(ColumnFamilyData& cfd,
+ const CompactionStatsCollector& collector) {
+#ifndef NDEBUG
+ InternalStats* internal_stats_ptr = cfd.internal_stats();
+ ASSERT_NE(internal_stats_ptr, nullptr);
+ const std::vector<InternalStats::CompactionStats>& comp_stats =
+ internal_stats_ptr->TEST_GetCompactionStats();
+ const int num_of_reasons =
+ static_cast<int>(CompactionReason::kNumOfReasons);
+ std::vector<int> counts(num_of_reasons, 0);
+ // Count the number of compactions caused by each CompactionReason across
+ // all levels.
+ for (const auto& stat : comp_stats) {
+ int sum = 0;
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] += stat.counts[i];
+ sum += stat.counts[i];
+ }
+ ASSERT_EQ(sum, stat.count);
+ }
+ // Verify InternalStats bookkeeping matches that of
+ // CompactionStatsCollector, assuming that all compactions complete.
+ for (int i = 0; i < num_of_reasons; i++) {
+ ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)),
+ counts[i]);
+ }
+#endif /* NDEBUG */
+ }
+};
+
+class DBCompactionTestWithParam
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+ DBCompactionTestWithParam()
+ : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {
+ max_subcompactions_ = std::get<0>(GetParam());
+ exclusive_manual_compaction_ = std::get<1>(GetParam());
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ uint32_t max_subcompactions_;
+ bool exclusive_manual_compaction_;
+};
+
+class DBCompactionTestWithBottommostParam
+ : public DBTestBase,
+ public testing::WithParamInterface<BottommostLevelCompaction> {
+ public:
+ DBCompactionTestWithBottommostParam()
+ : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {
+ bottommost_level_compaction_ = GetParam();
+ }
+
+ BottommostLevelCompaction bottommost_level_compaction_;
+};
+
+class DBCompactionDirectIOTest : public DBCompactionTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ DBCompactionDirectIOTest() : DBCompactionTest() {}
+};
+
+// Param = true : target level is non-empty
+// Param = false: level between target level and source level
+// is not empty.
+class ChangeLevelConflictsWithAuto
+ : public DBCompactionTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ ChangeLevelConflictsWithAuto() : DBCompactionTest() {}
+};
+
+// Param = true: grab the compaction pressure token (enable
+// parallel compactions)
+// Param = false: Not grab the token (no parallel compactions)
+class RoundRobinSubcompactionsAgainstPressureToken
+ : public DBCompactionTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ RoundRobinSubcompactionsAgainstPressureToken() {
+ grab_pressure_token_ = GetParam();
+ }
+ bool grab_pressure_token_;
+};
+
+class RoundRobinSubcompactionsAgainstResources
+ : public DBCompactionTest,
+ public ::testing::WithParamInterface<std::tuple<int, int>> {
+ public:
+ RoundRobinSubcompactionsAgainstResources() {
+ total_low_pri_threads_ = std::get<0>(GetParam());
+ max_compaction_limits_ = std::get<1>(GetParam());
+ }
+ int total_low_pri_threads_;
+ int max_compaction_limits_;
+};
+
+namespace {
+class FlushedFileCollector : public EventListener {
+ public:
+ FlushedFileCollector() {}
+ ~FlushedFileCollector() override {}
+
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.push_back(info.file_path);
+ }
+
+ std::vector<std::string> GetFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ std::vector<std::string> result;
+ for (auto fname : flushed_files_) {
+ result.push_back(fname);
+ }
+ return result;
+ }
+
+ void ClearFlushedFiles() { flushed_files_.clear(); }
+
+ private:
+ std::vector<std::string> flushed_files_;
+ std::mutex mutex_;
+};
+
+class SstStatsCollector : public EventListener {
+ public:
+ SstStatsCollector() : num_ssts_creation_started_(0) {}
+
+ void OnTableFileCreationStarted(
+ const TableFileCreationBriefInfo& /* info */) override {
+ ++num_ssts_creation_started_;
+ }
+
+ int num_ssts_creation_started() { return num_ssts_creation_started_; }
+
+ private:
+ std::atomic<int> num_ssts_creation_started_;
+};
+
+static const int kCDTValueSize = 1000;
+static const int kCDTKeysPerBuffer = 4;
+static const int kCDTNumLevels = 8;
+Options DeletionTriggerOptions(Options options) {
+ options.compression = kNoCompression;
+ options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
+ options.min_write_buffer_number_to_merge = 1;
+ options.max_write_buffer_size_to_maintain = 0;
+ options.num_levels = kCDTNumLevels;
+ options.level0_file_num_compaction_trigger = 1;
+ options.target_file_size_base = options.write_buffer_size * 2;
+ options.target_file_size_multiplier = 2;
+ options.max_bytes_for_level_base =
+ options.target_file_size_base * options.target_file_size_multiplier;
+ options.max_bytes_for_level_multiplier = 2;
+ options.disable_auto_compactions = false;
+ options.compaction_options_universal.max_size_amplification_percent = 100;
+ return options;
+}
+
+bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
+ const SstFileMetaData& b) {
+ if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) {
+ if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+ // b.smallestkey <= a.smallestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+ // a.smallestkey < b.smallestkey <= a.largestkey
+ return true;
+ }
+ if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) {
+ if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+ // b.smallestkey <= a.largestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+ // a.smallestkey <= b.largestkey < a.largestkey
+ return true;
+ }
+ return false;
+}
+
+// Identifies all files between level "min_level" and "max_level"
+// which has overlapping key range with "input_file_meta".
+void GetOverlappingFileNumbersForLevelCompaction(
+ const ColumnFamilyMetaData& cf_meta, const Comparator* comparator,
+ int min_level, int max_level, const SstFileMetaData* input_file_meta,
+ std::set<std::string>* overlapping_file_names) {
+ std::set<const SstFileMetaData*> overlapping_files;
+ overlapping_files.insert(input_file_meta);
+ for (int m = min_level; m <= max_level; ++m) {
+ for (auto& file : cf_meta.levels[m].files) {
+ for (auto* included_file : overlapping_files) {
+ if (HaveOverlappingKeyRanges(comparator, *included_file, file)) {
+ overlapping_files.insert(&file);
+ overlapping_file_names->insert(file.name);
+ break;
+ }
+ }
+ }
+ }
+}
+
+void VerifyCompactionResult(
+ const ColumnFamilyMetaData& cf_meta,
+ const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+ for (auto& level : cf_meta.levels) {
+ for (auto& file : level.files) {
+ assert(overlapping_file_numbers.find(file.name) ==
+ overlapping_file_numbers.end());
+ }
+ }
+#endif
+}
+
+const SstFileMetaData* PickFileRandomly(const ColumnFamilyMetaData& cf_meta,
+ Random* rand, int* level = nullptr) {
+ auto file_id = rand->Uniform(static_cast<int>(cf_meta.file_count)) + 1;
+ for (auto& level_meta : cf_meta.levels) {
+ if (file_id <= level_meta.files.size()) {
+ if (level != nullptr) {
+ *level = level_meta.level;
+ }
+ auto result = rand->Uniform(file_id);
+ return &(level_meta.files[result]);
+ }
+ file_id -= static_cast<uint32_t>(level_meta.files.size());
+ }
+ assert(false);
+ return nullptr;
+}
+} // anonymous namespace
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// All the TEST_P tests run once with sub_compactions disabled (i.e.
+// options.max_subcompactions = 1) and once with it enabled
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
+ for (int tid = 0; tid < 3; ++tid) {
+ uint64_t db_size[2];
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.max_subcompactions = max_subcompactions_;
+
+ if (tid == 1) {
+ // the following only disable stats update in DB::Open()
+ // and should not affect the result of this test.
+ options.skip_stats_update_on_db_open = true;
+ } else if (tid == 2) {
+ // third pass with universal compaction
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ }
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ const int kTestSize = kCDTKeysPerBuffer * 1024;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(rnd.RandomString(kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
+
+ for (int k = 0; k < kTestSize; ++k) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
+
+ if (options.compaction_style == kCompactionStyleUniversal) {
+ // Claim: in universal compaction none of the original data will remain
+ // once compactions settle.
+ //
+ // Proof: The compensated size of the file containing the most tombstones
+ // is enough on its own to trigger size amp compaction. Size amp
+ // compaction is a full compaction, so all tombstones meet the obsolete
+ // keys they cover.
+ ASSERT_EQ(0, db_size[1]);
+ } else {
+ // Claim: in level compaction at most `db_size[0] / 2` of the original
+ // data will remain once compactions settle.
+ //
+ // Proof: Assume the original data is all in the bottom level. If it were
+ // not, it would meet its tombstone sooner. The original data size is
+ // large enough to require fanout to bottom level to be greater than
+ // `max_bytes_for_level_multiplier == 2`. In the level just above,
+ // tombstones must cover less than `db_size[0] / 4` bytes since fanout >=
+ // 2 and file size is compensated by doubling the size of values we expect
+ // are covered (`kDeletionWeightOnCompaction == 2`). The tombstones in
+ // levels above must cover less than `db_size[0] / 8` bytes of original
+ // data, `db_size[0] / 16`, and so on.
+ ASSERT_GT(db_size[0] / 2, db_size[1]);
+ }
+ }
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
+ // This test verify UpdateAccumulatedStats is not on
+ // if options.skip_stats_update_on_db_open = true
+ // The test will need to be updated if the internal behavior changes.
+
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.disable_auto_compactions = true;
+ options.env = env_;
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ const int kTestSize = kCDTKeysPerBuffer * 512;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(rnd.RandomString(kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+
+ ASSERT_OK(Flush());
+
+ Close();
+
+ int update_acc_stats_called = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionStorageInfo::UpdateAccumulatedStats",
+ [&](void* /* arg */) { ++update_acc_stats_called; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Reopen the DB with stats-update disabled
+ options.skip_stats_update_on_db_open = true;
+ options.max_open_files = 20;
+ Reopen(options);
+
+ ASSERT_EQ(update_acc_stats_called, 0);
+
+ // Repeat the reopen process, but this time we enable
+ // stats-update.
+ options.skip_stats_update_on_db_open = false;
+ Reopen(options);
+
+ ASSERT_GT(update_acc_stats_called, 0);
+
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.max_open_files = 20;
+ options.level0_file_num_compaction_trigger = 3;
+ // Avoid many shards with small max_open_files, where as little as
+ // two table insertions could lead to an LRU eviction, depending on
+ // hash values.
+ options.table_cache_numshardbits = 2;
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ int num_table_cache_lookup = 0;
+ int num_new_table_reader = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::FindTable:0", [&](void* arg) {
+ assert(arg != nullptr);
+ bool no_io = *(reinterpret_cast<bool*>(arg));
+ if (!no_io) {
+ // filter out cases for table properties queries.
+ num_table_cache_lookup++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::GetTableReader:0",
+ [&](void* /*arg*/) { num_new_table_reader++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) {
+ ASSERT_OK(Put(Key(k), Key(k)));
+ ASSERT_OK(Put(Key(10 - k), "bar"));
+ if (k < options.level0_file_num_compaction_trigger - 1) {
+ num_table_cache_lookup = 0;
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // preloading iterator issues one table cache lookup and create
+ // a new table reader, if not preloaded.
+ int old_num_table_cache_lookup = num_table_cache_lookup;
+ ASSERT_GE(num_table_cache_lookup, 1);
+ ASSERT_EQ(num_new_table_reader, 1);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ ASSERT_EQ(Key(k), Get(Key(k)));
+ // lookup iterator from table cache and no need to create a new one.
+ ASSERT_EQ(old_num_table_cache_lookup + num_table_cache_lookup, 2);
+ ASSERT_EQ(num_new_table_reader, 0);
+ }
+ }
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Preloading iterator issues one table cache lookup and creates
+ // a new table reader. One file is created for flush and one for compaction.
+ // Compaction inputs make no table cache look-up for data/range deletion
+ // iterators
+ // May preload table cache too.
+ ASSERT_GE(num_table_cache_lookup, 2);
+ int old_num_table_cache_lookup2 = num_table_cache_lookup;
+
+ // Create new iterator for:
+ // (1) 1 for verifying flush results
+ // (2) 1 for verifying compaction results.
+ // (3) New TableReaders will not be created for compaction inputs
+ ASSERT_EQ(num_new_table_reader, 2);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ ASSERT_EQ(Key(1), Get(Key(1)));
+ ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5);
+ ASSERT_EQ(num_new_table_reader, 0);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ // Only verifying compaction outputs issues one table cache lookup
+ // for both data block and range deletion block).
+ // May preload table cache too.
+ ASSERT_GE(num_table_cache_lookup, 1);
+ old_num_table_cache_lookup2 = num_table_cache_lookup;
+ // One for verifying compaction results.
+ // No new iterator created for compaction.
+ ASSERT_EQ(num_new_table_reader, 1);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ ASSERT_EQ(Key(1), Get(Key(1)));
+ ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
+ ASSERT_EQ(num_new_table_reader, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) {
+ for (int tid = 0; tid < 2; ++tid) {
+ uint64_t db_size[3];
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.max_subcompactions = max_subcompactions_;
+
+ if (tid == 1) {
+ // second pass with universal compaction
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ }
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // round 1 --- insert key/value pairs.
+ const int kTestSize = kCDTKeysPerBuffer * 512;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(rnd.RandomString(kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
+ Close();
+
+ // round 2 --- disable auto-compactions and issue deletions.
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ for (int k = 0; k < kTestSize; ++k) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
+ Close();
+ // as auto_compaction is off, we shouldn't see any reduction in db size.
+ ASSERT_LE(db_size[0], db_size[1]);
+
+ // round 3 --- reopen db with auto_compaction on and see if
+ // deletion compensation still work.
+ options.disable_auto_compactions = false;
+ Reopen(options);
+ // insert relatively small amount of data to trigger auto compaction.
+ for (int k = 0; k < kTestSize / 10; ++k) {
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
+ // this time we're expecting significant drop in size.
+ //
+ // See "CompactionDeletionTrigger" test for proof that at most
+ // `db_size[0] / 2` of the original data remains. In addition to that, this
+ // test inserts `db_size[0] / 10` to push the tombstones into SST files and
+ // then through automatic compactions. So in total `3 * db_size[0] / 5` of
+ // the original data may remain.
+ ASSERT_GT(3 * db_size[0] / 5, db_size[2]);
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeBottomPri) {
+ ASSERT_OK(Put(Key(50), ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(100), ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(200), ""));
+ ASSERT_OK(Flush());
+
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ }
+ ASSERT_EQ("0,0,3", FilesPerLevel(0));
+
+ ASSERT_OK(Put(Key(1), ""));
+ ASSERT_OK(Put(Key(199), ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(2), ""));
+ ASSERT_OK(Put(Key(199), ""));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("2,0,3", FilesPerLevel(0));
+
+ // Now we have 2 L0 files, and 3 L2 files, and a manual compaction will
+ // be triggered.
+ // Two compaction jobs will run. One compacts 2 L0 files in Low Pri Pool
+ // and one compact to L2 in bottom pri pool.
+ int low_pri_count = 0;
+ int bottom_pri_count = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "ThreadPoolImpl::Impl::BGThread:BeforeRun", [&](void* arg) {
+ Env::Priority* pri = reinterpret_cast<Env::Priority*>(arg);
+ // First time is low pri pool in the test case.
+ if (low_pri_count == 0 && bottom_pri_count == 0) {
+ ASSERT_EQ(Env::Priority::LOW, *pri);
+ }
+ if (*pri == Env::Priority::LOW) {
+ low_pri_count++;
+ } else {
+ bottom_pri_count++;
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(1, low_pri_count);
+ ASSERT_EQ(1, bottom_pri_count);
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ // Recompact bottom most level uses bottom pool
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ(1, low_pri_count);
+ ASSERT_EQ(2, bottom_pri_count);
+
+ env_->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ // Low pri pool is used if bottom pool has size 0.
+ ASSERT_EQ(2, low_pri_count);
+ ASSERT_EQ(2, bottom_pri_count);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
+ uint64_t db_size[3];
+ for (int test = 0; test < 2; ++test) {
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.skip_stats_update_on_db_open = (test == 0);
+
+ env_->random_read_counter_.Reset();
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // round 1 --- insert key/value pairs.
+ const int kTestSize = kCDTKeysPerBuffer * 512;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(rnd.RandomString(kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // L1 and L2 can fit deletions iff size compensation does not take effect,
+ // i.e., when `skip_stats_update_on_db_open == true`. Move any remaining
+ // files at or above L2 down to L3 to ensure obsolete data does not
+ // accidentally meet its tombstone above L3. This makes the final size more
+ // deterministic and easy to see whether size compensation for deletions
+ // took effect.
+ MoveFilesToLevel(3 /* level */);
+ ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
+ Close();
+
+ // round 2 --- disable auto-compactions and issue deletions.
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+
+ env_->random_read_counter_.Reset();
+ Reopen(options);
+
+ for (int k = 0; k < kTestSize; ++k) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
+ Close();
+ // as auto_compaction is off, we shouldn't see any reduction in db size.
+ ASSERT_LE(db_size[0], db_size[1]);
+
+ // round 3 --- reopen db with auto_compaction on and see if
+ // deletion compensation still work.
+ options.disable_auto_compactions = false;
+ Reopen(options);
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
+
+ if (options.skip_stats_update_on_db_open) {
+ // If update stats on DB::Open is disable, we don't expect
+ // deletion entries taking effect.
+ //
+ // The deletions are small enough to fit in L1 and L2, and obsolete keys
+ // were moved to L3+, so none of the original data should have been
+ // dropped.
+ ASSERT_LE(db_size[0], db_size[2]);
+ } else {
+ // Otherwise, we should see a significant drop in db size.
+ //
+ // See "CompactionDeletionTrigger" test for proof that at most
+ // `db_size[0] / 2` of the original data remains.
+ ASSERT_GT(db_size[0] / 2, db_size[2]);
+ }
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionTrigger) {
+ const int kNumKeysPerFile = 100;
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.num_levels = 3;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_subcompactions = max_subcompactions_;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ std::vector<std::string> values;
+ // Write 100KB (100 values, each 1K)
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ values.push_back(rnd.RandomString(990));
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(1, "", ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+ }
+
+ // generate one more file in level-0, and should trigger level-0 compaction
+ std::vector<std::string> values;
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ values.push_back(rnd.RandomString(990));
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(1, "", ""));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
+}
+
+TEST_F(DBCompactionTest, BGCompactionsAllowed) {
+ // Create several column families. Make compaction triggers in all of them
+ // and see number of compactions scheduled to be less than allowed.
+ const int kNumKeysPerFile = 100;
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.num_levels = 3;
+ // Should speed up compaction when there are 4 files.
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 20;
+ options.soft_pending_compaction_bytes_limit = 1 << 30; // Infinitely large
+ options.max_background_compactions = 3;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+
+ // Block all threads in thread pool.
+ const size_t kTotalTasks = 4;
+ env_->SetBackgroundThreads(4, Env::LOW);
+ test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_tasks[i], Env::Priority::LOW);
+ sleeping_tasks[i].WaitUntilSleeping();
+ }
+
+ CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+ Random rnd(301);
+ for (int cf = 0; cf < 4; cf++) {
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf, "", ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+ ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+ }
+ }
+
+ // Now all column families qualify compaction but only one should be
+ // scheduled, because no column family hits speed up condition.
+ ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+ // Create two more files for one column family, which triggers speed up
+ // condition, three compactions will be scheduled.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(2, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(2, "", ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+ ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+ NumTableFilesAtLevel(0, 2));
+ }
+ ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+ // Unblock all threads to unblock all compactions.
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ sleeping_tasks[i].WakeUp();
+ sleeping_tasks[i].WaitUntilDone();
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Verify number of compactions allowed will come back to 1.
+
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ sleeping_tasks[i].Reset();
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_tasks[i], Env::Priority::LOW);
+ sleeping_tasks[i].WaitUntilSleeping();
+ }
+ for (int cf = 0; cf < 4; cf++) {
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf, "", ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+ ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+ }
+ }
+
+ // Now all column families qualify compaction but only one should be
+ // scheduled, because no column family hits speed up condition.
+ ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ sleeping_tasks[i].WakeUp();
+ sleeping_tasks[i].WaitUntilDone();
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.max_subcompactions = max_subcompactions_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ std::vector<std::string> values;
+ for (int i = 0; i < 80; i++) {
+ values.push_back(rnd.RandomString(100000));
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+
+ // Reopening moves updates to level-0
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+ true /* disallow trivial move */));
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+ for (int i = 0; i < 80; i++) {
+ ASSERT_EQ(Get(1, Key(i)), values[i]);
+ }
+}
+
+TEST_F(DBCompactionTest, MinorCompactionsHappen) {
+ do {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const int N = 500;
+
+ int starting_num_tables = TotalTableFiles(1);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v')));
+ }
+ int ending_num_tables = TotalTableFiles(1);
+ ASSERT_GT(ending_num_tables, starting_num_tables);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+ }
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+ }
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile1) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+
+ DestroyAndReopen(options);
+
+ // create first file and flush to l0
+ ASSERT_OK(Put("4", "A"));
+ ASSERT_OK(Put("3", "A"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ ASSERT_OK(Put("2", "A"));
+ ASSERT_OK(Delete("3"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ // move both files down to l1
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ for (int i = 0; i < 3; i++) {
+ ASSERT_OK(Put("2", "B"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile2) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+
+ DestroyAndReopen(options);
+
+ // create first file and flush to l0
+ ASSERT_OK(Put("4", "A"));
+ ASSERT_OK(Put("3", "A"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ ASSERT_OK(Put("2", "A"));
+ ASSERT_OK(SingleDelete("3"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ // move both files down to l1
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ for (int i = 0; i < 3; i++) {
+ ASSERT_OK(Put("2", "B"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, CompactionSstPartitioner) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+ std::shared_ptr<SstPartitionerFactory> factory(
+ NewSstPartitionerFixedPrefixFactory(4));
+ options.sst_partitioner_factory = factory;
+
+ DestroyAndReopen(options);
+
+ // create first file and flush to l0
+ ASSERT_OK(Put("aaaa1", "A"));
+ ASSERT_OK(Put("bbbb1", "B"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ ASSERT_OK(Put("aaaa1", "A2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ // move both files down to l1
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ std::vector<LiveFileMetaData> files;
+ dbfull()->GetLiveFilesMetaData(&files);
+ ASSERT_EQ(2, files.size());
+ ASSERT_EQ("A2", Get("aaaa1"));
+ ASSERT_EQ("B", Get("bbbb1"));
+}
+
+TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 1;
+ std::shared_ptr<SstPartitionerFactory> factory(
+ NewSstPartitionerFixedPrefixFactory(4));
+ options.sst_partitioner_factory = factory;
+
+ DestroyAndReopen(options);
+
+ // create first file and flush to l0
+ ASSERT_OK(Put("aaaa1", "A"));
+ ASSERT_OK(Put("bbbb1", "B"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ std::vector<LiveFileMetaData> files;
+ dbfull()->GetLiveFilesMetaData(&files);
+ ASSERT_EQ(2, files.size());
+ ASSERT_EQ("A", Get("aaaa1"));
+ ASSERT_EQ("B", Get("bbbb1"));
+}
+
+TEST_F(DBCompactionTest, ZeroSeqIdCompaction) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ // compaction options
+ CompactionOptions compact_opt;
+ compact_opt.compression = kNoCompression;
+ compact_opt.output_file_size_limit = 4096;
+ const size_t key_len =
+ static_cast<size_t>(compact_opt.output_file_size_limit) / 5;
+
+ DestroyAndReopen(options);
+
+ std::vector<const Snapshot*> snaps;
+
+ // create first file and flush to l0
+ for (auto& key : {"1", "2", "3", "3", "3", "3"}) {
+ ASSERT_OK(Put(key, std::string(key_len, 'A')));
+ snaps.push_back(dbfull()->GetSnapshot());
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ // create second file and flush to l0
+ for (auto& key : {"3", "4", "5", "6", "7", "8"}) {
+ ASSERT_OK(Put(key, std::string(key_len, 'A')));
+ snaps.push_back(dbfull()->GetSnapshot());
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ // move both files down to l1
+ ASSERT_OK(
+ dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1));
+
+ // release snap so that first instance of key(3) can have seqId=0
+ for (auto snap : snaps) {
+ dbfull()->ReleaseSnapshot(snap);
+ }
+
+ // create 3 files in l0 so to trigger compaction
+ for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
+ ASSERT_OK(Put("2", std::string(1, 'A')));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_OK(Put("", ""));
+}
+
+TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) {
+ // github issue #2249
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+ DestroyAndReopen(options);
+
+ // create two files in l1 that we can compact
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) {
+ ASSERT_OK(Put(std::to_string(2 * i), std::string(1, 'A')));
+ ASSERT_OK(Put(std::to_string(2 * i + 1), std::string(1, 'A')));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_OK(
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}}));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
+ ASSERT_OK(
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}}));
+
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+ ASSERT_EQ(2, cf_meta.levels[1].files.size());
+ std::vector<std::string> input_filenames;
+ for (const auto& sst_file : cf_meta.levels[1].files) {
+ input_filenames.push_back(sst_file.name);
+ }
+
+ // note CompactionOptions::output_file_size_limit is unset.
+ CompactionOptions compact_opt;
+ compact_opt.compression = kNoCompression;
+ ASSERT_OK(dbfull()->CompactFiles(compact_opt, input_filenames, 1));
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger a long memtable compaction and reopen the database during it
+ ASSERT_OK(Put(1, "foo", "v1")); // Goes to 1st log file
+ ASSERT_OK(Put(1, "big1", std::string(10000000, 'x'))); // Fills memtable
+ ASSERT_OK(Put(1, "big2", std::string(1000, 'y'))); // Triggers compaction
+ ASSERT_OK(Put(1, "bar", "v2")); // Goes to new log file
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v2", Get(1, "bar"));
+ ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
+ ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2"));
+ } while (ChangeOptions());
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
+ int32_t trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ int32_t num_keys = 80;
+ int32_t value_size = 100 * 1024; // 100 KB
+
+ Random rnd(301);
+ std::vector<std::string> values;
+ for (int i = 0; i < num_keys; i++) {
+ values.push_back(rnd.RandomString(value_size));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+
+ // Reopening moves updates to L0
+ Reopen(options);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 1); // 1 file in L0
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // 0 files in L1
+
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+ LiveFileMetaData level0_file = metadata[0]; // L0 file meta
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+ // Compaction will initiate a trivial move from L0 to L1
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+ // File moved From L0 to L1
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); // 0 files in L0
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); // 1 file in L1
+
+ metadata.clear();
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+ ASSERT_EQ(metadata[0].name /* level1_file.name */, level0_file.name);
+ ASSERT_EQ(metadata[0].size /* level1_file.size */, level0_file.size);
+
+ for (int i = 0; i < num_keys; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+
+ ASSERT_EQ(trivial_move, 1);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ // non overlapping ranges
+ std::vector<std::pair<int32_t, int32_t>> ranges = {
+ {100, 199}, {300, 399}, {0, 99}, {200, 299},
+ {600, 699}, {400, 499}, {500, 550}, {551, 599},
+ };
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ values[j] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ int32_t level0_files = NumTableFilesAtLevel(0, 0);
+ ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+ // Since data is non-overlapping we expect compaction to initiate
+ // a trivial move
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ // We expect that all the files were trivially moved from L0 to L1
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files);
+
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ ASSERT_EQ(Get(Key(j)), values[j]);
+ }
+ }
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ trivial_move = 0;
+ non_trivial_move = 0;
+ values.clear();
+ DestroyAndReopen(options);
+ // Same ranges as above but overlapping
+ ranges = {
+ {100, 199},
+ {300, 399},
+ {0, 99},
+ {200, 299},
+ {600, 699},
+ {400, 499},
+ {500, 560}, // this range overlap with the next
+ // one
+ {551, 599},
+ };
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ values[j] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ ASSERT_EQ(Get(Key(j)), values[j]);
+ }
+ }
+ ASSERT_EQ(trivial_move, 0);
+ ASSERT_EQ(non_trivial_move, 1);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.num_levels = 7;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 300]
+ for (int32_t i = 0; i <= 300; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [600 => 700]
+ for (int32_t i = 600; i <= 700; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 6;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L6
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ for (int32_t i = 0; i <= 300; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ for (int32_t i = 600; i <= 700; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, PartialOverlappingL0) {
+ class SubCompactionEventListener : public EventListener {
+ public:
+ void OnSubcompactionCompleted(const SubcompactionJobInfo&) override {
+ sub_compaction_finished_++;
+ }
+ std::atomic<int> sub_compaction_finished_{0};
+ };
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+ SubCompactionEventListener* listener = new SubCompactionEventListener();
+ options.listeners.emplace_back(listener);
+
+ DestroyAndReopen(options);
+
+ // For subcompactino to trigger, output level needs to be non-empty.
+ ASSERT_OK(Put("key", ""));
+ ASSERT_OK(Put("kez", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("key", ""));
+ ASSERT_OK(Put("kez", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Ranges that are only briefly overlapping so that they won't be trivially
+ // moved but subcompaction ranges would only contain a subset of files.
+ std::vector<std::pair<int32_t, int32_t>> ranges = {
+ {100, 199}, {198, 399}, {397, 600}, {598, 800}, {799, 900}, {895, 999},
+ };
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ values[j] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ int32_t level0_files = NumTableFilesAtLevel(0, 0);
+ ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); // One file in L1
+
+ listener->sub_compaction_finished_ = 0;
+ ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ if (max_subcompactions_ > 3) {
+ // RocksDB might not generate the exact number of sub compactions.
+ // Here we validate that at least subcompaction happened.
+ ASSERT_GT(listener->sub_compaction_finished_.load(), 2);
+ }
+
+ // We expect that all the files were compacted to L1
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1, 0), 1);
+
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ ASSERT_EQ(Get(Key(j)), values[j]);
+ }
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ bool first = true;
+ // Purpose of dependencies:
+ // 4 -> 1: ensure the order of two non-trivial compactions
+ // 5 -> 2 and 5 -> 3: ensure we do a check before two non-trivial compactions
+ // are installed
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBCompaction::ManualPartial:4", "DBCompaction::ManualPartial:1"},
+ {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"},
+ {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (first) {
+ first = false;
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:4");
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:3");
+ } else { // second non-trivial compaction
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:2");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.num_levels = 7;
+ options.max_subcompactions = max_subcompactions_;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 3;
+ options.target_file_size_base = 1 << 23; // 8 MB
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 100]
+ for (int32_t i = 0; i < 100; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [100 => 300]
+ for (int32_t i = 100; i < 300; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 6;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ // Trivial move the two non-overlapping files to level 6
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L6
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // file 3 [ 0 => 200]
+ for (int32_t i = 0; i < 200; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 1 files in L0
+ ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel(0));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(4, nullptr, nullptr, nullptr, false));
+ // 2 files in L6, 1 file in L5
+ ASSERT_EQ("0,0,0,0,0,1,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 6);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ compact_options.change_level = false;
+ compact_options.exclusive_manual_compaction = false;
+ std::string begin_string = Key(0);
+ std::string end_string = Key(199);
+ Slice begin(begin_string);
+ Slice end(end_string);
+ // First non-trivial compaction is triggered
+ ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+ });
+
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:1");
+ // file 4 [300 => 400)
+ for (int32_t i = 300; i <= 400; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 5 [400 => 500)
+ for (int32_t i = 400; i <= 500; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 6 [500 => 600)
+ for (int32_t i = 500; i <= 600; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ // Second non-trivial compaction is triggered
+ ASSERT_OK(Flush());
+
+ // Before two non-trivial compactions are installed, there are 3 files in L0
+ ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0));
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:5");
+
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // After two non-trivial compactions are installed, there is 1 file in L6, and
+ // 1 file in L1
+ ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0));
+ threads.join();
+
+ for (int32_t i = 0; i < 600; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+// Disable as the test is flaky.
+TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ bool first = true;
+ bool second = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"},
+ {"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (first) {
+ TEST_SYNC_POINT("DBCompaction::PartialFill:4");
+ first = false;
+ TEST_SYNC_POINT("DBCompaction::PartialFill:3");
+ } else if (second) {
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 4;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 3;
+
+ DestroyAndReopen(options);
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 100]
+ for (int32_t i = 0; i < 100; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [100 => 300]
+ for (int32_t i = 100; i < 300; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L2
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // file 3 [ 0 => 200]
+ for (int32_t i = 0; i < 200; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L2, 1 in L0
+ ASSERT_EQ("1,0,2", FilesPerLevel(0));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+ // 2 files in L2, 1 in L1
+ ASSERT_EQ("0,1,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 2);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ ROCKSDB_NAMESPACE::port::Thread threads([&] {
+ compact_options.change_level = false;
+ compact_options.exclusive_manual_compaction = false;
+ std::string begin_string = Key(0);
+ std::string end_string = Key(199);
+ Slice begin(begin_string);
+ Slice end(end_string);
+ ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+ });
+
+ TEST_SYNC_POINT("DBCompaction::PartialFill:1");
+ // Many files 4 [300 => 4300)
+ for (int32_t i = 0; i <= 5; i++) {
+ for (int32_t j = 300; j < 4300; j++) {
+ if (j == 2300) {
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ values[j] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ }
+
+ // Verify level sizes
+ uint64_t target_size = 4 * options.max_bytes_for_level_base;
+ for (int32_t i = 1; i < options.num_levels; i++) {
+ ASSERT_LE(SizeAtLevel(i), target_size);
+ target_size = static_cast<uint64_t>(target_size *
+ options.max_bytes_for_level_multiplier);
+ }
+
+ TEST_SYNC_POINT("DBCompaction::PartialFill:2");
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ threads.join();
+
+ for (int32_t i = 0; i < 4300; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL",
+ "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"},
+ {"DBImpl::WaitForPendingWrites:BeforeBlock",
+ "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}});
+
+ Options options = CurrentOptions();
+ options.unordered_write = true;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("bar", "v1"));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread writer([&]() { ASSERT_OK(Put("foo", "v2")); });
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL");
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ writer.join();
+ ASSERT_EQ(Get("foo"), "v2");
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ Reopen(options);
+ ASSERT_EQ(Get("foo"), "v2");
+}
+
+TEST_F(DBCompactionTest, DeleteFileRange) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 4;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 3;
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 100]
+ for (int32_t i = 0; i < 100; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [100 => 300]
+ for (int32_t i = 100; i < 300; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L2
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ // file 3 [ 0 => 200]
+ for (int32_t i = 0; i < 200; i++) {
+ values[i] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // Many files 4 [300 => 4300)
+ for (int32_t i = 0; i <= 5; i++) {
+ for (int32_t j = 300; j < 4300; j++) {
+ if (j == 2300) {
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ values[j] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Verify level sizes
+ uint64_t target_size = 4 * options.max_bytes_for_level_base;
+ for (int32_t i = 1; i < options.num_levels; i++) {
+ ASSERT_LE(SizeAtLevel(i), target_size);
+ target_size = static_cast<uint64_t>(target_size *
+ options.max_bytes_for_level_multiplier);
+ }
+
+ const size_t old_num_files = CountFiles();
+ std::string begin_string = Key(1000);
+ std::string end_string = Key(2000);
+ Slice begin(begin_string);
+ Slice end(end_string);
+ ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+
+ int32_t deleted_count = 0;
+ for (int32_t i = 0; i < 4300; i++) {
+ if (i < 1000 || i > 2000) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ } else {
+ ReadOptions roptions;
+ std::string result;
+ Status s = db_->Get(roptions, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound() || s.ok());
+ if (s.IsNotFound()) {
+ deleted_count++;
+ }
+ }
+ }
+ ASSERT_GT(deleted_count, 0);
+ begin_string = Key(5000);
+ end_string = Key(6000);
+ Slice begin1(begin_string);
+ Slice end1(end_string);
+ // Try deleting files in range which contain no keys
+ ASSERT_OK(
+ DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1));
+
+ // Push data from level 0 to level 1 to force all data to be deleted
+ // Note that we don't delete level 0 files
+ compact_options.change_level = true;
+ compact_options.target_level = 1;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_OK(
+ DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
+
+ int32_t deleted_count2 = 0;
+ for (int32_t i = 0; i < 4300; i++) {
+ ReadOptions roptions;
+ std::string result;
+ ASSERT_TRUE(db_->Get(roptions, Key(i), &result).IsNotFound());
+ deleted_count2++;
+ }
+ ASSERT_GT(deleted_count2, deleted_count);
+ const size_t new_num_files = CountFiles();
+ ASSERT_GT(old_num_files, new_num_files);
+}
+
+TEST_F(DBCompactionTest, DeleteFilesInRanges) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 4;
+ options.max_background_compactions = 3;
+ options.disable_auto_compactions = true;
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file [0 => 100), [100 => 200), ... [900, 1000)
+ for (auto i = 0; i < 10; i++) {
+ for (auto j = 0; j < 100; j++) {
+ auto k = i * 100 + j;
+ values[k] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ("10", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,10", FilesPerLevel(0));
+
+ // file [0 => 100), [200 => 300), ... [800, 900)
+ for (auto i = 0; i < 10; i += 2) {
+ for (auto j = 0; j < 100; j++) {
+ auto k = i * 100 + j;
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ("5,0,10", FilesPerLevel(0));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+ ASSERT_EQ("0,5,10", FilesPerLevel(0));
+
+ // Delete files in range [0, 299] (inclusive)
+ {
+ auto begin_str1 = Key(0), end_str1 = Key(100);
+ auto begin_str2 = Key(100), end_str2 = Key(200);
+ auto begin_str3 = Key(200), end_str3 = Key(299);
+ Slice begin1(begin_str1), end1(end_str1);
+ Slice begin2(begin_str2), end2(end_str2);
+ Slice begin3(begin_str3), end3(end_str3);
+ std::vector<RangePtr> ranges;
+ ranges.push_back(RangePtr(&begin1, &end1));
+ ranges.push_back(RangePtr(&begin2, &end2));
+ ranges.push_back(RangePtr(&begin3, &end3));
+ ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+ ranges.data(), ranges.size()));
+ ASSERT_EQ("0,3,7", FilesPerLevel(0));
+
+ // Keys [0, 300) should not exist.
+ for (auto i = 0; i < 300; i++) {
+ ReadOptions ropts;
+ std::string result;
+ auto s = db_->Get(ropts, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ for (auto i = 300; i < 1000; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ }
+
+ // Delete files in range [600, 999) (exclusive)
+ {
+ auto begin_str1 = Key(600), end_str1 = Key(800);
+ auto begin_str2 = Key(700), end_str2 = Key(900);
+ auto begin_str3 = Key(800), end_str3 = Key(999);
+ Slice begin1(begin_str1), end1(end_str1);
+ Slice begin2(begin_str2), end2(end_str2);
+ Slice begin3(begin_str3), end3(end_str3);
+ std::vector<RangePtr> ranges;
+ ranges.push_back(RangePtr(&begin1, &end1));
+ ranges.push_back(RangePtr(&begin2, &end2));
+ ranges.push_back(RangePtr(&begin3, &end3));
+ ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+ ranges.data(), ranges.size(), false));
+ ASSERT_EQ("0,1,4", FilesPerLevel(0));
+
+ // Keys [600, 900) should not exist.
+ for (auto i = 600; i < 900; i++) {
+ ReadOptions ropts;
+ std::string result;
+ auto s = db_->Get(ropts, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ for (auto i = 300; i < 600; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ for (auto i = 900; i < 1000; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ }
+
+ // Delete all files.
+ {
+ RangePtr range;
+ ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1));
+ ASSERT_EQ("", FilesPerLevel(0));
+
+ for (auto i = 0; i < 1000; i++) {
+ ReadOptions ropts;
+ std::string result;
+ auto s = db_->Get(ropts, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
+ // regression test for #2833: groups of files whose user-keys overlap at the
+ // endpoints could be split by `DeleteFilesInRange`. This caused old data to
+ // reappear, either because a new version of the key was removed, or a range
+ // deletion was partially dropped. It could also cause non-overlapping
+ // invariant to be violated if the files dropped by DeleteFilesInRange were
+ // a subset of files that a range deletion spans.
+ const int kNumL0Files = 2;
+ const int kValSize = 8 << 10; // 8KB
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.target_file_size_base = 1 << 10; // 1KB
+ DestroyAndReopen(options);
+
+ // The snapshot prevents key 1 from having its old version dropped. The low
+ // `target_file_size_base` ensures two keys will be in each output file.
+ const Snapshot* snapshot = nullptr;
+ Random rnd(301);
+ // The value indicates which flush the key belonged to, which is enough
+ // for us to determine the keys' relative ages. After L0 flushes finish,
+ // files look like:
+ //
+ // File 0: 0 -> vals[0], 1 -> vals[0]
+ // File 1: 1 -> vals[1], 2 -> vals[1]
+ //
+ // Then L0->L1 compaction happens, which outputs keys as follows:
+ //
+ // File 0: 0 -> vals[0], 1 -> vals[1]
+ // File 1: 1 -> vals[0], 2 -> vals[1]
+ //
+ // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
+ // would cause `1 -> vals[0]` (an older key) to reappear.
+ std::string vals[kNumL0Files];
+ for (int i = 0; i < kNumL0Files; ++i) {
+ vals[i] = rnd.RandomString(kValSize);
+ ASSERT_OK(Put(Key(i), vals[i]));
+ ASSERT_OK(Put(Key(i + 1), vals[i]));
+ ASSERT_OK(Flush());
+ if (i == 0) {
+ snapshot = db_->GetSnapshot();
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Verify `DeleteFilesInRange` can't drop only file 0 which would cause
+ // "1 -> vals[0]" to reappear.
+ std::string begin_str = Key(0), end_str = Key(1);
+ Slice begin = begin_str, end = end_str;
+ ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+ ASSERT_EQ(vals[1], Get(Key(1)));
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::vector<std::string> values;
+ // File with keys [ 0 => 99 ]
+ for (int i = 0; i < 100; i++) {
+ values.push_back(rnd.RandomString(value_size));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1", FilesPerLevel(0));
+ // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 3;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // File with keys [ 100 => 199 ]
+ for (int i = 100; i < 200; i++) {
+ values.push_back(rnd.RandomString(value_size));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+ // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 4);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ for (int i = 0; i < 200; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are not going to second path.
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ // Another 110KB triggers a compaction to 400K file to fill up first path
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path));
+
+ // (1, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4", FilesPerLevel(0));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 1)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,1", FilesPerLevel(0));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 2)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,2", FilesPerLevel(0));
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 3)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,3", FilesPerLevel(0));
+ ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,4", FilesPerLevel(0));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 5)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,5", FilesPerLevel(0));
+ ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 6)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,6", FilesPerLevel(0));
+ ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 7)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,7", FilesPerLevel(0));
+ ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,8", FilesPerLevel(0));
+ ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Always gets compacted into 1 Level1 file,
+ // 0/1 Level 0 file
+ for (int num = 0; num < 3; num++) {
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+
+ std::vector<Options> option_vector;
+ option_vector.emplace_back(options);
+ ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+ // Configure CF1 specific paths.
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 500 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024);
+ option_vector.emplace_back(DBOptions(options), cf_opt1);
+ CreateColumnFamilies({"one"}, option_vector[1]);
+
+ // Configure CF2 specific paths.
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024);
+ option_vector.emplace_back(DBOptions(options), cf_opt2);
+ CreateColumnFamilies({"two"}, option_vector[2]);
+
+ ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+ Random rnd(301);
+ int key_idx = 0;
+ int key_idx1 = 0;
+ int key_idx2 = 0;
+
+ auto generate_file = [&]() {
+ GenerateNewFile(0, &rnd, &key_idx);
+ GenerateNewFile(1, &rnd, &key_idx1);
+ GenerateNewFile(2, &rnd, &key_idx2);
+ };
+
+ auto check_sstfilecount = [&](int path_id, int expected) {
+ ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+ ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+ ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+ };
+
+ auto check_filesperlevel = [&](const std::string& expected) {
+ ASSERT_EQ(expected, FilesPerLevel(0));
+ ASSERT_EQ(expected, FilesPerLevel(1));
+ ASSERT_EQ(expected, FilesPerLevel(2));
+ };
+
+ auto check_getvalues = [&]() {
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(0, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ for (int i = 0; i < key_idx1; i++) {
+ auto v = Get(1, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ for (int i = 0; i < key_idx2; i++) {
+ auto v = Get(2, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+ };
+
+ // Check that default column family uses db_paths.
+ // And Column family "one" uses cf_paths.
+
+ // The compaction in level0 outputs the sst files in level1.
+ // The first path cannot hold level1's data(400KB+400KB > 500KB),
+ // so every compaction move a sst file to second path. Please
+ // refer to LevelCompactionBuilder::GetPathId.
+ for (int num = 0; num < 3; num++) {
+ generate_file();
+ }
+ check_sstfilecount(0, 1);
+ check_sstfilecount(1, 2);
+
+ generate_file();
+ check_sstfilecount(1, 3);
+
+ // (1, 4)
+ generate_file();
+ check_filesperlevel("1,4");
+ check_sstfilecount(1, 4);
+ check_sstfilecount(0, 1);
+
+ // (1, 4, 1)
+ generate_file();
+ check_filesperlevel("1,4,1");
+ check_sstfilecount(2, 1);
+ check_sstfilecount(1, 4);
+ check_sstfilecount(0, 1);
+
+ // (1, 4, 2)
+ generate_file();
+ check_filesperlevel("1,4,2");
+ check_sstfilecount(2, 2);
+ check_sstfilecount(1, 4);
+ check_sstfilecount(0, 1);
+
+ check_getvalues();
+
+ { // Also verify GetLiveFilesStorageInfo with db_paths / cf_paths
+ std::vector<LiveFileStorageInfo> new_infos;
+ LiveFilesStorageInfoOptions lfsio;
+ lfsio.wal_size_for_flush = UINT64_MAX; // no flush
+ ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsio, &new_infos));
+ std::unordered_map<std::string, int> live_sst_by_dir;
+ for (auto& info : new_infos) {
+ if (info.file_type == kTableFile) {
+ live_sst_by_dir[info.directory]++;
+ // Verify file on disk (no directory confusion)
+ uint64_t size;
+ ASSERT_OK(env_->GetFileSize(
+ info.directory + "/" + info.relative_filename, &size));
+ ASSERT_EQ(info.size, size);
+ }
+ }
+ ASSERT_EQ(3U * 3U, live_sst_by_dir.size());
+ for (auto& paths : {options.db_paths, cf_opt1.cf_paths, cf_opt2.cf_paths}) {
+ ASSERT_EQ(1, live_sst_by_dir[paths[0].path]);
+ ASSERT_EQ(4, live_sst_by_dir[paths[1].path]);
+ ASSERT_EQ(2, live_sst_by_dir[paths[2].path]);
+ }
+ }
+
+ ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+ check_getvalues();
+
+ Destroy(options, true);
+}
+
+TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
+ Random rnd(301);
+ int max_key_level_insert = 200;
+ int max_key_universal_insert = 600;
+
+ // Stage 1: generate a db with level compaction
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.num_levels = 4;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_bytes_for_level_base = 500 << 10; // 500KB
+ options.max_bytes_for_level_multiplier = 1;
+ options.target_file_size_base = 200 << 10; // 200KB
+ options.target_file_size_multiplier = 1;
+ options.max_subcompactions = max_subcompactions_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ for (int i = 0; i <= max_key_level_insert; i++) {
+ // each value is 10K
+ ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+ }
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_GT(TotalTableFiles(1, 4), 1);
+ int non_level0_num_files = 0;
+ for (int i = 1; i < options.num_levels; i++) {
+ non_level0_num_files += NumTableFilesAtLevel(i, 1);
+ }
+ ASSERT_GT(non_level0_num_files, 0);
+
+ // Stage 2: reopen with universal compaction - should fail
+ options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ options = CurrentOptions(options);
+ Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_TRUE(s.IsInvalidArgument());
+
+ // Stage 3: compact into a single file and move the file to level 0
+ options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = INT_MAX;
+ options.target_file_size_multiplier = 1;
+ options.max_bytes_for_level_base = INT_MAX;
+ options.max_bytes_for_level_multiplier = 1;
+ options.num_levels = 4;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 0;
+ // cannot use kForceOptimized here because the compaction here is expected
+ // to generate one output file
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kForce;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(
+ dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+
+ // Only 1 file in L0
+ ASSERT_EQ("1", FilesPerLevel(1));
+
+ // Stage 4: re-open in universal compaction style and do some db operations
+ options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 4;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 3;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ options.num_levels = 1;
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
+ ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+ }
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ for (int i = 1; i < options.num_levels; i++) {
+ ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+ }
+
+ // verify keys inserted in both level compaction style and universal
+ // compaction style
+ std::string keys_in_db;
+ Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+ ASSERT_OK(iter->status());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ keys_in_db.append(iter->key().ToString());
+ keys_in_db.push_back(',');
+ }
+ delete iter;
+
+ std::string expected_keys;
+ for (int i = 0; i <= max_key_universal_insert; i++) {
+ expected_keys.append(Key(i));
+ expected_keys.push_back(',');
+ }
+
+ ASSERT_EQ(keys_in_db, expected_keys);
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "b", "v"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Delete(1, "b"));
+ ASSERT_OK(Delete(1, "a"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Delete(1, "a"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "a", "v"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("(a->v)", Contents(1));
+ env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
+ ASSERT_EQ("(a->v)", Contents(1));
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "", ""));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Delete(1, "e"));
+ ASSERT_OK(Put(1, "", ""));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "c", "cv"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "", ""));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "", ""));
+ env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "d", "dv"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "", ""));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Delete(1, "d"));
+ ASSERT_OK(Delete(1, "b"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("(->)(c->cv)", Contents(1));
+ env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
+ ASSERT_EQ("(->)(c->cv)", Contents(1));
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, ManualAutoRace) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"},
+ {"DBImpl::RunManualCompaction:WaitScheduled",
+ "BackgroundCallCompaction:0"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(1, "foo", ""));
+ ASSERT_OK(Put(1, "bar", ""));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "foo", ""));
+ ASSERT_OK(Put(1, "bar", ""));
+ // Generate four files in CF 0, which should trigger an auto compaction
+ ASSERT_OK(Put("foo", ""));
+ ASSERT_OK(Put("bar", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", ""));
+ ASSERT_OK(Put("bar", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", ""));
+ ASSERT_OK(Put("bar", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", ""));
+ ASSERT_OK(Put("bar", ""));
+ ASSERT_OK(Flush());
+
+ // The auto compaction is scheduled but waited until here
+ TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
+ // The auto compaction will wait until the manual compaction is registerd
+ // before processing so that it will be cancelled.
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = true;
+ ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr));
+ ASSERT_EQ("0,1", FilesPerLevel(1));
+
+ // Eventually the cancelled compaction will be rescheduled and executed.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompaction) {
+ Options options = CurrentOptions();
+ options.max_subcompactions = max_subcompactions_;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // iter - 0 with 7 levels
+ // iter - 1 with 3 levels
+ for (int iter = 0; iter < 2; ++iter) {
+ MakeTables(3, "p", "q", 1);
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range falls before files
+ Compact(1, "", "c");
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range falls after files
+ Compact(1, "r", "z");
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range overlaps files
+ Compact(1, "p", "q");
+ ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+ // Populate a different range
+ MakeTables(3, "c", "e", 1);
+ ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+ // Compact just the new range
+ Compact(1, "b", "f");
+ ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+ // Compact all
+ MakeTables(1, "a", "z", 1);
+ ASSERT_EQ("1,0,2", FilesPerLevel(1));
+
+ uint64_t prev_block_cache_add =
+ options.statistics->getTickerCount(BLOCK_CACHE_ADD);
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
+ // Verify manual compaction doesn't fill block cache
+ ASSERT_EQ(prev_block_cache_add,
+ options.statistics->getTickerCount(BLOCK_CACHE_ADD));
+
+ ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+ if (iter == 0) {
+ options = CurrentOptions();
+ options.num_levels = 3;
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ }
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+ options.max_subcompactions = max_subcompactions_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // iter - 0 with 7 levels
+ // iter - 1 with 3 levels
+ for (int iter = 0; iter < 2; ++iter) {
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put(1, "p", "begin"));
+ ASSERT_OK(Put(1, "q", "end"));
+ ASSERT_OK(Flush(1));
+ }
+ ASSERT_EQ("3", FilesPerLevel(1));
+ ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Compaction range falls before files
+ Compact(1, "", "c");
+ ASSERT_EQ("3", FilesPerLevel(1));
+
+ // Compaction range falls after files
+ Compact(1, "r", "z");
+ ASSERT_EQ("3", FilesPerLevel(1));
+
+ // Compaction range overlaps files
+ Compact(1, "p", "q", 1);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,1", FilesPerLevel(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Populate a different range
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put(1, "c", "begin"));
+ ASSERT_OK(Put(1, "e", "end"));
+ ASSERT_OK(Flush(1));
+ }
+ ASSERT_EQ("3,1", FilesPerLevel(1));
+
+ // Compact just the new range
+ Compact(1, "b", "f", 1);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,2", FilesPerLevel(1));
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Compact all
+ ASSERT_OK(Put(1, "a", "begin"));
+ ASSERT_OK(Put(1, "z", "end"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("1,2", FilesPerLevel(1));
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = 1;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ("0,1", FilesPerLevel(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ if (iter == 0) {
+ DestroyAndReopen(options);
+ options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+ options.max_background_flushes = 1;
+ options.num_levels = 3;
+ options.create_if_missing = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, FilesDeletedAfterCompaction) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v2"));
+ Compact(1, "a", "z");
+ const size_t num_files = CountLiveFiles();
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(1, "foo", "v2"));
+ Compact(1, "a", "z");
+ }
+ ASSERT_EQ(CountLiveFiles(), num_files);
+ } while (ChangeCompactOptions());
+}
+
+// Check level comapction with compact files
+TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 100;
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ options.compaction_style = kCompactionStyleLevel;
+ options.target_file_size_base = options.write_buffer_size;
+ options.max_bytes_for_level_base = options.target_file_size_base * 2;
+ options.level0_stop_writes_trigger = 2;
+ options.max_bytes_for_level_multiplier = 2;
+ options.compression = kNoCompression;
+ options.max_subcompactions = max_subcompactions_;
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+ ASSERT_OK(Put(1, std::to_string(key), rnd.RandomString(kTestValueSize)));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ int output_level = static_cast<int>(cf_meta.levels.size()) - 1;
+ for (int file_picked = 5; file_picked > 0; --file_picked) {
+ std::set<std::string> overlapping_file_names;
+ std::vector<std::string> compaction_input_file_names;
+ for (int f = 0; f < file_picked; ++f) {
+ int level = 0;
+ auto file_meta = PickFileRandomly(cf_meta, &rnd, &level);
+ compaction_input_file_names.push_back(file_meta->name);
+ GetOverlappingFileNumbersForLevelCompaction(
+ cf_meta, options.comparator, level, output_level, file_meta,
+ &overlapping_file_names);
+ }
+
+ ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1],
+ compaction_input_file_names,
+ output_level));
+
+ // Make sure all overlapping files do not exist after compaction
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ VerifyCompactionResult(cf_meta, overlapping_file_names);
+ }
+
+ // make sure all key-values are still there.
+ for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+ ASSERT_NE(Get(1, std::to_string(key)), "NOT_FOUND");
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) {
+ Options options;
+ const int kKeySize = 16;
+ const int kKvSize = 1000;
+ const int kKeysPerBuffer = 100;
+ const int kNumL1Files = 5;
+ options.create_if_missing = true;
+ options.write_buffer_size = kKeysPerBuffer * kKvSize;
+ options.max_write_buffer_number = 2;
+ options.target_file_size_base =
+ options.write_buffer_size * (options.max_write_buffer_number - 1);
+ options.level0_file_num_compaction_trigger = kNumL1Files;
+ options.max_bytes_for_level_base =
+ options.level0_file_num_compaction_trigger *
+ options.target_file_size_base;
+ options.max_bytes_for_level_multiplier = 2;
+ options.compression = kNoCompression;
+ options.max_subcompactions = max_subcompactions_;
+
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ env_->SetBackgroundThreads(1, Env::LOW);
+ // stop the compaction thread until we simulate the file creation failure.
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ options.env = env_;
+
+ DestroyAndReopen(options);
+
+ const int kNumInsertedKeys = options.level0_file_num_compaction_trigger *
+ (options.max_write_buffer_number - 1) *
+ kKeysPerBuffer;
+
+ Random rnd(301);
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+ for (int k = 0; k < kNumInsertedKeys; ++k) {
+ keys.emplace_back(rnd.RandomString(kKeySize));
+ values.emplace_back(rnd.RandomString(kKvSize - kKeySize));
+ ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+ // Make sure the number of L0 files can trigger compaction.
+ ASSERT_GE(NumTableFilesAtLevel(0),
+ options.level0_file_num_compaction_trigger);
+
+ auto previous_num_level0_files = NumTableFilesAtLevel(0);
+
+ // Fail the first file creation.
+ env_->non_writable_count_ = 1;
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ // Expect compaction to fail here as one file will fail its
+ // creation.
+ ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok());
+
+ // Verify L0 -> L1 compaction does fail.
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+ // Verify all L0 files are still there.
+ ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
+
+ // All key-values must exist after compaction fails.
+ for (int k = 0; k < kNumInsertedKeys; ++k) {
+ ASSERT_EQ(values[k], Get(keys[k]));
+ }
+
+ env_->non_writable_count_ = 0;
+
+ // Make sure RocksDB will not get into corrupted state.
+ Reopen(options);
+
+ // Verify again after reopen.
+ for (int k = 0; k < kNumInsertedKeys; ++k) {
+ ASSERT_EQ(values[k], Get(keys[k]));
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
+ // iter 1 -- delete_obsolete_files_period_micros == 0
+ for (int iter = 0; iter < 2; ++iter) {
+ // This test triggers move compaction and verifies that the file is not
+ // deleted when it's part of move compaction
+ Options options = CurrentOptions();
+ options.env = env_;
+ if (iter == 1) {
+ options.delete_obsolete_files_period_micros = 0;
+ }
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger =
+ 2; // trigger compaction when we have 2 files
+ OnFileDeletionListener* listener = new OnFileDeletionListener();
+ options.listeners.emplace_back(listener);
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ // Create two 1MB sst files
+ for (int i = 0; i < 2; ++i) {
+ // Create 1MB sst file
+ for (int j = 0; j < 100; ++j) {
+ ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ // this should execute L0->L1
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ // block compactions
+ test::SleepingBackgroundTask sleeping_task;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::LOW);
+
+ options.max_bytes_for_level_base = 1024 * 1024; // 1 MB
+ Reopen(options);
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ // let compactions go
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+
+ // this should execute L1->L2 (move)
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+ auto moved_file_name = metadata[0].name;
+
+ // Create two more 1MB sst files
+ for (int i = 0; i < 2; ++i) {
+ // Create 1MB sst file
+ for (int j = 0; j < 100; ++j) {
+ ASSERT_OK(Put(Key(i * 50 + j + 100), rnd.RandomString(10 * 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ // this should execute both L0->L1 and L1->L2 (merge with previous file)
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ // iterator is holding the file
+ ASSERT_OK(env_->FileExists(dbname_ + moved_file_name));
+
+ listener->SetExpectedFileName(dbname_ + moved_file_name);
+ ASSERT_OK(iterator->status());
+ iterator.reset();
+
+ // this file should have been compacted away
+ ASSERT_NOK(env_->FileExists(dbname_ + moved_file_name));
+ listener->VerifyMatchedCount(1);
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) {
+ if (!Zlib_Supported()) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+ // First two levels have no compression, so that a trivial move between
+ // them will be allowed. Level 2 has Zlib compression so that a trivial
+ // move to level 3 will not be allowed
+ options.compression_per_level = {kNoCompression, kNoCompression,
+ kZlibCompression};
+ int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::InputCompressionMatchesOutput:Matches",
+ [&](void* /*arg*/) { matches++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::InputCompressionMatchesOutput:DidntMatch",
+ [&](void* /*arg*/) { didnt_match++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are going to level 0
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ // Another 110KB triggers a compaction to 400K file to fill up level 0
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(4, GetSstFileCount(dbname_));
+
+ // (1, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4", FilesPerLevel(0));
+
+ // (1, 4, 1)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,1", FilesPerLevel(0));
+
+ // (1, 4, 2)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,2", FilesPerLevel(0));
+
+ // (1, 4, 3)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,3", FilesPerLevel(0));
+
+ // (1, 4, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,4", FilesPerLevel(0));
+
+ // (1, 4, 5)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,5", FilesPerLevel(0));
+
+ // (1, 4, 6)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,6", FilesPerLevel(0));
+
+ // (1, 4, 7)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,7", FilesPerLevel(0));
+
+ // (1, 4, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,8", FilesPerLevel(0));
+
+ ASSERT_EQ(matches, 12);
+ // Currently, the test relies on the number of calls to
+ // InputCompressionMatchesOutput() per compaction.
+ const int kCallsToInputCompressionMatch = 2;
+ ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch);
+ ASSERT_EQ(trivial_move, 12);
+ ASSERT_EQ(non_trivial, 8);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) {
+ Options options = CurrentOptions();
+ options.max_background_compactions = 5;
+ options.soft_pending_compaction_bytes_limit = 0;
+ options.hard_pending_compaction_bytes_limit = 100;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit);
+
+ options.max_background_compactions = 3;
+ options.soft_pending_compaction_bytes_limit = 200;
+ options.hard_pending_compaction_bytes_limit = 150;
+ DestroyAndReopen(options);
+ ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit);
+}
+
+// This tests for a bug that could cause two level0 compactions running
+// concurrently
+// TODO(aekmekji): Make sure that the reason this fails when run with
+// max_subcompactions > 1 is not a correctness issue but just inherent to
+// running parallel L0-L1 compactions
+TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10;
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 450 << 10;
+ options.target_file_size_base = 98 << 10;
+ options.max_write_buffer_number = 2;
+ options.max_background_compactions = 2;
+
+ DestroyAndReopen(options);
+
+ // fill up the DB
+ Random rnd(301);
+ for (int num = 0; num < 10; num++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"CompactionJob::Run():Start",
+ "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"},
+ {"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2",
+ "CompactionJob::Run():End"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // trigger L0 compaction
+ for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+ num++) {
+ GenerateNewRandomFile(&rnd, /* nowait */ true);
+ ASSERT_OK(Flush());
+ }
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
+
+ GenerateNewRandomFile(&rnd, /* nowait */ true);
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+ for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+ num++) {
+ GenerateNewRandomFile(&rnd, /* nowait */ true);
+ ASSERT_OK(Flush());
+ }
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+static std::string ShortKey(int i) {
+ assert(i < 10000);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key%04d", i);
+ return std::string(buf);
+}
+
+TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // The key size is guaranteed to be <= 8
+ class ShortKeyComparator : public Comparator {
+ int Compare(const ROCKSDB_NAMESPACE::Slice& a,
+ const ROCKSDB_NAMESPACE::Slice& b) const override {
+ assert(a.size() <= 8);
+ assert(b.size() <= 8);
+ return BytewiseComparator()->Compare(a, b);
+ }
+ const char* Name() const override { return "ShortKeyComparator"; }
+ void FindShortestSeparator(
+ std::string* start,
+ const ROCKSDB_NAMESPACE::Slice& limit) const override {
+ return BytewiseComparator()->FindShortestSeparator(start, limit);
+ }
+ void FindShortSuccessor(std::string* key) const override {
+ return BytewiseComparator()->FindShortSuccessor(key);
+ }
+ } short_key_cmp;
+ Options options = CurrentOptions();
+ options.target_file_size_base = 100000000;
+ options.write_buffer_size = 100000000;
+ options.max_subcompactions = max_subcompactions_;
+ options.comparator = &short_key_cmp;
+ DestroyAndReopen(options);
+
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::vector<std::string> values;
+ // File with keys [ 0 => 99 ]
+ for (int i = 0; i < 100; i++) {
+ values.push_back(rnd.RandomString(value_size));
+ ASSERT_OK(Put(ShortKey(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1", FilesPerLevel(0));
+ // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 3;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // File with keys [ 100 => 199 ]
+ for (int i = 100; i < 200; i++) {
+ values.push_back(rnd.RandomString(value_size));
+ ASSERT_OK(Put(ShortKey(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+ // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+ // then compacte the bottommost level L3=>L3 (non trivial move)
+ compact_options = CompactRangeOptions();
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 4);
+ ASSERT_EQ(non_trivial_move, 1);
+
+ // File with keys [ 200 => 299 ]
+ for (int i = 200; i < 300; i++) {
+ values.push_back(rnd.RandomString(value_size));
+ ASSERT_OK(Put(ShortKey(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+ trivial_move = 0;
+ non_trivial_move = 0;
+ compact_options = CompactRangeOptions();
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kSkip;
+ // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+ // and will skip bottommost level compaction
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 3);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ for (int i = 0; i < 300; i++) {
+ ASSERT_EQ(Get(ShortKey(i)), values[i]);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0Compaction) {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 5;
+ options.max_background_compactions = 2;
+ options.max_subcompactions = max_subcompactions_;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.write_buffer_size = 2 << 20; // 2MB
+
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = NewLRUCache(64 << 20); // 64MB
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+
+ const size_t kValueSize = 1 << 20;
+ Random rnd(301);
+ std::string value(rnd.RandomString(kValueSize));
+
+ // The L0->L1 must be picked before we begin flushing files to trigger
+ // intra-L0 compaction, and must not finish until after an intra-L0
+ // compaction has been picked.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"LevelCompactionPicker::PickCompaction:Return",
+ "DBCompactionTest::IntraL0Compaction:L0ToL1Ready"},
+ {"LevelCompactionPicker::PickCompactionBySize:0",
+ "CompactionJob::Run():Start"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // index: 0 1 2 3 4 5 6 7 8 9
+ // size: 1MB 1MB 1MB 1MB 1MB 2MB 1MB 1MB 1MB 1MB
+ // score: 1.5 1.3 1.5 2.0 inf
+ //
+ // Files 0-4 will be included in an L0->L1 compaction.
+ //
+ // L0->L0 will be triggered since the sync points guarantee compaction to base
+ // level is still blocked when files 5-9 trigger another compaction.
+ //
+ // Files 6-9 are the longest span of available files for which
+ // work-per-deleted-file decreases (see "score" row above).
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(Put(Key(0), "")); // prevents trivial move
+ if (i == 5) {
+ TEST_SYNC_POINT("DBCompactionTest::IntraL0Compaction:L0ToL1Ready");
+ ASSERT_OK(Put(Key(i + 1), value + value));
+ } else {
+ ASSERT_OK(Put(Key(i + 1), value));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1
+ // L0 has the 2MB file (not compacted) and 4MB file (output of L0->L0)
+ ASSERT_EQ(2, level_to_files[0].size());
+ ASSERT_GT(level_to_files[1].size(), 0);
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21);
+ }
+
+ // The index/filter in the file produced by intra-L0 should not be pinned.
+ // That means clearing unref'd entries in block cache and re-accessing the
+ // file produced by intra-L0 should bump the index block miss count.
+ uint64_t prev_index_misses =
+ TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+ table_options.block_cache->EraseUnRefEntries();
+ ASSERT_EQ("", Get(Key(0)));
+ ASSERT_EQ(prev_index_misses + 1,
+ TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
+ // regression test for issue #2722: L0->L0 compaction can resurrect deleted
+ // keys from older L0 files if L1+ files' key-ranges do not include the key.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 5;
+ options.max_background_compactions = 2;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ const size_t kValueSize = 1 << 20;
+ Random rnd(301);
+ std::string value(rnd.RandomString(kValueSize));
+
+ // The L0->L1 must be picked before we begin flushing files to trigger
+ // intra-L0 compaction, and must not finish until after an intra-L0
+ // compaction has been picked.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"LevelCompactionPicker::PickCompaction:Return",
+ "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:"
+ "L0ToL1Ready"},
+ {"LevelCompactionPicker::PickCompactionBySize:0",
+ "CompactionJob::Run():Start"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // index: 0 1 2 3 4 5 6 7 8 9
+ // size: 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB
+ // score: 1.25 1.33 1.5 2.0 inf
+ //
+ // Files 0-4 will be included in an L0->L1 compaction.
+ //
+ // L0->L0 will be triggered since the sync points guarantee compaction to base
+ // level is still blocked when files 5-9 trigger another compaction. All files
+ // 5-9 are included in the L0->L0 due to work-per-deleted file decreasing.
+ //
+ // Put a key-value in files 0-4. Delete that key in files 5-9. Verify the
+ // L0->L0 preserves the deletion such that the key remains deleted.
+ for (int i = 0; i < 10; ++i) {
+ // key 0 serves both to prevent trivial move and as the key we want to
+ // verify is not resurrected by L0->L0 compaction.
+ if (i < 5) {
+ ASSERT_OK(Put(Key(0), ""));
+ } else {
+ ASSERT_OK(Delete(Key(0)));
+ }
+ if (i == 5) {
+ TEST_SYNC_POINT(
+ "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:"
+ "L0ToL1Ready");
+ }
+ ASSERT_OK(Put(Key(i + 1), value));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1
+ // L0 has a single output file from L0->L0
+ ASSERT_EQ(1, level_to_files[0].size());
+ ASSERT_GT(level_to_files[1].size(), 0);
+ ASSERT_GE(level_to_files[0][0].fd.file_size, 1 << 22);
+
+ ReadOptions roptions;
+ std::string result;
+ ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound());
+}
+
+TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) {
+ const int kNumFilesTrigger = 3;
+ Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+ for (bool use_universal_compaction : {false, true}) {
+ Options options = CurrentOptions();
+ if (use_universal_compaction) {
+ options.compaction_style = kCompactionStyleUniversal;
+ } else {
+ options.compaction_style = kCompactionStyleLevel;
+ options.level_compaction_dynamic_level_bytes = true;
+ }
+ options.num_levels = 4;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ DestroyAndReopen(options);
+
+ int num_bottom_pri_compactions = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BGWorkBottomCompaction",
+ [&](void* /*arg*/) { ++num_bottom_pri_compactions; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int num = 0; num < kNumFilesTrigger; num++) {
+ ASSERT_EQ(NumSortedRuns(), num);
+ int key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(1, num_bottom_pri_compactions);
+
+ // Verify that size amplification did occur
+ ASSERT_EQ(NumSortedRuns(), 1);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+ Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
+ // Deletions can be dropped when compacted to non-last level if they fall
+ // outside the lower-level files' key-ranges.
+ const int kNumL0Files = 4;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ // put key 1 and 3 in separate L1, L2 files.
+ // So key 0, 2, and 4+ fall outside these levels' key-ranges.
+ for (int level = 2; level >= 1; --level) {
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_OK(Put(Key(2 * i + 1), "val"));
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(level);
+ ASSERT_EQ(2, NumTableFilesAtLevel(level));
+ }
+
+ // Delete keys in range [1, 4]. These L0 files will be compacted with L1:
+ // - Tombstones for keys 2 and 4 can be dropped early.
+ // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges.
+ for (int i = 0; i < kNumL0Files; ++i) {
+ ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move
+ ASSERT_OK(Delete(Key(i + 1)));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ for (int i = 0; i < kNumL0Files; ++i) {
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i + 1), &value).IsNotFound());
+ }
+ ASSERT_EQ(2, options.statistics->getTickerCount(
+ COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE));
+ ASSERT_EQ(2,
+ options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE));
+}
+
+TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) {
+ // https://www.facebook.com/groups/rocksdb.dev/permalink/1389452781153232/
+ // CompactFiles() had a bug where it failed to pick a compaction when an L0
+ // compaction existed, but marked it as scheduled anyways. It'd never be
+ // unmarked as scheduled, so future compactions or DB close could hang.
+ const int kNumL0Files = 5;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files - 1;
+ options.max_background_compactions = 2;
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"LevelCompactionPicker::PickCompaction:Return",
+ "DBCompactionTest::CompactFilesPendingL0Bug:Picked"},
+ {"DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted",
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ auto schedule_multi_compaction_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ // Files 0-3 will be included in an L0->L1 compaction.
+ //
+ // File 4 will be included in a call to CompactFiles() while the first
+ // compaction is running.
+ for (int i = 0; i < kNumL0Files - 1; ++i) {
+ ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move
+ ASSERT_OK(Put(Key(i + 1), "val"));
+ ASSERT_OK(Flush());
+ }
+ TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:Picked");
+ // file 4 flushed after 0-3 picked
+ ASSERT_OK(Put(Key(kNumL0Files), "val"));
+ ASSERT_OK(Flush());
+
+ // previously DB close would hang forever as this situation caused scheduled
+ // compactions count to never decrement to zero.
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+ ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size());
+ std::vector<std::string> input_filenames;
+ input_filenames.push_back(cf_meta.levels[0].files.front().name);
+ ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
+ 0 /* output_level */));
+ TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) {
+ // Regression test for bug of not pulling in L0 files that overlap the user-
+ // specified input files in time- and key-ranges.
+ ASSERT_OK(Put(Key(0), "old_val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(0), "new_val"));
+ ASSERT_OK(Flush());
+
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+ ASSERT_GE(cf_meta.levels.size(), 2);
+ ASSERT_EQ(2, cf_meta.levels[0].files.size());
+
+ // Compacting {new L0 file, L1 file} should pull in the old L0 file since it
+ // overlaps in key-range and time-range.
+ std::vector<std::string> input_filenames;
+ input_filenames.push_back(cf_meta.levels[0].files.front().name);
+ ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
+ 1 /* output_level */));
+ ASSERT_EQ("new_val", Get(Key(0)));
+}
+
+TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ const Snapshot* snapshot = nullptr;
+ const int kMaxKey = 10;
+
+ for (int i = 0; i < kMaxKey; i++) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ ASSERT_OK(Delete(Key(i)));
+ if (!snapshot) {
+ snapshot = db_->GetSnapshot();
+ }
+ }
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+ ASSERT_OK(Put(Key(kMaxKey), Key(kMaxKey)));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // test DeleteFilesInRange() deletes the files already picked for compaction
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"VersionSet::LogAndApply:WriteManifestStart",
+ "BackgroundCallCompaction:0"},
+ {"DBImpl::BackgroundCompaction:Finish",
+ "VersionSet::LogAndApply:WriteManifestDone"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // release snapshot which mark bottommost file for compaction
+ db_->ReleaseSnapshot(snapshot);
+ std::string begin_string = Key(0);
+ std::string end_string = Key(kMaxKey + 1);
+ Slice begin(begin_string);
+ Slice end(end_string);
+ ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
+ // bottom-level files may contain deletions due to snapshots protecting the
+ // deleted keys. Once the snapshot is released, we should see files with many
+ // such deletions undergo single-file compactions.
+ const int kNumKeysPerFile = 1024;
+ const int kNumLevelFiles = 4;
+ const int kValueSize = 128;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = kNumLevelFiles;
+ // inflate it a bit to account for key/metadata overhead
+ options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
+ CreateAndReopenWithCF({"one"}, options);
+
+ Random rnd(301);
+ const Snapshot* snapshot = nullptr;
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+ }
+ if (i == kNumLevelFiles - 1) {
+ snapshot = db_->GetSnapshot();
+ // delete every other key after grabbing a snapshot, so these deletions
+ // and the keys they cover can't be dropped until after the snapshot is
+ // released.
+ for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
+ ASSERT_OK(Delete(Key(j)));
+ }
+ }
+ ASSERT_OK(Flush());
+ if (i < kNumLevelFiles - 1) {
+ ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
+
+ std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
+ db_->GetLiveFilesMetaData(&pre_release_metadata);
+ // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
+ // files does not need to be preserved in case of a future snapshot.
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+ // release snapshot and wait for compactions to finish. Single-file
+ // compactions should be triggered, which reduce the size of each bottom-level
+ // file without changing file count.
+ db_->ReleaseSnapshot(snapshot);
+ ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->compaction_reason() ==
+ CompactionReason::kBottommostFiles);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ db_->GetLiveFilesMetaData(&post_release_metadata);
+ ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
+
+ for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
+ const auto& pre_file = pre_release_metadata[i];
+ const auto& post_file = post_release_metadata[i];
+ ASSERT_EQ(1, pre_file.level);
+ ASSERT_EQ(1, post_file.level);
+ // each file is smaller than it was before as it was rewritten without
+ // deletion markers/deleted keys.
+ ASSERT_LT(post_file.size, pre_file.size);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) {
+ // bottom-level files may contain deletions due to snapshots protecting the
+ // deleted keys. Once the snapshot is released, we should see files with many
+ // such deletions undergo single-file compactions. But when disabling auto
+ // compactions, it shouldn't be triggered which may causing too many
+ // background jobs.
+ const int kNumKeysPerFile = 1024;
+ const int kNumLevelFiles = 4;
+ const int kValueSize = 128;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.level0_file_num_compaction_trigger = kNumLevelFiles;
+ // inflate it a bit to account for key/metadata overhead
+ options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
+ Reopen(options);
+
+ Random rnd(301);
+ const Snapshot* snapshot = nullptr;
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+ }
+ if (i == kNumLevelFiles - 1) {
+ snapshot = db_->GetSnapshot();
+ // delete every other key after grabbing a snapshot, so these deletions
+ // and the keys they cover can't be dropped until after the snapshot is
+ // released.
+ for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
+ ASSERT_OK(Delete(Key(j)));
+ }
+ }
+ ASSERT_OK(Flush());
+ if (i < kNumLevelFiles - 1) {
+ ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr));
+ ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
+
+ std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
+ db_->GetLiveFilesMetaData(&pre_release_metadata);
+ // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
+ // files does not need to be preserved in case of a future snapshot.
+ ASSERT_OK(Put(Key(0), "val"));
+
+ // release snapshot and no compaction should be triggered.
+ std::atomic<int> num_compactions{0};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:Start",
+ [&](void* /*arg*/) { num_compactions.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ db_->ReleaseSnapshot(snapshot);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, num_compactions);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ db_->GetLiveFilesMetaData(&post_release_metadata);
+ ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
+ for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
+ const auto& pre_file = pre_release_metadata[i];
+ const auto& post_file = post_release_metadata[i];
+ ASSERT_EQ(1, pre_file.level);
+ ASSERT_EQ(1, post_file.level);
+ // each file is same as before with deletion markers/deleted keys.
+ ASSERT_EQ(post_file.size, pre_file.size);
+ }
+}
+
+TEST_F(DBCompactionTest, RoundRobinTtlCompactionNormal) {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 20;
+ options.ttl = 24 * 60 * 60; // 24 hours
+ options.compaction_pri = kRoundRobin;
+ env_->now_cpu_count_.store(0);
+ env_->SetMockSleep();
+ options.env = env_;
+
+ // add a small second for each wait time, to make sure the file is expired
+ int small_seconds = 1;
+
+ std::atomic_int ttl_compactions{0};
+ std::atomic_int round_robin_ttl_compactions{0};
+ std::atomic_int other_compactions{0};
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kTtl) {
+ ttl_compactions++;
+ } else if (compaction_reason == CompactionReason::kRoundRobinTtl) {
+ round_robin_ttl_compactions++;
+ } else {
+ other_compactions++;
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyAndReopen(options);
+
+ // Setup the files from lower level to up level, each file is 1 hour's older
+ // than the next one.
+ // create 10 files on the last level (L6)
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j)));
+ }
+ ASSERT_OK(Flush());
+ env_->MockSleepForSeconds(60 * 60); // generate 1 file per hour
+ }
+ MoveFilesToLevel(6);
+
+ // create 5 files on L5
+ for (int i = 0; i < 5; i++) {
+ for (int j = 0; j < 200; j++) {
+ ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j)));
+ }
+ ASSERT_OK(Flush());
+ env_->MockSleepForSeconds(60 * 60);
+ }
+ MoveFilesToLevel(5);
+
+ // create 3 files on L4
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 300; j++) {
+ ASSERT_OK(Put(Key(i * 300 + j), "value" + std::to_string(i * 300 + j)));
+ }
+ ASSERT_OK(Flush());
+ env_->MockSleepForSeconds(60 * 60);
+ }
+ MoveFilesToLevel(4);
+
+ // The LSM tree should be like:
+ // L4: [0, 299], [300, 599], [600, 899]
+ // L5: [0, 199] [200, 399]...............[800, 999]
+ // L6: [0,99][100,199][200,299][300,399]...............[800,899][900,999]
+ ASSERT_EQ("0,0,0,0,3,5,10", FilesPerLevel());
+
+ // make sure the first L5 file is expired
+ env_->MockSleepForSeconds(16 * 60 * 60 + small_seconds++);
+
+ // trigger TTL compaction
+ ASSERT_OK(Put(Key(4), "value" + std::to_string(1)));
+ ASSERT_OK(Put(Key(5), "value" + std::to_string(1)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // verify there's a RoundRobin TTL compaction
+ ASSERT_EQ(1, round_robin_ttl_compactions);
+ round_robin_ttl_compactions = 0;
+
+ // expire 2 more files
+ env_->MockSleepForSeconds(2 * 60 * 60 + small_seconds++);
+ // trigger TTL compaction
+ ASSERT_OK(Put(Key(4), "value" + std::to_string(2)));
+ ASSERT_OK(Put(Key(5), "value" + std::to_string(2)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(2, round_robin_ttl_compactions);
+ round_robin_ttl_compactions = 0;
+
+ // expire 4 more files, 2 out of 3 files on L4 are expired
+ env_->MockSleepForSeconds(4 * 60 * 60 + small_seconds++);
+ // trigger TTL compaction
+ ASSERT_OK(Put(Key(6), "value" + std::to_string(3)));
+ ASSERT_OK(Put(Key(7), "value" + std::to_string(3)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(1, NumTableFilesAtLevel(4));
+ ASSERT_EQ(0, NumTableFilesAtLevel(5));
+
+ ASSERT_GT(round_robin_ttl_compactions, 0);
+ round_robin_ttl_compactions = 0;
+
+ // make the first L0 file expired, which triggers a normal TTL compaction
+ // instead of roundrobin TTL compaction, it will also include an extra file
+ // from L0 because of overlap
+ ASSERT_EQ(0, ttl_compactions);
+ env_->MockSleepForSeconds(19 * 60 * 60 + small_seconds++);
+
+ // trigger TTL compaction
+ ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
+ ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // L0 -> L1 compaction is normal TTL compaction, L1 -> next levels compactions
+ // are RoundRobin TTL compaction.
+ ASSERT_GT(ttl_compactions, 0);
+ ttl_compactions = 0;
+ ASSERT_GT(round_robin_ttl_compactions, 0);
+ round_robin_ttl_compactions = 0;
+
+ // All files are expired, so only the last level has data
+ env_->MockSleepForSeconds(24 * 60 * 60);
+ // trigger TTL compaction
+ ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
+ ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+
+ ASSERT_GT(ttl_compactions, 0);
+ ttl_compactions = 0;
+ ASSERT_GT(round_robin_ttl_compactions, 0);
+ round_robin_ttl_compactions = 0;
+
+ ASSERT_EQ(0, other_compactions);
+}
+
+TEST_F(DBCompactionTest, RoundRobinTtlCompactionUnsortedTime) {
+ // This is to test the case that the RoundRobin compaction cursor not pointing
+ // to the oldest file, RoundRobin compaction should still compact the file
+ // after cursor until all expired files are compacted.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 20;
+ options.ttl = 24 * 60 * 60; // 24 hours
+ options.compaction_pri = kRoundRobin;
+ env_->now_cpu_count_.store(0);
+ env_->SetMockSleep();
+ options.env = env_;
+
+ std::atomic_int ttl_compactions{0};
+ std::atomic_int round_robin_ttl_compactions{0};
+ std::atomic_int other_compactions{0};
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kTtl) {
+ ttl_compactions++;
+ } else if (compaction_reason == CompactionReason::kRoundRobinTtl) {
+ round_robin_ttl_compactions++;
+ } else {
+ other_compactions++;
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyAndReopen(options);
+
+ // create 10 files on the last level (L6)
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j)));
+ }
+ ASSERT_OK(Flush());
+ env_->MockSleepForSeconds(60 * 60); // generate 1 file per hour
+ }
+ MoveFilesToLevel(6);
+
+ // create 5 files on L5
+ for (int i = 0; i < 5; i++) {
+ for (int j = 0; j < 200; j++) {
+ ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j)));
+ }
+ ASSERT_OK(Flush());
+ env_->MockSleepForSeconds(60 * 60); // 1 hour
+ }
+ MoveFilesToLevel(5);
+
+ // The LSM tree should be like:
+ // L5: [0, 199] [200, 399] [400,599] [600,799] [800, 999]
+ // L6: [0,99][100,199][200,299][300,399]....................[800,899][900,999]
+ ASSERT_EQ("0,0,0,0,0,5,10", FilesPerLevel());
+
+ // point the compaction cursor to the 4th file on L5
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(cfd, nullptr);
+ Version* const current = cfd->current();
+ ASSERT_NE(current, nullptr);
+ VersionStorageInfo* storage_info = current->storage_info();
+ ASSERT_NE(storage_info, nullptr);
+ const InternalKey split_cursor = InternalKey(Key(600), 100000, kTypeValue);
+ storage_info->AddCursorForOneLevel(5, split_cursor);
+
+ // make the first file on L5 expired, there should be 3 TTL compactions:
+ // 4th one, 5th one, then 1st one.
+ env_->MockSleepForSeconds(19 * 60 * 60 + 1);
+ // trigger TTL compaction
+ ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
+ ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(2, NumTableFilesAtLevel(5));
+
+ ASSERT_EQ(3, round_robin_ttl_compactions);
+ ASSERT_EQ(0, ttl_compactions);
+ ASSERT_EQ(0, other_compactions);
+}
+
+TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 2;
+ const int kValueSize = 1024;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.ttl = 24 * 60 * 60; // 24 hours
+ options.max_open_files = -1;
+ env_->SetMockSleep();
+ options.env = env_;
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ MoveFilesToLevel(3);
+ ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+ // Delete previously written keys.
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("2,0,0,2", FilesPerLevel());
+ MoveFilesToLevel(1);
+ ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+ env_->MockSleepForSeconds(36 * 60 * 60); // 36 hours
+ ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+ // Just do a simple write + flush so that the Ttl expired files get
+ // compacted.
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Flush());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // All non-L0 files are deleted, as they contained only deleted data.
+ ASSERT_EQ("1", FilesPerLevel());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ // Test dynamically changing ttl.
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ MoveFilesToLevel(3);
+ ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+ // Delete previously written keys.
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("2,0,0,2", FilesPerLevel());
+ MoveFilesToLevel(1);
+ ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+ // Move time forward by 12 hours, and make sure that compaction still doesn't
+ // trigger as ttl is set to 24 hours.
+ env_->MockSleepForSeconds(12 * 60 * 60);
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("1,2,0,2", FilesPerLevel());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Dynamically change ttl to 10 hours.
+ // This should trigger a ttl compaction, as 12 hours have already passed.
+ ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}}));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // All non-L0 files are deleted, as they contained only deleted data.
+ ASSERT_EQ("1", FilesPerLevel());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
+ env_->SetMockSleep();
+ const int kValueSize = 100;
+
+ for (bool if_restart : {false, true}) {
+ for (bool if_open_all_files : {false, true}) {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.ttl = 24 * 60 * 60; // 24 hours
+ if (if_open_all_files) {
+ options.max_open_files = -1;
+ } else {
+ options.max_open_files = 20;
+ }
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = 2;
+ });
+ // In the case where all files are opened and doing DB restart
+ // forcing the oldest ancester time in manifest file to be 0 to
+ // simulate the case of reading from an old version.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) {
+ if (if_restart && if_open_all_files) {
+ std::string* encoded_fieled = static_cast<std::string*>(arg);
+ *encoded_fieled = "";
+ PutVarint64(encoded_fieled, 0);
+ }
+ });
+
+ options.env = env_;
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ DestroyAndReopen(options);
+
+ int ttl_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kTtl) {
+ ttl_compactions++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Add two L6 files with key ranges: [1 .. 100], [101 .. 200].
+ Random rnd(301);
+ for (int i = 1; i <= 100; ++i) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ // Get the first file's creation time. This will be the oldest file in the
+ // DB. Compactions inolving this file's descendents should keep getting
+ // this time.
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ uint64_t oldest_time = level_to_files[0][0].oldest_ancester_time;
+ // Add 1 hour and do another flush.
+ env_->MockSleepForSeconds(1 * 60 * 60);
+ for (int i = 101; i <= 200; ++i) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(6);
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+
+ env_->MockSleepForSeconds(1 * 60 * 60);
+ // Add two L4 files with key ranges: [1 .. 50], [51 .. 150].
+ for (int i = 1; i <= 50; ++i) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ env_->MockSleepForSeconds(1 * 60 * 60);
+ for (int i = 51; i <= 150; ++i) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(4);
+ ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
+
+ env_->MockSleepForSeconds(1 * 60 * 60);
+ // Add one L1 file with key range: [26, 75].
+ for (int i = 26; i <= 75; ++i) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ MoveFilesToLevel(1);
+ ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
+
+ // LSM tree:
+ // L1: [26 .. 75]
+ // L4: [1 .. 50][51 ..... 150]
+ // L6: [1 ........ 100][101 .... 200]
+ //
+ // On TTL expiry, TTL compaction should be initiated on L1 file, and the
+ // compactions should keep going on until the key range hits bottom level.
+ // In other words: the compaction on this data range "cascasdes" until
+ // reaching the bottom level.
+ //
+ // Order of events on TTL expiry:
+ // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the
+ // ttl
+ // compaction.
+ // 2. A TTL compaction happens between L3 and L4 files. Output file in L4.
+ // 3. The new output file from L4 falls to L5 via 1 trival move initiated
+ // by the ttl compaction.
+ // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6.
+
+ // Add 25 hours and do a write
+ env_->MockSleepForSeconds(25 * 60 * 60);
+
+ ASSERT_OK(Put(Key(1), "1"));
+ if (if_restart) {
+ Reopen(options);
+ } else {
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(5, ttl_compactions);
+
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ ASSERT_EQ(oldest_time, level_to_files[6][0].oldest_ancester_time);
+
+ env_->MockSleepForSeconds(25 * 60 * 60);
+ ASSERT_OK(Put(Key(2), "1"));
+ if (if_restart) {
+ Reopen(options);
+ } else {
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GE(ttl_compactions, 6);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
+ env_->SetMockSleep();
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 2;
+ const int kValueSize = 100;
+
+ for (bool if_restart : {false, true}) {
+ for (bool if_open_all_files : {false, true}) {
+ Options options = CurrentOptions();
+ options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days
+ if (if_open_all_files) {
+ options.max_open_files = -1; // needed for ttl compaction
+ } else {
+ options.max_open_files = 20;
+ }
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = 0;
+ });
+ // In the case where all files are opened and doing DB restart
+ // forcing the file creation time in manifest file to be 0 to
+ // simulate the case of reading from an old version.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) {
+ if (if_restart && if_open_all_files) {
+ std::string* encoded_fieled = static_cast<std::string*>(arg);
+ *encoded_fieled = "";
+ PutVarint64(encoded_fieled, 0);
+ }
+ });
+
+ options.env = env_;
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ DestroyAndReopen(options);
+
+ int periodic_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ periodic_compactions++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ("2", FilesPerLevel());
+ ASSERT_EQ(0, periodic_compactions);
+
+ // Add 50 hours and do a write
+ env_->MockSleepForSeconds(50 * 60 * 60);
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Assert that the files stay in the same level
+ ASSERT_EQ("3", FilesPerLevel());
+ // The two old files go through the periodic compaction process
+ ASSERT_EQ(2, periodic_compactions);
+
+ MoveFilesToLevel(1);
+ ASSERT_EQ("0,3", FilesPerLevel());
+
+ // Add another 50 hours and do another write
+ env_->MockSleepForSeconds(50 * 60 * 60);
+ ASSERT_OK(Put("b", "2"));
+ if (if_restart) {
+ Reopen(options);
+ } else {
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("1,3", FilesPerLevel());
+ // The three old files now go through the periodic compaction process. 2
+ // + 3.
+ ASSERT_EQ(5, periodic_compactions);
+
+ // Add another 50 hours and do another write
+ env_->MockSleepForSeconds(50 * 60 * 60);
+ ASSERT_OK(Put("c", "3"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("2,3", FilesPerLevel());
+ // The four old files now go through the periodic compaction process. 5
+ // + 4.
+ ASSERT_EQ(9, periodic_compactions);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
+ // This test makes sure that periodic compactions are working with a DB
+ // where file_creation_time of some files is 0.
+ // After compactions the new files are created with a valid file_creation_time
+
+ const int kNumKeysPerFile = 32;
+ const int kNumFiles = 4;
+ const int kValueSize = 100;
+
+ Options options = CurrentOptions();
+ env_->SetMockSleep();
+ options.env = env_;
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ DestroyAndReopen(options);
+
+ int periodic_compactions = 0;
+ bool set_file_creation_time_to_zero = true;
+ bool set_creation_time_to_zero = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ periodic_compactions++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+ TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+ if (set_file_creation_time_to_zero) {
+ props->file_creation_time = 0;
+ }
+ if (set_creation_time_to_zero) {
+ props->creation_time = 0;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ // Move the first two files to L2.
+ if (i == 1) {
+ MoveFilesToLevel(2);
+ set_creation_time_to_zero = false;
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ("2,0,2", FilesPerLevel());
+ ASSERT_EQ(0, periodic_compactions);
+
+ Close();
+
+ set_file_creation_time_to_zero = false;
+ // Forward the clock by 2 days.
+ env_->MockSleepForSeconds(2 * 24 * 60 * 60);
+ options.periodic_compaction_seconds = 1 * 24 * 60 * 60; // 1 day
+
+ Reopen(options);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("2,0,2", FilesPerLevel());
+ // Make sure that all files go through periodic compaction.
+ ASSERT_EQ(kNumFiles, periodic_compactions);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 2;
+ const int kValueSize = 100;
+
+ Options options = CurrentOptions();
+ options.ttl = 10 * 60 * 60; // 10 hours
+ options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days
+ options.max_open_files = -1; // needed for both periodic and ttl compactions
+ env_->SetMockSleep();
+ options.env = env_;
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ DestroyAndReopen(options);
+
+ int periodic_compactions = 0;
+ int ttl_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ periodic_compactions++;
+ } else if (compaction_reason == CompactionReason::kTtl) {
+ ttl_compactions++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ MoveFilesToLevel(3);
+
+ ASSERT_EQ("0,0,0,2", FilesPerLevel());
+ ASSERT_EQ(0, periodic_compactions);
+ ASSERT_EQ(0, ttl_compactions);
+
+ // Add some time greater than periodic_compaction_time.
+ env_->MockSleepForSeconds(50 * 60 * 60);
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Files in the bottom level go through periodic compactions.
+ ASSERT_EQ("1,0,0,2", FilesPerLevel());
+ ASSERT_EQ(2, periodic_compactions);
+ ASSERT_EQ(0, ttl_compactions);
+
+ // Add a little more time than ttl
+ env_->MockSleepForSeconds(11 * 60 * 60);
+ ASSERT_OK(Put("b", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Notice that the previous file in level 1 falls down to the bottom level
+ // due to ttl compactions, one level at a time.
+ // And bottom level files don't get picked up for ttl compactions.
+ ASSERT_EQ("1,0,0,3", FilesPerLevel());
+ ASSERT_EQ(2, periodic_compactions);
+ ASSERT_EQ(3, ttl_compactions);
+
+ // Add some time greater than periodic_compaction_time.
+ env_->MockSleepForSeconds(50 * 60 * 60);
+ ASSERT_OK(Put("c", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Previous L0 file falls one level at a time to bottom level due to ttl.
+ // And all 4 bottom files go through periodic compactions.
+ ASSERT_EQ("1,0,0,4", FilesPerLevel());
+ ASSERT_EQ(6, periodic_compactions);
+ ASSERT_EQ(6, ttl_compactions);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelTtlBooster) {
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 3;
+ const int kValueSize = 1000;
+
+ Options options = CurrentOptions();
+ options.ttl = 10 * 60 * 60; // 10 hours
+ options.periodic_compaction_seconds = 480 * 60 * 60; // very long
+ options.level0_file_num_compaction_trigger = 2;
+ options.max_bytes_for_level_base = 5 * uint64_t{kNumKeysPerFile * kValueSize};
+ options.max_open_files = -1; // needed for both periodic and ttl compactions
+ options.compaction_pri = CompactionPri::kMinOverlappingRatio;
+ env_->SetMockSleep();
+ options.env = env_;
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ MoveFilesToLevel(2);
+
+ ASSERT_EQ("0,0,3", FilesPerLevel());
+
+ // Create some files for L1
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(2 * j + i), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ ASSERT_EQ("0,1,3", FilesPerLevel());
+
+ // Make the new L0 files qualify TTL boosting and generate one more to trigger
+ // L1 -> L2 compaction. Old files will be picked even if their priority is
+ // lower without boosting.
+ env_->MockSleepForSeconds(8 * 60 * 60);
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(kNumKeysPerFile * 2 + 2 * j + i),
+ rnd.RandomString(kValueSize * 2)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ // Force files to be compacted to L1
+ ASSERT_OK(
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "1"}}));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,1,2", FilesPerLevel());
+ ASSERT_OK(
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}}));
+
+ ASSERT_GT(SizeAtLevel(1), kNumKeysPerFile * 4 * kValueSize);
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
+ class TestCompactionFilter : public CompactionFilter {
+ const char* Name() const override { return "TestCompactionFilter"; }
+ };
+ class TestCompactionFilterFactory : public CompactionFilterFactory {
+ const char* Name() const override { return "TestCompactionFilterFactory"; }
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
+ }
+ };
+
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 2;
+ const int kValueSize = 100;
+
+ Random rnd(301);
+
+ Options options = CurrentOptions();
+ TestCompactionFilter test_compaction_filter;
+ env_->SetMockSleep();
+ options.env = env_;
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ enum CompactionFilterType {
+ kUseCompactionFilter,
+ kUseCompactionFilterFactory
+ };
+
+ for (CompactionFilterType comp_filter_type :
+ {kUseCompactionFilter, kUseCompactionFilterFactory}) {
+ // Assert that periodic compactions are not enabled.
+ ASSERT_EQ(std::numeric_limits<uint64_t>::max() - 1,
+ options.periodic_compaction_seconds);
+
+ if (comp_filter_type == kUseCompactionFilter) {
+ options.compaction_filter = &test_compaction_filter;
+ options.compaction_filter_factory.reset();
+ } else if (comp_filter_type == kUseCompactionFilterFactory) {
+ options.compaction_filter = nullptr;
+ options.compaction_filter_factory.reset(
+ new TestCompactionFilterFactory());
+ }
+ DestroyAndReopen(options);
+
+ // periodic_compaction_seconds should be set to the sanitized value when
+ // a compaction filter or a compaction filter factory is used.
+ ASSERT_EQ(30 * 24 * 60 * 60,
+ dbfull()->GetOptions().periodic_compaction_seconds);
+
+ int periodic_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ auto compaction_reason = compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ periodic_compactions++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ("2", FilesPerLevel());
+ ASSERT_EQ(0, periodic_compactions);
+
+ // Add 31 days and do a write
+ env_->MockSleepForSeconds(31 * 24 * 60 * 60);
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Assert that the files stay in the same level
+ ASSERT_EQ("3", FilesPerLevel());
+ // The two old files go through the periodic compaction process
+ ASSERT_EQ(2, periodic_compactions);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+ // compaction only triggers flush after it's sure stall won't be triggered for
+ // L0 file count going too high.
+ const int kNumL0FilesTrigger = 4;
+ const int kNumL0FilesLimit = 8;
+ // i == 0: verifies normal case where stall is avoided by delay
+ // i == 1: verifies no delay in edge case where stall trigger is same as
+ // compaction trigger, so stall can't be avoided
+ for (int i = 0; i < 2; ++i) {
+ Options options = CurrentOptions();
+ options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+ if (i == 0) {
+ options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+ } else {
+ options.level0_file_num_compaction_trigger = kNumL0FilesLimit;
+ }
+ Reopen(options);
+
+ if (i == 0) {
+ // ensure the auto compaction doesn't finish until manual compaction has
+ // had a chance to be delayed.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "CompactionJob::Run():End"}});
+ } else {
+ // ensure the auto-compaction doesn't finish until manual compaction has
+ // continued without delay.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:StallWaitDone",
+ "CompactionJob::Run():End"}});
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ ASSERT_OK(Put(Key(k), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ });
+
+ manual_compaction_thread.join();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+ // compaction only triggers flush after it's sure stall won't be triggered for
+ // immutable memtable count going too high.
+ const int kNumImmMemTableLimit = 8;
+ // i == 0: verifies normal case where stall is avoided by delay
+ // i == 1: verifies no delay in edge case where stall trigger is same as flush
+ // trigger, so stall can't be avoided
+ for (int i = 0; i < 2; ++i) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ // the delay limit is one less than the stop limit. This test focuses on
+ // avoiding delay limit, but this option sets stop limit, so add one.
+ options.max_write_buffer_number = kNumImmMemTableLimit + 1;
+ if (i == 1) {
+ options.min_write_buffer_number_to_merge = kNumImmMemTableLimit;
+ }
+ Reopen(options);
+
+ if (i == 0) {
+ // ensure the flush doesn't finish until manual compaction has had a
+ // chance to be delayed.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "FlushJob::WriteLevel0Table"}});
+ } else {
+ // ensure the flush doesn't finish until manual compaction has continued
+ // without delay.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:StallWaitDone",
+ "FlushJob::WriteLevel0Table"}});
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) {
+ ASSERT_OK(Put(Key(0), rnd.RandomString(1024)));
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ flush_opts.allow_write_stall = true;
+ ASSERT_OK(dbfull()->Flush(flush_opts));
+ }
+
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ });
+
+ manual_compaction_thread.join();
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`, delay
+ // does not hang if CF is dropped or DB is closed
+ const int kNumL0FilesTrigger = 4;
+ const int kNumL0FilesLimit = 8;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+ options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+ // i == 0: DB::DropColumnFamily() on CompactRange's target CF unblocks it
+ // i == 1: DB::CancelAllBackgroundWork() unblocks CompactRange. This is to
+ // simulate what happens during Close as we can't call Close (it
+ // blocks on the auto-compaction, making a cycle).
+ for (int i = 0; i < 2; ++i) {
+ CreateAndReopenWithCF({"one"}, options);
+ // The calls to close CF/DB wait until the manual compaction stalls.
+ // The auto-compaction waits until the manual compaction finishes to ensure
+ // the signal comes from closing CF/DB, not from compaction making progress.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"},
+ {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual",
+ "CompactionJob::Run():End"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ ASSERT_OK(Put(1, Key(k), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush(1));
+ }
+ auto manual_compaction_thread = port::Thread([this, i]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ if (i == 0) {
+ ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+ .IsColumnFamilyDropped());
+ } else {
+ ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+ .IsShutdownInProgress());
+ }
+ });
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown");
+ if (i == 0) {
+ ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+ } else {
+ dbfull()->CancelAllBackgroundWork(false /* wait */);
+ }
+ manual_compaction_thread.join();
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual");
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`,
+ // CompactRange skips its flush if the delay is long enough that the memtables
+ // existing at the beginning of the call have already been flushed.
+ const int kNumL0FilesTrigger = 4;
+ const int kNumL0FilesLimit = 8;
+ Options options = CurrentOptions();
+ options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+ options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+ Reopen(options);
+
+ Random rnd(301);
+ // The manual flush includes the memtable that was active when CompactRange
+ // began. So it unblocks CompactRange and precludes its flush. Throughout the
+ // test, stall conditions are upheld via high L0 file count.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"},
+ {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush",
+ "DBImpl::FlushMemTable:StallWaitDone"},
+ {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // used for the delayable flushes
+ FlushOptions flush_opts;
+ flush_opts.allow_write_stall = true;
+ for (int i = 0; i < kNumL0FilesLimit - 1; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(dbfull()->Flush(flush_opts));
+ }
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush");
+ ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024)));
+ ASSERT_OK(dbfull()->Flush(flush_opts));
+ ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024)));
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush");
+ manual_compaction_thread.join();
+
+ // If CompactRange's flush was skipped, the final Put above will still be
+ // in the active memtable.
+ std::string num_keys_in_memtable;
+ ASSERT_TRUE(db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &num_keys_in_memtable));
+ ASSERT_EQ(std::to_string(1), num_keys_in_memtable);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) {
+ // Verify memtable only gets flushed if it contains data overlapping the range
+ // provided to `CompactRange`. Tests all kinds of overlap/non-overlap.
+ const int kNumEndpointKeys = 5;
+ std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"};
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ // One extra iteration for nullptr, which means left side of interval is
+ // unbounded.
+ for (int i = 0; i <= kNumEndpointKeys; ++i) {
+ Slice begin;
+ Slice* begin_ptr;
+ if (i == 0) {
+ begin_ptr = nullptr;
+ } else {
+ begin = keys[i - 1];
+ begin_ptr = &begin;
+ }
+ // Start at `i` so right endpoint comes after left endpoint. One extra
+ // iteration for nullptr, which means right side of interval is unbounded.
+ for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) {
+ Slice end;
+ Slice* end_ptr;
+ if (j == kNumEndpointKeys) {
+ end_ptr = nullptr;
+ } else {
+ end = keys[j];
+ end_ptr = &end;
+ }
+ ASSERT_OK(Put("b", "val"));
+ ASSERT_OK(Put("d", "val"));
+ CompactRangeOptions compact_range_opts;
+ ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr));
+
+ uint64_t get_prop_tmp, num_memtable_entries = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables,
+ &get_prop_tmp));
+ num_memtable_entries += get_prop_tmp;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &get_prop_tmp));
+ num_memtable_entries += get_prop_tmp;
+ if (begin_ptr == nullptr || end_ptr == nullptr ||
+ (i <= 4 && j >= 1 && (begin != "c" || end != "c"))) {
+ // In this case `CompactRange`'s range overlapped in some way with the
+ // memtable's range, so flush should've happened. Then "b" and "d" won't
+ // be in the memtable.
+ ASSERT_EQ(0, num_memtable_entries);
+ } else {
+ ASSERT_EQ(2, num_memtable_entries);
+ // flush anyways to prepare for next iteration
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, CompactionStatsTest) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ CompactionStatsCollector* collector = new CompactionStatsCollector();
+ options.listeners.emplace_back(collector);
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < 32; i++) {
+ for (int j = 0; j < 5000; j++) {
+ ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+ ColumnFamilyData* cfd = cfh->cfd();
+
+ VerifyCompactionStats(*cfd, *collector);
+}
+
+TEST_F(DBCompactionTest, SubcompactionEvent) {
+ class SubCompactionEventListener : public EventListener {
+ public:
+ void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+ InstrumentedMutexLock l(&mutex_);
+ ASSERT_EQ(running_compactions_.find(ci.job_id),
+ running_compactions_.end());
+ running_compactions_.emplace(ci.job_id, std::unordered_set<int>());
+ }
+
+ void OnCompactionCompleted(DB* /*db*/,
+ const CompactionJobInfo& ci) override {
+ InstrumentedMutexLock l(&mutex_);
+ auto it = running_compactions_.find(ci.job_id);
+ ASSERT_NE(it, running_compactions_.end());
+ ASSERT_EQ(it->second.size(), 0);
+ running_compactions_.erase(it);
+ }
+
+ void OnSubcompactionBegin(const SubcompactionJobInfo& si) override {
+ InstrumentedMutexLock l(&mutex_);
+ auto it = running_compactions_.find(si.job_id);
+ ASSERT_NE(it, running_compactions_.end());
+ auto r = it->second.insert(si.subcompaction_job_id);
+ ASSERT_TRUE(r.second); // each subcompaction_job_id should be different
+ total_subcompaction_cnt_++;
+ }
+
+ void OnSubcompactionCompleted(const SubcompactionJobInfo& si) override {
+ InstrumentedMutexLock l(&mutex_);
+ auto it = running_compactions_.find(si.job_id);
+ ASSERT_NE(it, running_compactions_.end());
+ auto r = it->second.erase(si.subcompaction_job_id);
+ ASSERT_EQ(r, 1);
+ }
+
+ size_t GetRunningCompactionCount() {
+ InstrumentedMutexLock l(&mutex_);
+ return running_compactions_.size();
+ }
+
+ size_t GetTotalSubcompactionCount() {
+ InstrumentedMutexLock l(&mutex_);
+ return total_subcompaction_cnt_;
+ }
+
+ private:
+ InstrumentedMutex mutex_;
+ std::unordered_map<int, std::unordered_set<int>> running_compactions_;
+ size_t total_subcompaction_cnt_ = 0;
+ };
+
+ Options options = CurrentOptions();
+ options.target_file_size_base = 1024;
+ options.level0_file_num_compaction_trigger = 10;
+ auto* listener = new SubCompactionEventListener();
+ options.listeners.emplace_back(listener);
+
+ DestroyAndReopen(options);
+
+ // generate 4 files @ L2
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(2);
+
+ // generate 2 files @ L1 which overlaps with L2 files
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(1);
+ ASSERT_EQ(FilesPerLevel(), "0,2,4");
+
+ CompactRangeOptions comp_opts;
+ comp_opts.max_subcompactions = 4;
+ Status s = dbfull()->CompactRange(comp_opts, nullptr, nullptr);
+ ASSERT_OK(s);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // make sure there's no running compaction
+ ASSERT_EQ(listener->GetRunningCompactionCount(), 0);
+ // and sub compaction is triggered
+ ASSERT_GT(listener->GetTotalSubcompactionCount(), 0);
+}
+
+TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) {
+ // LSM setup:
+ // L1: [ba bz]
+ // L2: [a b] [c d]
+ // L3: [a b] [c d]
+ //
+ // Thread 1: Thread 2:
+ // Begin compacting all L2->L3
+ // Compact [ba bz] L1->L3
+ // End compacting all L2->L3
+ //
+ // The compaction operation in thread 2 should be disallowed because the range
+ // overlaps with the compaction in thread 1, which also covers that range in
+ // L3.
+ Options options = CurrentOptions();
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+ Reopen(options);
+
+ for (int level = 3; level >= 2; --level) {
+ ASSERT_OK(Put("a", "val"));
+ ASSERT_OK(Put("b", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("c", "val"));
+ ASSERT_OK(Put("d", "val"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(level);
+ }
+ ASSERT_OK(Put("ba", "val"));
+ ASSERT_OK(Put("bz", "val"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+
+ SyncPoint::GetInstance()->LoadDependency({
+ {"CompactFilesImpl:0",
+ "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"},
+ {"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End",
+ "CompactFilesImpl:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ auto bg_thread = port::Thread([&]() {
+ // Thread 1
+ std::vector<std::string> filenames = collector->GetFlushedFiles();
+ filenames.pop_back();
+ ASSERT_OK(db_->CompactFiles(CompactionOptions(), filenames,
+ 3 /* output_level */));
+ });
+
+ // Thread 2
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin");
+ std::string filename = collector->GetFlushedFiles().back();
+ ASSERT_FALSE(
+ db_->CompactFiles(CompactionOptions(), {filename}, 3 /* output_level */)
+ .ok());
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End");
+
+ bg_thread.join();
+}
+
+TEST_F(DBCompactionTest, CompactionHasEmptyOutput) {
+ Options options = CurrentOptions();
+ SstStatsCollector* collector = new SstStatsCollector();
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(collector);
+ Reopen(options);
+
+ // Make sure the L0 files overlap to prevent trivial move.
+ ASSERT_OK(Put("a", "val"));
+ ASSERT_OK(Put("b", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Delete("a"));
+ ASSERT_OK(Delete("b"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+ // Expect one file creation to start for each flush, and zero for compaction
+ // since no keys are written.
+ ASSERT_EQ(2, collector->num_ssts_creation_started());
+}
+
+TEST_F(DBCompactionTest, CompactionLimiter) {
+ const int kNumKeysPerFile = 10;
+ const int kMaxBackgroundThreads = 64;
+
+ struct CompactionLimiter {
+ std::string name;
+ int limit_tasks;
+ int max_tasks;
+ int tasks;
+ std::shared_ptr<ConcurrentTaskLimiter> limiter;
+ };
+
+ std::vector<CompactionLimiter> limiter_settings;
+ limiter_settings.push_back({"limiter_1", 1, 0, 0, nullptr});
+ limiter_settings.push_back({"limiter_2", 2, 0, 0, nullptr});
+ limiter_settings.push_back({"limiter_3", 3, 0, 0, nullptr});
+
+ for (auto& ls : limiter_settings) {
+ ls.limiter.reset(NewConcurrentTaskLimiter(ls.name, ls.limit_tasks));
+ }
+
+ std::shared_ptr<ConcurrentTaskLimiter> unique_limiter(
+ NewConcurrentTaskLimiter("unique_limiter", -1));
+
+ const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5", "6", "7",
+ "8", "9", "a", "b", "c", "d", "e", "f"};
+ const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0];
+
+ std::unordered_map<std::string, CompactionLimiter*> cf_to_limiter;
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 * 1024; // 110KB
+ options.arena_block_size = 4096;
+ options.num_levels = 3;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 64;
+ options.level0_stop_writes_trigger = 64;
+ options.max_background_jobs = kMaxBackgroundThreads; // Enough threads
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ options.max_write_buffer_number = 10; // Enough memtables
+ DestroyAndReopen(options);
+
+ std::vector<Options> option_vector;
+ option_vector.reserve(cf_count);
+
+ for (unsigned int cf = 0; cf < cf_count; cf++) {
+ ColumnFamilyOptions cf_opt(options);
+ if (cf == 0) {
+ // "Default" CF does't use compaction limiter
+ cf_opt.compaction_thread_limiter = nullptr;
+ } else if (cf == 1) {
+ // "1" CF uses bypass compaction limiter
+ unique_limiter->SetMaxOutstandingTask(-1);
+ cf_opt.compaction_thread_limiter = unique_limiter;
+ } else {
+ // Assign limiter by mod
+ auto& ls = limiter_settings[cf % 3];
+ cf_opt.compaction_thread_limiter = ls.limiter;
+ cf_to_limiter[cf_names[cf]] = &ls;
+ }
+ option_vector.emplace_back(DBOptions(options), cf_opt);
+ }
+
+ for (unsigned int cf = 1; cf < cf_count; cf++) {
+ CreateColumnFamilies({cf_names[cf]}, option_vector[cf]);
+ }
+
+ ReopenWithColumnFamilies(
+ std::vector<std::string>(cf_names, cf_names + cf_count), option_vector);
+
+ port::Mutex mutex;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:BeforeCompaction", [&](void* arg) {
+ const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+ auto iter = cf_to_limiter.find(cf_name);
+ if (iter != cf_to_limiter.end()) {
+ MutexLock l(&mutex);
+ ASSERT_GE(iter->second->limit_tasks, ++iter->second->tasks);
+ iter->second->max_tasks =
+ std::max(iter->second->max_tasks, iter->second->limit_tasks);
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:AfterCompaction", [&](void* arg) {
+ const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+ auto iter = cf_to_limiter.find(cf_name);
+ if (iter != cf_to_limiter.end()) {
+ MutexLock l(&mutex);
+ ASSERT_GE(--iter->second->tasks, 0);
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Block all compact threads in thread pool.
+ const size_t kTotalFlushTasks = kMaxBackgroundThreads / 4;
+ const size_t kTotalCompactTasks = kMaxBackgroundThreads - kTotalFlushTasks;
+ env_->SetBackgroundThreads((int)kTotalFlushTasks, Env::HIGH);
+ env_->SetBackgroundThreads((int)kTotalCompactTasks, Env::LOW);
+
+ test::SleepingBackgroundTask sleeping_compact_tasks[kTotalCompactTasks];
+
+ // Block all compaction threads in thread pool.
+ for (size_t i = 0; i < kTotalCompactTasks; i++) {
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_compact_tasks[i], Env::LOW);
+ sleeping_compact_tasks[i].WaitUntilSleeping();
+ }
+
+ int keyIndex = 0;
+
+ for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) {
+ for (unsigned int cf = 0; cf < cf_count; cf++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf, Key(keyIndex++), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf, "", ""));
+ }
+
+ for (unsigned int cf = 0; cf < cf_count; cf++) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+ }
+ }
+
+ // Enough L0 files to trigger compaction
+ for (unsigned int cf = 0; cf < cf_count; cf++) {
+ ASSERT_EQ(NumTableFilesAtLevel(0, cf),
+ options.level0_file_num_compaction_trigger);
+ }
+
+ // Create more files for one column family, which triggers speed up
+ // condition, all compactions will be scheduled.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(0, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(0, "", ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+ ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+ NumTableFilesAtLevel(0, 0));
+ }
+
+ // All CFs are pending compaction
+ ASSERT_EQ(cf_count, env_->GetThreadPoolQueueLen(Env::LOW));
+
+ // Unblock all compaction threads
+ for (size_t i = 0; i < kTotalCompactTasks; i++) {
+ sleeping_compact_tasks[i].WakeUp();
+ sleeping_compact_tasks[i].WaitUntilDone();
+ }
+
+ for (unsigned int cf = 0; cf < cf_count; cf++) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+ }
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Max outstanding compact tasks reached limit
+ for (auto& ls : limiter_settings) {
+ ASSERT_EQ(ls.limit_tasks, ls.max_tasks);
+ ASSERT_EQ(0, ls.limiter->GetOutstandingTask());
+ }
+
+ // test manual compaction under a fully throttled limiter
+ int cf_test = 1;
+ unique_limiter->SetMaxOutstandingTask(0);
+
+ // flush one more file to cf 1
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf_test, Key(keyIndex++), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf_test, "", ""));
+
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test));
+
+ Compact(cf_test, Key(0), Key(keyIndex));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
+ ::testing::Values(std::make_tuple(1, true),
+ std::make_tuple(1, false),
+ std::make_tuple(4, true),
+ std::make_tuple(4, false)));
+
+TEST_P(DBCompactionDirectIOTest, DirectIO) {
+ Options options = CurrentOptions();
+ Destroy(options);
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.use_direct_io_for_flush_and_compaction = GetParam();
+ options.env = MockEnv::Create(Env::Default());
+ Reopen(options);
+ bool readahead = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
+ bool* use_direct_writes = static_cast<bool*>(arg);
+ ASSERT_EQ(*use_direct_writes,
+ options.use_direct_io_for_flush_and_compaction);
+ });
+ if (options.use_direct_io_for_flush_and_compaction) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions:direct_io", [&](void* /*arg*/) { readahead = true; });
+ }
+ SyncPoint::GetInstance()->EnableProcessing();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ MakeTables(3, "p", "q", 1);
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+ Compact(1, "p", "q");
+ ASSERT_EQ(readahead, options.use_direct_reads);
+ ASSERT_EQ("0,0,1", FilesPerLevel(1));
+ Destroy(options);
+ delete options.env;
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest,
+ testing::Bool());
+
+class CompactionPriTest : public DBTestBase,
+ public testing::WithParamInterface<uint32_t> {
+ public:
+ CompactionPriTest()
+ : DBTestBase("compaction_pri_test", /*env_do_fsync=*/true) {
+ compaction_pri_ = GetParam();
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ uint32_t compaction_pri_;
+};
+
+TEST_P(CompactionPriTest, Test) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 16 * 1024;
+ options.compaction_pri = static_cast<CompactionPri>(compaction_pri_);
+ options.hard_pending_compaction_bytes_limit = 256 * 1024;
+ options.max_bytes_for_level_base = 64 * 1024;
+ options.max_bytes_for_level_multiplier = 4;
+ options.compression = kNoCompression;
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ const int kNKeys = 5000;
+ int keys[kNKeys];
+ for (int i = 0; i < kNKeys; i++) {
+ keys[i] = i;
+ }
+ RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
+
+ for (int i = 0; i < kNKeys; i++) {
+ ASSERT_OK(Put(Key(keys[i]), rnd.RandomString(102)));
+ }
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ for (int i = 0; i < kNKeys; i++) {
+ ASSERT_NE("NOT_FOUND", Get(Key(i)));
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ CompactionPriTest, CompactionPriTest,
+ ::testing::Values(CompactionPri::kByCompensatedSize,
+ CompactionPri::kOldestLargestSeqFirst,
+ CompactionPri::kOldestSmallestSeqFirst,
+ CompactionPri::kMinOverlappingRatio,
+ CompactionPri::kRoundRobin));
+
+TEST_F(DBCompactionTest, PersistRoundRobinCompactCursor) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 16 * 1024;
+ options.max_bytes_for_level_base = 128 * 1024;
+ options.target_file_size_base = 64 * 1024;
+ options.level0_file_num_compaction_trigger = 4;
+ options.compaction_pri = CompactionPri::kRoundRobin;
+ options.max_bytes_for_level_multiplier = 4;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ // 30 Files in L0 to trigger compactions between L1 and L2
+ for (int i = 0; i < 30; i++) {
+ for (int j = 0; j < 16; j++) {
+ ASSERT_OK(Put(rnd.RandomString(24), rnd.RandomString(1000)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(cfd, nullptr);
+
+ Version* const current = cfd->current();
+ ASSERT_NE(current, nullptr);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ ASSERT_NE(storage_info, nullptr);
+
+ const std::vector<InternalKey> compact_cursors =
+ storage_info->GetCompactCursors();
+
+ Reopen(options);
+
+ VersionSet* const reopened_versions = dbfull()->GetVersionSet();
+ assert(reopened_versions);
+
+ ColumnFamilyData* const reopened_cfd =
+ reopened_versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(reopened_cfd, nullptr);
+
+ Version* const reopened_current = reopened_cfd->current();
+ ASSERT_NE(reopened_current, nullptr);
+
+ const VersionStorageInfo* const reopened_storage_info =
+ reopened_current->storage_info();
+ ASSERT_NE(reopened_storage_info, nullptr);
+
+ const std::vector<InternalKey> reopened_compact_cursors =
+ reopened_storage_info->GetCompactCursors();
+ const auto icmp = reopened_storage_info->InternalComparator();
+ ASSERT_EQ(compact_cursors.size(), reopened_compact_cursors.size());
+ for (size_t i = 0; i < compact_cursors.size(); i++) {
+ if (compact_cursors[i].Valid()) {
+ ASSERT_EQ(0,
+ icmp->Compare(compact_cursors[i], reopened_compact_cursors[i]));
+ } else {
+ ASSERT_TRUE(!reopened_compact_cursors[i].Valid());
+ }
+ }
+}
+
+TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) {
+ const int kKeysPerBuffer = 100;
+ Options options = CurrentOptions();
+ options.num_levels = 4;
+ options.max_bytes_for_level_multiplier = 2;
+ options.level0_file_num_compaction_trigger = 4;
+ options.target_file_size_base = kKeysPerBuffer * 1024;
+ options.compaction_pri = CompactionPri::kRoundRobin;
+ options.max_bytes_for_level_base = 8 * kKeysPerBuffer * 1024;
+ options.disable_auto_compactions = true;
+ // Setup 7 threads but limited subcompactions so that
+ // RoundRobin requires extra compactions from reserved threads
+ options.max_subcompactions = 1;
+ options.max_background_compactions = 7;
+ options.max_compaction_bytes = 100000000;
+ DestroyAndReopen(options);
+ env_->SetBackgroundThreads(7, Env::LOW);
+
+ Random rnd(301);
+ const std::vector<int> files_per_level = {0, 15, 25};
+ for (int lvl = 2; lvl > 0; lvl--) {
+ for (int i = 0; i < files_per_level[lvl]; i++) {
+ for (int j = 0; j < kKeysPerBuffer; j++) {
+ // Add (lvl-1) to ensure nearly equivallent number of files
+ // in L2 are overlapped with fils selected to compact from
+ // L1
+ ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
+ rnd.RandomString(1010)));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(lvl);
+ ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
+ }
+ // 15 files in L1; 25 files in L2
+
+ // This is a variable for making sure the following callback is called
+ // and the assertions in it are indeed excuted.
+ bool num_planned_subcompactions_verified = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
+ uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
+ if (grab_pressure_token_) {
+ // 7 files are selected for round-robin under auto
+ // compaction. The number of planned subcompaction is restricted by
+ // the limited number of max_background_compactions
+ ASSERT_EQ(num_planned_subcompactions, 7);
+ } else {
+ ASSERT_EQ(num_planned_subcompactions, 1);
+ }
+ num_planned_subcompactions_verified = true;
+ });
+
+ // The following 3 dependencies have to be added to ensure the auto
+ // compaction and the pressure token is correctly enabled. Same for
+ // RoundRobinSubcompactionsUsingResources and
+ // DBCompactionTest.RoundRobinSubcompactionsShrinkResources
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"RoundRobinSubcompactionsAgainstPressureToken:0",
+ "BackgroundCallCompaction:0"},
+ {"CompactionJob::AcquireSubcompactionResources:0",
+ "RoundRobinSubcompactionsAgainstPressureToken:1"},
+ {"RoundRobinSubcompactionsAgainstPressureToken:2",
+ "CompactionJob::AcquireSubcompactionResources:1"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:0");
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:1");
+ std::unique_ptr<WriteControllerToken> pressure_token;
+ if (grab_pressure_token_) {
+ pressure_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+ }
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:2");
+
+ ASSERT_OK(dbfull()->WaitForCompact());
+ ASSERT_TRUE(num_planned_subcompactions_verified);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstPressureToken,
+ RoundRobinSubcompactionsAgainstPressureToken,
+ testing::Bool());
+
+TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
+ const int kKeysPerBuffer = 200;
+ Options options = CurrentOptions();
+ options.num_levels = 4;
+ options.level0_file_num_compaction_trigger = 3;
+ options.target_file_size_base = kKeysPerBuffer * 1024;
+ options.compaction_pri = CompactionPri::kRoundRobin;
+ options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024;
+ options.disable_auto_compactions = true;
+ options.max_subcompactions = 1;
+ options.max_background_compactions = max_compaction_limits_;
+ // Set a large number for max_compaction_bytes so that one round-robin
+ // compaction is enough to make post-compaction L1 size less than
+ // the maximum size (this test assumes only one round-robin compaction
+ // is triggered by kLevelMaxLevelSize)
+ options.max_compaction_bytes = 100000000;
+
+ DestroyAndReopen(options);
+ env_->SetBackgroundThreads(total_low_pri_threads_, Env::LOW);
+
+ Random rnd(301);
+ const std::vector<int> files_per_level = {0, 40, 100};
+ for (int lvl = 2; lvl > 0; lvl--) {
+ for (int i = 0; i < files_per_level[lvl]; i++) {
+ for (int j = 0; j < kKeysPerBuffer; j++) {
+ // Add (lvl-1) to ensure nearly equivallent number of files
+ // in L2 are overlapped with fils selected to compact from
+ // L1
+ ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
+ rnd.RandomString(1010)));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(lvl);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
+ }
+
+ // 40 files in L1; 100 files in L2
+ // This is a variable for making sure the following callback is called
+ // and the assertions in it are indeed excuted.
+ bool num_planned_subcompactions_verified = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
+ uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
+ // More than 10 files are selected for round-robin under auto
+ // compaction. The number of planned subcompaction is restricted by
+ // the minimum number between available threads and compaction limits
+ ASSERT_EQ(num_planned_subcompactions - options.max_subcompactions,
+ std::min(total_low_pri_threads_, max_compaction_limits_) - 1);
+ num_planned_subcompactions_verified = true;
+ });
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"RoundRobinSubcompactionsAgainstResources:0",
+ "BackgroundCallCompaction:0"},
+ {"CompactionJob::AcquireSubcompactionResources:0",
+ "RoundRobinSubcompactionsAgainstResources:1"},
+ {"RoundRobinSubcompactionsAgainstResources:2",
+ "CompactionJob::AcquireSubcompactionResources:1"},
+ {"CompactionJob::ReleaseSubcompactionResources:0",
+ "RoundRobinSubcompactionsAgainstResources:3"},
+ {"RoundRobinSubcompactionsAgainstResources:4",
+ "CompactionJob::ReleaseSubcompactionResources:1"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(dbfull()->WaitForCompact());
+ ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0");
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1");
+ auto pressure_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:2");
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:3");
+ // We can reserve more threads now except one is being used
+ ASSERT_EQ(total_low_pri_threads_ - 1,
+ env_->ReserveThreads(total_low_pri_threads_, Env::Priority::LOW));
+ ASSERT_EQ(
+ total_low_pri_threads_ - 1,
+ env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW));
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4");
+ ASSERT_OK(dbfull()->WaitForCompact());
+ ASSERT_TRUE(num_planned_subcompactions_verified);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstResources,
+ RoundRobinSubcompactionsAgainstResources,
+ ::testing::Values(std::make_tuple(1, 5),
+ std::make_tuple(5, 1),
+ std::make_tuple(10, 5),
+ std::make_tuple(5, 10),
+ std::make_tuple(10, 10)));
+
+TEST_P(DBCompactionTestWithParam, RoundRobinWithoutAdditionalResources) {
+ const int kKeysPerBuffer = 200;
+ Options options = CurrentOptions();
+ options.num_levels = 4;
+ options.level0_file_num_compaction_trigger = 3;
+ options.target_file_size_base = kKeysPerBuffer * 1024;
+ options.compaction_pri = CompactionPri::kRoundRobin;
+ options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024;
+ options.disable_auto_compactions = true;
+ options.max_subcompactions = max_subcompactions_;
+ options.max_background_compactions = 1;
+ options.max_compaction_bytes = 100000000;
+ // Similar experiment setting as above except the max_subcompactions
+ // is given by max_subcompactions_ (1 or 4), and we fix the
+ // additional resources as (1, 1) and thus no more extra resources
+ // can be used
+ DestroyAndReopen(options);
+ env_->SetBackgroundThreads(1, Env::LOW);
+
+ Random rnd(301);
+ const std::vector<int> files_per_level = {0, 33, 100};
+ for (int lvl = 2; lvl > 0; lvl--) {
+ for (int i = 0; i < files_per_level[lvl]; i++) {
+ for (int j = 0; j < kKeysPerBuffer; j++) {
+ // Add (lvl-1) to ensure nearly equivallent number of files
+ // in L2 are overlapped with fils selected to compact from
+ // L1
+ ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
+ rnd.RandomString(1010)));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(lvl);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
+ }
+
+ // 33 files in L1; 100 files in L2
+ // This is a variable for making sure the following callback is called
+ // and the assertions in it are indeed excuted.
+ bool num_planned_subcompactions_verified = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
+ uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
+ // At most 4 files are selected for round-robin under auto
+ // compaction. The number of planned subcompaction is restricted by
+ // the max_subcompactions since no extra resources can be used
+ ASSERT_EQ(num_planned_subcompactions, options.max_subcompactions);
+ num_planned_subcompactions_verified = true;
+ });
+ // No need to setup dependency for pressure token since
+ // AcquireSubcompactionResources may not be called and it anyway cannot
+ // reserve any additional resources
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBCompactionTest::RoundRobinWithoutAdditionalResources:0",
+ "BackgroundCallCompaction:0"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(dbfull()->WaitForCompact());
+ ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
+ TEST_SYNC_POINT("DBCompactionTest::RoundRobinWithoutAdditionalResources:0");
+
+ ASSERT_OK(dbfull()->WaitForCompact());
+ ASSERT_TRUE(num_planned_subcompactions_verified);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBCompactionTest, RoundRobinCutOutputAtCompactCursor) {
+ Options options = CurrentOptions();
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.write_buffer_size = 4 * 1024;
+ options.max_bytes_for_level_base = 64 * 1024;
+ options.max_bytes_for_level_multiplier = 4;
+ options.level0_file_num_compaction_trigger = 4;
+ options.compaction_pri = CompactionPri::kRoundRobin;
+
+ DestroyAndReopen(options);
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(cfd, nullptr);
+
+ Version* const current = cfd->current();
+ ASSERT_NE(current, nullptr);
+
+ VersionStorageInfo* storage_info = current->storage_info();
+ ASSERT_NE(storage_info, nullptr);
+
+ const InternalKey split_cursor = InternalKey(Key(600), 100, kTypeValue);
+ storage_info->AddCursorForOneLevel(2, split_cursor);
+
+ Random rnd(301);
+
+ for (int i = 0; i < 50; i++) {
+ for (int j = 0; j < 50; j++) {
+ ASSERT_OK(Put(Key(j * 2 + i * 100), rnd.RandomString(102)));
+ }
+ }
+ // Add more overlapping files (avoid trivial move) to trigger compaction that
+ // output files in L2. Note that trivial move does not trigger compaction and
+ // in that case the cursor is not necessarily the boundary of file.
+ for (int i = 0; i < 50; i++) {
+ for (int j = 0; j < 50; j++) {
+ ASSERT_OK(Put(Key(j * 2 + 1 + i * 100), rnd.RandomString(1014)));
+ }
+ }
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ const auto icmp = cfd->current()->storage_info()->InternalComparator();
+ // Files in level 2 should be split by the cursor
+ for (const auto& file : level_to_files[2]) {
+ ASSERT_TRUE(
+ icmp->Compare(file.smallest.Encode(), split_cursor.Encode()) >= 0 ||
+ icmp->Compare(file.largest.Encode(), split_cursor.Encode()) < 0);
+ }
+}
+
+class NoopMergeOperator : public MergeOperator {
+ public:
+ NoopMergeOperator() {}
+
+ bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+ MergeOperationOutput* merge_out) const override {
+ std::string val("bar");
+ merge_out->new_value = val;
+ return true;
+ }
+
+ const char* Name() const override { return "Noop"; }
+};
+
+TEST_F(DBCompactionTest, PartialManualCompaction) {
+ Options opts = CurrentOptions();
+ opts.num_levels = 3;
+ opts.level0_file_num_compaction_trigger = 10;
+ opts.compression = kNoCompression;
+ opts.merge_operator.reset(new NoopMergeOperator());
+ opts.target_file_size_base = 10240;
+ DestroyAndReopen(opts);
+
+ Random rnd(301);
+ for (auto i = 0; i < 8; ++i) {
+ for (auto j = 0; j < 10; ++j) {
+ ASSERT_OK(Merge("foo", rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ MoveFilesToLevel(2);
+
+ std::string prop;
+ EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop));
+ uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2;
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}}));
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+}
+
+TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
+ // Regression test for bug where manual compaction hangs forever when the DB
+ // is in read-only mode. Verify it now at least returns, despite failing.
+ const int kNumL0Files = 4;
+ std::unique_ptr<FaultInjectionTestEnv> mock_env(
+ new FaultInjectionTestEnv(env_));
+ Options opts = CurrentOptions();
+ opts.disable_auto_compactions = true;
+ opts.env = mock_env.get();
+ DestroyAndReopen(opts);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumL0Files; ++i) {
+ // Make sure files are overlapping in key-range to prevent trivial move.
+ ASSERT_OK(Put("key1", rnd.RandomString(1024)));
+ ASSERT_OK(Put("key2", rnd.RandomString(1024)));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0));
+
+ // Enter read-only mode by failing a write.
+ mock_env->SetFilesystemActive(false);
+ // Make sure this is outside `CompactRange`'s range so that it doesn't fail
+ // early trying to flush memtable.
+ ASSERT_NOK(Put("key3", rnd.RandomString(1024)));
+
+ // In the bug scenario, the first manual compaction would fail and forget to
+ // unregister itself, causing the second one to hang forever due to conflict
+ // with a non-running compaction.
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = false;
+ Slice begin_key("key1");
+ Slice end_key("key2");
+ ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+ ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+
+ // Close before mock_env destruct.
+ Close();
+}
+
+// ManualCompactionBottomLevelOptimization tests the bottom level manual
+// compaction optimization to skip recompacting files created by Ln-1 to Ln
+// compaction
+TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
+ Options opts = CurrentOptions();
+ opts.num_levels = 3;
+ opts.level0_file_num_compaction_trigger = 5;
+ opts.compression = kNoCompression;
+ opts.merge_operator.reset(new NoopMergeOperator());
+ opts.target_file_size_base = 1024;
+ opts.max_bytes_for_level_multiplier = 2;
+ opts.disable_auto_compactions = true;
+ DestroyAndReopen(opts);
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+ ColumnFamilyData* cfd = cfh->cfd();
+ InternalStats* internal_stats_ptr = cfd->internal_stats();
+ ASSERT_NE(internal_stats_ptr, nullptr);
+
+ Random rnd(301);
+ for (auto i = 0; i < 8; ++i) {
+ for (auto j = 0; j < 10; ++j) {
+ ASSERT_OK(
+ Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ MoveFilesToLevel(2);
+
+ for (auto i = 0; i < 8; ++i) {
+ for (auto j = 0; j < 10; ++j) {
+ ASSERT_OK(
+ Put("bar" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ const std::vector<InternalStats::CompactionStats>& comp_stats =
+ internal_stats_ptr->TEST_GetCompactionStats();
+ int num = comp_stats[2].num_input_files_in_output_level;
+ ASSERT_EQ(num, 0);
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+ const std::vector<InternalStats::CompactionStats>& comp_stats2 =
+ internal_stats_ptr->TEST_GetCompactionStats();
+ num = comp_stats2[2].num_input_files_in_output_level;
+ ASSERT_EQ(num, 0);
+}
+
+TEST_F(DBCompactionTest, ManualCompactionMax) {
+ uint64_t l1_avg_size = 0, l2_avg_size = 0;
+ auto generate_sst_func = [&]() {
+ Random rnd(301);
+ for (auto i = 0; i < 100; i++) {
+ for (auto j = 0; j < 10; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(2);
+
+ for (auto i = 0; i < 10; i++) {
+ for (auto j = 0; j < 10; j++) {
+ ASSERT_OK(Put(Key(i * 100 + j * 10), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(1);
+
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+
+ uint64_t total = 0;
+ for (const auto& file : level_to_files[1]) {
+ total += file.compensated_file_size;
+ }
+ l1_avg_size = total / level_to_files[1].size();
+
+ total = 0;
+ for (const auto& file : level_to_files[2]) {
+ total += file.compensated_file_size;
+ }
+ l2_avg_size = total / level_to_files[2].size();
+ };
+
+ std::atomic_int num_compactions(0);
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BGWorkCompaction", [&](void* /*arg*/) { ++num_compactions; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Options opts = CurrentOptions();
+ opts.disable_auto_compactions = true;
+
+ // with default setting (1.6G by default), it should cover all files in 1
+ // compaction
+ DestroyAndReopen(opts);
+ generate_sst_func();
+ num_compactions.store(0);
+ CompactRangeOptions cro;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_TRUE(num_compactions.load() == 1);
+
+ // split the compaction to 5
+ int num_split = 5;
+ DestroyAndReopen(opts);
+ generate_sst_func();
+ uint64_t total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
+ opts.max_compaction_bytes = total_size / num_split;
+ opts.target_file_size_base = total_size / num_split;
+ Reopen(opts);
+ num_compactions.store(0);
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_TRUE(num_compactions.load() == num_split);
+
+ // very small max_compaction_bytes, it should still move forward
+ opts.max_compaction_bytes = l1_avg_size / 2;
+ opts.target_file_size_base = l1_avg_size / 2;
+ DestroyAndReopen(opts);
+ generate_sst_func();
+ num_compactions.store(0);
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_TRUE(num_compactions.load() > 10);
+
+ // dynamically set the option
+ num_split = 2;
+ opts.max_compaction_bytes = 0;
+ DestroyAndReopen(opts);
+ generate_sst_func();
+ total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
+ Status s = db_->SetOptions(
+ {{"max_compaction_bytes", std::to_string(total_size / num_split)},
+ {"target_file_size_base", std::to_string(total_size / num_split)}});
+ ASSERT_OK(s);
+
+ num_compactions.store(0);
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_TRUE(num_compactions.load() == num_split);
+}
+
+TEST_F(DBCompactionTest, CompactionDuringShutdown) {
+ Options opts = CurrentOptions();
+ opts.level0_file_num_compaction_trigger = 2;
+ opts.disable_auto_compactions = true;
+ DestroyAndReopen(opts);
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+ ColumnFamilyData* cfd = cfh->cfd();
+ InternalStats* internal_stats_ptr = cfd->internal_stats();
+ ASSERT_NE(internal_stats_ptr, nullptr);
+
+ Random rnd(301);
+ for (auto i = 0; i < 2; ++i) {
+ for (auto j = 0; j < 10; ++j) {
+ ASSERT_OK(
+ Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
+ [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_TRUE(s.ok() || s.IsShutdownInProgress());
+ ASSERT_OK(dbfull()->error_handler_.GetBGError());
+}
+
+// FixFileIngestionCompactionDeadlock tests and verifies that compaction and
+// file ingestion do not cause deadlock in the event of write stall triggered
+// by number of L0 files reaching level0_stop_writes_trigger.
+TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
+ const int kNumKeysPerFile = 100;
+ // Generate SST files.
+ Options options = CurrentOptions();
+
+ // Generate an external SST file containing a single key, i.e. 99
+ std::string sst_files_dir = dbname_ + "/sst_files/";
+ ASSERT_OK(DestroyDir(env_, sst_files_dir));
+ ASSERT_OK(env_->CreateDir(sst_files_dir));
+ SstFileWriter sst_writer(EnvOptions(), options);
+ const std::string sst_file_path = sst_files_dir + "test.sst";
+ ASSERT_OK(sst_writer.Open(sst_file_path));
+ ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value"));
+ ASSERT_OK(sst_writer.Finish());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+ "BackgroundCallCompaction:0"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.level0_file_num_compaction_trigger =
+ options.level0_stop_writes_trigger;
+ options.max_subcompactions = max_subcompactions_;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // Generate level0_stop_writes_trigger L0 files to trigger write stop
+ for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+ for (int j = 0; j != kNumKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(j), rnd.RandomString(990)));
+ }
+ if (i > 0) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i);
+ }
+ }
+ // When we reach this point, there will be level0_stop_writes_trigger L0
+ // files and one extra key (99) in memory, which overlaps with the external
+ // SST file. Write stall triggers, and can be cleared only after compaction
+ // reduces the number of L0 files.
+
+ // Compaction will also be triggered since we have reached the threshold for
+ // auto compaction. Note that compaction may begin after the following file
+ // ingestion thread and waits for ingestion to finish.
+
+ // Thread to ingest file with overlapping key range with the current
+ // memtable. Consequently ingestion will trigger a flush. The flush MUST
+ // proceed without waiting for the write stall condition to clear, otherwise
+ // deadlock can happen.
+ port::Thread ingestion_thr([&]() {
+ IngestExternalFileOptions ifo;
+ Status s = db_->IngestExternalFile({sst_file_path}, ifo);
+ ASSERT_OK(s);
+ });
+
+ // More write to trigger write stop
+ ingestion_thr.join();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ Close();
+}
+
+TEST_F(DBCompactionTest, ConsistencyFailTest) {
+ Options options = CurrentOptions();
+ options.force_consistency_checks = true;
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionBuilder::CheckConsistency0", [&](void* arg) {
+ auto p =
+ reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
+ // just swap the two FileMetaData so that we hit error
+ // in CheckConsistency funcion
+ FileMetaData* temp = *(p->first);
+ *(p->first) = *(p->second);
+ *(p->second) = temp;
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int k = 0; k < 2; ++k) {
+ ASSERT_OK(Put("foo", "bar"));
+ Status s = Flush();
+ if (k < 1) {
+ ASSERT_OK(s);
+ } else {
+ ASSERT_TRUE(s.IsCorruption());
+ }
+ }
+
+ ASSERT_NOK(Put("foo", "bar"));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBCompactionTest, ConsistencyFailTest2) {
+ Options options = CurrentOptions();
+ options.force_consistency_checks = true;
+ options.target_file_size_base = 1000;
+ options.level0_file_num_compaction_trigger = 2;
+ BlockBasedTableOptions bbto;
+ bbto.block_size = 400; // small block size
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionBuilder::CheckConsistency1", [&](void* arg) {
+ auto p =
+ reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
+ // just swap the two FileMetaData so that we hit error
+ // in CheckConsistency funcion
+ FileMetaData* temp = *(p->first);
+ *(p->first) = *(p->second);
+ *(p->second) = temp;
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ std::string value = rnd.RandomString(1000);
+
+ ASSERT_OK(Put("foo1", value));
+ ASSERT_OK(Put("z", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo2", value));
+ ASSERT_OK(Put("z", ""));
+ Status s = Flush();
+ ASSERT_TRUE(s.ok() || s.IsCorruption());
+
+ // This probably returns non-OK, but we rely on the next Put()
+ // to determine the DB is frozen.
+ ASSERT_NOK(dbfull()->TEST_WaitForCompact());
+ ASSERT_NOK(Put("foo", "bar"));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+void IngestOneKeyValue(DBImpl* db, const std::string& key,
+ const std::string& value, const Options& options) {
+ ExternalSstFileInfo info;
+ std::string f = test::PerThreadDBPath("sst_file" + key);
+ EnvOptions env;
+ ROCKSDB_NAMESPACE::SstFileWriter writer(env, options);
+ auto s = writer.Open(f);
+ ASSERT_OK(s);
+ // ASSERT_OK(writer.Put(Key(), ""));
+ ASSERT_OK(writer.Put(key, value));
+
+ ASSERT_OK(writer.Finish(&info));
+ IngestExternalFileOptions ingest_opt;
+
+ ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt));
+}
+
+TEST_P(DBCompactionTestWithParam,
+ FlushAfterIntraL0CompactionCheckConsistencyFail) {
+ Options options = CurrentOptions();
+ options.force_consistency_checks = true;
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 5;
+ options.max_background_compactions = 2;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ const size_t kValueSize = 1 << 20;
+ Random rnd(301);
+ std::atomic<int> pick_intra_l0_count(0);
+ std::string value(rnd.RandomString(kValueSize));
+
+ // The L0->L1 must be picked before we begin ingesting files to trigger
+ // intra-L0 compaction, and must not finish until after an intra-L0
+ // compaction has been picked.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"LevelCompactionPicker::PickCompaction:Return",
+ "DBCompactionTestWithParam::"
+ "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"},
+ {"LevelCompactionPicker::PickCompactionBySize:0",
+ "CompactionJob::Run():Start"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FindIntraL0Compaction",
+ [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // prevents trivial move
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(Put(Key(i), "")); // prevents trivial move
+ }
+ ASSERT_OK(Flush());
+ Compact("", Key(99));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+ // Flush 5 L0 sst.
+ for (int i = 0; i < 5; ++i) {
+ ASSERT_OK(Put(Key(i + 1), value));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+ // Put one key, to make smallest log sequence number in this memtable is less
+ // than sst which would be ingested in next step.
+ ASSERT_OK(Put(Key(0), "a"));
+
+ ASSERT_EQ(5, NumTableFilesAtLevel(0));
+ TEST_SYNC_POINT(
+ "DBCompactionTestWithParam::"
+ "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready");
+
+ // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction.
+ for (int i = 5; i < 10; i++) {
+ ASSERT_EQ(i, NumTableFilesAtLevel(0));
+ IngestOneKeyValue(dbfull(), Key(i), value, options);
+ }
+
+ // Put one key, to make biggest log sequence number in this memtable is bigger
+ // than sst which would be ingested in next step.
+ ASSERT_OK(Put(Key(2), "b"));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ ASSERT_GT(level_to_files[0].size(), 0);
+ ASSERT_GT(pick_intra_l0_count.load(), 0);
+
+ ASSERT_OK(Flush());
+}
+
+TEST_P(DBCompactionTestWithParam,
+ IntraL0CompactionAfterFlushCheckConsistencyFail) {
+ Options options = CurrentOptions();
+ options.force_consistency_checks = true;
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 5;
+ options.max_background_compactions = 2;
+ options.max_subcompactions = max_subcompactions_;
+ options.write_buffer_size = 2 << 20;
+ options.max_write_buffer_number = 6;
+ DestroyAndReopen(options);
+
+ const size_t kValueSize = 1 << 20;
+ Random rnd(301);
+ std::string value(rnd.RandomString(kValueSize));
+ std::string value2(rnd.RandomString(kValueSize));
+ std::string bigvalue = value + value;
+
+ // prevents trivial move
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(Put(Key(i), "")); // prevents trivial move
+ }
+ ASSERT_OK(Flush());
+ Compact("", Key(99));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+ std::atomic<int> pick_intra_l0_count(0);
+ // The L0->L1 must be picked before we begin ingesting files to trigger
+ // intra-L0 compaction, and must not finish until after an intra-L0
+ // compaction has been picked.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"LevelCompactionPicker::PickCompaction:Return",
+ "DBCompactionTestWithParam::"
+ "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"},
+ {"LevelCompactionPicker::PickCompactionBySize:0",
+ "CompactionJob::Run():Start"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FindIntraL0Compaction",
+ [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ // Make 6 L0 sst.
+ for (int i = 0; i < 6; ++i) {
+ if (i % 2 == 0) {
+ IngestOneKeyValue(dbfull(), Key(i), value, options);
+ } else {
+ ASSERT_OK(Put(Key(i), value));
+ ASSERT_OK(Flush());
+ }
+ }
+
+ ASSERT_EQ(6, NumTableFilesAtLevel(0));
+
+ // Stop run flush job
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ test::SleepingBackgroundTask sleeping_tasks;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks,
+ Env::Priority::HIGH);
+ sleeping_tasks.WaitUntilSleeping();
+
+ // Put many keys to make memtable request to flush
+ for (int i = 0; i < 6; ++i) {
+ ASSERT_OK(Put(Key(i), bigvalue));
+ }
+
+ ASSERT_EQ(6, NumTableFilesAtLevel(0));
+ TEST_SYNC_POINT(
+ "DBCompactionTestWithParam::"
+ "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready");
+ // ingest file to trigger IntraL0Compaction
+ for (int i = 6; i < 10; ++i) {
+ ASSERT_EQ(i, NumTableFilesAtLevel(0));
+ IngestOneKeyValue(dbfull(), Key(i), value2, options);
+ }
+
+ // Wake up flush job
+ sleeping_tasks.WakeUp();
+ sleeping_tasks.WaitUntilDone();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ uint64_t error_count = 0;
+ db_->GetIntProperty("rocksdb.background-errors", &error_count);
+ ASSERT_EQ(error_count, 0);
+ ASSERT_GT(pick_intra_l0_count.load(), 0);
+ for (int i = 0; i < 6; ++i) {
+ ASSERT_EQ(bigvalue, Get(Key(i)));
+ }
+ for (int i = 6; i < 10; ++i) {
+ ASSERT_EQ(value2, Get(Key(i)));
+ }
+}
+
+TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) {
+ constexpr int kSstNum = 10;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ // Generate some sst files on level 0 with sequence keys (no overlap)
+ for (int i = 0; i < kSstNum; i++) {
+ for (int j = 1; j < UCHAR_MAX; j++) {
+ auto key = std::string(kSstNum, '\0');
+ key[kSstNum - i] += static_cast<char>(j);
+ ASSERT_OK(Put(key, std::string(i % 1000, 'A')));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ ASSERT_EQ(std::to_string(kSstNum), FilesPerLevel(0));
+
+ auto cro = CompactRangeOptions();
+ cro.bottommost_level_compaction = bottommost_level_compaction_;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce ||
+ bottommost_level_compaction_ ==
+ BottommostLevelCompaction::kForceOptimized) {
+ // Real compaction to compact all sst files from level 0 to 1 file on level
+ // 1
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ } else {
+ // Just trivial move from level 0 -> 1
+ ASSERT_EQ("0," + std::to_string(kSstNum), FilesPerLevel(0));
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ DBCompactionTestWithBottommostParam, DBCompactionTestWithBottommostParam,
+ ::testing::Values(BottommostLevelCompaction::kSkip,
+ BottommostLevelCompaction::kIfHaveCompactionFilter,
+ BottommostLevelCompaction::kForce,
+ BottommostLevelCompaction::kForceOptimized));
+
+TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) {
+ Options options = CurrentOptions();
+ options.max_subcompactions = 10;
+ options.target_file_size_base = 1 << 10; // 1KB
+ DestroyAndReopen(options);
+
+ bool has_compaction = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->max_subcompactions() == 10);
+ has_compaction = true;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 10);
+ // Trigger compaction
+ for (int i = 0; i < 32; i++) {
+ for (int j = 0; j < 5000; j++) {
+ ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(has_compaction);
+
+ has_compaction = false;
+ ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}}));
+ ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->max_subcompactions() == 2);
+ has_compaction = true;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Trigger compaction
+ for (int i = 0; i < 32; i++) {
+ for (int j = 0; j < 5000; j++) {
+ ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(has_compaction);
+}
+
+TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) {
+ Options options = CurrentOptions();
+ options.max_subcompactions = 10;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.target_file_size_base = 1 << 10; // 1KB
+ DestroyAndReopen(options);
+
+ bool has_compaction = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->max_subcompactions() == 10);
+ has_compaction = true;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Trigger compaction
+ for (int i = 0; i < 32; i++) {
+ for (int j = 0; j < 5000; j++) {
+ ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(has_compaction);
+ has_compaction = false;
+
+ ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}}));
+ ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->max_subcompactions() == 2);
+ has_compaction = true;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Trigger compaction
+ for (int i = 0; i < 32; i++) {
+ for (int j = 0; j < 5000; j++) {
+ ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(has_compaction);
+}
+
+TEST_P(ChangeLevelConflictsWithAuto, TestConflict) {
+ // A `CompactRange()` may race with an automatic compaction, we'll need
+ // to make sure it doesn't corrupte the data.
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("bar", "v1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ }
+ ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+ // Run a qury to refitting to level 1 while another thread writing to
+ // the same level.
+ SyncPoint::GetInstance()->LoadDependency({
+ // The first two dependencies ensure the foreground creates an L0 file
+ // between the background compaction's L0->L1 and its L1->L2.
+ {
+ "DBImpl::CompactRange:BeforeRefit:1",
+ "AutoCompactionFinished1",
+ },
+ {
+ "AutoCompactionFinished2",
+ "DBImpl::CompactRange:BeforeRefit:2",
+ },
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::thread auto_comp([&] {
+ TEST_SYNC_POINT("AutoCompactionFinished1");
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Put("foo", "v2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("bar", "v3"));
+ ASSERT_OK(Put("foo", "v3"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ TEST_SYNC_POINT("AutoCompactionFinished2");
+ });
+
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = GetParam() ? 1 : 0;
+ // This should return non-OK, but it's more important for the test to
+ // make sure that the DB is not corrupted.
+ ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ }
+ auto_comp.join();
+ // Refitting didn't happen.
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // Write something to DB just make sure that consistency check didn't
+ // fail and make the DB readable.
+}
+
+INSTANTIATE_TEST_CASE_P(ChangeLevelConflictsWithAuto,
+ ChangeLevelConflictsWithAuto, testing::Bool());
+
+TEST_F(DBCompactionTest, ChangeLevelCompactRangeConflictsWithManual) {
+ // A `CompactRange()` with `change_level == true` needs to execute its final
+ // step, `ReFitLevel()`, in isolation. Previously there was a bug where
+ // refitting could target the same level as an ongoing manual compaction,
+ // leading to overlapping files in that level.
+ //
+ // This test ensures that case is not possible by verifying any manual
+ // compaction issued during the `ReFitLevel()` phase fails with
+ // `Status::Incomplete`.
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 3;
+ Reopen(options);
+
+ // Setup an LSM with three levels populated.
+ Random rnd(301);
+ int key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ }
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ GenerateNewFile(&rnd, &key_idx);
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1,2", FilesPerLevel(0));
+
+ // The background thread will refit L2->L1 while the
+ // foreground thread will try to simultaneously compact L0->L1.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ // The first two dependencies ensure the foreground creates an L0 file
+ // between the background compaction's L0->L1 and its L1->L2.
+ {
+ "DBImpl::RunManualCompaction()::1",
+ "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+ "PutFG",
+ },
+ {
+ "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+ "FlushedFG",
+ "DBImpl::RunManualCompaction()::2",
+ },
+ // The next two dependencies ensure the foreground invokes
+ // `CompactRange()` while the background is refitting. The
+ // foreground's `CompactRange()` is guaranteed to attempt an L0->L1
+ // as we set it up with an empty memtable and a new L0 file.
+ {
+ "DBImpl::CompactRange:PreRefitLevel",
+ "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+ "CompactFG",
+ },
+ {
+ "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+ "CompactedFG",
+ "DBImpl::CompactRange:PostRefitLevel",
+ },
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 1;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:PutFG");
+ // Make sure we have something new to compact in the foreground.
+ // Note key 1 is carefully chosen as it ensures the file we create here
+ // overlaps with one of the files being refitted L2->L1 in the background.
+ // If we chose key 0, the file created here would not overlap.
+ ASSERT_OK(Put(Key(1), "val"));
+ ASSERT_OK(Flush());
+ TEST_SYNC_POINT(
+ "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:FlushedFG");
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:CompactFG");
+ ASSERT_TRUE(dbfull()
+ ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+ .IsIncomplete());
+ TEST_SYNC_POINT(
+ "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+ "CompactedFG");
+ refit_level_thread.join();
+}
+
+TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) {
+ // This test is added to ensure that RefitLevel() error paths are clearing
+ // internal flags and to test that subsequent valid RefitLevel() calls
+ // succeeds
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 3;
+ Reopen(options);
+
+ ASSERT_EQ("", FilesPerLevel(0));
+
+ // Setup an LSM with three levels populated.
+ Random rnd(301);
+ int key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1", FilesPerLevel(0));
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ }
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ auto start_idx = key_idx;
+ GenerateNewFile(&rnd, &key_idx);
+ GenerateNewFile(&rnd, &key_idx);
+ auto end_idx = key_idx - 1;
+ ASSERT_EQ("1,1,2", FilesPerLevel(0));
+
+ // Next two CompactRange() calls are used to test exercise error paths within
+ // RefitLevel() before triggering a valid RefitLevel() call
+
+ // Trigger a refit to L1 first
+ {
+ std::string begin_string = Key(start_idx);
+ std::string end_string = Key(end_idx);
+ Slice begin(begin_string);
+ Slice end(end_string);
+
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 1;
+ ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end));
+ }
+ ASSERT_EQ("0,3,2", FilesPerLevel(0));
+
+ // Try a refit from L2->L1 - this should fail and exercise error paths in
+ // RefitLevel()
+ {
+ // Select key range that matches the bottom most level (L2)
+ std::string begin_string = Key(0);
+ std::string end_string = Key(start_idx - 1);
+ Slice begin(begin_string);
+ Slice end(end_string);
+
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 1;
+ ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end));
+ }
+ ASSERT_EQ("0,3,2", FilesPerLevel(0));
+
+ // Try a valid Refit request to ensure, the path is still working
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 1;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ }
+ ASSERT_EQ("0,5", FilesPerLevel(0));
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlob) {
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+
+ Reopen(options);
+
+ constexpr char first_key[] = "first_key";
+ constexpr char second_key[] = "second_key";
+ constexpr char first_value[] = "first_value";
+ constexpr char second_value[] = "second_value";
+ constexpr char third_value[] = "third_value";
+
+ ASSERT_OK(Put(first_key, first_value));
+ ASSERT_OK(Put(second_key, first_value));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(first_key, second_value));
+ ASSERT_OK(Put(second_key, second_value));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(first_key, third_value));
+ ASSERT_OK(Put(second_key, third_value));
+ ASSERT_OK(Flush());
+
+ options.enable_blob_files = true;
+
+ Reopen(options);
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ ASSERT_EQ(Get(first_key), third_value);
+ ASSERT_EQ(Get(second_key), third_value);
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(cfd, nullptr);
+
+ Version* const current = cfd->current();
+ ASSERT_NE(current, nullptr);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ ASSERT_NE(storage_info, nullptr);
+
+ const auto& l1_files = storage_info->LevelFiles(1);
+ ASSERT_EQ(l1_files.size(), 1);
+
+ const FileMetaData* const table_file = l1_files[0];
+ ASSERT_NE(table_file, nullptr);
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+ ASSERT_EQ(blob_files.size(), 1);
+
+ const auto& blob_file = blob_files.front();
+ ASSERT_NE(blob_file, nullptr);
+
+ ASSERT_EQ(table_file->smallest.user_key(), first_key);
+ ASSERT_EQ(table_file->largest.user_key(), second_key);
+ ASSERT_EQ(table_file->oldest_blob_file_number,
+ blob_file->GetBlobFileNumber());
+
+ ASSERT_EQ(blob_file->GetTotalBlobCount(), 2);
+
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ ASSERT_NE(internal_stats, nullptr);
+
+ const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+ ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written, table_file->fd.GetFileSize());
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob,
+ blob_file->GetTotalBlobBytes());
+ ASSERT_EQ(compaction_stats[1].num_output_files, 1);
+ ASSERT_EQ(compaction_stats[1].num_output_files_blob, 1);
+}
+
+class DBCompactionTestBlobError
+ : public DBCompactionTest,
+ public testing::WithParamInterface<std::string> {
+ public:
+ DBCompactionTestBlobError() : sync_point_(GetParam()) {}
+
+ std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobError, DBCompactionTestBlobError,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileBuilder::WriteBlobToFile:AddRecord",
+ "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBCompactionTestBlobError, CompactionError) {
+ Options options;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+
+ Reopen(options);
+
+ constexpr char first_key[] = "first_key";
+ constexpr char second_key[] = "second_key";
+ constexpr char first_value[] = "first_value";
+ constexpr char second_value[] = "second_value";
+ constexpr char third_value[] = "third_value";
+
+ ASSERT_OK(Put(first_key, first_value));
+ ASSERT_OK(Put(second_key, first_value));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(first_key, second_value));
+ ASSERT_OK(Put(second_key, second_value));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(first_key, third_value));
+ ASSERT_OK(Put(second_key, third_value));
+ ASSERT_OK(Flush());
+
+ options.enable_blob_files = true;
+
+ Reopen(options);
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+ Status* const s = static_cast<Status*>(arg);
+ assert(s);
+
+ (*s) = Status::IOError(sync_point_);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), begin, end).IsIOError());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(cfd, nullptr);
+
+ Version* const current = cfd->current();
+ ASSERT_NE(current, nullptr);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ ASSERT_NE(storage_info, nullptr);
+
+ const auto& l1_files = storage_info->LevelFiles(1);
+ ASSERT_TRUE(l1_files.empty());
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+ ASSERT_TRUE(blob_files.empty());
+
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ ASSERT_NE(internal_stats, nullptr);
+
+ const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") {
+ ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+ ASSERT_EQ(compaction_stats[1].num_output_files, 0);
+ ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
+ } else {
+ // SST file writing succeeded; blob file writing failed (during Finish)
+ ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_GT(compaction_stats[1].bytes_written, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+ ASSERT_EQ(compaction_stats[1].num_output_files, 1);
+ ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
+ }
+}
+
+class DBCompactionTestBlobGC
+ : public DBCompactionTest,
+ public testing::WithParamInterface<std::tuple<double, bool>> {
+ public:
+ DBCompactionTestBlobGC()
+ : blob_gc_age_cutoff_(std::get<0>(GetParam())),
+ updated_enable_blob_files_(std::get<1>(GetParam())) {}
+
+ double blob_gc_age_cutoff_;
+ bool updated_enable_blob_files_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobGC, DBCompactionTestBlobGC,
+ ::testing::Combine(::testing::Values(0.0, 0.5, 1.0),
+ ::testing::Bool()));
+
+TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGCOverrides) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.enable_blob_files = true;
+ options.blob_file_size = 32; // one blob per file
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 0;
+
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < 128; i += 2) {
+ ASSERT_OK(Put("key" + std::to_string(i), "value" + std::to_string(i)));
+ ASSERT_OK(
+ Put("key" + std::to_string(i + 1), "value" + std::to_string(i + 1)));
+ ASSERT_OK(Flush());
+ }
+
+ std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+ ASSERT_EQ(original_blob_files.size(), 128);
+
+ // Note: turning off enable_blob_files before the compaction results in
+ // garbage collected values getting inlined.
+ ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+
+ CompactRangeOptions cro;
+ cro.blob_garbage_collection_policy = BlobGarbageCollectionPolicy::kForce;
+ cro.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_;
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // Check that the GC stats are correct
+ {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ assert(versions->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ assert(internal_stats);
+
+ const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ ASSERT_GE(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+ }
+
+ const size_t cutoff_index = static_cast<size_t>(
+ cro.blob_garbage_collection_age_cutoff * original_blob_files.size());
+ const size_t expected_num_files = original_blob_files.size() - cutoff_index;
+
+ const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+ ASSERT_EQ(new_blob_files.size(), expected_num_files);
+
+ // Original blob files below the cutoff should be gone, original blob files
+ // at or above the cutoff should be still there
+ for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+ ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+ }
+
+ for (size_t i = 0; i < 128; ++i) {
+ ASSERT_EQ(Get("key" + std::to_string(i)), "value" + std::to_string(i));
+ }
+}
+
+TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGC) {
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ options.enable_blob_files = true;
+ options.blob_file_size = 32; // one blob per file
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_;
+
+ Reopen(options);
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "first_value";
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "second_value";
+
+ ASSERT_OK(Put(first_key, first_value));
+ ASSERT_OK(Put(second_key, second_value));
+ ASSERT_OK(Flush());
+
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "third_value";
+ constexpr char fourth_key[] = "fourth_key";
+ constexpr char fourth_value[] = "fourth_value";
+
+ ASSERT_OK(Put(third_key, third_value));
+ ASSERT_OK(Put(fourth_key, fourth_value));
+ ASSERT_OK(Flush());
+
+ const std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+
+ ASSERT_EQ(original_blob_files.size(), 4);
+
+ const size_t cutoff_index = static_cast<size_t>(
+ options.blob_garbage_collection_age_cutoff * original_blob_files.size());
+
+ // Note: turning off enable_blob_files before the compaction results in
+ // garbage collected values getting inlined.
+ size_t expected_number_of_files = original_blob_files.size();
+
+ if (!updated_enable_blob_files_) {
+ ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+
+ expected_number_of_files -= cutoff_index;
+ }
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ ASSERT_EQ(Get(first_key), first_value);
+ ASSERT_EQ(Get(second_key), second_value);
+ ASSERT_EQ(Get(third_key), third_value);
+ ASSERT_EQ(Get(fourth_key), fourth_value);
+
+ const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+ ASSERT_EQ(new_blob_files.size(), expected_number_of_files);
+
+ // Original blob files below the cutoff should be gone, original blob files at
+ // or above the cutoff should be still there
+ for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+ ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+ }
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ assert(versions->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ assert(internal_stats);
+
+ const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ if (blob_gc_age_cutoff_ > 0.0) {
+ ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+
+ if (updated_enable_blob_files_) {
+ // GC relocated some blobs to new blob files
+ ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_read_blob,
+ compaction_stats[1].bytes_written_blob);
+ } else {
+ // GC moved some blobs back to the LSM, no new blob files
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+ }
+ } else {
+ ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+ }
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_CorruptIndex) {
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ options.enable_blob_files = true;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 1.0;
+
+ Reopen(options);
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "first_value";
+ ASSERT_OK(Put(first_key, first_value));
+
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "second_value";
+ ASSERT_OK(Put(second_key, second_value));
+
+ ASSERT_OK(Flush());
+
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "third_value";
+ ASSERT_OK(Put(third_key, third_value));
+
+ constexpr char fourth_key[] = "fourth_key";
+ constexpr char fourth_value[] = "fourth_value";
+ ASSERT_OK(Put(fourth_key, fourth_value));
+
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex",
+ [](void* arg) {
+ Slice* const blob_index = static_cast<Slice*>(arg);
+ assert(blob_index);
+ assert(!blob_index->empty());
+ blob_index->remove_prefix(1);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_TRUE(
+ db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_InlinedTTLIndex) {
+ constexpr uint64_t min_blob_size = 10;
+
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = min_blob_size;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 1.0;
+
+ Reopen(options);
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "first_value";
+ ASSERT_OK(Put(first_key, first_value));
+
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "second_value";
+ ASSERT_OK(Put(second_key, second_value));
+
+ ASSERT_OK(Flush());
+
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "third_value";
+ ASSERT_OK(Put(third_key, third_value));
+
+ constexpr char fourth_key[] = "fourth_key";
+ constexpr char blob[] = "short";
+ static_assert(sizeof(short) - 1 < min_blob_size,
+ "Blob too long to be inlined");
+
+ // Fake an inlined TTL blob index.
+ std::string blob_index;
+
+ constexpr uint64_t expiration = 1234567890;
+
+ BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+
+ WriteBatch batch;
+ ASSERT_OK(
+ WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+ ASSERT_OK(Flush());
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_TRUE(
+ db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_IndexWithInvalidFileNumber) {
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ options.enable_blob_files = true;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 1.0;
+
+ Reopen(options);
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "first_value";
+ ASSERT_OK(Put(first_key, first_value));
+
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "second_value";
+ ASSERT_OK(Put(second_key, second_value));
+
+ ASSERT_OK(Flush());
+
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "third_value";
+ ASSERT_OK(Put(third_key, third_value));
+
+ constexpr char fourth_key[] = "fourth_key";
+
+ // Fake a blob index referencing a non-existent blob file.
+ std::string blob_index;
+
+ constexpr uint64_t blob_file_number = 1000;
+ constexpr uint64_t offset = 1234;
+ constexpr uint64_t size = 5678;
+
+ BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+ kNoCompression);
+
+ WriteBatch batch;
+ ASSERT_OK(
+ WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+ ASSERT_OK(Flush());
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_TRUE(
+ db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) {
+ if (mem_env_ || encrypted_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+ return;
+ }
+ std::shared_ptr<FaultInjectionTestFS> fault_fs(
+ new FaultInjectionTestFS(FileSystem::Default()));
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 3;
+ options.env = fault_fs_env.get();
+ options.create_if_missing = true;
+ options.checksum_handoff_file_types.Add(FileType::kTableFile);
+ Status s;
+ Reopen(options);
+
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s, Status::OK());
+ Destroy(options);
+ Reopen(options);
+
+ // The hash does not match, compaction write fails
+ // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+ // Since the file system returns IOStatus::Corruption, it is an
+ // unrecoverable error.
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) {
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s.severity(),
+ ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ Destroy(options);
+ Reopen(options);
+
+ // The file system does not support checksum handoff. The check
+ // will be ignored.
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s, Status::OK());
+
+ // Each write will be similated as corrupted.
+ // Since the file system returns IOStatus::Corruption, it is an
+ // unrecoverable error.
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0",
+ [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s.severity(),
+ ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) {
+ if (mem_env_ || encrypted_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+ return;
+ }
+ std::shared_ptr<FaultInjectionTestFS> fault_fs(
+ new FaultInjectionTestFS(FileSystem::Default()));
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 3;
+ options.env = fault_fs_env.get();
+ options.create_if_missing = true;
+ Status s;
+ Reopen(options);
+
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s, Status::OK());
+ Destroy(options);
+ Reopen(options);
+
+ // options is not set, the checksum handoff will not be triggered
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) {
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s, Status::OK());
+ SyncPoint::GetInstance()->DisableProcessing();
+ Destroy(options);
+ Reopen(options);
+
+ // The file system does not support checksum handoff. The check
+ // will be ignored.
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s, Status::OK());
+
+ // options is not set, the checksum handoff will not be triggered
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0",
+ [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s, Status::OK());
+
+ Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) {
+ if (mem_env_ || encrypted_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+ return;
+ }
+ std::shared_ptr<FaultInjectionTestFS> fault_fs(
+ new FaultInjectionTestFS(FileSystem::Default()));
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 3;
+ options.env = fault_fs_env.get();
+ options.create_if_missing = true;
+ options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+ Status s;
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ Reopen(options);
+
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s, Status::OK());
+ Destroy(options);
+ Reopen(options);
+
+ // The hash does not match, compaction write fails
+ // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+ // Since the file system returns IOStatus::Corruption, it is mapped to
+ // kFatalError error.
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) {
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) {
+ if (mem_env_ || encrypted_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+ return;
+ }
+ std::shared_ptr<FaultInjectionTestFS> fault_fs(
+ new FaultInjectionTestFS(FileSystem::Default()));
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 3;
+ options.env = fault_fs_env.get();
+ options.create_if_missing = true;
+ options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+ Status s;
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+ Reopen(options);
+
+ // The file system does not support checksum handoff. The check
+ // will be ignored.
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s, Status::OK());
+
+ // Each write will be similated as corrupted.
+ // Since the file system returns IOStatus::Corruption, it is mapped to
+ // kFatalError error.
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ ASSERT_OK(Put(Key(0), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0",
+ [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put(Key(1), "value3"));
+ s = Flush();
+ ASSERT_EQ(s, Status::OK());
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ Destroy(options);
+}
+
+TEST_F(DBCompactionTest, FIFOWarm) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleFIFO;
+ options.num_levels = 1;
+ options.max_open_files = -1;
+ options.level0_file_num_compaction_trigger = 2;
+ options.create_if_missing = true;
+ CompactionOptionsFIFO fifo_options;
+ fifo_options.age_for_warm = 1000;
+ fifo_options.max_table_files_size = 100000000;
+ options.compaction_options_fifo = fifo_options;
+ env_->SetMockSleep();
+ Reopen(options);
+
+ int total_warm = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "NewWritableFile::FileOptions.temperature", [&](void* arg) {
+ Temperature temperature = *(static_cast<Temperature*>(arg));
+ if (temperature == Temperature::kWarm) {
+ total_warm++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // The file system does not support checksum handoff. The check
+ // will be ignored.
+ ASSERT_OK(Put(Key(0), "value1"));
+ env_->MockSleepForSeconds(800);
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(0), "value1"));
+ env_->MockSleepForSeconds(800);
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(0), "value1"));
+ env_->MockSleepForSeconds(800);
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_OK(Put(Key(0), "value1"));
+ env_->MockSleepForSeconds(800);
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ColumnFamilyMetaData metadata;
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(4, metadata.file_count);
+ ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+ ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature);
+ ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[2].temperature);
+ ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[3].temperature);
+ ASSERT_EQ(2, total_warm);
+
+ Destroy(options);
+}
+
+TEST_F(DBCompactionTest, DisableMultiManualCompaction) {
+ const int kNumL0Files = 10;
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ Reopen(options);
+
+ // Generate 2 levels of file to make sure the manual compaction is not skipped
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), "value"));
+ if (i % 2) {
+ ASSERT_OK(Flush());
+ }
+ }
+ MoveFilesToLevel(2);
+
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), "value"));
+ if (i % 2) {
+ ASSERT_OK(Flush());
+ }
+ }
+ MoveFilesToLevel(1);
+
+ // Block compaction queue
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ port::Thread compact_thread1([&]() {
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = false;
+ std::string begin_str = Key(0);
+ std::string end_str = Key(3);
+ Slice b = begin_str;
+ Slice e = end_str;
+ auto s = db_->CompactRange(cro, &b, &e);
+ ASSERT_TRUE(s.IsIncomplete());
+ });
+
+ port::Thread compact_thread2([&]() {
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = false;
+ std::string begin_str = Key(4);
+ std::string end_str = Key(7);
+ Slice b = begin_str;
+ Slice e = end_str;
+ auto s = db_->CompactRange(cro, &b, &e);
+ ASSERT_TRUE(s.IsIncomplete());
+ });
+
+ // Disable manual compaction should cancel both manual compactions and both
+ // compaction should return incomplete.
+ db_->DisableManualCompaction();
+
+ compact_thread1.join();
+ compact_thread2.join();
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+}
+
+TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) {
+ const int kNumL0Files = 4;
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ Reopen(options);
+
+ // generate files, but avoid trigger auto compaction
+ for (int i = 0; i < kNumL0Files / 2; i++) {
+ ASSERT_OK(Put(Key(1), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+ }
+
+ // make sure the manual compaction background is started but not yet set the
+ // status to in_progress, then cancel the manual compaction, which should not
+ // result in segfault
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkCompaction",
+ "DBCompactionTest::DisableJustStartedManualCompaction:"
+ "PreDisableManualCompaction"},
+ {"DBImpl::RunManualCompaction:Unscheduled",
+ "BackgroundCallCompaction:0"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread compact_thread([&]() {
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = true;
+ auto s = db_->CompactRange(cro, nullptr, nullptr);
+ ASSERT_TRUE(s.IsIncomplete());
+ });
+ TEST_SYNC_POINT(
+ "DBCompactionTest::DisableJustStartedManualCompaction:"
+ "PreDisableManualCompaction");
+ db_->DisableManualCompaction();
+
+ compact_thread.join();
+}
+
+TEST_F(DBCompactionTest, DisableInProgressManualCompaction) {
+ const int kNumL0Files = 4;
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ Reopen(options);
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BackgroundCompaction:InProgress",
+ "DBCompactionTest::DisableInProgressManualCompaction:"
+ "PreDisableManualCompaction"},
+ {"DBImpl::RunManualCompaction:Unscheduled",
+ "CompactionJob::Run():Start"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // generate files, but avoid trigger auto compaction
+ for (int i = 0; i < kNumL0Files / 2; i++) {
+ ASSERT_OK(Put(Key(1), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+ }
+
+ port::Thread compact_thread([&]() {
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = true;
+ auto s = db_->CompactRange(cro, nullptr, nullptr);
+ ASSERT_TRUE(s.IsIncomplete());
+ });
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::DisableInProgressManualCompaction:"
+ "PreDisableManualCompaction");
+ db_->DisableManualCompaction();
+
+ compact_thread.join();
+}
+
+TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) {
+ const int kNumL0Files = 4;
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::RunManualCompaction:Scheduled",
+ "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+ "PreDisableManualCompaction"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ Reopen(options);
+
+ // Block compaction queue
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ // generate files, but avoid trigger auto compaction
+ for (int i = 0; i < kNumL0Files / 2; i++) {
+ ASSERT_OK(Put(Key(1), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+ }
+
+ port::Thread compact_thread([&]() {
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = true;
+ auto s = db_->CompactRange(cro, nullptr, nullptr);
+ ASSERT_TRUE(s.IsIncomplete());
+ });
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+ "PreDisableManualCompaction");
+
+ // Generate more files to trigger auto compaction which is scheduled after
+ // manual compaction. Has to generate 4 more files because existing files are
+ // pending compaction
+ for (int i = 0; i < kNumL0Files; i++) {
+ ASSERT_OK(Put(Key(1), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+ db_->DisableManualCompaction();
+
+ // CompactRange should return before the compaction has the chance to run
+ compact_thread.join();
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+}
+
+TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) {
+ const int kNumL0Files = 4;
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::RunManualCompaction:Scheduled",
+ "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+ "PreDisableManualCompaction"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ Reopen(options);
+
+ // Block compaction queue
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ // generate files, but avoid trigger auto compaction
+ for (int i = 0; i < kNumL0Files / 2; i++) {
+ ASSERT_OK(Put(Key(1), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+ }
+
+ port::Thread compact_thread([&]() {
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = true;
+ auto s = db_->CompactRange(cro, nullptr, nullptr);
+ ASSERT_TRUE(s.IsIncomplete());
+ });
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+ "PreDisableManualCompaction");
+
+ // Generate more files to trigger auto compaction which is scheduled after
+ // manual compaction. Has to generate 4 more files because existing files are
+ // pending compaction
+ for (int i = 0; i < kNumL0Files; i++) {
+ ASSERT_OK(Put(Key(1), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+ db_->DisableManualCompaction();
+
+ // CompactRange should return before the compaction has the chance to run
+ compact_thread.join();
+
+ // Try close DB while manual compaction is canceled but still in the queue.
+ // And an auto-triggered compaction is also in the queue.
+ auto s = db_->Close();
+ ASSERT_OK(s);
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBCompactionTest, DBCloseWithManualCompaction) {
+ const int kNumL0Files = 4;
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::RunManualCompaction:Scheduled",
+ "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+ "PreDisableManualCompaction"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ Reopen(options);
+
+ // Block compaction queue
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ // generate files, but avoid trigger auto compaction
+ for (int i = 0; i < kNumL0Files / 2; i++) {
+ ASSERT_OK(Put(Key(1), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+ }
+
+ port::Thread compact_thread([&]() {
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = true;
+ auto s = db_->CompactRange(cro, nullptr, nullptr);
+ ASSERT_TRUE(s.IsIncomplete());
+ });
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+ "PreDisableManualCompaction");
+
+ // Generate more files to trigger auto compaction which is scheduled after
+ // manual compaction. Has to generate 4 more files because existing files are
+ // pending compaction
+ for (int i = 0; i < kNumL0Files; i++) {
+ ASSERT_OK(Put(Key(1), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+ // Close DB with manual compaction and auto triggered compaction in the queue.
+ auto s = db_->Close();
+ ASSERT_OK(s);
+
+ // manual compaction thread should return with Incomplete().
+ compact_thread.join();
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBCompactionTest,
+ DisableManualCompactionDoesNotWaitForDrainingAutomaticCompaction) {
+ // When `CompactRangeOptions::exclusive_manual_compaction == true`, we wait
+ // for automatic compactions to drain before starting the manual compaction.
+ // This test verifies `DisableManualCompaction()` can cancel such a compaction
+ // without waiting for the drain to complete.
+ const int kNumL0Files = 4;
+
+ // Enforces manual compaction enters wait loop due to pending automatic
+ // compaction.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkCompaction", "DBImpl::RunManualCompaction:NotScheduled"},
+ {"DBImpl::RunManualCompaction:WaitScheduled",
+ "BackgroundCallCompaction:0"}});
+ // The automatic compaction will cancel the waiting manual compaction.
+ // Completing this implies the cancellation did not wait on automatic
+ // compactions to finish.
+ bool callback_completed = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void* /*arg*/) {
+ db_->DisableManualCompaction();
+ callback_completed = true;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ Reopen(options);
+
+ for (int i = 0; i < kNumL0Files; ++i) {
+ ASSERT_OK(Put(Key(1), "value1"));
+ ASSERT_OK(Put(Key(2), "value2"));
+ ASSERT_OK(Flush());
+ }
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = true;
+ ASSERT_TRUE(db_->CompactRange(cro, nullptr, nullptr).IsIncomplete());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(callback_completed);
+}
+
+TEST_F(DBCompactionTest, ChangeLevelConflictsWithManual) {
+ Options options = CurrentOptions();
+ options.num_levels = 3;
+ Reopen(options);
+
+ // Setup an LSM with L2 populated.
+ Random rnd(301);
+ ASSERT_OK(Put(Key(0), rnd.RandomString(990)));
+ ASSERT_OK(Put(Key(1), rnd.RandomString(990)));
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ }
+ ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+ // The background thread will refit L2->L1 while the foreground thread will
+ // attempt to run a compaction on new data. The following dependencies
+ // ensure the background manual compaction's refitting phase disables manual
+ // compaction immediately before the foreground manual compaction can register
+ // itself. Manual compaction is kept disabled until the foreground manual
+ // checks for the failure once.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ // Only do Put()s for foreground CompactRange() once the background
+ // CompactRange() has reached the refitting phase.
+ {
+ "DBImpl::CompactRange:BeforeRefit:1",
+ "DBCompactionTest::ChangeLevelConflictsWithManual:"
+ "PreForegroundCompactRange",
+ },
+ // Right before we register the manual compaction, proceed with
+ // the refitting phase so manual compactions are disabled. Stay in
+ // the refitting phase with manual compactions disabled until it is
+ // noticed.
+ {
+ "DBImpl::RunManualCompaction:0",
+ "DBImpl::CompactRange:BeforeRefit:2",
+ },
+ {
+ "DBImpl::CompactRange:PreRefitLevel",
+ "DBImpl::RunManualCompaction:1",
+ },
+ {
+ "DBImpl::RunManualCompaction:PausedAtStart",
+ "DBImpl::CompactRange:PostRefitLevel",
+ },
+ // If compaction somehow were scheduled, let's let it run after reenabling
+ // manual compactions. This dependency is not expected to be hit but is
+ // here for speculatively coercing future bugs.
+ {
+ "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled",
+ "BackgroundCallCompaction:0",
+ },
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 1;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::ChangeLevelConflictsWithManual:"
+ "PreForegroundCompactRange");
+ ASSERT_OK(Put(Key(0), rnd.RandomString(990)));
+ ASSERT_OK(Put(Key(1), rnd.RandomString(990)));
+ ASSERT_TRUE(dbfull()
+ ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+ .IsIncomplete());
+
+ refit_level_thread.join();
+}
+
+TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) {
+ // Flushes several files to trigger compaction while lock is released during
+ // a bottom-pri compaction. Verifies it does not get scheduled to thread pool
+ // because per-DB limit for compaction parallelism is one (default).
+ const int kNumL0Files = 4;
+ const int kNumLevels = 3;
+
+ env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // Setup last level to be non-empty since it's a bit unclear whether
+ // compaction to an empty level would be considered "bottommost".
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(kNumLevels - 1);
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkBottomCompaction",
+ "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+ "PreTriggerCompaction"},
+ {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+ "PostTriggerCompaction",
+ "BackgroundCallCompaction:0"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread compact_range_thread([&] {
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ cro.exclusive_manual_compaction = false;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ });
+
+ // Sleep in the low-pri thread so any newly scheduled compaction will be
+ // queued. Otherwise it might finish before we check its existence.
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+ "PreTriggerCompaction");
+ for (int i = 0; i < kNumL0Files; ++i) {
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+ TEST_SYNC_POINT(
+ "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+ "PostTriggerCompaction");
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+ compact_range_thread.join();
+}
+
+TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) {
+ // allow_ingest_behind prevents seqnum zeroing, and could cause
+ // compaction loop with reason kBottommostFiles.
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.compaction_style = kCompactionStyleLevel;
+ options.allow_ingest_behind = true;
+ options.comparator = BytewiseComparator();
+ DestroyAndReopen(options);
+
+ WriteOptions write_opts;
+ ASSERT_OK(db_->Put(write_opts, "infinite", "compaction loop"));
+ ASSERT_OK(db_->Put(write_opts, "infinite", "loop"));
+
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+ ASSERT_OK(db_->Put(write_opts, "bumpseqnum", ""));
+ ASSERT_OK(Flush());
+ auto snapshot = db_->GetSnapshot();
+ // Bump up oldest_snapshot_seqnum_ in VersionStorageInfo.
+ db_->ReleaseSnapshot(snapshot);
+ bool compacted = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* /* arg */) {
+ // There should not be a compaction.
+ compacted = true;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ // Wait for compaction to be scheduled.
+ env_->SleepForMicroseconds(2000000);
+ ASSERT_FALSE(compacted);
+ // The following assert can be used to check for compaction loop:
+ // it used to wait forever before the fix.
+ // ASSERT_OK(dbfull()->TEST_WaitForCompact(true /* wait_unscheduled */));
+}
+
+#endif // !defined(ROCKSDB_LITE)
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void)argc;
+ (void)argv;
+ return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_dynamic_level_test.cc b/src/rocksdb/db/db_dynamic_level_test.cc
new file mode 100644
index 000000000..17fa67cb2
--- /dev/null
+++ b/src/rocksdb/db/db_dynamic_level_test.cc
@@ -0,0 +1,507 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/env.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBTestDynamicLevel : public DBTestBase {
+ public:
+ DBTestDynamicLevel()
+ : DBTestBase("db_dynamic_level_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) {
+ if (!Snappy_Supported() || !LZ4_Supported()) {
+ return;
+ }
+ // Use InMemoryEnv, or it would be too slow.
+ std::unique_ptr<Env> env(NewMemEnv(env_));
+
+ const int kNKeys = 1000;
+ int keys[kNKeys];
+
+ auto verify_func = [&]() {
+ for (int i = 0; i < kNKeys; i++) {
+ ASSERT_NE("NOT_FOUND", Get(Key(i)));
+ ASSERT_NE("NOT_FOUND", Get(Key(kNKeys * 2 + i)));
+ if (i < kNKeys / 10) {
+ ASSERT_EQ("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+ } else {
+ ASSERT_NE("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+ }
+ }
+ };
+
+ Random rnd(301);
+ for (int ordered_insert = 0; ordered_insert <= 1; ordered_insert++) {
+ for (int i = 0; i < kNKeys; i++) {
+ keys[i] = i;
+ }
+ if (ordered_insert == 0) {
+ RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
+ }
+ for (int max_background_compactions = 1; max_background_compactions < 4;
+ max_background_compactions += 2) {
+ Options options;
+ options.env = env.get();
+ options.create_if_missing = true;
+ options.write_buffer_size = 2048;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 2;
+ options.target_file_size_base = 2048;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 10240;
+ options.max_bytes_for_level_multiplier = 4;
+ options.max_background_compactions = max_background_compactions;
+ options.num_levels = 5;
+
+ options.compression_per_level.resize(3);
+ options.compression_per_level[0] = kNoCompression;
+ options.compression_per_level[1] = kLZ4Compression;
+ options.compression_per_level[2] = kSnappyCompression;
+ options.env = env_;
+
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < kNKeys; i++) {
+ int key = keys[i];
+ ASSERT_OK(Put(Key(kNKeys + key), rnd.RandomString(102)));
+ ASSERT_OK(Put(Key(key), rnd.RandomString(102)));
+ ASSERT_OK(Put(Key(kNKeys * 2 + key), rnd.RandomString(102)));
+ ASSERT_OK(Delete(Key(kNKeys + keys[i / 10])));
+ env_->SleepForMicroseconds(5000);
+ }
+
+ uint64_t int_prop;
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.background-errors", &int_prop));
+ ASSERT_EQ(0U, int_prop);
+
+ // Verify DB
+ for (int j = 0; j < 2; j++) {
+ verify_func();
+ if (j == 0) {
+ Reopen(options);
+ }
+ }
+
+ // Test compact range works
+ ASSERT_OK(
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // All data should be in the last level.
+ ColumnFamilyMetaData cf_meta;
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ ASSERT_EQ(5U, cf_meta.levels.size());
+ for (int i = 0; i < 4; i++) {
+ ASSERT_EQ(0U, cf_meta.levels[i].files.size());
+ }
+ ASSERT_GT(cf_meta.levels[4U].files.size(), 0U);
+ verify_func();
+
+ Close();
+ }
+ }
+
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
+ Random rnd(301);
+ int kMaxKey = 1000000;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.write_buffer_size = 20480;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 9999;
+ options.level0_stop_writes_trigger = 9999;
+ options.target_file_size_base = 9102;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 40960;
+ options.max_bytes_for_level_multiplier = 4;
+ options.max_background_compactions = 2;
+ options.num_levels = 5;
+ options.max_compaction_bytes = 0; // Force not expanding in compactions
+ options.db_host_id = ""; // Setting this messes up the file size calculation
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1024;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "true"},
+ }));
+
+ uint64_t int_prop;
+ std::string str_prop;
+
+ // Initial base level is the last level
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(4U, int_prop);
+
+ // Put about 28K to L0
+ for (int i = 0; i < 70; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ rnd.RandomString(380)));
+ }
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "false"},
+ }));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(4U, int_prop);
+
+ // Insert extra about 28K to L0. After they are compacted to L4, the base
+ // level should be changed to L3.
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "true"},
+ }));
+ for (int i = 0; i < 70; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ rnd.RandomString(380)));
+ }
+
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "false"},
+ }));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(3U, int_prop);
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+ ASSERT_EQ("0", str_prop);
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+ ASSERT_EQ("0", str_prop);
+
+ // Write even more data while leaving the base level at L3.
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "true"},
+ }));
+ // Write about 40K more
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ rnd.RandomString(380)));
+ }
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "false"},
+ }));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(3U, int_prop);
+
+ // Fill up L0, and then run an (auto) L0->Lmax compaction to raise the base
+ // level to 2.
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "true"},
+ }));
+ // Write about 650K more.
+ // Each file is about 11KB, with 9KB of data.
+ for (int i = 0; i < 1300; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ rnd.RandomString(380)));
+ }
+
+ // Make sure that the compaction starts before the last bit of data is
+ // flushed, so that the base level isn't raised to L1.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "false"},
+ }));
+
+ TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0");
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(2U, int_prop);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Write more data until the base level changes to L1. There will be
+ // a manual compaction going on at the same time.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:1"},
+ {"DynamicLevelMaxBytesBase2:2", "CompactionJob::Run():End"},
+ {"DynamicLevelMaxBytesBase2:compact_range_finish",
+ "FlushJob::WriteLevel0Table"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread thread([this] {
+ TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_start");
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_finish");
+ });
+
+ TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1");
+ for (int i = 0; i < 2; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+ rnd.RandomString(380)));
+ }
+ TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2");
+
+ ASSERT_OK(Flush());
+
+ thread.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(1U, int_prop);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) {
+ Random rnd(301);
+ int kMaxKey = 1000000;
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.write_buffer_size = 2048;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 9999;
+ options.level0_stop_writes_trigger = 9999;
+ options.target_file_size_base = 2;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 10240;
+ options.max_bytes_for_level_multiplier = 4;
+ options.max_background_compactions = 1;
+ const int kNumLevels = 5;
+ options.num_levels = kNumLevels;
+ options.max_compaction_bytes = 1; // Force not expanding in compactions
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1024;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+
+ // Compact against empty DB
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ uint64_t int_prop;
+ std::string str_prop;
+
+ // Initial base level is the last level
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(4U, int_prop);
+
+ // Put about 7K to L0
+ for (int i = 0; i < 140; i++) {
+ ASSERT_OK(
+ Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))), rnd.RandomString(80)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ if (NumTableFilesAtLevel(0) == 0) {
+ // Make sure level 0 is not empty
+ ASSERT_OK(
+ Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))), rnd.RandomString(80)));
+ ASSERT_OK(Flush());
+ }
+
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(3U, int_prop);
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+ ASSERT_EQ("0", str_prop);
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+ ASSERT_EQ("0", str_prop);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ std::set<int> output_levels;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionPicker::CompactRange:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ output_levels.insert(compaction->output_level());
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(output_levels.size(), 2);
+ ASSERT_TRUE(output_levels.find(3) != output_levels.end());
+ ASSERT_TRUE(output_levels.find(4) != output_levels.end());
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &str_prop));
+ ASSERT_EQ("0", str_prop);
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level3", &str_prop));
+ ASSERT_EQ("0", str_prop);
+ // Base level is still level 3.
+ ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+ ASSERT_EQ(3U, int_prop);
+}
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBaseInc) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.write_buffer_size = 2048;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 2;
+ options.target_file_size_base = 2048;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 10240;
+ options.max_bytes_for_level_multiplier = 4;
+ options.max_background_compactions = 2;
+ options.num_levels = 5;
+ options.max_compaction_bytes = 100000000;
+
+ DestroyAndReopen(options);
+
+ int non_trivial = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ const int total_keys = 3000;
+ const int random_part_size = 100;
+ for (int i = 0; i < total_keys; i++) {
+ std::string value = rnd.RandomString(random_part_size);
+ PutFixed32(&value, static_cast<uint32_t>(i));
+ ASSERT_OK(Put(Key(i), value));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ASSERT_EQ(non_trivial, 0);
+
+ for (int i = 0; i < total_keys; i++) {
+ std::string value = Get(Key(i));
+ ASSERT_EQ(DecodeFixed32(value.c_str() + random_part_size),
+ static_cast<uint32_t>(i));
+ }
+
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) {
+ Random rnd(301);
+ const int kMaxKey = 2000;
+
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = 2048;
+ options.max_write_buffer_number = 8;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 4;
+ options.level0_stop_writes_trigger = 8;
+ options.target_file_size_base = 2048;
+ options.level_compaction_dynamic_level_bytes = false;
+ options.max_bytes_for_level_base = 10240;
+ options.max_bytes_for_level_multiplier = 4;
+ options.num_levels = 8;
+
+ DestroyAndReopen(options);
+
+ auto verify_func = [&](int num_keys, bool if_sleep) {
+ for (int i = 0; i < num_keys; i++) {
+ ASSERT_NE("NOT_FOUND", Get(Key(kMaxKey + i)));
+ if (i < num_keys / 10) {
+ ASSERT_EQ("NOT_FOUND", Get(Key(i)));
+ } else {
+ ASSERT_NE("NOT_FOUND", Get(Key(i)));
+ }
+ if (if_sleep && i % 1000 == 0) {
+ // Without it, valgrind may choose not to give another
+ // thread a chance to run before finishing the function,
+ // causing the test to be extremely slow.
+ env_->SleepForMicroseconds(1);
+ }
+ }
+ };
+
+ int total_keys = 1000;
+ for (int i = 0; i < total_keys; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+ ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
+ ASSERT_OK(Delete(Key(i / 10)));
+ }
+ verify_func(total_keys, false);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ options.level_compaction_dynamic_level_bytes = true;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ verify_func(total_keys, false);
+
+ std::atomic_bool compaction_finished;
+ compaction_finished = false;
+ // Issue manual compaction in one thread and still verify DB state
+ // in main thread.
+ ROCKSDB_NAMESPACE::port::Thread t([&]() {
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = options.num_levels - 1;
+ ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+ compaction_finished.store(true);
+ });
+ do {
+ verify_func(total_keys, true);
+ } while (!compaction_finished.load());
+ t.join();
+
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "false"},
+ }));
+
+ int total_keys2 = 2000;
+ for (int i = total_keys; i < total_keys2; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+ ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
+ ASSERT_OK(Delete(Key(i / 10)));
+ }
+
+ verify_func(total_keys2, false);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ verify_func(total_keys2, false);
+
+ // Base level is not level 1
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void)argc;
+ (void)argv;
+ return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_encryption_test.cc b/src/rocksdb/db/db_encryption_test.cc
new file mode 100644
index 000000000..73e89d158
--- /dev/null
+++ b/src/rocksdb/db/db_encryption_test.cc
@@ -0,0 +1,130 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include <iostream>
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBEncryptionTest : public DBTestBase {
+ public:
+ DBEncryptionTest()
+ : DBTestBase("db_encryption_test", /*env_do_fsync=*/true) {}
+ Env* GetTargetEnv() {
+ if (encrypted_env_ != nullptr) {
+ return (static_cast<EnvWrapper*>(encrypted_env_))->target();
+ } else {
+ return env_;
+ }
+ }
+};
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBEncryptionTest, CheckEncrypted) {
+ ASSERT_OK(Put("foo567", "v1.fetdq"));
+ ASSERT_OK(Put("bar123", "v2.dfgkjdfghsd"));
+ Close();
+
+ // Open all files and look for the values we've put in there.
+ // They should not be found if encrypted, otherwise
+ // they should be found.
+ std::vector<std::string> fileNames;
+ auto status = env_->GetChildren(dbname_, &fileNames);
+ ASSERT_OK(status);
+
+ Env* target = GetTargetEnv();
+ int hits = 0;
+ for (auto it = fileNames.begin(); it != fileNames.end(); ++it) {
+ if (*it == "LOCK") {
+ continue;
+ }
+ auto filePath = dbname_ + "/" + *it;
+ std::unique_ptr<SequentialFile> seqFile;
+ auto envOptions = EnvOptions(CurrentOptions());
+ status = target->NewSequentialFile(filePath, &seqFile, envOptions);
+ ASSERT_OK(status);
+
+ uint64_t fileSize;
+ status = target->GetFileSize(filePath, &fileSize);
+ ASSERT_OK(status);
+
+ std::string scratch;
+ scratch.reserve(fileSize);
+ Slice data;
+ status = seqFile->Read(fileSize, &data, (char*)scratch.data());
+ ASSERT_OK(status);
+
+ if (data.ToString().find("foo567") != std::string::npos) {
+ hits++;
+ // std::cout << "Hit in " << filePath << "\n";
+ }
+ if (data.ToString().find("v1.fetdq") != std::string::npos) {
+ hits++;
+ // std::cout << "Hit in " << filePath << "\n";
+ }
+ if (data.ToString().find("bar123") != std::string::npos) {
+ hits++;
+ // std::cout << "Hit in " << filePath << "\n";
+ }
+ if (data.ToString().find("v2.dfgkjdfghsd") != std::string::npos) {
+ hits++;
+ // std::cout << "Hit in " << filePath << "\n";
+ }
+ if (data.ToString().find("dfgk") != std::string::npos) {
+ hits++;
+ // std::cout << "Hit in " << filePath << "\n";
+ }
+ }
+ if (encrypted_env_) {
+ ASSERT_EQ(hits, 0);
+ } else {
+ ASSERT_GE(hits, 4);
+ }
+}
+
+TEST_F(DBEncryptionTest, ReadEmptyFile) {
+ auto defaultEnv = GetTargetEnv();
+
+ // create empty file for reading it back in later
+ auto envOptions = EnvOptions(CurrentOptions());
+ auto filePath = dbname_ + "/empty.empty";
+
+ Status status;
+ {
+ std::unique_ptr<WritableFile> writableFile;
+ status = defaultEnv->NewWritableFile(filePath, &writableFile, envOptions);
+ ASSERT_OK(status);
+ }
+
+ std::unique_ptr<SequentialFile> seqFile;
+ status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
+ ASSERT_OK(status);
+
+ std::string scratch;
+ Slice data;
+ // reading back 16 bytes from the empty file shouldn't trigger an assertion.
+ // it should just work and return an empty string
+ status = seqFile->Read(16, &data, (char*)scratch.data());
+ ASSERT_OK(status);
+
+ ASSERT_TRUE(data.empty());
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_filesnapshot.cc b/src/rocksdb/db/db_filesnapshot.cc
new file mode 100644
index 000000000..aa9bd738a
--- /dev/null
+++ b/src/rocksdb/db/db_filesnapshot.cc
@@ -0,0 +1,442 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/types.h"
+#include "test_util/sync_point.h"
+#include "util/file_checksum_helper.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status DBImpl::FlushForGetLiveFiles() {
+ mutex_.AssertHeld();
+
+ // flush all dirty data to disk.
+ Status status;
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds;
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ mutex_.Unlock();
+ status =
+ AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kGetLiveFiles);
+ if (status.IsColumnFamilyDropped()) {
+ status = Status::OK();
+ }
+ mutex_.Lock();
+ } else {
+ for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ mutex_.Unlock();
+ status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
+ TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
+ TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
+ mutex_.Lock();
+ if (!status.ok() && !status.IsColumnFamilyDropped()) {
+ break;
+ } else if (status.IsColumnFamilyDropped()) {
+ status = Status::OK();
+ }
+ }
+ }
+ return status;
+}
+
+Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
+ uint64_t* manifest_file_size, bool flush_memtable) {
+ *manifest_file_size = 0;
+
+ mutex_.Lock();
+
+ if (flush_memtable) {
+ Status status = FlushForGetLiveFiles();
+ if (!status.ok()) {
+ mutex_.Unlock();
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
+ status.ToString().c_str());
+ return status;
+ }
+ }
+
+ // Make a set of all of the live table and blob files
+ std::vector<uint64_t> live_table_files;
+ std::vector<uint64_t> live_blob_files;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files);
+ }
+
+ ret.clear();
+ ret.reserve(live_table_files.size() + live_blob_files.size() +
+ 3); // for CURRENT + MANIFEST + OPTIONS
+
+ // create names of the live files. The names are not absolute
+ // paths, instead they are relative to dbname_.
+ for (const auto& table_file_number : live_table_files) {
+ ret.emplace_back(MakeTableFileName("", table_file_number));
+ }
+
+ for (const auto& blob_file_number : live_blob_files) {
+ ret.emplace_back(BlobFileName("", blob_file_number));
+ }
+
+ ret.emplace_back(CurrentFileName(""));
+ ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
+ // The OPTIONS file number is zero in read-write mode when OPTIONS file
+ // writing failed and the DB was configured with
+ // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
+ // number is zero when no OPTIONS file exist at all. In those cases we do not
+ // record any OPTIONS file in the live file list.
+ if (versions_->options_file_number() != 0) {
+ ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
+ }
+
+ // find length of manifest file while holding the mutex lock
+ *manifest_file_size = versions_->manifest_file_size();
+
+ mutex_.Unlock();
+ return Status::OK();
+}
+
+Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
+ // Record tracked WALs as a (minimum) cross-check for directory scan
+ std::vector<uint64_t> required_by_manifest;
+
+ // If caller disabled deletions, this function should return files that are
+ // guaranteed not to be deleted until deletions are re-enabled. We need to
+ // wait for pending purges to finish since WalManager doesn't know which
+ // files are going to be purged. Additional purges won't be scheduled as
+ // long as deletions are disabled (so the below loop must terminate).
+ // Also note that we disable deletions anyway to avoid the case where a
+ // file is deleted in the middle of the scan, causing IO error.
+ Status deletions_disabled = DisableFileDeletions();
+ {
+ InstrumentedMutexLock l(&mutex_);
+ while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
+ bg_cv_.Wait();
+ }
+
+ // Record tracked WALs as a (minimum) cross-check for directory scan
+ const auto& manifest_wals = versions_->GetWalSet().GetWals();
+ required_by_manifest.reserve(manifest_wals.size());
+ for (const auto& wal : manifest_wals) {
+ required_by_manifest.push_back(wal.first);
+ }
+ }
+
+ Status s = wal_manager_.GetSortedWalFiles(files);
+
+ // DisableFileDeletions / EnableFileDeletions not supported in read-only DB
+ if (deletions_disabled.ok()) {
+ Status s2 = EnableFileDeletions(/*force*/ false);
+ assert(s2.ok());
+ s2.PermitUncheckedError();
+ } else {
+ assert(deletions_disabled.IsNotSupported());
+ }
+
+ if (s.ok()) {
+ // Verify includes those required by manifest (one sorted list is superset
+ // of the other)
+ auto required = required_by_manifest.begin();
+ auto included = files.begin();
+
+ while (required != required_by_manifest.end()) {
+ if (included == files.end() || *required < (*included)->LogNumber()) {
+ // FAIL - did not find
+ return Status::Corruption(
+ "WAL file " + std::to_string(*required) +
+ " required by manifest but not in directory list");
+ }
+ if (*required == (*included)->LogNumber()) {
+ ++required;
+ ++included;
+ } else {
+ assert(*required > (*included)->LogNumber());
+ ++included;
+ }
+ }
+ }
+
+ return s;
+}
+
+Status DBImpl::GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) {
+ uint64_t current_logfile_number;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ current_logfile_number = logfile_number_;
+ }
+
+ return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
+}
+
+Status DBImpl::GetLiveFilesStorageInfo(
+ const LiveFilesStorageInfoOptions& opts,
+ std::vector<LiveFileStorageInfo>* files) {
+ // To avoid returning partial results, only move results to files on success.
+ assert(files);
+ files->clear();
+ std::vector<LiveFileStorageInfo> results;
+
+ // NOTE: This implementation was largely migrated from Checkpoint.
+
+ Status s;
+ VectorLogPtr live_wal_files;
+ bool flush_memtable = true;
+ if (!immutable_db_options_.allow_2pc) {
+ if (opts.wal_size_for_flush == std::numeric_limits<uint64_t>::max()) {
+ flush_memtable = false;
+ } else if (opts.wal_size_for_flush > 0) {
+ // If the outstanding log files are small, we skip the flush.
+ s = GetSortedWalFiles(live_wal_files);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Don't flush column families if total log size is smaller than
+ // log_size_for_flush. We copy the log files instead.
+ // We may be able to cover 2PC case too.
+ uint64_t total_wal_size = 0;
+ for (auto& wal : live_wal_files) {
+ total_wal_size += wal->SizeFileBytes();
+ }
+ if (total_wal_size < opts.wal_size_for_flush) {
+ flush_memtable = false;
+ }
+ live_wal_files.clear();
+ }
+ }
+
+ // This is a modified version of GetLiveFiles, to get access to more
+ // metadata.
+ mutex_.Lock();
+ if (flush_memtable) {
+ Status status = FlushForGetLiveFiles();
+ if (!status.ok()) {
+ mutex_.Unlock();
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
+ status.ToString().c_str());
+ return status;
+ }
+ }
+
+ // Make a set of all of the live table and blob files
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ VersionStorageInfo& vsi = *cfd->current()->storage_info();
+ auto& cf_paths = cfd->ioptions()->cf_paths;
+
+ auto GetDir = [&](size_t path_id) {
+ // Matching TableFileName() behavior
+ if (path_id >= cf_paths.size()) {
+ assert(false);
+ return cf_paths.back().path;
+ } else {
+ return cf_paths[path_id].path;
+ }
+ };
+
+ for (int level = 0; level < vsi.num_levels(); ++level) {
+ const auto& level_files = vsi.LevelFiles(level);
+ for (const auto& meta : level_files) {
+ assert(meta);
+
+ results.emplace_back();
+ LiveFileStorageInfo& info = results.back();
+
+ info.relative_filename = MakeTableFileName(meta->fd.GetNumber());
+ info.directory = GetDir(meta->fd.GetPathId());
+ info.file_number = meta->fd.GetNumber();
+ info.file_type = kTableFile;
+ info.size = meta->fd.GetFileSize();
+ if (opts.include_checksum_info) {
+ info.file_checksum_func_name = meta->file_checksum_func_name;
+ info.file_checksum = meta->file_checksum;
+ if (info.file_checksum_func_name.empty()) {
+ info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+ info.file_checksum = kUnknownFileChecksum;
+ }
+ }
+ info.temperature = meta->temperature;
+ }
+ }
+ const auto& blob_files = vsi.GetBlobFiles();
+ for (const auto& meta : blob_files) {
+ assert(meta);
+
+ results.emplace_back();
+ LiveFileStorageInfo& info = results.back();
+
+ info.relative_filename = BlobFileName(meta->GetBlobFileNumber());
+ info.directory = GetDir(/* path_id */ 0);
+ info.file_number = meta->GetBlobFileNumber();
+ info.file_type = kBlobFile;
+ info.size = meta->GetBlobFileSize();
+ if (opts.include_checksum_info) {
+ info.file_checksum_func_name = meta->GetChecksumMethod();
+ info.file_checksum = meta->GetChecksumValue();
+ if (info.file_checksum_func_name.empty()) {
+ info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+ info.file_checksum = kUnknownFileChecksum;
+ }
+ }
+ // TODO?: info.temperature
+ }
+ }
+
+ // Capture some final info before releasing mutex
+ const uint64_t manifest_number = versions_->manifest_file_number();
+ const uint64_t manifest_size = versions_->manifest_file_size();
+ const uint64_t options_number = versions_->options_file_number();
+ const uint64_t options_size = versions_->options_file_size_;
+ const uint64_t min_log_num = MinLogNumberToKeep();
+
+ mutex_.Unlock();
+
+ std::string manifest_fname = DescriptorFileName(manifest_number);
+ { // MANIFEST
+ results.emplace_back();
+ LiveFileStorageInfo& info = results.back();
+
+ info.relative_filename = manifest_fname;
+ info.directory = GetName();
+ info.file_number = manifest_number;
+ info.file_type = kDescriptorFile;
+ info.size = manifest_size;
+ info.trim_to_size = true;
+ if (opts.include_checksum_info) {
+ info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+ info.file_checksum = kUnknownFileChecksum;
+ }
+ }
+
+ { // CURRENT
+ results.emplace_back();
+ LiveFileStorageInfo& info = results.back();
+
+ info.relative_filename = kCurrentFileName;
+ info.directory = GetName();
+ info.file_type = kCurrentFile;
+ // CURRENT could be replaced so we have to record the contents as needed.
+ info.replacement_contents = manifest_fname + "\n";
+ info.size = manifest_fname.size() + 1;
+ if (opts.include_checksum_info) {
+ info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+ info.file_checksum = kUnknownFileChecksum;
+ }
+ }
+
+ // The OPTIONS file number is zero in read-write mode when OPTIONS file
+ // writing failed and the DB was configured with
+ // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
+ // number is zero when no OPTIONS file exist at all. In those cases we do not
+ // record any OPTIONS file in the live file list.
+ if (options_number != 0) {
+ results.emplace_back();
+ LiveFileStorageInfo& info = results.back();
+
+ info.relative_filename = OptionsFileName(options_number);
+ info.directory = GetName();
+ info.file_number = options_number;
+ info.file_type = kOptionsFile;
+ info.size = options_size;
+ if (opts.include_checksum_info) {
+ info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+ info.file_checksum = kUnknownFileChecksum;
+ }
+ }
+
+ // Some legacy testing stuff TODO: carefully clean up obsolete parts
+ TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone");
+
+ TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
+ TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
+
+ if (s.ok()) {
+ // To maximize the effectiveness of track_and_verify_wals_in_manifest,
+ // sync WAL when it is enabled.
+ s = FlushWAL(
+ immutable_db_options_.track_and_verify_wals_in_manifest /* sync */);
+ if (s.IsNotSupported()) { // read-only DB or similar
+ s = Status::OK();
+ }
+ }
+
+ TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
+ TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2");
+
+ // If we have more than one column family, we also need to get WAL files.
+ if (s.ok()) {
+ s = GetSortedWalFiles(live_wal_files);
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ size_t wal_size = live_wal_files.size();
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Number of log files %" ROCKSDB_PRIszt, live_wal_files.size());
+
+ // Link WAL files. Copy exact size of last one because it is the only one
+ // that has changes after the last flush.
+ auto wal_dir = immutable_db_options_.GetWalDir();
+ for (size_t i = 0; s.ok() && i < wal_size; ++i) {
+ if ((live_wal_files[i]->Type() == kAliveLogFile) &&
+ (!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num)) {
+ results.emplace_back();
+ LiveFileStorageInfo& info = results.back();
+ auto f = live_wal_files[i]->PathName();
+ assert(!f.empty() && f[0] == '/');
+ info.relative_filename = f.substr(1);
+ info.directory = wal_dir;
+ info.file_number = live_wal_files[i]->LogNumber();
+ info.file_type = kWalFile;
+ info.size = live_wal_files[i]->SizeFileBytes();
+ // Only last should need to be trimmed
+ info.trim_to_size = (i + 1 == wal_size);
+ if (opts.include_checksum_info) {
+ info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+ info.file_checksum = kUnknownFileChecksum;
+ }
+ }
+ }
+
+ if (s.ok()) {
+ // Only move results to output on success.
+ *files = std::move(results);
+ }
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_flush_test.cc b/src/rocksdb/db/db_flush_test.cc
new file mode 100644
index 000000000..3b3f7e183
--- /dev/null
+++ b/src/rocksdb/db/db_flush_test.cc
@@ -0,0 +1,3084 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <atomic>
+#include <limits>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static std::string NEW_VALUE = "NewValue";
+
+class DBFlushTest : public DBTestBase {
+ public:
+ DBFlushTest() : DBTestBase("db_flush_test", /*env_do_fsync=*/true) {}
+};
+
+class DBFlushDirectIOTest : public DBFlushTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ DBFlushDirectIOTest() : DBFlushTest() {}
+};
+
+class DBAtomicFlushTest : public DBFlushTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ DBAtomicFlushTest() : DBFlushTest() {}
+};
+
+// We had issue when two background threads trying to flush at the same time,
+// only one of them get committed. The test verifies the issue is fixed.
+TEST_F(DBFlushTest, FlushWhileWritingManifest) {
+ Options options;
+ options.disable_auto_compactions = true;
+ options.max_background_flushes = 2;
+ options.env = env_;
+ Reopen(options);
+ FlushOptions no_wait;
+ no_wait.wait = false;
+ no_wait.allow_write_stall = true;
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"VersionSet::LogAndApply:WriteManifest",
+ "DBFlushTest::FlushWhileWritingManifest:1"},
+ {"MemTableList::TryInstallMemtableFlushResults:InProgress",
+ "VersionSet::LogAndApply:WriteManifestDone"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("foo", "v"));
+ ASSERT_OK(dbfull()->Flush(no_wait));
+ TEST_SYNC_POINT("DBFlushTest::FlushWhileWritingManifest:1");
+ ASSERT_OK(Put("bar", "v"));
+ ASSERT_OK(dbfull()->Flush(no_wait));
+ // If the issue is hit we will wait here forever.
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(2, TotalTableFiles());
+#endif // ROCKSDB_LITE
+}
+
+// Disable this test temporarily on Travis as it fails intermittently.
+// Github issue: #4151
+TEST_F(DBFlushTest, SyncFail) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options;
+ options.disable_auto_compactions = true;
+ options.env = fault_injection_env.get();
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
+ {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put("key", "value"));
+ FlushOptions flush_options;
+ flush_options.wait = false;
+ ASSERT_OK(dbfull()->Flush(flush_options));
+ // Flush installs a new super-version. Get the ref count after that.
+ fault_injection_env->SetFilesystemActive(false);
+ TEST_SYNC_POINT("DBFlushTest::SyncFail:1");
+ TEST_SYNC_POINT("DBFlushTest::SyncFail:2");
+ fault_injection_env->SetFilesystemActive(true);
+ // Now the background job will do the flush; wait for it.
+ // Returns the IO error happend during flush.
+ ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("", FilesPerLevel()); // flush failed.
+#endif // ROCKSDB_LITE
+ Destroy(options);
+}
+
+TEST_F(DBFlushTest, SyncSkip) {
+ Options options = CurrentOptions();
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBFlushTest::SyncSkip:1", "DBImpl::SyncClosedLogs:Skip"},
+ {"DBImpl::SyncClosedLogs:Skip", "DBFlushTest::SyncSkip:2"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(options);
+ ASSERT_OK(Put("key", "value"));
+
+ FlushOptions flush_options;
+ flush_options.wait = false;
+ ASSERT_OK(dbfull()->Flush(flush_options));
+
+ TEST_SYNC_POINT("DBFlushTest::SyncSkip:1");
+ TEST_SYNC_POINT("DBFlushTest::SyncSkip:2");
+
+ // Now the background job will do the flush; wait for it.
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
+ // Verify setting an empty high-pri (flush) thread pool causes flushes to be
+ // scheduled in the low-pri (compaction) thread pool.
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 4;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+ Reopen(options);
+ env_->SetBackgroundThreads(0, Env::HIGH);
+
+ std::thread::id tid;
+ int num_flushes = 0, num_compactions = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BGWorkFlush", [&](void* /*arg*/) {
+ if (tid == std::thread::id()) {
+ tid = std::this_thread::get_id();
+ } else {
+ ASSERT_EQ(tid, std::this_thread::get_id());
+ }
+ ++num_flushes;
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BGWorkCompaction", [&](void* /*arg*/) {
+ ASSERT_EQ(tid, std::this_thread::get_id());
+ ++num_compactions;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("key", "val"));
+ for (int i = 0; i < 4; ++i) {
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(4, num_flushes);
+ ASSERT_EQ(1, num_compactions);
+}
+
+// Test when flush job is submitted to low priority thread pool and when DB is
+// closed in the meanwhile, CloseHelper doesn't hang.
+TEST_F(DBFlushTest, CloseDBWhenFlushInLowPri) {
+ Options options = CurrentOptions();
+ options.max_background_flushes = 1;
+ options.max_total_wal_size = 8192;
+
+ DestroyAndReopen(options);
+ CreateColumnFamilies({"cf1", "cf2"}, options);
+
+ env_->SetBackgroundThreads(0, Env::HIGH);
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ int num_flushes = 0;
+
+ SyncPoint::GetInstance()->SetCallBack("DBImpl::BGWorkFlush",
+ [&](void* /*arg*/) { ++num_flushes; });
+
+ int num_low_flush_unscheduled = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::UnscheduleLowFlushCallback", [&](void* /*arg*/) {
+ num_low_flush_unscheduled++;
+ // There should be one flush job in low pool that needs to be
+ // unscheduled
+ ASSERT_EQ(num_low_flush_unscheduled, 1);
+ });
+
+ int num_high_flush_unscheduled = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::UnscheduleHighFlushCallback", [&](void* /*arg*/) {
+ num_high_flush_unscheduled++;
+ // There should be no flush job in high pool
+ ASSERT_EQ(num_high_flush_unscheduled, 0);
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(0, "key1", DummyString(8192)));
+ // Block thread so that flush cannot be run and can be removed from the queue
+ // when called Unschedule.
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ sleeping_task_low.WaitUntilSleeping();
+
+ // Trigger flush and flush job will be scheduled to LOW priority thread.
+ ASSERT_OK(Put(0, "key2", DummyString(8192)));
+
+ // Close DB and flush job in low priority queue will be removed without
+ // running.
+ Close();
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+ ASSERT_EQ(0, num_flushes);
+
+ TryReopenWithColumnFamilies({"default", "cf1", "cf2"}, options);
+ ASSERT_OK(Put(0, "key3", DummyString(8192)));
+ ASSERT_OK(Flush(0));
+ ASSERT_EQ(1, num_flushes);
+}
+
+TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ Reopen(options);
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkFlush",
+ "DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1"},
+ {"DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2",
+ "FlushJob::WriteLevel0Table"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("key1", "value1"));
+
+ port::Thread t([&]() {
+ // The call wait for flush to finish, i.e. with flush_options.wait = true.
+ ASSERT_OK(Flush());
+ });
+
+ // Wait for flush start.
+ TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1");
+ // Insert a second memtable before the manual flush finish.
+ // At the end of the manual flush job, it will check if further flush
+ // is needed, but it will not trigger flush of the second memtable because
+ // min_write_buffer_number_to_merge is not reached.
+ ASSERT_OK(Put("key2", "value2"));
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+ TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2");
+
+ // Manual flush should return, without waiting for flush indefinitely.
+ t.join();
+}
+
+TEST_F(DBFlushTest, ScheduleOnlyOneBgThread) {
+ Options options = CurrentOptions();
+ Reopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ int called = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", [&](void* arg) {
+ ASSERT_NE(nullptr, arg);
+ auto unscheduled_flushes = *reinterpret_cast<int*>(arg);
+ ASSERT_EQ(0, unscheduled_flushes);
+ ++called;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("a", "foo"));
+ FlushOptions flush_opts;
+ ASSERT_OK(dbfull()->Flush(flush_opts));
+ ASSERT_EQ(1, called);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// The following 3 tests are designed for testing garbage statistics at flush
+// time.
+//
+// ======= General Information ======= (from GitHub Wiki).
+// There are three scenarios where memtable flush can be triggered:
+//
+// 1 - Memtable size exceeds ColumnFamilyOptions::write_buffer_size
+// after a write.
+// 2 - Total memtable size across all column families exceeds
+// DBOptions::db_write_buffer_size,
+// or DBOptions::write_buffer_manager signals a flush. In this scenario
+// the largest memtable will be flushed.
+// 3 - Total WAL file size exceeds DBOptions::max_total_wal_size.
+// In this scenario the memtable with the oldest data will be flushed,
+// in order to allow the WAL file with data from this memtable to be
+// purged.
+//
+// As a result, a memtable can be flushed before it is full. This is one
+// reason the generated SST file can be smaller than the corresponding
+// memtable. Compression is another factor to make SST file smaller than
+// corresponding memtable, since data in memtable is uncompressed.
+
+TEST_F(DBFlushTest, StatisticsGarbageBasic) {
+ Options options = CurrentOptions();
+
+ // The following options are used to enforce several values that
+ // may already exist as default values to make this test resilient
+ // to default value updates in the future.
+ options.statistics = CreateDBStatistics();
+
+ // Record all statistics.
+ options.statistics->set_stats_level(StatsLevel::kAll);
+
+ // create the DB if it's not already present
+ options.create_if_missing = true;
+
+ // Useful for now as we are trying to compare uncompressed data savings on
+ // flush().
+ options.compression = kNoCompression;
+
+ // Prevent memtable in place updates. Should already be disabled
+ // (from Wiki:
+ // In place updates can be enabled by toggling on the bool
+ // inplace_update_support flag. However, this flag is by default set to
+ // false
+ // because this thread-safe in-place update support is not compatible
+ // with concurrent memtable writes. Note that the bool
+ // allow_concurrent_memtable_write is set to true by default )
+ options.inplace_update_support = false;
+ options.allow_concurrent_memtable_write = true;
+
+ // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+ options.write_buffer_size = 64 << 20;
+
+ ASSERT_OK(TryReopen(options));
+
+ // Put multiple times the same key-values.
+ // The encoded length of a db entry in the memtable is
+ // defined in db/memtable.cc (MemTable::Add) as the variable:
+ // encoded_len= VarintLength(internal_key_size) --> =
+ // log_256(internal_key).
+ // Min # of bytes
+ // necessary to
+ // store
+ // internal_key_size.
+ // + internal_key_size --> = actual key string,
+ // (size key_size: w/o term null char)
+ // + 8 bytes for
+ // fixed uint64 "seq
+ // number
+ // +
+ // insertion type"
+ // + VarintLength(val_size) --> = min # of bytes to
+ // store val_size
+ // + val_size --> = actual value
+ // string
+ // For example, in our situation, "key1" : size 4, "value1" : size 6
+ // (the terminating null characters are not copied over to the memtable).
+ // And therefore encoded_len = 1 + (4+8) + 1 + 6 = 20 bytes per entry.
+ // However in terms of raw data contained in the memtable, and written
+ // over to the SSTable, we only count internal_key_size and val_size,
+ // because this is the only raw chunk of bytes that contains everything
+ // necessary to reconstruct a user entry: sequence number, insertion type,
+ // key, and value.
+
+ // To test the relevance of our Memtable garbage statistics,
+ // namely MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+ // we insert K-V pairs with 3 distinct keys (of length 4),
+ // and random values of arbitrary length RAND_VALUES_LENGTH,
+ // and we repeat this step NUM_REPEAT times total.
+ // At the end, we insert 3 final K-V pairs with the same 3 keys
+ // and known values (these will be the final values, of length 6).
+ // I chose NUM_REPEAT=2,000 such that no automatic flush is
+ // triggered (the number of bytes in the memtable is therefore
+ // well below any meaningful heuristic for a memtable of size 64MB).
+ // As a result, since each K-V pair is inserted as a payload
+ // of N meaningful bytes (sequence number, insertion type,
+ // key, and value = 8 + 4 + RAND_VALUE_LENGTH),
+ // MEMTABLE_GARBAGE_BYTES_AT_FLUSH should be equal to 2,000 * N bytes
+ // and MEMTABLE_PAYLAOD_BYTES_AT_FLUSH = MEMTABLE_GARBAGE_BYTES_AT_FLUSH +
+ // (3*(8 + 4 + 6)) bytes. For RAND_VALUE_LENGTH = 172 (arbitrary value), we
+ // expect:
+ // N = 8 + 4 + 172 = 184 bytes
+ // MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 2,000 * 184 = 368,000 bytes.
+ // MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 368,000 + 3*18 = 368,054 bytes.
+
+ const size_t NUM_REPEAT = 2000;
+ const size_t RAND_VALUES_LENGTH = 172;
+ const std::string KEY1 = "key1";
+ const std::string KEY2 = "key2";
+ const std::string KEY3 = "key3";
+ const std::string VALUE1 = "value1";
+ const std::string VALUE2 = "value2";
+ const std::string VALUE3 = "value3";
+ uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+ uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+ Random rnd(301);
+ // Insertion of of K-V pairs, multiple times.
+ for (size_t i = 0; i < NUM_REPEAT; i++) {
+ // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+ std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+ std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+ std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEY1, p_v1));
+ ASSERT_OK(Put(KEY2, p_v2));
+ ASSERT_OK(Put(KEY3, p_v3));
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+ KEY1.size() + p_v1.size() + sizeof(uint64_t);
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+ KEY2.size() + p_v2.size() + sizeof(uint64_t);
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+ KEY3.size() + p_v3.size() + sizeof(uint64_t);
+ }
+
+ // The memtable data bytes includes the "garbage"
+ // bytes along with the useful payload.
+ EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+ ASSERT_OK(Put(KEY1, VALUE1));
+ ASSERT_OK(Put(KEY2, VALUE2));
+ ASSERT_OK(Put(KEY3, VALUE3));
+
+ // Add useful payload to the memtable data bytes:
+ EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+ KEY1.size() + VALUE1.size() + KEY2.size() + VALUE2.size() + KEY3.size() +
+ VALUE3.size() + 3 * sizeof(uint64_t);
+
+ // We assert that the last K-V pairs have been successfully inserted,
+ // and that the valid values are VALUE1, VALUE2, VALUE3.
+ PinnableSlice value;
+ ASSERT_OK(Get(KEY1, &value));
+ ASSERT_EQ(value.ToString(), VALUE1);
+ ASSERT_OK(Get(KEY2, &value));
+ ASSERT_EQ(value.ToString(), VALUE2);
+ ASSERT_OK(Get(KEY3, &value));
+ ASSERT_EQ(value.ToString(), VALUE3);
+
+ // Force flush to SST. Increments the statistics counter.
+ ASSERT_OK(Flush());
+
+ // Collect statistics.
+ uint64_t mem_data_bytes =
+ TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+ uint64_t mem_garbage_bytes =
+ TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+ EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+ EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+ Close();
+}
+
+TEST_F(DBFlushTest, StatisticsGarbageInsertAndDeletes) {
+ Options options = CurrentOptions();
+ options.statistics = CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kAll);
+ options.create_if_missing = true;
+ options.compression = kNoCompression;
+ options.inplace_update_support = false;
+ options.allow_concurrent_memtable_write = true;
+ options.write_buffer_size = 67108864;
+
+ ASSERT_OK(TryReopen(options));
+
+ const size_t NUM_REPEAT = 2000;
+ const size_t RAND_VALUES_LENGTH = 37;
+ const std::string KEY1 = "key1";
+ const std::string KEY2 = "key2";
+ const std::string KEY3 = "key3";
+ const std::string KEY4 = "key4";
+ const std::string KEY5 = "key5";
+ const std::string KEY6 = "key6";
+
+ uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+ uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+ WriteBatch batch;
+
+ Random rnd(301);
+ // Insertion of of K-V pairs, multiple times.
+ for (size_t i = 0; i < NUM_REPEAT; i++) {
+ // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+ std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+ std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+ std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEY1, p_v1));
+ ASSERT_OK(Put(KEY2, p_v2));
+ ASSERT_OK(Put(KEY3, p_v3));
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+ KEY1.size() + p_v1.size() + sizeof(uint64_t);
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+ KEY2.size() + p_v2.size() + sizeof(uint64_t);
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+ KEY3.size() + p_v3.size() + sizeof(uint64_t);
+ ASSERT_OK(Delete(KEY1));
+ ASSERT_OK(Delete(KEY2));
+ ASSERT_OK(Delete(KEY3));
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+ KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
+ }
+
+ // The memtable data bytes includes the "garbage"
+ // bytes along with the useful payload.
+ EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+ // Note : one set of delete for KEY1, KEY2, KEY3 is written to
+ // SSTable to propagate the delete operations to K-V pairs
+ // that could have been inserted into the database during past Flush
+ // opeartions.
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
+ KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
+
+ // Additional useful paylaod.
+ ASSERT_OK(Delete(KEY4));
+ ASSERT_OK(Delete(KEY5));
+ ASSERT_OK(Delete(KEY6));
+
+ // // Add useful payload to the memtable data bytes:
+ EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+ KEY4.size() + KEY5.size() + KEY6.size() + 3 * sizeof(uint64_t);
+
+ // We assert that the K-V pairs have been successfully deleted.
+ PinnableSlice value;
+ ASSERT_NOK(Get(KEY1, &value));
+ ASSERT_NOK(Get(KEY2, &value));
+ ASSERT_NOK(Get(KEY3, &value));
+
+ // Force flush to SST. Increments the statistics counter.
+ ASSERT_OK(Flush());
+
+ // Collect statistics.
+ uint64_t mem_data_bytes =
+ TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+ uint64_t mem_garbage_bytes =
+ TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+ EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+ EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+ Close();
+}
+
+TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) {
+ Options options = CurrentOptions();
+ options.statistics = CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kAll);
+ options.create_if_missing = true;
+ options.compression = kNoCompression;
+ options.inplace_update_support = false;
+ options.allow_concurrent_memtable_write = true;
+ options.write_buffer_size = 67108864;
+
+ ASSERT_OK(TryReopen(options));
+
+ const size_t NUM_REPEAT = 1000;
+ const size_t RAND_VALUES_LENGTH = 42;
+ const std::string KEY1 = "key1";
+ const std::string KEY2 = "key2";
+ const std::string KEY3 = "key3";
+ const std::string KEY4 = "key4";
+ const std::string KEY5 = "key5";
+ const std::string KEY6 = "key6";
+ const std::string VALUE3 = "value3";
+
+ uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+ uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+ Random rnd(301);
+ // Insertion of of K-V pairs, multiple times.
+ // Also insert DeleteRange
+ for (size_t i = 0; i < NUM_REPEAT; i++) {
+ // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+ std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+ std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+ std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEY1, p_v1));
+ ASSERT_OK(Put(KEY2, p_v2));
+ ASSERT_OK(Put(KEY3, p_v3));
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+ KEY1.size() + p_v1.size() + sizeof(uint64_t);
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+ KEY2.size() + p_v2.size() + sizeof(uint64_t);
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+ KEY3.size() + p_v3.size() + sizeof(uint64_t);
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1,
+ KEY2));
+ // Note: DeleteRange have an exclusive upper bound, e.g. here: [KEY2,KEY3)
+ // is deleted.
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2,
+ KEY3));
+ // Delete ranges are stored as a regular K-V pair, with key=STARTKEY,
+ // value=ENDKEY.
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+ (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
+ (KEY2.size() + KEY3.size() + sizeof(uint64_t));
+ }
+
+ // The memtable data bytes includes the "garbage"
+ // bytes along with the useful payload.
+ EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+ // Note : one set of deleteRange for (KEY1, KEY2) and (KEY2, KEY3) is written
+ // to SSTable to propagate the deleteRange operations to K-V pairs that could
+ // have been inserted into the database during past Flush opeartions.
+ EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
+ (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
+ (KEY2.size() + KEY3.size() + sizeof(uint64_t));
+
+ // Overwrite KEY3 with known value (VALUE3)
+ // Note that during the whole time KEY3 has never been deleted
+ // by the RangeDeletes.
+ ASSERT_OK(Put(KEY3, VALUE3));
+ EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+ KEY3.size() + VALUE3.size() + sizeof(uint64_t);
+
+ // Additional useful paylaod.
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY4, KEY5));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY5, KEY6));
+
+ // Add useful payload to the memtable data bytes:
+ EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+ (KEY4.size() + KEY5.size() + sizeof(uint64_t)) +
+ (KEY5.size() + KEY6.size() + sizeof(uint64_t));
+
+ // We assert that the K-V pairs have been successfully deleted.
+ PinnableSlice value;
+ ASSERT_NOK(Get(KEY1, &value));
+ ASSERT_NOK(Get(KEY2, &value));
+ // And that KEY3's value is correct.
+ ASSERT_OK(Get(KEY3, &value));
+ ASSERT_EQ(value, VALUE3);
+
+ // Force flush to SST. Increments the statistics counter.
+ ASSERT_OK(Flush());
+
+ // Collect statistics.
+ uint64_t mem_data_bytes =
+ TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+ uint64_t mem_garbage_bytes =
+ TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+ EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+ EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+ Close();
+}
+
+#ifndef ROCKSDB_LITE
+// This simple Listener can only handle one flush at a time.
+class TestFlushListener : public EventListener {
+ public:
+ TestFlushListener(Env* env, DBFlushTest* test)
+ : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
+ db_closed = false;
+ }
+
+ ~TestFlushListener() override {
+ prev_fc_info_.status.PermitUncheckedError(); // Ignore the status
+ }
+
+ void OnTableFileCreated(const TableFileCreationInfo& info) override {
+ // remember the info for later checking the FlushJobInfo.
+ prev_fc_info_ = info;
+ ASSERT_GT(info.db_name.size(), 0U);
+ ASSERT_GT(info.cf_name.size(), 0U);
+ ASSERT_GT(info.file_path.size(), 0U);
+ ASSERT_GT(info.job_id, 0);
+ ASSERT_GT(info.table_properties.data_size, 0U);
+ ASSERT_GT(info.table_properties.raw_key_size, 0U);
+ ASSERT_GT(info.table_properties.raw_value_size, 0U);
+ ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+ ASSERT_GT(info.table_properties.num_entries, 0U);
+ ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+ ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+ }
+
+ void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+ flushed_dbs_.push_back(db);
+ flushed_column_family_names_.push_back(info.cf_name);
+ if (info.triggered_writes_slowdown) {
+ slowdown_count++;
+ }
+ if (info.triggered_writes_stop) {
+ stop_count++;
+ }
+ // verify whether the previously created file matches the flushed file.
+ ASSERT_EQ(prev_fc_info_.db_name, db->GetName());
+ ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
+ ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
+ ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+ ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number);
+
+ // Note: the following chunk relies on the notification pertaining to the
+ // database pointed to by DBTestBase::db_, and is thus bypassed when
+ // that assumption does not hold (see the test case MultiDBMultiListeners
+ // below).
+ ASSERT_TRUE(test_);
+ if (db == test_->db_) {
+ std::vector<std::vector<FileMetaData>> files_by_level;
+ test_->dbfull()->TEST_GetFilesMetaData(db->DefaultColumnFamily(),
+ &files_by_level);
+
+ ASSERT_FALSE(files_by_level.empty());
+ auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(),
+ [&](const FileMetaData& meta) {
+ return meta.fd.GetNumber() == info.file_number;
+ });
+ ASSERT_NE(it, files_by_level[0].end());
+ ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number);
+ }
+
+ ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
+ ASSERT_GT(info.thread_id, 0U);
+ }
+
+ std::vector<std::string> flushed_column_family_names_;
+ std::vector<DB*> flushed_dbs_;
+ int slowdown_count;
+ int stop_count;
+ bool db_closing;
+ std::atomic_bool db_closed;
+ TableFileCreationInfo prev_fc_info_;
+
+ protected:
+ Env* env_;
+ DBFlushTest* test_;
+};
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBFlushTest, MemPurgeBasic) {
+ Options options = CurrentOptions();
+
+ // The following options are used to enforce several values that
+ // may already exist as default values to make this test resilient
+ // to default value updates in the future.
+ options.statistics = CreateDBStatistics();
+
+ // Record all statistics.
+ options.statistics->set_stats_level(StatsLevel::kAll);
+
+ // create the DB if it's not already present
+ options.create_if_missing = true;
+
+ // Useful for now as we are trying to compare uncompressed data savings on
+ // flush().
+ options.compression = kNoCompression;
+
+ // Prevent memtable in place updates. Should already be disabled
+ // (from Wiki:
+ // In place updates can be enabled by toggling on the bool
+ // inplace_update_support flag. However, this flag is by default set to
+ // false
+ // because this thread-safe in-place update support is not compatible
+ // with concurrent memtable writes. Note that the bool
+ // allow_concurrent_memtable_write is set to true by default )
+ options.inplace_update_support = false;
+ options.allow_concurrent_memtable_write = true;
+
+ // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+ options.write_buffer_size = 1 << 20;
+#ifndef ROCKSDB_LITE
+ // Initially deactivate the MemPurge prototype.
+ options.experimental_mempurge_threshold = 0.0;
+ TestFlushListener* listener = new TestFlushListener(options.env, this);
+ options.listeners.emplace_back(listener);
+#else
+ // Activate directly the MemPurge prototype.
+ // (RocksDB lite does not support dynamic options)
+ options.experimental_mempurge_threshold = 1.0;
+#endif // !ROCKSDB_LITE
+ ASSERT_OK(TryReopen(options));
+
+ // RocksDB lite does not support dynamic options
+#ifndef ROCKSDB_LITE
+ // Dynamically activate the MemPurge prototype without restarting the DB.
+ ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+ ASSERT_OK(db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "1.0"}}));
+#endif
+
+ std::atomic<uint32_t> mempurge_count{0};
+ std::atomic<uint32_t> sst_count{0};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:MemPurgeSuccessful",
+ [&](void* /*arg*/) { mempurge_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::string KEY1 = "IamKey1";
+ std::string KEY2 = "IamKey2";
+ std::string KEY3 = "IamKey3";
+ std::string KEY4 = "IamKey4";
+ std::string KEY5 = "IamKey5";
+ std::string KEY6 = "IamKey6";
+ std::string KEY7 = "IamKey7";
+ std::string KEY8 = "IamKey8";
+ std::string KEY9 = "IamKey9";
+ std::string RNDKEY1, RNDKEY2, RNDKEY3;
+ const std::string NOT_FOUND = "NOT_FOUND";
+
+ // Heavy overwrite workload,
+ // more than would fit in maximum allowed memtables.
+ Random rnd(719);
+ const size_t NUM_REPEAT = 100;
+ const size_t RAND_KEYS_LENGTH = 57;
+ const size_t RAND_VALUES_LENGTH = 10240;
+ std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9, p_rv1,
+ p_rv2, p_rv3;
+
+ // Insert a very first set of keys that will be
+ // mempurged at least once.
+ p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEY1, p_v1));
+ ASSERT_OK(Put(KEY2, p_v2));
+ ASSERT_OK(Put(KEY3, p_v3));
+ ASSERT_OK(Put(KEY4, p_v4));
+ ASSERT_EQ(Get(KEY1), p_v1);
+ ASSERT_EQ(Get(KEY2), p_v2);
+ ASSERT_EQ(Get(KEY3), p_v3);
+ ASSERT_EQ(Get(KEY4), p_v4);
+
+ // Insertion of of K-V pairs, multiple times (overwrites).
+ for (size_t i = 0; i < NUM_REPEAT; i++) {
+ // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+ p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v6 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v7 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v8 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v9 = rnd.RandomString(RAND_VALUES_LENGTH);
+
+ ASSERT_OK(Put(KEY5, p_v5));
+ ASSERT_OK(Put(KEY6, p_v6));
+ ASSERT_OK(Put(KEY7, p_v7));
+ ASSERT_OK(Put(KEY8, p_v8));
+ ASSERT_OK(Put(KEY9, p_v9));
+
+ ASSERT_EQ(Get(KEY1), p_v1);
+ ASSERT_EQ(Get(KEY2), p_v2);
+ ASSERT_EQ(Get(KEY3), p_v3);
+ ASSERT_EQ(Get(KEY4), p_v4);
+ ASSERT_EQ(Get(KEY5), p_v5);
+ ASSERT_EQ(Get(KEY6), p_v6);
+ ASSERT_EQ(Get(KEY7), p_v7);
+ ASSERT_EQ(Get(KEY8), p_v8);
+ ASSERT_EQ(Get(KEY9), p_v9);
+ }
+
+ // Check that there was at least one mempurge
+ const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+ // Check that there was no SST files created during flush.
+ const uint32_t EXPECTED_SST_COUNT = 0;
+
+ EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+ EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+ // Insertion of of K-V pairs, no overwrites.
+ for (size_t i = 0; i < NUM_REPEAT; i++) {
+ // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+ RNDKEY1 = rnd.RandomString(RAND_KEYS_LENGTH);
+ RNDKEY2 = rnd.RandomString(RAND_KEYS_LENGTH);
+ RNDKEY3 = rnd.RandomString(RAND_KEYS_LENGTH);
+ p_rv1 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_rv2 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_rv3 = rnd.RandomString(RAND_VALUES_LENGTH);
+
+ ASSERT_OK(Put(RNDKEY1, p_rv1));
+ ASSERT_OK(Put(RNDKEY2, p_rv2));
+ ASSERT_OK(Put(RNDKEY3, p_rv3));
+
+ ASSERT_EQ(Get(KEY1), p_v1);
+ ASSERT_EQ(Get(KEY2), p_v2);
+ ASSERT_EQ(Get(KEY3), p_v3);
+ ASSERT_EQ(Get(KEY4), p_v4);
+ ASSERT_EQ(Get(KEY5), p_v5);
+ ASSERT_EQ(Get(KEY6), p_v6);
+ ASSERT_EQ(Get(KEY7), p_v7);
+ ASSERT_EQ(Get(KEY8), p_v8);
+ ASSERT_EQ(Get(KEY9), p_v9);
+ ASSERT_EQ(Get(RNDKEY1), p_rv1);
+ ASSERT_EQ(Get(RNDKEY2), p_rv2);
+ ASSERT_EQ(Get(RNDKEY3), p_rv3);
+ }
+
+ // Assert that at least one flush to storage has been performed
+ EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT);
+ // (which will consequently increase the number of mempurges recorded too).
+ EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+
+ // Assert that there is no data corruption, even with
+ // a flush to storage.
+ ASSERT_EQ(Get(KEY1), p_v1);
+ ASSERT_EQ(Get(KEY2), p_v2);
+ ASSERT_EQ(Get(KEY3), p_v3);
+ ASSERT_EQ(Get(KEY4), p_v4);
+ ASSERT_EQ(Get(KEY5), p_v5);
+ ASSERT_EQ(Get(KEY6), p_v6);
+ ASSERT_EQ(Get(KEY7), p_v7);
+ ASSERT_EQ(Get(KEY8), p_v8);
+ ASSERT_EQ(Get(KEY9), p_v9);
+ ASSERT_EQ(Get(RNDKEY1), p_rv1);
+ ASSERT_EQ(Get(RNDKEY2), p_rv2);
+ ASSERT_EQ(Get(RNDKEY3), p_rv3);
+
+ Close();
+}
+
+// RocksDB lite does not support dynamic options
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, MemPurgeBasicToggle) {
+ Options options = CurrentOptions();
+
+ // The following options are used to enforce several values that
+ // may already exist as default values to make this test resilient
+ // to default value updates in the future.
+ options.statistics = CreateDBStatistics();
+
+ // Record all statistics.
+ options.statistics->set_stats_level(StatsLevel::kAll);
+
+ // create the DB if it's not already present
+ options.create_if_missing = true;
+
+ // Useful for now as we are trying to compare uncompressed data savings on
+ // flush().
+ options.compression = kNoCompression;
+
+ // Prevent memtable in place updates. Should already be disabled
+ // (from Wiki:
+ // In place updates can be enabled by toggling on the bool
+ // inplace_update_support flag. However, this flag is by default set to
+ // false
+ // because this thread-safe in-place update support is not compatible
+ // with concurrent memtable writes. Note that the bool
+ // allow_concurrent_memtable_write is set to true by default )
+ options.inplace_update_support = false;
+ options.allow_concurrent_memtable_write = true;
+
+ // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+ options.write_buffer_size = 1 << 20;
+ // Initially deactivate the MemPurge prototype.
+ // (negative values are equivalent to 0.0).
+ options.experimental_mempurge_threshold = -25.3;
+ TestFlushListener* listener = new TestFlushListener(options.env, this);
+ options.listeners.emplace_back(listener);
+
+ ASSERT_OK(TryReopen(options));
+ // Dynamically activate the MemPurge prototype without restarting the DB.
+ ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+ // Values greater than 1.0 are equivalent to 1.0
+ ASSERT_OK(
+ db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "3.7898"}}));
+ std::atomic<uint32_t> mempurge_count{0};
+ std::atomic<uint32_t> sst_count{0};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:MemPurgeSuccessful",
+ [&](void* /*arg*/) { mempurge_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ const size_t KVSIZE = 3;
+ std::vector<std::string> KEYS(KVSIZE);
+ for (size_t k = 0; k < KVSIZE; k++) {
+ KEYS[k] = "IamKey" + std::to_string(k);
+ }
+
+ std::vector<std::string> RNDVALS(KVSIZE);
+ const std::string NOT_FOUND = "NOT_FOUND";
+
+ // Heavy overwrite workload,
+ // more than would fit in maximum allowed memtables.
+ Random rnd(719);
+ const size_t NUM_REPEAT = 100;
+ const size_t RAND_VALUES_LENGTH = 10240;
+
+ // Insertion of of K-V pairs, multiple times (overwrites).
+ for (size_t i = 0; i < NUM_REPEAT; i++) {
+ for (size_t j = 0; j < KEYS.size(); j++) {
+ RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEYS[j], RNDVALS[j]));
+ ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+ }
+ for (size_t j = 0; j < KEYS.size(); j++) {
+ ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+ }
+ }
+
+ // Check that there was at least one mempurge
+ const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+ // Check that there was no SST files created during flush.
+ const uint32_t EXPECTED_SST_COUNT = 0;
+
+ EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+ EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+ // Dynamically deactivate MemPurge.
+ ASSERT_OK(
+ db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "-1023.0"}}));
+
+ // Insertion of of K-V pairs, multiple times (overwrites).
+ for (size_t i = 0; i < NUM_REPEAT; i++) {
+ for (size_t j = 0; j < KEYS.size(); j++) {
+ RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEYS[j], RNDVALS[j]));
+ ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+ }
+ for (size_t j = 0; j < KEYS.size(); j++) {
+ ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+ }
+ }
+
+ // Check that there was at least one mempurge
+ const uint32_t ZERO = 0;
+ // Assert that at least one flush to storage has been performed
+ EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT);
+ // The mempurge count is expected to be set to 0 when the options are updated.
+ // We expect no mempurge at all.
+ EXPECT_EQ(mempurge_count.exchange(0), ZERO);
+
+ Close();
+}
+// Closes the "#ifndef ROCKSDB_LITE"
+// End of MemPurgeBasicToggle, which is not
+// supported with RocksDB LITE because it
+// relies on dynamically changing the option
+// flag experimental_mempurge_threshold.
+#endif
+
+// At the moment, MemPurge feature is deactivated
+// when atomic_flush is enabled. This is because the level
+// of garbage between Column Families is not guaranteed to
+// be consistent, therefore a CF could hypothetically
+// trigger a MemPurge while another CF would trigger
+// a regular Flush.
+TEST_F(DBFlushTest, MemPurgeWithAtomicFlush) {
+ Options options = CurrentOptions();
+
+ // The following options are used to enforce several values that
+ // may already exist as default values to make this test resilient
+ // to default value updates in the future.
+ options.statistics = CreateDBStatistics();
+
+ // Record all statistics.
+ options.statistics->set_stats_level(StatsLevel::kAll);
+
+ // create the DB if it's not already present
+ options.create_if_missing = true;
+
+ // Useful for now as we are trying to compare uncompressed data savings on
+ // flush().
+ options.compression = kNoCompression;
+
+ // Prevent memtable in place updates. Should already be disabled
+ // (from Wiki:
+ // In place updates can be enabled by toggling on the bool
+ // inplace_update_support flag. However, this flag is by default set to
+ // false
+ // because this thread-safe in-place update support is not compatible
+ // with concurrent memtable writes. Note that the bool
+ // allow_concurrent_memtable_write is set to true by default )
+ options.inplace_update_support = false;
+ options.allow_concurrent_memtable_write = true;
+
+ // Enforce size of a single MemTable to 64KB (64KB = 65,536 bytes).
+ options.write_buffer_size = 1 << 20;
+ // Activate the MemPurge prototype.
+ options.experimental_mempurge_threshold = 153.245;
+ // Activate atomic_flush.
+ options.atomic_flush = true;
+
+ const std::vector<std::string> new_cf_names = {"pikachu", "eevie"};
+ CreateColumnFamilies(new_cf_names, options);
+
+ Close();
+
+ // 3 CFs: default will be filled with overwrites (would normally trigger
+ // mempurge)
+ // new_cf_names[1] will be filled with random values (would trigger
+ // flush) new_cf_names[2] not filled with anything.
+ ReopenWithColumnFamilies(
+ {kDefaultColumnFamilyName, new_cf_names[0], new_cf_names[1]}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(2, "bar", "baz"));
+
+ std::atomic<uint32_t> mempurge_count{0};
+ std::atomic<uint32_t> sst_count{0};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:MemPurgeSuccessful",
+ [&](void* /*arg*/) { mempurge_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ const size_t KVSIZE = 3;
+ std::vector<std::string> KEYS(KVSIZE);
+ for (size_t k = 0; k < KVSIZE; k++) {
+ KEYS[k] = "IamKey" + std::to_string(k);
+ }
+
+ std::string RNDKEY;
+ std::vector<std::string> RNDVALS(KVSIZE);
+ const std::string NOT_FOUND = "NOT_FOUND";
+
+ // Heavy overwrite workload,
+ // more than would fit in maximum allowed memtables.
+ Random rnd(106);
+ const size_t NUM_REPEAT = 100;
+ const size_t RAND_KEY_LENGTH = 128;
+ const size_t RAND_VALUES_LENGTH = 10240;
+
+ // Insertion of of K-V pairs, multiple times (overwrites).
+ for (size_t i = 0; i < NUM_REPEAT; i++) {
+ for (size_t j = 0; j < KEYS.size(); j++) {
+ RNDKEY = rnd.RandomString(RAND_KEY_LENGTH);
+ RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEYS[j], RNDVALS[j]));
+ ASSERT_OK(Put(1, RNDKEY, RNDVALS[j]));
+ ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+ ASSERT_EQ(Get(1, RNDKEY), RNDVALS[j]);
+ }
+ }
+
+ // Check that there was no mempurge because atomic_flush option is true.
+ const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 0;
+ // Check that there was at least one SST files created during flush.
+ const uint32_t EXPECTED_SST_COUNT = 1;
+
+ EXPECT_EQ(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+ EXPECT_GE(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+ Close();
+}
+
+TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
+ Options options = CurrentOptions();
+
+ options.statistics = CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kAll);
+ options.create_if_missing = true;
+ options.compression = kNoCompression;
+ options.inplace_update_support = false;
+ options.allow_concurrent_memtable_write = true;
+#ifndef ROCKSDB_LITE
+ TestFlushListener* listener = new TestFlushListener(options.env, this);
+ options.listeners.emplace_back(listener);
+#endif // !ROCKSDB_LITE
+ // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+ options.write_buffer_size = 1 << 20;
+ // Activate the MemPurge prototype.
+ options.experimental_mempurge_threshold = 15.0;
+
+ ASSERT_OK(TryReopen(options));
+
+ std::atomic<uint32_t> mempurge_count{0};
+ std::atomic<uint32_t> sst_count{0};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:MemPurgeSuccessful",
+ [&](void* /*arg*/) { mempurge_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::string KEY1 = "ThisIsKey1";
+ std::string KEY2 = "ThisIsKey2";
+ std::string KEY3 = "ThisIsKey3";
+ std::string KEY4 = "ThisIsKey4";
+ std::string KEY5 = "ThisIsKey5";
+ const std::string NOT_FOUND = "NOT_FOUND";
+
+ Random rnd(117);
+ const size_t NUM_REPEAT = 100;
+ const size_t RAND_VALUES_LENGTH = 10240;
+
+ std::string key, value, p_v1, p_v2, p_v3, p_v3b, p_v4, p_v5;
+ int count = 0;
+ const int EXPECTED_COUNT_FORLOOP = 3;
+ const int EXPECTED_COUNT_END = 4;
+
+ ReadOptions ropt;
+ ropt.pin_data = true;
+ ropt.total_order_seek = true;
+ Iterator* iter = nullptr;
+
+ // Insertion of of K-V pairs, multiple times.
+ // Also insert DeleteRange
+ for (size_t i = 0; i < NUM_REPEAT; i++) {
+ // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+ p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v3b = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEY1, p_v1));
+ ASSERT_OK(Put(KEY2, p_v2));
+ ASSERT_OK(Put(KEY3, p_v3));
+ ASSERT_OK(Put(KEY4, p_v4));
+ ASSERT_OK(Put(KEY5, p_v5));
+ ASSERT_OK(Delete(KEY2));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2,
+ KEY4));
+ ASSERT_OK(Put(KEY3, p_v3b));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1,
+ KEY3));
+ ASSERT_OK(Delete(KEY1));
+
+ ASSERT_EQ(Get(KEY1), NOT_FOUND);
+ ASSERT_EQ(Get(KEY2), NOT_FOUND);
+ ASSERT_EQ(Get(KEY3), p_v3b);
+ ASSERT_EQ(Get(KEY4), p_v4);
+ ASSERT_EQ(Get(KEY5), p_v5);
+
+ iter = db_->NewIterator(ropt);
+ iter->SeekToFirst();
+ count = 0;
+ for (; iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ key = (iter->key()).ToString(false);
+ value = (iter->value()).ToString(false);
+ if (key.compare(KEY3) == 0)
+ ASSERT_EQ(value, p_v3b);
+ else if (key.compare(KEY4) == 0)
+ ASSERT_EQ(value, p_v4);
+ else if (key.compare(KEY5) == 0)
+ ASSERT_EQ(value, p_v5);
+ else
+ ASSERT_EQ(value, NOT_FOUND);
+ count++;
+ }
+
+ // Expected count here is 3: KEY3, KEY4, KEY5.
+ ASSERT_EQ(count, EXPECTED_COUNT_FORLOOP);
+ if (iter) {
+ delete iter;
+ }
+ }
+
+ // Check that there was at least one mempurge
+ const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+ // Check that there was no SST files created during flush.
+ const uint32_t EXPECTED_SST_COUNT = 0;
+
+ EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+ EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+ // Additional test for the iterator+memPurge.
+ ASSERT_OK(Put(KEY2, p_v2));
+ iter = db_->NewIterator(ropt);
+ iter->SeekToFirst();
+ ASSERT_OK(Put(KEY4, p_v4));
+ count = 0;
+ for (; iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ key = (iter->key()).ToString(false);
+ value = (iter->value()).ToString(false);
+ if (key.compare(KEY2) == 0)
+ ASSERT_EQ(value, p_v2);
+ else if (key.compare(KEY3) == 0)
+ ASSERT_EQ(value, p_v3b);
+ else if (key.compare(KEY4) == 0)
+ ASSERT_EQ(value, p_v4);
+ else if (key.compare(KEY5) == 0)
+ ASSERT_EQ(value, p_v5);
+ else
+ ASSERT_EQ(value, NOT_FOUND);
+ count++;
+ }
+
+ // Expected count here is 4: KEY2, KEY3, KEY4, KEY5.
+ ASSERT_EQ(count, EXPECTED_COUNT_END);
+ if (iter) delete iter;
+
+ Close();
+}
+
+// Create a Compaction Fitler that will be invoked
+// at flush time and will update the value of a KV pair
+// if the key string is "lower" than the filter_key_ string.
+class ConditionalUpdateFilter : public CompactionFilter {
+ public:
+ explicit ConditionalUpdateFilter(const std::string* filtered_key)
+ : filtered_key_(filtered_key) {}
+ bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+ std::string* new_value, bool* value_changed) const override {
+ // If key<filtered_key_, update the value of the KV-pair.
+ if (key.compare(*filtered_key_) < 0) {
+ assert(new_value != nullptr);
+ *new_value = NEW_VALUE;
+ *value_changed = true;
+ }
+ return false /*do not remove this KV-pair*/;
+ }
+
+ const char* Name() const override { return "ConditionalUpdateFilter"; }
+
+ private:
+ const std::string* filtered_key_;
+};
+
+class ConditionalUpdateFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit ConditionalUpdateFilterFactory(const Slice& filtered_key)
+ : filtered_key_(filtered_key.ToString()) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(
+ new ConditionalUpdateFilter(&filtered_key_));
+ }
+
+ const char* Name() const override { return "ConditionalUpdateFilterFactory"; }
+
+ bool ShouldFilterTableFileCreation(
+ TableFileCreationReason reason) const override {
+ // This compaction filter will be invoked
+ // at flush time (and therefore at MemPurge time).
+ return (reason == TableFileCreationReason::kFlush);
+ }
+
+ private:
+ std::string filtered_key_;
+};
+
+TEST_F(DBFlushTest, MemPurgeAndCompactionFilter) {
+ Options options = CurrentOptions();
+
+ std::string KEY1 = "ThisIsKey1";
+ std::string KEY2 = "ThisIsKey2";
+ std::string KEY3 = "ThisIsKey3";
+ std::string KEY4 = "ThisIsKey4";
+ std::string KEY5 = "ThisIsKey5";
+ std::string KEY6 = "ThisIsKey6";
+ std::string KEY7 = "ThisIsKey7";
+ std::string KEY8 = "ThisIsKey8";
+ std::string KEY9 = "ThisIsKey9";
+ const std::string NOT_FOUND = "NOT_FOUND";
+
+ options.statistics = CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kAll);
+ options.create_if_missing = true;
+ options.compression = kNoCompression;
+ options.inplace_update_support = false;
+ options.allow_concurrent_memtable_write = true;
+#ifndef ROCKSDB_LITE
+ TestFlushListener* listener = new TestFlushListener(options.env, this);
+ options.listeners.emplace_back(listener);
+#endif // !ROCKSDB_LITE
+ // Create a ConditionalUpdate compaction filter
+ // that will update all the values of the KV pairs
+ // where the keys are "lower" than KEY4.
+ options.compaction_filter_factory =
+ std::make_shared<ConditionalUpdateFilterFactory>(KEY4);
+
+ // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+ options.write_buffer_size = 1 << 20;
+ // Activate the MemPurge prototype.
+ options.experimental_mempurge_threshold = 26.55;
+
+ ASSERT_OK(TryReopen(options));
+
+ std::atomic<uint32_t> mempurge_count{0};
+ std::atomic<uint32_t> sst_count{0};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:MemPurgeSuccessful",
+ [&](void* /*arg*/) { mempurge_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(53);
+ const size_t NUM_REPEAT = 1000;
+ const size_t RAND_VALUES_LENGTH = 10240;
+ std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9;
+
+ p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEY1, p_v1));
+ ASSERT_OK(Put(KEY2, p_v2));
+ ASSERT_OK(Put(KEY3, p_v3));
+ ASSERT_OK(Put(KEY4, p_v4));
+ ASSERT_OK(Put(KEY5, p_v5));
+ ASSERT_OK(Delete(KEY1));
+
+ // Insertion of of K-V pairs, multiple times.
+ for (size_t i = 0; i < NUM_REPEAT; i++) {
+ // Create value strings of arbitrary
+ // length RAND_VALUES_LENGTH bytes.
+ p_v6 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v7 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v8 = rnd.RandomString(RAND_VALUES_LENGTH);
+ p_v9 = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEY6, p_v6));
+ ASSERT_OK(Put(KEY7, p_v7));
+ ASSERT_OK(Put(KEY8, p_v8));
+ ASSERT_OK(Put(KEY9, p_v9));
+
+ ASSERT_OK(Delete(KEY7));
+ }
+
+ // Check that there was at least one mempurge
+ const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+ // Check that there was no SST files created during flush.
+ const uint32_t EXPECTED_SST_COUNT = 0;
+
+ EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+ EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+ // Verify that the ConditionalUpdateCompactionFilter
+ // updated the values of KEY2 and KEY3, and not KEY4 and KEY5.
+ ASSERT_EQ(Get(KEY1), NOT_FOUND);
+ ASSERT_EQ(Get(KEY2), NEW_VALUE);
+ ASSERT_EQ(Get(KEY3), NEW_VALUE);
+ ASSERT_EQ(Get(KEY4), p_v4);
+ ASSERT_EQ(Get(KEY5), p_v5);
+}
+
+TEST_F(DBFlushTest, DISABLED_MemPurgeWALSupport) {
+ Options options = CurrentOptions();
+
+ options.statistics = CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kAll);
+ options.create_if_missing = true;
+ options.compression = kNoCompression;
+ options.inplace_update_support = false;
+ options.allow_concurrent_memtable_write = true;
+
+ // Enforce size of a single MemTable to 128KB.
+ options.write_buffer_size = 128 << 10;
+ // Activate the MemPurge prototype
+ // (values >1.0 are equivalent to 1.0).
+ options.experimental_mempurge_threshold = 2.5;
+
+ ASSERT_OK(TryReopen(options));
+
+ const size_t KVSIZE = 10;
+
+ do {
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Put(1, "baz", "v5"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("v1", Get(1, "foo"));
+
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v5", Get(1, "baz"));
+ ASSERT_OK(Put(0, "bar", "v2"));
+ ASSERT_OK(Put(1, "bar", "v2"));
+ ASSERT_OK(Put(1, "foo", "v3"));
+ std::atomic<uint32_t> mempurge_count{0};
+ std::atomic<uint32_t> sst_count{0};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:MemPurgeSuccessful",
+ [&](void* /*arg*/) { mempurge_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<std::string> keys;
+ for (size_t k = 0; k < KVSIZE; k++) {
+ keys.push_back("IamKey" + std::to_string(k));
+ }
+
+ std::string RNDKEY, RNDVALUE;
+ const std::string NOT_FOUND = "NOT_FOUND";
+
+ // Heavy overwrite workload,
+ // more than would fit in maximum allowed memtables.
+ Random rnd(719);
+ const size_t NUM_REPEAT = 100;
+ const size_t RAND_KEY_LENGTH = 4096;
+ const size_t RAND_VALUES_LENGTH = 1024;
+ std::vector<std::string> values_default(KVSIZE), values_pikachu(KVSIZE);
+
+ // Insert a very first set of keys that will be
+ // mempurged at least once.
+ for (size_t k = 0; k < KVSIZE / 2; k++) {
+ values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+ values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+ }
+
+ // Insert keys[0:KVSIZE/2] to
+ // both 'default' and 'pikachu' CFs.
+ for (size_t k = 0; k < KVSIZE / 2; k++) {
+ ASSERT_OK(Put(0, keys[k], values_default[k]));
+ ASSERT_OK(Put(1, keys[k], values_pikachu[k]));
+ }
+
+ // Check that the insertion was seamless.
+ for (size_t k = 0; k < KVSIZE / 2; k++) {
+ ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+ ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+ }
+
+ // Insertion of of K-V pairs, multiple times (overwrites)
+ // into 'default' CF. Will trigger mempurge.
+ for (size_t j = 0; j < NUM_REPEAT; j++) {
+ // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+ for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+ values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+ }
+
+ // Insert K-V into default CF.
+ for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+ ASSERT_OK(Put(0, keys[k], values_default[k]));
+ }
+
+ // Check key validity, for all keys, both in
+ // default and pikachu CFs.
+ for (size_t k = 0; k < KVSIZE; k++) {
+ ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+ }
+ // Note that at this point, only keys[0:KVSIZE/2]
+ // have been inserted into Pikachu.
+ for (size_t k = 0; k < KVSIZE / 2; k++) {
+ ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+ }
+ }
+
+ // Insertion of of K-V pairs, multiple times (overwrites)
+ // into 'pikachu' CF. Will trigger mempurge.
+ // Check that we keep the older logs for 'default' imm().
+ for (size_t j = 0; j < NUM_REPEAT; j++) {
+ // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+ for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+ values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+ }
+
+ // Insert K-V into pikachu CF.
+ for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+ ASSERT_OK(Put(1, keys[k], values_pikachu[k]));
+ }
+
+ // Check key validity, for all keys,
+ // both in default and pikachu.
+ for (size_t k = 0; k < KVSIZE; k++) {
+ ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+ ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+ }
+ }
+
+ // Check that there was at least one mempurge
+ const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+ // Check that there was no SST files created during flush.
+ const uint32_t EXPECTED_SST_COUNT = 0;
+
+ EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+ if (options.experimental_mempurge_threshold ==
+ std::numeric_limits<double>::max()) {
+ EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+ }
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ // Check that there was no data corruption anywhere,
+ // not in 'default' nor in 'Pikachu' CFs.
+ ASSERT_EQ("v3", Get(1, "foo"));
+ ASSERT_OK(Put(1, "foo", "v4"));
+ ASSERT_EQ("v4", Get(1, "foo"));
+ ASSERT_EQ("v2", Get(1, "bar"));
+ ASSERT_EQ("v5", Get(1, "baz"));
+ // Check keys in 'Default' and 'Pikachu'.
+ // keys[0:KVSIZE/2] were for sure contained
+ // in the imm() at Reopen/recovery time.
+ for (size_t k = 0; k < KVSIZE; k++) {
+ ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+ ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+ }
+ // Insertion of random K-V pairs to trigger
+ // a flush in the Pikachu CF.
+ for (size_t j = 0; j < NUM_REPEAT; j++) {
+ RNDKEY = rnd.RandomString(RAND_KEY_LENGTH);
+ RNDVALUE = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(1, RNDKEY, RNDVALUE));
+ }
+ // ASsert than there was at least one flush to storage.
+ EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("v4", Get(1, "foo"));
+ ASSERT_EQ("v2", Get(1, "bar"));
+ ASSERT_EQ("v5", Get(1, "baz"));
+ // Since values in default are held in mutable mem()
+ // and imm(), check if the flush in pikachu didn't
+ // affect these values.
+ for (size_t k = 0; k < KVSIZE; k++) {
+ ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+ ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+ }
+ ASSERT_EQ(Get(1, RNDKEY), RNDVALUE);
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBFlushTest, MemPurgeCorrectLogNumberAndSSTFileCreation) {
+ // Before our bug fix, we noticed that when 2 memtables were
+ // being flushed (with one memtable being the output of a
+ // previous MemPurge and one memtable being a newly-sealed memtable),
+ // the SST file created was not properly added to the DB version
+ // (via the VersionEdit obj), leading to data loss (the SST file
+ // was later being purged as an obsolete file).
+ // Therefore, we reproduce this scenario to test our fix.
+ Options options = CurrentOptions();
+
+ options.create_if_missing = true;
+ options.compression = kNoCompression;
+ options.inplace_update_support = false;
+ options.allow_concurrent_memtable_write = true;
+
+ // Enforce size of a single MemTable to 1MB (64MB = 1048576 bytes).
+ options.write_buffer_size = 1 << 20;
+ // Activate the MemPurge prototype.
+ options.experimental_mempurge_threshold = 1.0;
+
+ // Force to have more than one memtable to trigger a flush.
+ // For some reason this option does not seem to be enforced,
+ // so the following test is designed to make sure that we
+ // are testing the correct test case.
+ options.min_write_buffer_number_to_merge = 3;
+ options.max_write_buffer_number = 5;
+ options.max_write_buffer_size_to_maintain = 2 * (options.write_buffer_size);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(TryReopen(options));
+
+ std::atomic<uint32_t> mempurge_count{0};
+ std::atomic<uint32_t> sst_count{0};
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:MemPurgeSuccessful",
+ [&](void* /*arg*/) { mempurge_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Dummy variable used for the following callback function.
+ uint64_t ZERO = 0;
+ // We will first execute mempurge operations exclusively.
+ // Therefore, when the first flush is triggered, we want to make
+ // sure there is at least 2 memtables being flushed: one output
+ // from a previous mempurge, and one newly sealed memtable.
+ // This is when we observed in the past that some SST files created
+ // were not properly added to the DB version (via the VersionEdit obj).
+ std::atomic<uint64_t> num_memtable_at_first_flush(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FlushJob::WriteLevel0Table:num_memtables", [&](void* arg) {
+ uint64_t* mems_size = reinterpret_cast<uint64_t*>(arg);
+ // atomic_compare_exchange_strong sometimes updates the value
+ // of ZERO (the "expected" object), so we make sure ZERO is indeed...
+ // zero.
+ ZERO = 0;
+ std::atomic_compare_exchange_strong(&num_memtable_at_first_flush, &ZERO,
+ *mems_size);
+ });
+
+ const std::vector<std::string> KEYS = {
+ "ThisIsKey1", "ThisIsKey2", "ThisIsKey3", "ThisIsKey4", "ThisIsKey5",
+ "ThisIsKey6", "ThisIsKey7", "ThisIsKey8", "ThisIsKey9"};
+ const std::string NOT_FOUND = "NOT_FOUND";
+
+ Random rnd(117);
+ const uint64_t NUM_REPEAT_OVERWRITES = 100;
+ const uint64_t NUM_RAND_INSERTS = 500;
+ const uint64_t RAND_VALUES_LENGTH = 10240;
+
+ std::string key, value;
+ std::vector<std::string> values(9, "");
+
+ // Keys used to check that no SST file disappeared.
+ for (uint64_t k = 0; k < 5; k++) {
+ values[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEYS[k], values[k]));
+ }
+
+ // Insertion of of K-V pairs, multiple times.
+ // Trigger at least one mempurge and no SST file creation.
+ for (size_t i = 0; i < NUM_REPEAT_OVERWRITES; i++) {
+ // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+ for (uint64_t k = 5; k < values.size(); k++) {
+ values[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(KEYS[k], values[k]));
+ }
+ // Check database consistency.
+ for (uint64_t k = 0; k < values.size(); k++) {
+ ASSERT_EQ(Get(KEYS[k]), values[k]);
+ }
+ }
+
+ // Check that there was at least one mempurge
+ uint32_t expected_min_mempurge_count = 1;
+ // Check that there was no SST files created during flush.
+ uint32_t expected_sst_count = 0;
+ EXPECT_GE(mempurge_count.load(), expected_min_mempurge_count);
+ EXPECT_EQ(sst_count.load(), expected_sst_count);
+
+ // Trigger an SST file creation and no mempurge.
+ for (size_t i = 0; i < NUM_RAND_INSERTS; i++) {
+ key = rnd.RandomString(RAND_VALUES_LENGTH);
+ // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+ value = rnd.RandomString(RAND_VALUES_LENGTH);
+ ASSERT_OK(Put(key, value));
+ // Check database consistency.
+ for (uint64_t k = 0; k < values.size(); k++) {
+ ASSERT_EQ(Get(KEYS[k]), values[k]);
+ }
+ ASSERT_EQ(Get(key), value);
+ }
+
+ // Check that there was at least one SST files created during flush.
+ expected_sst_count = 1;
+ EXPECT_GE(sst_count.load(), expected_sst_count);
+
+ // Oddly enough, num_memtable_at_first_flush is not enforced to be
+ // equal to min_write_buffer_number_to_merge. So by asserting that
+ // the first SST file creation comes from one output memtable
+ // from a previous mempurge, and one newly sealed memtable. This
+ // is the scenario where we observed that some SST files created
+ // were not properly added to the DB version before our bug fix.
+ ASSERT_GE(num_memtable_at_first_flush.load(), 2);
+
+ // Check that no data was lost after SST file creation.
+ for (uint64_t k = 0; k < values.size(); k++) {
+ ASSERT_EQ(Get(KEYS[k]), values[k]);
+ }
+ // Extra check of database consistency.
+ ASSERT_EQ(Get(key), value);
+
+ Close();
+}
+
+TEST_P(DBFlushDirectIOTest, DirectIO) {
+ Options options;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.max_background_flushes = 2;
+ options.use_direct_io_for_flush_and_compaction = GetParam();
+ options.env = MockEnv::Create(Env::Default());
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:create_file", [&](void* arg) {
+ bool* use_direct_writes = static_cast<bool*>(arg);
+ ASSERT_EQ(*use_direct_writes,
+ options.use_direct_io_for_flush_and_compaction);
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ Reopen(options);
+ ASSERT_OK(Put("foo", "v"));
+ FlushOptions flush_options;
+ flush_options.wait = true;
+ ASSERT_OK(dbfull()->Flush(flush_options));
+ Destroy(options);
+ delete options.env;
+}
+
+TEST_F(DBFlushTest, FlushError) {
+ Options options;
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ options.write_buffer_size = 100;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.disable_auto_compactions = true;
+ options.env = fault_injection_env.get();
+ Reopen(options);
+
+ ASSERT_OK(Put("key1", "value1"));
+ ASSERT_OK(Put("key2", "value2"));
+ fault_injection_env->SetFilesystemActive(false);
+ Status s = dbfull()->TEST_SwitchMemtable();
+ fault_injection_env->SetFilesystemActive(true);
+ Destroy(options);
+ ASSERT_NE(s, Status::OK());
+}
+
+TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) {
+ // Regression test for bug where manual flush hangs forever when the DB
+ // is in read-only mode. Verify it now at least returns, despite failing.
+ Options options;
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ options.env = fault_injection_env.get();
+ options.max_write_buffer_number = 2;
+ Reopen(options);
+
+ // Trigger a first flush but don't let it run
+ ASSERT_OK(db_->PauseBackgroundWork());
+ ASSERT_OK(Put("key1", "value1"));
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ ASSERT_OK(db_->Flush(flush_opts));
+
+ // Write a key to the second memtable so we have something to flush later
+ // after the DB is in read-only mode.
+ ASSERT_OK(Put("key2", "value2"));
+
+ // Let the first flush continue, hit an error, and put the DB in read-only
+ // mode.
+ fault_injection_env->SetFilesystemActive(false);
+ ASSERT_OK(db_->ContinueBackgroundWork());
+ // We ingested the error to env, so the returned status is not OK.
+ ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
+#ifndef ROCKSDB_LITE
+ uint64_t num_bg_errors;
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBackgroundErrors, &num_bg_errors));
+ ASSERT_GT(num_bg_errors, 0);
+#endif // ROCKSDB_LITE
+
+ // In the bug scenario, triggering another flush would cause the second flush
+ // to hang forever. After the fix we expect it to return an error.
+ ASSERT_NOK(db_->Flush(FlushOptions()));
+
+ Close();
+}
+
+TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:AfterScheduleFlush",
+ "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+ {"DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+ "DBImpl::BackgroundCallFlush:start"},
+ {"DBImpl::BackgroundCallFlush:start",
+ "DBImpl::FlushMemTable:BeforeWaitForBgFlush"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_EQ(2, handles_.size());
+ ASSERT_OK(Put(1, "key", "value"));
+ auto* cfd = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+ port::Thread drop_cf_thr([&]() {
+ TEST_SYNC_POINT(
+ "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+ handles_.resize(1);
+ TEST_SYNC_POINT(
+ "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+ });
+ FlushOptions flush_opts;
+ flush_opts.allow_write_stall = true;
+ ASSERT_NOK(dbfull()->TEST_FlushMemTable(cfd, flush_opts));
+ drop_cf_thr.join();
+ Close();
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) {
+ class TestListener : public EventListener {
+ public:
+ void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+ // There's only one key in each flush.
+ ASSERT_EQ(info.smallest_seqno, info.largest_seqno);
+ ASSERT_NE(0, info.smallest_seqno);
+ if (info.smallest_seqno == seq1) {
+ // First flush completed
+ ASSERT_FALSE(completed1);
+ completed1 = true;
+ CheckFlushResultCommitted(db, seq1);
+ } else {
+ // Second flush completed
+ ASSERT_FALSE(completed2);
+ completed2 = true;
+ ASSERT_EQ(info.smallest_seqno, seq2);
+ CheckFlushResultCommitted(db, seq2);
+ }
+ }
+
+ void CheckFlushResultCommitted(DB* db, SequenceNumber seq) {
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db);
+ InstrumentedMutex* mutex = db_impl->mutex();
+ mutex->Lock();
+ auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+ db->DefaultColumnFamily())
+ ->cfd();
+ ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber());
+ mutex->Unlock();
+ }
+
+ std::atomic<SequenceNumber> seq1{0};
+ std::atomic<SequenceNumber> seq2{0};
+ std::atomic<bool> completed1{false};
+ std::atomic<bool> completed2{false};
+ };
+ std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables",
+ "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"},
+ {"DBImpl::FlushMemTableToOutputFile:Finish",
+ "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}});
+ SyncPoint::GetInstance()->SetCallBack(
+ "FlushJob::WriteLevel0Table", [&listener](void* arg) {
+ // Wait for the second flush finished, out of mutex.
+ auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
+ if (mems->front()->GetEarliestSequenceNumber() == listener->seq1 - 1) {
+ TEST_SYNC_POINT(
+ "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:"
+ "WaitSecond");
+ }
+ });
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.listeners.push_back(listener);
+ // Setting max_flush_jobs = max_background_jobs / 4 = 2.
+ options.max_background_jobs = 8;
+ // Allow 2 immutable memtables.
+ options.max_write_buffer_number = 3;
+ Reopen(options);
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put("foo", "v"));
+ listener->seq1 = db_->GetLatestSequenceNumber();
+ // t1 will wait for the second flush complete before committing flush result.
+ auto t1 = port::Thread([&]() {
+ // flush_opts.wait = true
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ });
+ // Wait for first flush started.
+ TEST_SYNC_POINT(
+ "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst");
+ // The second flush will exit early without commit its result. The work
+ // is delegated to the first flush.
+ ASSERT_OK(Put("bar", "v"));
+ listener->seq2 = db_->GetLatestSequenceNumber();
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ ASSERT_OK(db_->Flush(flush_opts));
+ t1.join();
+ // Ensure background work is fully finished including listener callbacks
+ // before accessing listener state.
+ ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+ ASSERT_TRUE(listener->completed1);
+ ASSERT_TRUE(listener->completed2);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBFlushTest, FlushWithBlob) {
+ constexpr uint64_t min_blob_size = 10;
+
+ Options options;
+ options.enable_blob_files = true;
+ options.min_blob_size = min_blob_size;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+
+ Reopen(options);
+
+ constexpr char short_value[] = "short";
+ static_assert(sizeof(short_value) - 1 < min_blob_size,
+ "short_value too long");
+
+ constexpr char long_value[] = "long_value";
+ static_assert(sizeof(long_value) - 1 >= min_blob_size,
+ "long_value too short");
+
+ ASSERT_OK(Put("key1", short_value));
+ ASSERT_OK(Put("key2", long_value));
+
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(Get("key1"), short_value);
+ ASSERT_EQ(Get("key2"), long_value);
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ Version* const current = cfd->current();
+ assert(current);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
+
+ const auto& l0_files = storage_info->LevelFiles(0);
+ ASSERT_EQ(l0_files.size(), 1);
+
+ const FileMetaData* const table_file = l0_files[0];
+ assert(table_file);
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+ ASSERT_EQ(blob_files.size(), 1);
+
+ const auto& blob_file = blob_files.front();
+ assert(blob_file);
+
+ ASSERT_EQ(table_file->smallest.user_key(), "key1");
+ ASSERT_EQ(table_file->largest.user_key(), "key2");
+ ASSERT_EQ(table_file->fd.smallest_seqno, 1);
+ ASSERT_EQ(table_file->fd.largest_seqno, 2);
+ ASSERT_EQ(table_file->oldest_blob_file_number,
+ blob_file->GetBlobFileNumber());
+
+ ASSERT_EQ(blob_file->GetTotalBlobCount(), 1);
+
+#ifndef ROCKSDB_LITE
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ assert(internal_stats);
+
+ const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+ ASSERT_FALSE(compaction_stats.empty());
+ ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize());
+ ASSERT_EQ(compaction_stats[0].bytes_written_blob,
+ blob_file->GetTotalBlobBytes());
+ ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+ ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1);
+
+ const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+ ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+ compaction_stats[0].bytes_written +
+ compaction_stats[0].bytes_written_blob);
+#endif // ROCKSDB_LITE
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoff1) {
+ if (mem_env_ || encrypted_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+ return;
+ }
+ std::shared_ptr<FaultInjectionTestFS> fault_fs(
+ new FaultInjectionTestFS(FileSystem::Default()));
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.disable_auto_compactions = true;
+ options.env = fault_fs_env.get();
+ options.checksum_handoff_file_types.Add(FileType::kTableFile);
+ Reopen(options);
+
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ ASSERT_OK(Put("key1", "value1"));
+ ASSERT_OK(Put("key2", "value2"));
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+ // The hash does not match, write fails
+ // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+ // Since the file system returns IOStatus::Corruption, it is an
+ // unrecoverable error.
+ SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+ });
+ ASSERT_OK(Put("key3", "value3"));
+ ASSERT_OK(Put("key4", "value4"));
+ SyncPoint::GetInstance()->EnableProcessing();
+ Status s = Flush();
+ ASSERT_EQ(s.severity(),
+ ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ Destroy(options);
+ Reopen(options);
+
+ // The file system does not support checksum handoff. The check
+ // will be ignored.
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+ ASSERT_OK(Put("key5", "value5"));
+ ASSERT_OK(Put("key6", "value6"));
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+ // Each write will be similated as corrupted.
+ // Since the file system returns IOStatus::Corruption, it is an
+ // unrecoverable error.
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+ fault_fs->IngestDataCorruptionBeforeWrite();
+ });
+ ASSERT_OK(Put("key7", "value7"));
+ ASSERT_OK(Put("key8", "value8"));
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(),
+ ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoff2) {
+ if (mem_env_ || encrypted_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+ return;
+ }
+ std::shared_ptr<FaultInjectionTestFS> fault_fs(
+ new FaultInjectionTestFS(FileSystem::Default()));
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.disable_auto_compactions = true;
+ options.env = fault_fs_env.get();
+ Reopen(options);
+
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ ASSERT_OK(Put("key1", "value1"));
+ ASSERT_OK(Put("key2", "value2"));
+ ASSERT_OK(Flush());
+
+ // options is not set, the checksum handoff will not be triggered
+ SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+ });
+ ASSERT_OK(Put("key3", "value3"));
+ ASSERT_OK(Put("key4", "value4"));
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Flush());
+ SyncPoint::GetInstance()->DisableProcessing();
+ Destroy(options);
+ Reopen(options);
+
+ // The file system does not support checksum handoff. The check
+ // will be ignored.
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+ ASSERT_OK(Put("key5", "value5"));
+ ASSERT_OK(Put("key6", "value6"));
+ ASSERT_OK(Flush());
+
+ // options is not set, the checksum handoff will not be triggered
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+ fault_fs->IngestDataCorruptionBeforeWrite();
+ });
+ ASSERT_OK(Put("key7", "value7"));
+ ASSERT_OK(Put("key8", "value8"));
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Flush());
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest1) {
+ if (mem_env_ || encrypted_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+ return;
+ }
+ std::shared_ptr<FaultInjectionTestFS> fault_fs(
+ new FaultInjectionTestFS(FileSystem::Default()));
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.disable_auto_compactions = true;
+ options.env = fault_fs_env.get();
+ options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ Reopen(options);
+
+ ASSERT_OK(Put("key1", "value1"));
+ ASSERT_OK(Put("key2", "value2"));
+ ASSERT_OK(Flush());
+
+ // The hash does not match, write fails
+ // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+ // Since the file system returns IOStatus::Corruption, it is mapped to
+ // kFatalError error.
+ ASSERT_OK(Put("key3", "value3"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+ });
+ ASSERT_OK(Put("key3", "value3"));
+ ASSERT_OK(Put("key4", "value4"));
+ SyncPoint::GetInstance()->EnableProcessing();
+ Status s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest2) {
+ if (mem_env_ || encrypted_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+ return;
+ }
+ std::shared_ptr<FaultInjectionTestFS> fault_fs(
+ new FaultInjectionTestFS(FileSystem::Default()));
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.disable_auto_compactions = true;
+ options.env = fault_fs_env.get();
+ options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+ Reopen(options);
+ // The file system does not support checksum handoff. The check
+ // will be ignored.
+ ASSERT_OK(Put("key5", "value5"));
+ ASSERT_OK(Put("key6", "value6"));
+ ASSERT_OK(Flush());
+
+ // Each write will be similated as corrupted.
+ // Since the file system returns IOStatus::Corruption, it is mapped to
+ // kFatalError error.
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest",
+ [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+ ASSERT_OK(Put("key7", "value7"));
+ ASSERT_OK(Put("key8", "value8"));
+ SyncPoint::GetInstance()->EnableProcessing();
+ Status s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ Destroy(options);
+}
+
+TEST_F(DBFlushTest, PickRightMemtables) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ options.create_if_missing = true;
+
+ const std::string test_cf_name = "test_cf";
+ options.max_write_buffer_number = 128;
+ CreateColumnFamilies({test_cf_name}, options);
+
+ Close();
+
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, test_cf_name}, options);
+
+ ASSERT_OK(db_->Put(WriteOptions(), "key", "value"));
+
+ ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "key", "value"));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::SyncClosedLogs:BeforeReLock", [&](void* /*arg*/) {
+ ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "what", "v"));
+ auto* cfhi =
+ static_cast_with_check<ColumnFamilyHandleImpl>(handles_[1]);
+ assert(cfhi);
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfhi->cfd()));
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", [&](void* arg) {
+ auto* job = reinterpret_cast<FlushJob*>(arg);
+ assert(job);
+ const auto& mems = job->GetMemTables();
+ assert(mems.size() == 1);
+ assert(mems[0]);
+ ASSERT_EQ(1, mems[0]->GetID());
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class DBFlushTestBlobError : public DBFlushTest,
+ public testing::WithParamInterface<std::string> {
+ public:
+ DBFlushTestBlobError() : sync_point_(GetParam()) {}
+
+ std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBFlushTestBlobError, DBFlushTestBlobError,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileBuilder::WriteBlobToFile:AddRecord",
+ "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBFlushTestBlobError, FlushError) {
+ Options options;
+ options.enable_blob_files = true;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("key", "blob"));
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+ Status* const s = static_cast<Status*>(arg);
+ assert(s);
+
+ (*s) = Status::IOError(sync_point_);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_NOK(Flush());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ Version* const current = cfd->current();
+ assert(current);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
+
+ const auto& l0_files = storage_info->LevelFiles(0);
+ ASSERT_TRUE(l0_files.empty());
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+ ASSERT_TRUE(blob_files.empty());
+
+ // Make sure the files generated by the failed job have been deleted
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(dbname_, &files));
+ for (const auto& file : files) {
+ uint64_t number = 0;
+ FileType type = kTableFile;
+
+ if (!ParseFileName(file, &number, &type)) {
+ continue;
+ }
+
+ ASSERT_NE(type, kTableFile);
+ ASSERT_NE(type, kBlobFile);
+ }
+
+#ifndef ROCKSDB_LITE
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ assert(internal_stats);
+
+ const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+ ASSERT_FALSE(compaction_stats.empty());
+
+ if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") {
+ ASSERT_EQ(compaction_stats[0].bytes_written, 0);
+ ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0);
+ ASSERT_EQ(compaction_stats[0].num_output_files, 0);
+ ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0);
+ } else {
+ // SST file writing succeeded; blob file writing failed (during Finish)
+ ASSERT_GT(compaction_stats[0].bytes_written, 0);
+ ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0);
+ ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+ ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0);
+ }
+
+ const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+ ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+ compaction_stats[0].bytes_written +
+ compaction_stats[0].bytes_written_blob);
+#endif // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) {
+ class SimpleTestFlushListener : public EventListener {
+ public:
+ explicit SimpleTestFlushListener(DBFlushTest* _test) : test_(_test) {}
+ ~SimpleTestFlushListener() override {}
+
+ void OnFlushBegin(DB* db, const FlushJobInfo& info) override {
+ ASSERT_EQ(static_cast<uint32_t>(0), info.cf_id);
+
+ ASSERT_OK(db->Delete(WriteOptions(), "foo"));
+ snapshot_ = db->GetSnapshot();
+ ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+
+ auto* dbimpl = static_cast_with_check<DBImpl>(db);
+ assert(dbimpl);
+
+ ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
+ auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
+ assert(cfhi);
+ ASSERT_OK(dbimpl->TEST_SwitchMemtable(cfhi->cfd()));
+ }
+
+ DBFlushTest* test_ = nullptr;
+ const Snapshot* snapshot_ = nullptr;
+ };
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ auto* listener = new SimpleTestFlushListener(this);
+ options.listeners.emplace_back(listener);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0"));
+
+ ManagedSnapshot snapshot_guard(db_);
+
+ ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+ ASSERT_OK(db_->Flush(FlushOptions(), default_cf));
+
+ const Snapshot* snapshot = listener->snapshot_;
+ assert(snapshot);
+
+ ReadOptions read_opts;
+ read_opts.snapshot = snapshot;
+
+ // Using snapshot should not see "foo".
+ {
+ std::string value;
+ Status s = db_->Get(read_opts, "foo", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.allow_2pc = true;
+ options.atomic_flush = GetParam();
+ // 64MB so that memtable flush won't be trigger by the small writes.
+ options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+ // Destroy the DB to recreate as a TransactionDB.
+ Close();
+ Destroy(options, true);
+
+ // Create a TransactionDB.
+ TransactionDB* txn_db = nullptr;
+ TransactionDBOptions txn_db_opts;
+ txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
+ ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
+ ASSERT_NE(txn_db, nullptr);
+ db_ = txn_db;
+
+ // Create two more columns other than default CF.
+ std::vector<std::string> cfs = {"puppy", "kitty"};
+ CreateColumnFamilies(cfs, options);
+ ASSERT_EQ(handles_.size(), 2);
+ ASSERT_EQ(handles_[0]->GetName(), cfs[0]);
+ ASSERT_EQ(handles_[1]->GetName(), cfs[1]);
+ const size_t kNumCfToFlush = options.atomic_flush ? 2 : 1;
+
+ WriteOptions wopts;
+ TransactionOptions txn_opts;
+ // txn1 only prepare, but does not commit.
+ // The WAL containing the prepared but uncommitted data must be kept.
+ Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+ // txn2 not only prepare, but also commit.
+ Transaction* txn2 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+ ASSERT_NE(txn1, nullptr);
+ ASSERT_NE(txn2, nullptr);
+ for (size_t i = 0; i < kNumCfToFlush; i++) {
+ ASSERT_OK(txn1->Put(handles_[i], "k1", "v1"));
+ ASSERT_OK(txn2->Put(handles_[i], "k2", "v2"));
+ }
+ // A txn must be named before prepare.
+ ASSERT_OK(txn1->SetName("txn1"));
+ ASSERT_OK(txn2->SetName("txn2"));
+ // Prepare writes to WAL, but not to memtable. (WriteCommitted)
+ ASSERT_OK(txn1->Prepare());
+ ASSERT_OK(txn2->Prepare());
+ // Commit writes to memtable.
+ ASSERT_OK(txn2->Commit());
+ delete txn1;
+ delete txn2;
+
+ // There are still data in memtable not flushed.
+ // But since data is small enough to reside in the active memtable,
+ // there are no immutable memtable.
+ for (size_t i = 0; i < kNumCfToFlush; i++) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+ ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+ }
+
+ // Atomic flush memtables,
+ // the min log with prepared data should be written to MANIFEST.
+ std::vector<ColumnFamilyHandle*> cfs_to_flush(kNumCfToFlush);
+ for (size_t i = 0; i < kNumCfToFlush; i++) {
+ cfs_to_flush[i] = handles_[i];
+ }
+ ASSERT_OK(txn_db->Flush(FlushOptions(), cfs_to_flush));
+
+ // There are no remaining data in memtable after flush.
+ for (size_t i = 0; i < kNumCfToFlush; i++) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+ ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+ ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush);
+ }
+
+ // The recovered min log number with prepared data should be non-zero.
+ // In 2pc mode, MinLogNumberToKeep returns the
+ // VersionSet::min_log_number_to_keep recovered from MANIFEST, if it's 0,
+ // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST.
+ cfs.push_back(kDefaultColumnFamilyName);
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+ DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+ ASSERT_TRUE(db_impl->allow_2pc());
+ ASSERT_NE(db_impl->MinLogNumberToKeep(), 0);
+}
+#endif // ROCKSDB_LITE
+
+TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = GetParam();
+ options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ WriteOptions wopts;
+ wopts.disableWAL = true;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+ }
+
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+ ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+ }
+
+ std::vector<int> cf_ids;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ cf_ids.emplace_back(static_cast<int>(i));
+ }
+ ASSERT_OK(Flush(cf_ids));
+
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush);
+ ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+ ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+ }
+}
+
+TEST_P(DBAtomicFlushTest, PrecomputeMinLogNumberToKeepNon2PC) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = GetParam();
+ options.write_buffer_size = (static_cast<size_t>(64) << 20);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const size_t num_cfs = handles_.size();
+ ASSERT_EQ(num_cfs, 2);
+ WriteOptions wopts;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+ }
+
+ {
+ // Flush the default CF only.
+ std::vector<int> cf_ids{0};
+ ASSERT_OK(Flush(cf_ids));
+
+ autovector<ColumnFamilyData*> flushed_cfds;
+ autovector<autovector<VersionEdit*>> flush_edits;
+ auto flushed_cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[0]);
+ flushed_cfds.push_back(flushed_cfh->cfd());
+ flush_edits.push_back({});
+ auto unflushed_cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[1]);
+
+ ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(),
+ flushed_cfds, flush_edits),
+ unflushed_cfh->cfd()->GetLogNumber());
+ }
+
+ {
+ // Flush all CFs.
+ std::vector<int> cf_ids;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ cf_ids.emplace_back(static_cast<int>(i));
+ }
+ ASSERT_OK(Flush(cf_ids));
+ uint64_t log_num_after_flush = dbfull()->TEST_GetCurrentLogNumber();
+
+ uint64_t min_log_number_to_keep = std::numeric_limits<uint64_t>::max();
+ autovector<ColumnFamilyData*> flushed_cfds;
+ autovector<autovector<VersionEdit*>> flush_edits;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ flushed_cfds.push_back(cfh->cfd());
+ flush_edits.push_back({});
+ min_log_number_to_keep =
+ std::min(min_log_number_to_keep, cfh->cfd()->GetLogNumber());
+ }
+ ASSERT_EQ(min_log_number_to_keep, log_num_after_flush);
+ ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(),
+ flushed_cfds, flush_edits),
+ min_log_number_to_keep);
+ }
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = GetParam();
+ // 4KB so that we can easily trigger auto flush.
+ options.write_buffer_size = 4096;
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BackgroundCallFlush:FlushFinish:0",
+ "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ WriteOptions wopts;
+ wopts.disableWAL = true;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+ }
+ // Keep writing to one of them column families to trigger auto flush.
+ for (int i = 0; i != 4000; ++i) {
+ ASSERT_OK(Put(static_cast<int>(num_cfs) - 1 /*cf*/,
+ "key" + std::to_string(i), "value" + std::to_string(i),
+ wopts));
+ }
+
+ TEST_SYNC_POINT(
+ "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck");
+ if (options.atomic_flush) {
+ for (size_t i = 0; i + 1 != num_cfs; ++i) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+ ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+ }
+ } else {
+ for (size_t i = 0; i + 1 != num_cfs; ++i) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+ ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+ }
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ options.env = fault_injection_env.get();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1",
+ "DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1"},
+ {"DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2",
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ WriteOptions wopts;
+ wopts.disableWAL = true;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ int cf_id = static_cast<int>(i);
+ ASSERT_OK(Put(cf_id, "key", "value", wopts));
+ }
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+ TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1");
+ fault_injection_env->SetFilesystemActive(false);
+ TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2");
+ for (auto* cfh : handles_) {
+ // Returns the IO error happend during flush.
+ ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable(cfh));
+ }
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+ ASSERT_EQ(1, cfh->cfd()->imm()->NumNotFlushed());
+ ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+ }
+ fault_injection_env->SetFilesystemActive(true);
+ Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ WriteOptions wopts;
+ wopts.disableWAL = true;
+ std::vector<int> cf_ids;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ int cf_id = static_cast<int>(i);
+ ASSERT_OK(Put(cf_id, "key", "value", wopts));
+ cf_ids.push_back(cf_id);
+ }
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ ASSERT_TRUE(Flush(cf_ids).IsColumnFamilyDropped());
+ Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest,
+ FlushMultipleCFs_DropSomeAfterScheduleFlushBeforeFlushJobRun) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+ "DBAtomicFlushTest::BeforeDropCF"},
+ {"DBAtomicFlushTest::AfterDropCF",
+ "DBImpl::BackgroundCallFlush:start"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(3, num_cfs);
+ WriteOptions wopts;
+ wopts.disableWAL = true;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ int cf_id = static_cast<int>(i);
+ ASSERT_OK(Put(cf_id, "key", "value", wopts));
+ }
+ port::Thread user_thread([&]() {
+ TEST_SYNC_POINT("DBAtomicFlushTest::BeforeDropCF");
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ TEST_SYNC_POINT("DBAtomicFlushTest::AfterDropCF");
+ });
+ FlushOptions flush_opts;
+ flush_opts.wait = true;
+ ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+ user_thread.join();
+ for (size_t i = 0; i != num_cfs; ++i) {
+ int cf_id = static_cast<int>(i);
+ ASSERT_EQ("value", Get(cf_id, "key"));
+ }
+
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "eevee"}, options);
+ num_cfs = handles_.size();
+ ASSERT_EQ(2, num_cfs);
+ for (size_t i = 0; i != num_cfs; ++i) {
+ int cf_id = static_cast<int>(i);
+ ASSERT_EQ("value", Get(cf_id, "key"));
+ }
+ Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, TriggerFlushAndClose) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ const int kNumKeysTriggerFlush = 4;
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysTriggerFlush));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ for (int i = 0; i != kNumKeysTriggerFlush; ++i) {
+ ASSERT_OK(Put(0, "key" + std::to_string(i), "value" + std::to_string(i)));
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put(0, "key", "value"));
+ Close();
+
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+ ASSERT_EQ("value", Get(0, "key"));
+}
+
+TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) {
+ bool atomic_flush = GetParam();
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ options.max_write_buffer_number = 4;
+ // Set min_write_buffer_number_to_merge to be greater than 1, so that
+ // a column family with one memtable in the imm will not cause IsFlushPending
+ // to return true when flush_requested_ is false.
+ options.min_write_buffer_number_to_merge = 2;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_EQ(2, handles_.size());
+ ASSERT_OK(dbfull()->PauseBackgroundWork());
+ ASSERT_OK(Put(0, "key00", "value00"));
+ ASSERT_OK(Put(1, "key10", "value10"));
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+ ASSERT_OK(Put(0, "key01", "value01"));
+ // Since max_write_buffer_number is 4, the following flush won't cause write
+ // stall.
+ ASSERT_OK(dbfull()->Flush(flush_opts));
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+ handles_[1] = nullptr;
+ ASSERT_OK(dbfull()->ContinueBackgroundWork());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+ delete handles_[0];
+ handles_.clear();
+}
+
+TEST_P(DBAtomicFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+ "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+ {"DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+ "DBImpl::BackgroundCallFlush:start"},
+ {"DBImpl::BackgroundCallFlush:start",
+ "DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_EQ(2, handles_.size());
+ ASSERT_OK(Put(0, "key", "value"));
+ ASSERT_OK(Put(1, "key", "value"));
+ auto* cfd_default =
+ static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
+ ->cfd();
+ auto* cfd_pikachu = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+ port::Thread drop_cf_thr([&]() {
+ TEST_SYNC_POINT(
+ "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ delete handles_[1];
+ handles_.resize(1);
+ TEST_SYNC_POINT(
+ "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+ });
+ FlushOptions flush_opts;
+ flush_opts.allow_write_stall = true;
+ ASSERT_OK(dbfull()->TEST_AtomicFlushMemTables({cfd_default, cfd_pikachu},
+ flush_opts));
+ drop_cf_thr.join();
+ Close();
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, RollbackAfterFailToInstallResults) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ auto fault_injection_env = std::make_shared<FaultInjectionTestEnv>(env_);
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+ options.create_if_missing = true;
+ options.atomic_flush = atomic_flush;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_EQ(2, handles_.size());
+ for (size_t cf = 0; cf < handles_.size(); ++cf) {
+ ASSERT_OK(Put(static_cast<int>(cf), "a", "value"));
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+ [&](void* /*arg*/) { fault_injection_env->SetFilesystemActive(false); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ FlushOptions flush_opts;
+ Status s = db_->Flush(flush_opts, handles_);
+ ASSERT_NOK(s);
+ fault_injection_env->SetFilesystemActive(true);
+ Close();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// In atomic flush, concurrent bg flush threads commit to the MANIFEST in
+// serial, in the order of their picked memtables for each column family.
+// Only when a bg flush thread finds out that its memtables are the earliest
+// unflushed ones for all the included column families will this bg flush
+// thread continue to commit to MANIFEST.
+// This unit test uses sync point to coordinate the execution of two bg threads
+// executing the same sequence of functions. The interleaving are as follows.
+// time bg1 bg2
+// | pick memtables to flush
+// | flush memtables cf1_m1, cf2_m1
+// | join MANIFEST write queue
+// | pick memtabls to flush
+// | flush memtables cf1_(m1+1)
+// | join MANIFEST write queue
+// | wait to write MANIFEST
+// | write MANIFEST
+// | IO error
+// | detect IO error and stop waiting
+// V
+TEST_P(DBAtomicFlushTest, BgThreadNoWaitAfterManifestError) {
+ bool atomic_flush = GetParam();
+ if (!atomic_flush) {
+ return;
+ }
+ auto fault_injection_env = std::make_shared<FaultInjectionTestEnv>(env_);
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = true;
+ options.env = fault_injection_env.get();
+ // Set a larger value than default so that RocksDB can schedule concurrent
+ // background flush threads.
+ options.max_background_jobs = 8;
+ options.max_write_buffer_number = 8;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ assert(2 == handles_.size());
+
+ WriteOptions write_opts;
+ write_opts.disableWAL = true;
+
+ ASSERT_OK(Put(0, "a", "v_0_a", write_opts));
+ ASSERT_OK(Put(1, "a", "v_1_a", write_opts));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ SyncPoint::GetInstance()->LoadDependency({
+ {"BgFlushThr2:WaitToCommit", "BgFlushThr1:BeforeWriteManifest"},
+ });
+
+ std::thread::id bg_flush_thr1, bg_flush_thr2;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCallFlush:start", [&](void*) {
+ if (bg_flush_thr1 == std::thread::id()) {
+ bg_flush_thr1 = std::this_thread::get_id();
+ } else if (bg_flush_thr2 == std::thread::id()) {
+ bg_flush_thr2 = std::this_thread::get_id();
+ }
+ });
+
+ int called = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", [&](void* arg) {
+ if (std::this_thread::get_id() == bg_flush_thr2) {
+ const auto* ptr = reinterpret_cast<std::pair<Status, bool>*>(arg);
+ assert(ptr);
+ if (0 == called) {
+ // When bg flush thread 2 reaches here for the first time.
+ ASSERT_OK(ptr->first);
+ ASSERT_TRUE(ptr->second);
+ } else if (1 == called) {
+ // When bg flush thread 2 reaches here for the second time.
+ ASSERT_TRUE(ptr->first.IsIOError());
+ ASSERT_FALSE(ptr->second);
+ }
+ ++called;
+ TEST_SYNC_POINT("BgFlushThr2:WaitToCommit");
+ }
+ });
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+ [&](void*) {
+ if (std::this_thread::get_id() == bg_flush_thr1) {
+ TEST_SYNC_POINT("BgFlushThr1:BeforeWriteManifest");
+ }
+ });
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+ if (std::this_thread::get_id() != bg_flush_thr1) {
+ return;
+ }
+ ASSERT_OK(db_->Put(write_opts, "b", "v_1_b"));
+
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ std::vector<ColumnFamilyHandle*> cfhs(1, db_->DefaultColumnFamily());
+ ASSERT_OK(dbfull()->Flush(flush_opts, cfhs));
+ });
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) {
+ auto* ptr = reinterpret_cast<IOStatus*>(arg);
+ assert(ptr);
+ *ptr = IOStatus::IOError("Injected failure");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_TRUE(dbfull()->Flush(FlushOptions(), handles_).IsIOError());
+
+ Close();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBAtomicFlushTest, NoWaitWhenWritesStopped) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.atomic_flush = GetParam();
+ options.max_write_buffer_number = 2;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+
+ Reopen(options);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::DelayWrite:Start",
+ "DBAtomicFlushTest::NoWaitWhenWritesStopped:0"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(dbfull()->PauseBackgroundWork());
+ for (int i = 0; i < options.max_write_buffer_number; ++i) {
+ ASSERT_OK(Put("k" + std::to_string(i), "v" + std::to_string(i)));
+ }
+ std::thread stalled_writer([&]() { ASSERT_OK(Put("k", "v")); });
+
+ TEST_SYNC_POINT("DBAtomicFlushTest::NoWaitWhenWritesStopped:0");
+
+ {
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ flush_opts.allow_write_stall = true;
+ ASSERT_TRUE(db_->Flush(flush_opts).IsTryAgain());
+ }
+
+ ASSERT_OK(dbfull()->ContinueBackgroundWork());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ stalled_writer.join();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
+ testing::Bool());
+
+INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool());
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_impl/compacted_db_impl.cc b/src/rocksdb/db/db_impl/compacted_db_impl.cc
new file mode 100644
index 000000000..f18ee0d72
--- /dev/null
+++ b/src/rocksdb/db/db_impl/compacted_db_impl.cc
@@ -0,0 +1,257 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "db/db_impl/compacted_db_impl.h"
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "logging/logging.h"
+#include "table/get_context.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern void MarkKeyMayExist(void* arg);
+extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+ const Slice& v, bool hit_and_return);
+
+CompactedDBImpl::CompactedDBImpl(const DBOptions& options,
+ const std::string& dbname)
+ : DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true,
+ /*read_only*/ true),
+ cfd_(nullptr),
+ version_(nullptr),
+ user_comparator_(nullptr) {}
+
+CompactedDBImpl::~CompactedDBImpl() {}
+
+size_t CompactedDBImpl::FindFile(const Slice& key) {
+ size_t right = files_.num_files - 1;
+ auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+ return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
+ };
+ return static_cast<size_t>(
+ std::lower_bound(files_.files, files_.files + right, key, cmp) -
+ files_.files);
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
+ const Slice& key, PinnableSlice* value) {
+ return Get(options, /*column_family*/ nullptr, key, value,
+ /*timestamp*/ nullptr);
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
+ const Slice& key, PinnableSlice* value,
+ std::string* timestamp) {
+ assert(user_comparator_);
+ if (options.timestamp) {
+ const Status s = FailIfTsMismatchCf(
+ DefaultColumnFamily(), *(options.timestamp), /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ const Status s = FailIfCfHasTs(DefaultColumnFamily());
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Clear the timestamps for returning results so that we can distinguish
+ // between tombstone or key that has never been written
+ if (timestamp) {
+ timestamp->clear();
+ }
+
+ GetWithTimestampReadCallback read_cb(kMaxSequenceNumber);
+ std::string* ts =
+ user_comparator_->timestamp_size() > 0 ? timestamp : nullptr;
+ LookupKey lkey(key, kMaxSequenceNumber, options.timestamp);
+ GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, lkey.user_key(), value,
+ /*columns=*/nullptr, ts, nullptr, nullptr, true,
+ nullptr, nullptr, nullptr, nullptr, &read_cb);
+
+ const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())];
+ if (user_comparator_->CompareWithoutTimestamp(
+ key, /*a_has_ts=*/false,
+ ExtractUserKeyAndStripTimestamp(f.smallest_key,
+ user_comparator_->timestamp_size()),
+ /*b_has_ts=*/false) < 0) {
+ return Status::NotFound();
+ }
+ Status s = f.fd.table_reader->Get(options, lkey.internal_key(), &get_context,
+ nullptr);
+ if (!s.ok() && !s.IsNotFound()) {
+ return s;
+ }
+ if (get_context.State() == GetContext::kFound) {
+ return Status::OK();
+ }
+ return Status::NotFound();
+}
+
+std::vector<Status> CompactedDBImpl::MultiGet(
+ const ReadOptions& options, const std::vector<ColumnFamilyHandle*>&,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) {
+ return MultiGet(options, keys, values, /*timestamps*/ nullptr);
+}
+
+std::vector<Status> CompactedDBImpl::MultiGet(
+ const ReadOptions& options, const std::vector<ColumnFamilyHandle*>&,
+ const std::vector<Slice>& keys, std::vector<std::string>* values,
+ std::vector<std::string>* timestamps) {
+ assert(user_comparator_);
+ size_t num_keys = keys.size();
+
+ if (options.timestamp) {
+ Status s = FailIfTsMismatchCf(DefaultColumnFamily(), *(options.timestamp),
+ /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return std::vector<Status>(num_keys, s);
+ }
+ } else {
+ Status s = FailIfCfHasTs(DefaultColumnFamily());
+ if (!s.ok()) {
+ return std::vector<Status>(num_keys, s);
+ }
+ }
+
+ // Clear the timestamps for returning results so that we can distinguish
+ // between tombstone or key that has never been written
+ if (timestamps) {
+ for (auto& ts : *timestamps) {
+ ts.clear();
+ }
+ }
+
+ GetWithTimestampReadCallback read_cb(kMaxSequenceNumber);
+ autovector<TableReader*, 16> reader_list;
+ for (const auto& key : keys) {
+ LookupKey lkey(key, kMaxSequenceNumber, options.timestamp);
+ const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())];
+ if (user_comparator_->CompareWithoutTimestamp(
+ key, /*a_has_ts=*/false,
+ ExtractUserKeyAndStripTimestamp(f.smallest_key,
+ user_comparator_->timestamp_size()),
+ /*b_has_ts=*/false) < 0) {
+ reader_list.push_back(nullptr);
+ } else {
+ f.fd.table_reader->Prepare(lkey.internal_key());
+ reader_list.push_back(f.fd.table_reader);
+ }
+ }
+ std::vector<Status> statuses(num_keys, Status::NotFound());
+ values->resize(num_keys);
+ if (timestamps) {
+ timestamps->resize(num_keys);
+ }
+ int idx = 0;
+ for (auto* r : reader_list) {
+ if (r != nullptr) {
+ PinnableSlice pinnable_val;
+ std::string& value = (*values)[idx];
+ LookupKey lkey(keys[idx], kMaxSequenceNumber, options.timestamp);
+ std::string* timestamp = timestamps ? &(*timestamps)[idx] : nullptr;
+ GetContext get_context(
+ user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound,
+ lkey.user_key(), &pinnable_val, /*columns=*/nullptr,
+ user_comparator_->timestamp_size() > 0 ? timestamp : nullptr, nullptr,
+ nullptr, true, nullptr, nullptr, nullptr, nullptr, &read_cb);
+ Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr);
+ assert(static_cast<size_t>(idx) < statuses.size());
+ if (!s.ok() && !s.IsNotFound()) {
+ statuses[idx] = s;
+ } else {
+ value.assign(pinnable_val.data(), pinnable_val.size());
+ if (get_context.State() == GetContext::kFound) {
+ statuses[idx] = Status::OK();
+ }
+ }
+ }
+ ++idx;
+ }
+ return statuses;
+}
+
+Status CompactedDBImpl::Init(const Options& options) {
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ mutex_.Lock();
+ ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
+ ColumnFamilyOptions(options));
+ Status s = Recover({cf}, true /* read only */, false, true);
+ if (s.ok()) {
+ cfd_ = static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+ ->cfd();
+ cfd_->InstallSuperVersion(&sv_context, &mutex_);
+ }
+ mutex_.Unlock();
+ sv_context.Clean();
+ if (!s.ok()) {
+ return s;
+ }
+ NewThreadStatusCfInfo(cfd_);
+ version_ = cfd_->GetSuperVersion()->current;
+ user_comparator_ = cfd_->user_comparator();
+ auto* vstorage = version_->storage_info();
+ if (vstorage->num_non_empty_levels() == 0) {
+ return Status::NotSupported("no file exists");
+ }
+ const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
+ // L0 should not have files
+ if (l0.num_files > 1) {
+ return Status::NotSupported("L0 contain more than 1 file");
+ }
+ if (l0.num_files == 1) {
+ if (vstorage->num_non_empty_levels() > 1) {
+ return Status::NotSupported("Both L0 and other level contain files");
+ }
+ files_ = l0;
+ return Status::OK();
+ }
+
+ for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
+ if (vstorage->LevelFilesBrief(i).num_files > 0) {
+ return Status::NotSupported("Other levels also contain files");
+ }
+ }
+
+ int level = vstorage->num_non_empty_levels() - 1;
+ if (vstorage->LevelFilesBrief(level).num_files > 0) {
+ files_ = vstorage->LevelFilesBrief(level);
+ return Status::OK();
+ }
+ return Status::NotSupported("no file exists");
+}
+
+Status CompactedDBImpl::Open(const Options& options, const std::string& dbname,
+ DB** dbptr) {
+ *dbptr = nullptr;
+
+ if (options.max_open_files != -1) {
+ return Status::InvalidArgument("require max_open_files = -1");
+ }
+ if (options.merge_operator.get() != nullptr) {
+ return Status::InvalidArgument("merge operator is not supported");
+ }
+ DBOptions db_options(options);
+ std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
+ Status s = db->Init(options);
+ if (s.ok()) {
+ s = db->StartPeriodicTaskScheduler();
+ }
+ if (s.ok()) {
+ ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
+ "Opened the db as fully compacted mode");
+ LogFlush(db->immutable_db_options_.info_log);
+ *dbptr = db.release();
+ }
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/compacted_db_impl.h b/src/rocksdb/db/db_impl/compacted_db_impl.h
new file mode 100644
index 000000000..eb458b85d
--- /dev/null
+++ b/src/rocksdb/db/db_impl/compacted_db_impl.h
@@ -0,0 +1,154 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: Share common structure with DBImplSecondary and DBImplReadOnly
+class CompactedDBImpl : public DBImpl {
+ public:
+ CompactedDBImpl(const DBOptions& options, const std::string& dbname);
+ // No copying allowed
+ CompactedDBImpl(const CompactedDBImpl&) = delete;
+ void operator=(const CompactedDBImpl&) = delete;
+
+ ~CompactedDBImpl() override;
+
+ static Status Open(const Options& options, const std::string& dbname,
+ DB** dbptr);
+
+ // Implementations of the DB interface
+ using DB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+
+ Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value,
+ std::string* timestamp) override;
+
+ using DB::MultiGet;
+ // Note that CompactedDBImpl::MultiGet is not the optimized version of
+ // MultiGet to use.
+ // TODO: optimize CompactedDBImpl::MultiGet, see DBImpl::MultiGet for details.
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options, const std::vector<ColumnFamilyHandle*>&,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override;
+
+ std::vector<Status> MultiGet(const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>&,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values,
+ std::vector<std::string>* timestamps) override;
+
+ using DBImpl::Put;
+ virtual Status Put(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+
+ using DBImpl::PutEntity;
+ Status PutEntity(const WriteOptions& /* options */,
+ ColumnFamilyHandle* /* column_family */,
+ const Slice& /* key */,
+ const WideColumns& /* columns */) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+
+ using DBImpl::Merge;
+ virtual Status Merge(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+
+ using DBImpl::Delete;
+ virtual Status Delete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ virtual Status Write(const WriteOptions& /*options*/,
+ WriteBatch* /*updates*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ using DBImpl::CompactRange;
+ virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/,
+ const Slice* /*end*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+
+ virtual Status DisableFileDeletions() override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ virtual Status EnableFileDeletions(bool /*force*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ virtual Status GetLiveFiles(std::vector<std::string>& ret,
+ uint64_t* manifest_file_size,
+ bool /*flush_memtable*/) override {
+ return DBImpl::GetLiveFiles(ret, manifest_file_size,
+ false /* flush_memtable */);
+ }
+ using DBImpl::Flush;
+ virtual Status Flush(const FlushOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+
+ virtual Status SyncWAL() override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+
+ using DB::IngestExternalFile;
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*external_files*/,
+ const IngestExternalFileOptions& /*ingestion_options*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& /*options*/,
+ const std::string& /*column_family_name*/,
+ const ImportColumnFamilyOptions& /*import_options*/,
+ const ExportImportFilesMetaData& /*metadata*/,
+ ColumnFamilyHandle** /*handle*/) override {
+ return Status::NotSupported("Not supported in compacted db mode.");
+ }
+
+ // FIXME: some missing overrides for more "write" functions
+ // Share with DBImplReadOnly?
+
+ protected:
+#ifndef ROCKSDB_LITE
+ Status FlushForGetLiveFiles() override {
+ // No-op for read-only DB
+ return Status::OK();
+ }
+#endif // !ROCKSDB_LITE
+
+ private:
+ friend class DB;
+ inline size_t FindFile(const Slice& key);
+ Status Init(const Options& options);
+
+ ColumnFamilyData* cfd_;
+ Version* version_;
+ const Comparator* user_comparator_;
+ LevelFilesBrief files_;
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl.cc b/src/rocksdb/db/db_impl/db_impl.cc
new file mode 100644
index 000000000..a431111d4
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.cc
@@ -0,0 +1,5918 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <stdint.h>
+#ifdef OS_SOLARIS
+#include <alloca.h>
+#endif
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <map>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/builder.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_info_dumper.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/forward_iterator.h"
+#include "db/import_column_family_job.h"
+#include "db/job_context.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/malloc_stats.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/periodic_task_scheduler.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/transaction_log_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "db/write_callback.h"
+#include "env/unique_id_gen.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/auto_roll_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/in_memory_stats_history.h"
+#include "monitoring/instrumented_mutex.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "options/cf_options.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/stats_history.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/version.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/get_context.h"
+#include "table/merging_iterator.h"
+#include "table/multiget_context.h"
+#include "table/sst_file_dumper.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/trace_replay.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/defer.h"
+#include "util/distributed_mutex.h"
+#include "util/hash_containers.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/trace/replayer_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kDefaultColumnFamilyName("default");
+const std::string kPersistentStatsColumnFamilyName(
+ "___rocksdb_stats_history___");
+void DumpRocksDBBuildVersion(Logger* log);
+
+CompressionType GetCompressionFlush(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options) {
+ // Compressing memtable flushes might not help unless the sequential load
+ // optimization is used for leveled compaction. Otherwise the CPU and
+ // latency overhead is not offset by saving much space.
+ if (ioptions.compaction_style == kCompactionStyleUniversal &&
+ mutable_cf_options.compaction_options_universal
+ .compression_size_percent >= 0) {
+ return kNoCompression;
+ }
+ if (mutable_cf_options.compression_per_level.empty()) {
+ return mutable_cf_options.compression;
+ } else {
+ // For leveled compress when min_level_to_compress != 0.
+ return mutable_cf_options.compression_per_level[0];
+ }
+}
+
+namespace {
+void DumpSupportInfo(Logger* logger) {
+ ROCKS_LOG_HEADER(logger, "Compression algorithms supported:");
+ for (auto& compression : OptionsHelper::compression_type_string_map) {
+ if (compression.second != kNoCompression &&
+ compression.second != kDisableCompressionOption) {
+ ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(),
+ CompressionTypeSupported(compression.second));
+ }
+ }
+ ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s",
+ crc32c::IsFastCrc32Supported().c_str());
+
+ ROCKS_LOG_HEADER(logger, "DMutex implementation: %s", DMutex::kName());
+}
+} // namespace
+
+DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
+ const bool seq_per_batch, const bool batch_per_txn,
+ bool read_only)
+ : dbname_(dbname),
+ own_info_log_(options.info_log == nullptr),
+ init_logger_creation_s_(),
+ initial_db_options_(SanitizeOptions(dbname, options, read_only,
+ &init_logger_creation_s_)),
+ env_(initial_db_options_.env),
+ io_tracer_(std::make_shared<IOTracer>()),
+ immutable_db_options_(initial_db_options_),
+ fs_(immutable_db_options_.fs, io_tracer_),
+ mutable_db_options_(initial_db_options_),
+ stats_(immutable_db_options_.stats),
+#ifdef COERCE_CONTEXT_SWITCH
+ mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, &bg_cv_,
+ immutable_db_options_.use_adaptive_mutex),
+#else // COERCE_CONTEXT_SWITCH
+ mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS,
+ immutable_db_options_.use_adaptive_mutex),
+#endif // COERCE_CONTEXT_SWITCH
+ default_cf_handle_(nullptr),
+ error_handler_(this, immutable_db_options_, &mutex_),
+ event_logger_(immutable_db_options_.info_log.get()),
+ max_total_in_memory_state_(0),
+ file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+ file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite(
+ file_options_, immutable_db_options_)),
+ seq_per_batch_(seq_per_batch),
+ batch_per_txn_(batch_per_txn),
+ next_job_id_(1),
+ shutting_down_(false),
+ db_lock_(nullptr),
+ manual_compaction_paused_(false),
+ bg_cv_(&mutex_),
+ logfile_number_(0),
+ log_dir_synced_(false),
+ log_empty_(true),
+ persist_stats_cf_handle_(nullptr),
+ log_sync_cv_(&log_write_mutex_),
+ total_log_size_(0),
+ is_snapshot_supported_(true),
+ write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
+ write_thread_(immutable_db_options_),
+ nonmem_write_thread_(immutable_db_options_),
+ write_controller_(mutable_db_options_.delayed_write_rate),
+ last_batch_group_size_(0),
+ unscheduled_flushes_(0),
+ unscheduled_compactions_(0),
+ bg_bottom_compaction_scheduled_(0),
+ bg_compaction_scheduled_(0),
+ num_running_compactions_(0),
+ bg_flush_scheduled_(0),
+ num_running_flushes_(0),
+ bg_purge_scheduled_(0),
+ disable_delete_obsolete_files_(0),
+ pending_purge_obsolete_files_(0),
+ delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()),
+ last_stats_dump_time_microsec_(0),
+ has_unpersisted_data_(false),
+ unable_to_release_oldest_log_(false),
+ num_running_ingest_file_(0),
+#ifndef ROCKSDB_LITE
+ wal_manager_(immutable_db_options_, file_options_, io_tracer_,
+ seq_per_batch),
+#endif // ROCKSDB_LITE
+ bg_work_paused_(0),
+ bg_compaction_paused_(0),
+ refitting_level_(false),
+ opened_successfully_(false),
+#ifndef ROCKSDB_LITE
+ periodic_task_scheduler_(),
+#endif // ROCKSDB_LITE
+ two_write_queues_(options.two_write_queues),
+ manual_wal_flush_(options.manual_wal_flush),
+ // last_sequencee_ is always maintained by the main queue that also writes
+ // to the memtable. When two_write_queues_ is disabled last seq in
+ // memtable is the same as last seq published to the readers. When it is
+ // enabled but seq_per_batch_ is disabled, last seq in memtable still
+ // indicates last published seq since wal-only writes that go to the 2nd
+ // queue do not consume a sequence number. Otherwise writes performed by
+ // the 2nd queue could change what is visible to the readers. In this
+ // cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a
+ // separate variable to indicate the last published sequence.
+ last_seq_same_as_publish_seq_(
+ !(seq_per_batch && options.two_write_queues)),
+ // Since seq_per_batch_ is currently set only by WritePreparedTxn which
+ // requires a custom gc for compaction, we use that to set use_custom_gc_
+ // as well.
+ use_custom_gc_(seq_per_batch),
+ shutdown_initiated_(false),
+ own_sfm_(options.sst_file_manager == nullptr),
+ closed_(false),
+ atomic_flush_install_cv_(&mutex_),
+ blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_,
+ &error_handler_, &event_logger_,
+ immutable_db_options_.listeners, dbname_) {
+ // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
+ // WriteUnprepared, which should use seq_per_batch_.
+ assert(batch_per_txn_ || seq_per_batch_);
+
+ // Reserve ten files or so for other uses and give the rest to TableCache.
+ // Give a large number for setting of "infinite" open files.
+ const int table_cache_size = (mutable_db_options_.max_open_files == -1)
+ ? TableCache::kInfiniteCapacity
+ : mutable_db_options_.max_open_files - 10;
+ LRUCacheOptions co;
+ co.capacity = table_cache_size;
+ co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ table_cache_ = NewLRUCache(co);
+ SetDbSessionId();
+ assert(!db_session_id_.empty());
+
+#ifndef ROCKSDB_LITE
+ periodic_task_functions_.emplace(PeriodicTaskType::kDumpStats,
+ [this]() { this->DumpStats(); });
+ periodic_task_functions_.emplace(PeriodicTaskType::kPersistStats,
+ [this]() { this->PersistStats(); });
+ periodic_task_functions_.emplace(PeriodicTaskType::kFlushInfoLog,
+ [this]() { this->FlushInfoLog(); });
+ periodic_task_functions_.emplace(
+ PeriodicTaskType::kRecordSeqnoTime,
+ [this]() { this->RecordSeqnoToTimeMapping(); });
+#endif // ROCKSDB_LITE
+
+ versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_,
+ table_cache_.get(), write_buffer_manager_,
+ &write_controller_, &block_cache_tracer_,
+ io_tracer_, db_id_, db_session_id_));
+ column_family_memtables_.reset(
+ new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+
+ DumpRocksDBBuildVersion(immutable_db_options_.info_log.get());
+ DumpDBFileSummary(immutable_db_options_, dbname_, db_session_id_);
+ immutable_db_options_.Dump(immutable_db_options_.info_log.get());
+ mutable_db_options_.Dump(immutable_db_options_.info_log.get());
+ DumpSupportInfo(immutable_db_options_.info_log.get());
+
+ max_total_wal_size_.store(mutable_db_options_.max_total_wal_size,
+ std::memory_order_relaxed);
+ if (write_buffer_manager_) {
+ wbm_stall_.reset(new WBMStallInterface());
+ }
+}
+
+Status DBImpl::Resume() {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB");
+
+ InstrumentedMutexLock db_mutex(&mutex_);
+
+ if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) {
+ // Nothing to do
+ return Status::OK();
+ }
+
+ if (error_handler_.IsRecoveryInProgress()) {
+ // Don't allow a mix of manual and automatic recovery
+ return Status::Busy();
+ }
+
+ mutex_.Unlock();
+ Status s = error_handler_.RecoverFromBGError(true);
+ mutex_.Lock();
+ return s;
+}
+
+// This function implements the guts of recovery from a background error. It
+// is eventually called for both manual as well as automatic recovery. It does
+// the following -
+// 1. Wait for currently scheduled background flush/compaction to exit, in
+// order to inadvertently causing an error and thinking recovery failed
+// 2. Flush memtables if there's any data for all the CFs. This may result
+// another error, which will be saved by error_handler_ and reported later
+// as the recovery status
+// 3. Find and delete any obsolete files
+// 4. Schedule compactions if needed for all the CFs. This is needed as the
+// flush in the prior step might have been a no-op for some CFs, which
+// means a new super version wouldn't have been installed
+Status DBImpl::ResumeImpl(DBRecoverContext context) {
+ mutex_.AssertHeld();
+ WaitForBackgroundWork();
+
+ Status s;
+ if (shutdown_initiated_) {
+ // Returning shutdown status to SFM during auto recovery will cause it
+ // to abort the recovery and allow the shutdown to progress
+ s = Status::ShutdownInProgress();
+ }
+
+ if (s.ok()) {
+ Status bg_error = error_handler_.GetBGError();
+ if (bg_error.severity() > Status::Severity::kHardError) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "DB resume requested but failed due to Fatal/Unrecoverable error");
+ s = bg_error;
+ }
+ }
+
+ // Make sure the IO Status stored in version set is set to OK.
+ bool file_deletion_disabled = !IsFileDeletionsEnabled();
+ if (s.ok()) {
+ IOStatus io_s = versions_->io_status();
+ if (io_s.IsIOError()) {
+ // If resuming from IOError resulted from MANIFEST write, then assert
+ // that we must have already set the MANIFEST writer to nullptr during
+ // clean-up phase MANIFEST writing. We must have also disabled file
+ // deletions.
+ assert(!versions_->descriptor_log_);
+ assert(file_deletion_disabled);
+ // Since we are trying to recover from MANIFEST write error, we need to
+ // switch to a new MANIFEST anyway. The old MANIFEST can be corrupted.
+ // Therefore, force writing a dummy version edit because we do not know
+ // whether there are flush jobs with non-empty data to flush, triggering
+ // appends to MANIFEST.
+ VersionEdit edit;
+ auto cfh =
+ static_cast_with_check<ColumnFamilyHandleImpl>(default_cf_handle_);
+ assert(cfh);
+ ColumnFamilyData* cfd = cfh->cfd();
+ const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions();
+ s = versions_->LogAndApply(cfd, cf_opts, &edit, &mutex_,
+ directories_.GetDbDir());
+ if (!s.ok()) {
+ io_s = versions_->io_status();
+ if (!io_s.ok()) {
+ s = error_handler_.SetBGError(io_s,
+ BackgroundErrorReason::kManifestWrite);
+ }
+ }
+ }
+ }
+
+ // We cannot guarantee consistency of the WAL. So force flush Memtables of
+ // all the column families
+ if (s.ok()) {
+ FlushOptions flush_opts;
+ // We allow flush to stall write since we are trying to resume from error.
+ flush_opts.allow_write_stall = true;
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds;
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ mutex_.Unlock();
+ s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
+ mutex_.Lock();
+ } else {
+ for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ InstrumentedMutexUnlock u(&mutex_);
+ s = FlushMemTable(cfd, flush_opts, context.flush_reason);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "DB resume requested but failed due to Flush failure [%s]",
+ s.ToString().c_str());
+ }
+ }
+
+ JobContext job_context(0);
+ FindObsoleteFiles(&job_context, true);
+ mutex_.Unlock();
+
+ job_context.manifest_file_number = 1;
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+
+ if (s.ok()) {
+ assert(versions_->io_status().ok());
+ // If we reach here, we should re-enable file deletions if it was disabled
+ // during previous error handling.
+ if (file_deletion_disabled) {
+ // Always return ok
+ s = EnableFileDeletions(/*force=*/true);
+ if (!s.ok()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "DB resume requested but could not enable file deletions [%s]",
+ s.ToString().c_str());
+ assert(false);
+ }
+ }
+ }
+
+ mutex_.Lock();
+ if (s.ok()) {
+ // This will notify and unblock threads waiting for error recovery to
+ // finish. Those previouly waiting threads can now proceed, which may
+ // include closing the db.
+ s = error_handler_.ClearBGError();
+ } else {
+ // NOTE: this is needed to pass ASSERT_STATUS_CHECKED
+ // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
+ // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
+ error_handler_.GetRecoveryError().PermitUncheckedError();
+ }
+
+ if (s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+ } else {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Failed to resume DB [%s]",
+ s.ToString().c_str());
+ }
+
+ // Check for shutdown again before scheduling further compactions,
+ // since we released and re-acquired the lock above
+ if (shutdown_initiated_) {
+ s = Status::ShutdownInProgress();
+ }
+ if (s.ok()) {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ SchedulePendingCompaction(cfd);
+ }
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ // Wake up any waiters - in this case, it could be the shutdown thread
+ bg_cv_.SignalAll();
+
+ // No need to check BGError again. If something happened, event listener would
+ // be notified and the operation causing it would have failed
+ return s;
+}
+
+void DBImpl::WaitForBackgroundWork() {
+ // Wait for background work to finish
+ while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+ bg_flush_scheduled_) {
+ bg_cv_.Wait();
+ }
+}
+
+// Will lock the mutex_, will wait for completion if wait is true
+void DBImpl::CancelAllBackgroundWork(bool wait) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Shutdown: canceling all background work");
+
+#ifndef ROCKSDB_LITE
+ for (uint8_t task_type = 0;
+ task_type < static_cast<uint8_t>(PeriodicTaskType::kMax); task_type++) {
+ Status s = periodic_task_scheduler_.Unregister(
+ static_cast<PeriodicTaskType>(task_type));
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Failed to unregister periodic task %d, status: %s",
+ task_type, s.ToString().c_str());
+ }
+ }
+#endif // !ROCKSDB_LITE
+
+ InstrumentedMutexLock l(&mutex_);
+ if (!shutting_down_.load(std::memory_order_acquire) &&
+ has_unpersisted_data_.load(std::memory_order_relaxed) &&
+ !mutable_db_options_.avoid_flush_during_shutdown) {
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds;
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ mutex_.Unlock();
+ Status s =
+ AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+ s.PermitUncheckedError(); //**TODO: What to do on error?
+ mutex_.Lock();
+ } else {
+ for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+ if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
+ InstrumentedMutexUnlock u(&mutex_);
+ Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
+ s.PermitUncheckedError(); //**TODO: What to do on error?
+ }
+ }
+ }
+ }
+
+ shutting_down_.store(true, std::memory_order_release);
+ bg_cv_.SignalAll();
+ if (!wait) {
+ return;
+ }
+ WaitForBackgroundWork();
+}
+
+Status DBImpl::MaybeReleaseTimestampedSnapshotsAndCheck() {
+ size_t num_snapshots = 0;
+ ReleaseTimestampedSnapshotsOlderThan(std::numeric_limits<uint64_t>::max(),
+ &num_snapshots);
+
+ // If there is unreleased snapshot, fail the close call
+ if (num_snapshots > 0) {
+ return Status::Aborted("Cannot close DB with unreleased snapshot.");
+ }
+
+ return Status::OK();
+}
+
+Status DBImpl::CloseHelper() {
+ // Guarantee that there is no background error recovery in progress before
+ // continuing with the shutdown
+ mutex_.Lock();
+ shutdown_initiated_ = true;
+ error_handler_.CancelErrorRecovery();
+ while (error_handler_.IsRecoveryInProgress()) {
+ bg_cv_.Wait();
+ }
+ mutex_.Unlock();
+
+ // Below check is added as recovery_error_ is not checked and it causes crash
+ // in DBSSTTest.DBWithMaxSpaceAllowedWithBlobFiles when space limit is
+ // reached.
+ error_handler_.GetRecoveryError().PermitUncheckedError();
+
+ // CancelAllBackgroundWork called with false means we just set the shutdown
+ // marker. After this we do a variant of the waiting and unschedule work
+ // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
+ CancelAllBackgroundWork(false);
+
+ // Cancel manual compaction if there's any
+ if (HasPendingManualCompaction()) {
+ DisableManualCompaction();
+ }
+ mutex_.Lock();
+ // Unschedule all tasks for this DB
+ for (uint8_t i = 0; i < static_cast<uint8_t>(TaskType::kCount); i++) {
+ env_->UnSchedule(GetTaskTag(i), Env::Priority::BOTTOM);
+ env_->UnSchedule(GetTaskTag(i), Env::Priority::LOW);
+ env_->UnSchedule(GetTaskTag(i), Env::Priority::HIGH);
+ }
+
+ Status ret = Status::OK();
+
+ // Wait for background work to finish
+ while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+ bg_flush_scheduled_ || bg_purge_scheduled_ ||
+ pending_purge_obsolete_files_ ||
+ error_handler_.IsRecoveryInProgress()) {
+ TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob");
+ bg_cv_.Wait();
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished",
+ &files_grabbed_for_purge_);
+ EraseThreadStatusDbInfo();
+ flush_scheduler_.Clear();
+ trim_history_scheduler_.Clear();
+
+ while (!flush_queue_.empty()) {
+ const FlushRequest& flush_req = PopFirstFromFlushQueue();
+ for (const auto& iter : flush_req) {
+ iter.first->UnrefAndTryDelete();
+ }
+ }
+
+ while (!compaction_queue_.empty()) {
+ auto cfd = PopFirstFromCompactionQueue();
+ cfd->UnrefAndTryDelete();
+ }
+
+ if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) {
+ // we need to delete handle outside of lock because it does its own locking
+ mutex_.Unlock();
+ if (default_cf_handle_) {
+ delete default_cf_handle_;
+ default_cf_handle_ = nullptr;
+ }
+ if (persist_stats_cf_handle_) {
+ delete persist_stats_cf_handle_;
+ persist_stats_cf_handle_ = nullptr;
+ }
+ mutex_.Lock();
+ }
+
+ // Clean up obsolete files due to SuperVersion release.
+ // (1) Need to delete to obsolete files before closing because RepairDB()
+ // scans all existing files in the file system and builds manifest file.
+ // Keeping obsolete files confuses the repair process.
+ // (2) Need to check if we Open()/Recover() the DB successfully before
+ // deleting because if VersionSet recover fails (may be due to corrupted
+ // manifest file), it is not able to identify live files correctly. As a
+ // result, all "live" files can get deleted by accident. However, corrupted
+ // manifest is recoverable by RepairDB().
+ if (opened_successfully_) {
+ JobContext job_context(next_job_id_.fetch_add(1));
+ FindObsoleteFiles(&job_context, true);
+
+ mutex_.Unlock();
+ // manifest number starting from 2
+ job_context.manifest_file_number = 1;
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ mutex_.Lock();
+ }
+ {
+ InstrumentedMutexLock lock(&log_write_mutex_);
+ for (auto l : logs_to_free_) {
+ delete l;
+ }
+ for (auto& log : logs_) {
+ uint64_t log_number = log.writer->get_log_number();
+ Status s = log.ClearWriter();
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(
+ immutable_db_options_.info_log,
+ "Unable to Sync WAL file %s with error -- %s",
+ LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(),
+ s.ToString().c_str());
+ // Retain the first error
+ if (ret.ok()) {
+ ret = s;
+ }
+ }
+ }
+ logs_.clear();
+ }
+
+ // Table cache may have table handles holding blocks from the block cache.
+ // We need to release them before the block cache is destroyed. The block
+ // cache may be destroyed inside versions_.reset(), when column family data
+ // list is destroyed, so leaving handles in table cache after
+ // versions_.reset() may cause issues.
+ // Here we clean all unreferenced handles in table cache.
+ // Now we assume all user queries have finished, so only version set itself
+ // can possibly hold the blocks from block cache. After releasing unreferenced
+ // handles here, only handles held by version set left and inside
+ // versions_.reset(), we will release them. There, we need to make sure every
+ // time a handle is released, we erase it from the cache too. By doing that,
+ // we can guarantee that after versions_.reset(), table cache is empty
+ // so the cache can be safely destroyed.
+ table_cache_->EraseUnRefEntries();
+
+ for (auto& txn_entry : recovered_transactions_) {
+ delete txn_entry.second;
+ }
+
+ // versions need to be destroyed before table_cache since it can hold
+ // references to table_cache.
+ versions_.reset();
+ mutex_.Unlock();
+ if (db_lock_ != nullptr) {
+ // TODO: Check for unlock error
+ env_->UnlockFile(db_lock_).PermitUncheckedError();
+ }
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete");
+ LogFlush(immutable_db_options_.info_log);
+
+#ifndef ROCKSDB_LITE
+ // If the sst_file_manager was allocated by us during DB::Open(), ccall
+ // Close() on it before closing the info_log. Otherwise, background thread
+ // in SstFileManagerImpl might try to log something
+ if (immutable_db_options_.sst_file_manager && own_sfm_) {
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ sfm->Close();
+ }
+#endif // ROCKSDB_LITE
+
+ if (immutable_db_options_.info_log && own_info_log_) {
+ Status s = immutable_db_options_.info_log->Close();
+ if (!s.ok() && !s.IsNotSupported() && ret.ok()) {
+ ret = s;
+ }
+ }
+
+ if (write_buffer_manager_ && wbm_stall_) {
+ write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get());
+ }
+
+ IOStatus io_s = directories_.Close(IOOptions(), nullptr /* dbg */);
+ if (!io_s.ok()) {
+ ret = io_s;
+ }
+ if (ret.IsAborted()) {
+ // Reserve IsAborted() error for those where users didn't release
+ // certain resource and they can release them and come back and
+ // retry. In this case, we wrap this exception to something else.
+ return Status::Incomplete(ret.ToString());
+ }
+
+ return ret;
+}
+
+Status DBImpl::CloseImpl() { return CloseHelper(); }
+
+DBImpl::~DBImpl() {
+ // TODO: remove this.
+ init_logger_creation_s_.PermitUncheckedError();
+
+ InstrumentedMutexLock closing_lock_guard(&closing_mutex_);
+ if (closed_) {
+ return;
+ }
+
+ closed_ = true;
+
+ {
+ const Status s = MaybeReleaseTimestampedSnapshotsAndCheck();
+ s.PermitUncheckedError();
+ }
+
+ closing_status_ = CloseImpl();
+ closing_status_.PermitUncheckedError();
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+ if (s->ok() || immutable_db_options_.paranoid_checks) {
+ // No change needed
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "Ignoring error %s",
+ s->ToString().c_str());
+ *s = Status::OK();
+ }
+}
+
+const Status DBImpl::CreateArchivalDirectory() {
+ if (immutable_db_options_.WAL_ttl_seconds > 0 ||
+ immutable_db_options_.WAL_size_limit_MB > 0) {
+ std::string archivalPath =
+ ArchivalDirectory(immutable_db_options_.GetWalDir());
+ return env_->CreateDirIfMissing(archivalPath);
+ }
+ return Status::OK();
+}
+
+void DBImpl::PrintStatistics() {
+ auto dbstats = immutable_db_options_.stats;
+ if (dbstats) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s",
+ dbstats->ToString().c_str());
+ }
+}
+
+Status DBImpl::StartPeriodicTaskScheduler() {
+#ifndef ROCKSDB_LITE
+
+#ifndef NDEBUG
+ // It only used by test to disable scheduler
+ bool disable_scheduler = false;
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::StartPeriodicTaskScheduler:DisableScheduler",
+ &disable_scheduler);
+ if (disable_scheduler) {
+ return Status::OK();
+ }
+
+ {
+ InstrumentedMutexLock l(&mutex_);
+ TEST_SYNC_POINT_CALLBACK("DBImpl::StartPeriodicTaskScheduler:Init",
+ &periodic_task_scheduler_);
+ }
+
+#endif // !NDEBUG
+ if (mutable_db_options_.stats_dump_period_sec > 0) {
+ Status s = periodic_task_scheduler_.Register(
+ PeriodicTaskType::kDumpStats,
+ periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
+ mutable_db_options_.stats_dump_period_sec);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ if (mutable_db_options_.stats_persist_period_sec > 0) {
+ Status s = periodic_task_scheduler_.Register(
+ PeriodicTaskType::kPersistStats,
+ periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
+ mutable_db_options_.stats_persist_period_sec);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ Status s = periodic_task_scheduler_.Register(
+ PeriodicTaskType::kFlushInfoLog,
+ periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog));
+
+ return s;
+#else
+ return Status::OK();
+#endif // !ROCKSDB_LITE
+}
+
+Status DBImpl::RegisterRecordSeqnoTimeWorker() {
+#ifndef ROCKSDB_LITE
+ uint64_t min_time_duration = std::numeric_limits<uint64_t>::max();
+ uint64_t max_time_duration = std::numeric_limits<uint64_t>::min();
+ {
+ InstrumentedMutexLock l(&mutex_);
+
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ // preserve time is the max of 2 options.
+ uint64_t preserve_time_duration =
+ std::max(cfd->ioptions()->preserve_internal_time_seconds,
+ cfd->ioptions()->preclude_last_level_data_seconds);
+ if (!cfd->IsDropped() && preserve_time_duration > 0) {
+ min_time_duration = std::min(preserve_time_duration, min_time_duration);
+ max_time_duration = std::max(preserve_time_duration, max_time_duration);
+ }
+ }
+ if (min_time_duration == std::numeric_limits<uint64_t>::max()) {
+ seqno_time_mapping_.Resize(0, 0);
+ } else {
+ seqno_time_mapping_.Resize(min_time_duration, max_time_duration);
+ }
+ }
+
+ uint64_t seqno_time_cadence = 0;
+ if (min_time_duration != std::numeric_limits<uint64_t>::max()) {
+ // round up to 1 when the time_duration is smaller than
+ // kMaxSeqnoTimePairsPerCF
+ seqno_time_cadence =
+ (min_time_duration + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF - 1) /
+ SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF;
+ }
+
+ Status s;
+ if (seqno_time_cadence == 0) {
+ s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kRecordSeqnoTime);
+ } else {
+ s = periodic_task_scheduler_.Register(
+ PeriodicTaskType::kRecordSeqnoTime,
+ periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime),
+ seqno_time_cadence);
+ }
+
+ return s;
+#else
+ return Status::OK();
+#endif // !ROCKSDB_LITE
+}
+
+// esitmate the total size of stats_history_
+size_t DBImpl::EstimateInMemoryStatsHistorySize() const {
+ size_t size_total =
+ sizeof(std::map<uint64_t, std::map<std::string, uint64_t>>);
+ if (stats_history_.size() == 0) return size_total;
+ size_t size_per_slice =
+ sizeof(uint64_t) + sizeof(std::map<std::string, uint64_t>);
+ // non-empty map, stats_history_.begin() guaranteed to exist
+ for (const auto& pairs : stats_history_.begin()->second) {
+ size_per_slice +=
+ pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second);
+ }
+ size_total = size_per_slice * stats_history_.size();
+ return size_total;
+}
+
+void DBImpl::PersistStats() {
+ TEST_SYNC_POINT("DBImpl::PersistStats:Entry");
+#ifndef ROCKSDB_LITE
+ if (shutdown_initiated_) {
+ return;
+ }
+ TEST_SYNC_POINT("DBImpl::PersistStats:StartRunning");
+ uint64_t now_seconds =
+ immutable_db_options_.clock->NowMicros() / kMicrosInSecond;
+
+ Statistics* statistics = immutable_db_options_.stats;
+ if (!statistics) {
+ return;
+ }
+ size_t stats_history_size_limit = 0;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ stats_history_size_limit = mutable_db_options_.stats_history_buffer_size;
+ }
+
+ std::map<std::string, uint64_t> stats_map;
+ if (!statistics->getTickerMap(&stats_map)) {
+ return;
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "------- PERSISTING STATS -------");
+
+ if (immutable_db_options_.persist_stats_to_disk) {
+ WriteBatch batch;
+ Status s = Status::OK();
+ if (stats_slice_initialized_) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Reading %" ROCKSDB_PRIszt " stats from statistics\n",
+ stats_slice_.size());
+ for (const auto& stat : stats_map) {
+ if (s.ok()) {
+ char key[100];
+ int length =
+ EncodePersistentStatsKey(now_seconds, stat.first, 100, key);
+ // calculate the delta from last time
+ if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+ uint64_t delta = stat.second - stats_slice_[stat.first];
+ s = batch.Put(persist_stats_cf_handle_,
+ Slice(key, std::min(100, length)),
+ std::to_string(delta));
+ }
+ }
+ }
+ }
+ stats_slice_initialized_ = true;
+ std::swap(stats_slice_, stats_map);
+ if (s.ok()) {
+ WriteOptions wo;
+ wo.low_pri = true;
+ wo.no_slowdown = true;
+ wo.sync = false;
+ s = Write(wo, &batch);
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Writing to persistent stats CF failed -- %s",
+ s.ToString().c_str());
+ } else {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+ " to persistent stats CF succeeded",
+ stats_slice_.size(), now_seconds);
+ }
+ // TODO(Zhongyi): add purging for persisted data
+ } else {
+ InstrumentedMutexLock l(&stats_history_mutex_);
+ // calculate the delta from last time
+ if (stats_slice_initialized_) {
+ std::map<std::string, uint64_t> stats_delta;
+ for (const auto& stat : stats_map) {
+ if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+ stats_delta[stat.first] = stat.second - stats_slice_[stat.first];
+ }
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+ " to in-memory stats history",
+ stats_slice_.size(), now_seconds);
+ stats_history_[now_seconds] = stats_delta;
+ }
+ stats_slice_initialized_ = true;
+ std::swap(stats_slice_, stats_map);
+ TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied");
+
+ // delete older stats snapshots to control memory consumption
+ size_t stats_history_size = EstimateInMemoryStatsHistorySize();
+ bool purge_needed = stats_history_size > stats_history_size_limit;
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+ " bytes, slice count: %" ROCKSDB_PRIszt,
+ stats_history_size, stats_history_.size());
+ while (purge_needed && !stats_history_.empty()) {
+ stats_history_.erase(stats_history_.begin());
+ purge_needed =
+ EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+ " bytes, slice count: %" ROCKSDB_PRIszt,
+ stats_history_size, stats_history_.size());
+ }
+ TEST_SYNC_POINT("DBImpl::PersistStats:End");
+#endif // !ROCKSDB_LITE
+}
+
+bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time,
+ uint64_t* new_time,
+ std::map<std::string, uint64_t>* stats_map) {
+ assert(new_time);
+ assert(stats_map);
+ if (!new_time || !stats_map) return false;
+ // lock when search for start_time
+ {
+ InstrumentedMutexLock l(&stats_history_mutex_);
+ auto it = stats_history_.lower_bound(start_time);
+ if (it != stats_history_.end() && it->first < end_time) {
+ // make a copy for timestamp and stats_map
+ *new_time = it->first;
+ *stats_map = it->second;
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
+
+Status DBImpl::GetStatsHistory(
+ uint64_t start_time, uint64_t end_time,
+ std::unique_ptr<StatsHistoryIterator>* stats_iterator) {
+ if (!stats_iterator) {
+ return Status::InvalidArgument("stats_iterator not preallocated.");
+ }
+ if (immutable_db_options_.persist_stats_to_disk) {
+ stats_iterator->reset(
+ new PersistentStatsHistoryIterator(start_time, end_time, this));
+ } else {
+ stats_iterator->reset(
+ new InMemoryStatsHistoryIterator(start_time, end_time, this));
+ }
+ return (*stats_iterator)->status();
+}
+
+void DBImpl::DumpStats() {
+ TEST_SYNC_POINT("DBImpl::DumpStats:1");
+#ifndef ROCKSDB_LITE
+ std::string stats;
+ if (shutdown_initiated_) {
+ return;
+ }
+
+ // Also probe block cache(s) for problems, dump to info log
+ UnorderedSet<Cache*> probed_caches;
+ TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
+ {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+ if (!cfd->initialized()) {
+ continue;
+ }
+
+ // Release DB mutex for gathering cache entry stats. Pass over all
+ // column families for this first so that other stats are dumped
+ // near-atomically.
+ InstrumentedMutexUnlock u(&mutex_);
+ cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
+
+ // Probe block cache for problems (if not already via another CF)
+ if (immutable_db_options_.info_log) {
+ auto* table_factory = cfd->ioptions()->table_factory.get();
+ assert(table_factory != nullptr);
+ Cache* cache =
+ table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
+ if (cache && probed_caches.insert(cache).second) {
+ cache->ReportProblems(immutable_db_options_.info_log);
+ }
+ }
+ }
+
+ const std::string* property = &DB::Properties::kDBStats;
+ const DBPropertyInfo* property_info = GetPropertyInfo(*property);
+ assert(property_info != nullptr);
+ assert(!property_info->need_out_of_mutex);
+ default_cf_internal_stats_->GetStringProperty(*property_info, *property,
+ &stats);
+
+ property = &InternalStats::kPeriodicCFStats;
+ property_info = GetPropertyInfo(*property);
+ assert(property_info != nullptr);
+ assert(!property_info->need_out_of_mutex);
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->initialized()) {
+ cfd->internal_stats()->GetStringProperty(*property_info, *property,
+ &stats);
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::DumpStats:2");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "------- DUMPING STATS -------");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+ if (immutable_db_options_.dump_malloc_stats) {
+ stats.clear();
+ DumpMallocStats(&stats);
+ if (!stats.empty()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "------- Malloc STATS -------");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+ }
+ }
+#endif // !ROCKSDB_LITE
+
+ PrintStatistics();
+}
+
+// Periodically flush info log out of application buffer at a low frequency.
+// This improves debuggability in case of RocksDB hanging since it ensures the
+// log messages leading up to the hang will eventually become visible in the
+// log.
+void DBImpl::FlushInfoLog() {
+ if (shutdown_initiated_) {
+ return;
+ }
+ TEST_SYNC_POINT("DBImpl::FlushInfoLog:StartRunning");
+ LogFlush(immutable_db_options_.info_log);
+}
+
+Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+ int max_entries_to_print,
+ std::string* out_str) {
+ auto* cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ Version* version = super_version->current;
+
+ Status s =
+ version->TablesRangeTombstoneSummary(max_entries_to_print, out_str);
+
+ CleanupSuperVersion(super_version);
+ return s;
+}
+
+void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
+ mutex_.AssertHeld();
+ if (!job_context->logs_to_free.empty()) {
+ for (auto l : job_context->logs_to_free) {
+ AddToLogsToFreeQueue(l);
+ }
+ job_context->logs_to_free.clear();
+ }
+}
+
+FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
+ assert(cfd);
+ FSDirectory* ret_dir = cfd->GetDataDir(path_id);
+ if (ret_dir == nullptr) {
+ return directories_.GetDataDir(path_id);
+ }
+ return ret_dir;
+}
+
+Status DBImpl::SetOptions(
+ ColumnFamilyHandle* column_family,
+ const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+ (void)column_family;
+ (void)options_map;
+ return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+ if (options_map.empty()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "SetOptions() on column family [%s], empty input",
+ cfd->GetName().c_str());
+ return Status::InvalidArgument("empty input");
+ }
+
+ MutableCFOptions new_options;
+ Status s;
+ Status persist_options_status;
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ {
+ auto db_options = GetDBOptions();
+ InstrumentedMutexLock l(&mutex_);
+ s = cfd->SetOptions(db_options, options_map);
+ if (s.ok()) {
+ new_options = *cfd->GetLatestMutableCFOptions();
+ // Append new version to recompute compaction score.
+ VersionEdit dummy_edit;
+ s = versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_,
+ directories_.GetDbDir());
+ // Trigger possible flush/compactions. This has to be before we persist
+ // options to file, otherwise there will be a deadlock with writer
+ // thread.
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options);
+
+ persist_options_status = WriteOptionsFile(
+ false /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ bg_cv_.SignalAll();
+ }
+ }
+ sv_context.Clean();
+
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str());
+ for (const auto& o : options_map) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+ o.second.c_str());
+ }
+ if (s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] SetOptions() succeeded", cfd->GetName().c_str());
+ new_options.Dump(immutable_db_options_.info_log.get());
+ if (!persist_options_status.ok()) {
+ // NOTE: WriteOptionsFile already logs on failure
+ s = persist_options_status;
+ }
+ } else {
+ persist_options_status.PermitUncheckedError(); // less important
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed",
+ cfd->GetName().c_str());
+ }
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+#endif // ROCKSDB_LITE
+}
+
+Status DBImpl::SetDBOptions(
+ const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+ (void)options_map;
+ return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+ if (options_map.empty()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "SetDBOptions(), empty input.");
+ return Status::InvalidArgument("empty input");
+ }
+
+ MutableDBOptions new_options;
+ Status s;
+ Status persist_options_status = Status::OK();
+ bool wal_changed = false;
+ WriteContext write_context;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map,
+ &new_options);
+
+ if (new_options.bytes_per_sync == 0) {
+ new_options.bytes_per_sync = 1024 * 1024;
+ }
+
+ if (MutableDBOptionsAreEqual(mutable_db_options_, new_options)) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "SetDBOptions(), input option value is not changed, "
+ "skipping updating.");
+ persist_options_status.PermitUncheckedError();
+ return s;
+ }
+
+ DBOptions new_db_options =
+ BuildDBOptions(immutable_db_options_, new_options);
+ if (s.ok()) {
+ s = ValidateOptions(new_db_options);
+ }
+ if (s.ok()) {
+ for (auto c : *versions_->GetColumnFamilySet()) {
+ if (!c->IsDropped()) {
+ auto cf_options = c->GetLatestCFOptions();
+ s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ }
+ if (s.ok()) {
+ const BGJobLimits current_bg_job_limits =
+ GetBGJobLimits(mutable_db_options_.max_background_flushes,
+ mutable_db_options_.max_background_compactions,
+ mutable_db_options_.max_background_jobs,
+ /* parallelize_compactions */ true);
+ const BGJobLimits new_bg_job_limits = GetBGJobLimits(
+ new_options.max_background_flushes,
+ new_options.max_background_compactions,
+ new_options.max_background_jobs, /* parallelize_compactions */ true);
+
+ const bool max_flushes_increased =
+ new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes;
+ const bool max_compactions_increased =
+ new_bg_job_limits.max_compactions >
+ current_bg_job_limits.max_compactions;
+
+ if (max_flushes_increased || max_compactions_increased) {
+ if (max_flushes_increased) {
+ env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_flushes,
+ Env::Priority::HIGH);
+ }
+
+ if (max_compactions_increased) {
+ env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_compactions,
+ Env::Priority::LOW);
+ }
+
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ mutex_.Unlock();
+ if (new_options.stats_dump_period_sec == 0) {
+ s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kDumpStats);
+ } else {
+ s = periodic_task_scheduler_.Register(
+ PeriodicTaskType::kDumpStats,
+ periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
+ new_options.stats_dump_period_sec);
+ }
+ if (new_options.max_total_wal_size !=
+ mutable_db_options_.max_total_wal_size) {
+ max_total_wal_size_.store(new_options.max_total_wal_size,
+ std::memory_order_release);
+ }
+ if (s.ok()) {
+ if (new_options.stats_persist_period_sec == 0) {
+ s = periodic_task_scheduler_.Unregister(
+ PeriodicTaskType::kPersistStats);
+ } else {
+ s = periodic_task_scheduler_.Register(
+ PeriodicTaskType::kPersistStats,
+ periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
+ new_options.stats_persist_period_sec);
+ }
+ }
+ mutex_.Lock();
+ if (!s.ok()) {
+ return s;
+ }
+
+ write_controller_.set_max_delayed_write_rate(
+ new_options.delayed_write_rate);
+ table_cache_.get()->SetCapacity(new_options.max_open_files == -1
+ ? TableCache::kInfiniteCapacity
+ : new_options.max_open_files - 10);
+ wal_changed = mutable_db_options_.wal_bytes_per_sync !=
+ new_options.wal_bytes_per_sync;
+ mutable_db_options_ = new_options;
+ file_options_for_compaction_ = FileOptions(new_db_options);
+ file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite(
+ file_options_for_compaction_, immutable_db_options_);
+ versions_->ChangeFileOptions(mutable_db_options_);
+ // TODO(xiez): clarify why apply optimize for read to write options
+ file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
+ file_options_for_compaction_, immutable_db_options_);
+ file_options_for_compaction_.compaction_readahead_size =
+ mutable_db_options_.compaction_readahead_size;
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) {
+ Status purge_wal_status = SwitchWAL(&write_context);
+ if (!purge_wal_status.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Unable to purge WAL files in SetDBOptions() -- %s",
+ purge_wal_status.ToString().c_str());
+ }
+ }
+ persist_options_status = WriteOptionsFile(
+ false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+ write_thread_.ExitUnbatched(&w);
+ } else {
+ // To get here, we must have had invalid options and will not attempt to
+ // persist the options, which means the status is "OK/Uninitialized.
+ persist_options_status.PermitUncheckedError();
+ }
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:");
+ for (const auto& o : options_map) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+ o.second.c_str());
+ }
+ if (s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded");
+ new_options.Dump(immutable_db_options_.info_log.get());
+ if (!persist_options_status.ok()) {
+ if (immutable_db_options_.fail_if_options_file_error) {
+ s = Status::IOError(
+ "SetDBOptions() succeeded, but unable to persist options",
+ persist_options_status.ToString());
+ }
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Unable to persist options in SetDBOptions() -- %s",
+ persist_options_status.ToString().c_str());
+ }
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed");
+ }
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+#endif // ROCKSDB_LITE
+}
+
+// return the same level if it cannot be moved
+int DBImpl::FindMinimumEmptyLevelFitting(
+ ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/,
+ int level) {
+ mutex_.AssertHeld();
+ const auto* vstorage = cfd->current()->storage_info();
+ int minimum_level = level;
+ for (int i = level - 1; i > 0; --i) {
+ // stop if level i is not empty
+ if (vstorage->NumLevelFiles(i) > 0) break;
+ // stop if level i is too small (cannot fit the level files)
+ if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) {
+ break;
+ }
+
+ minimum_level = i;
+ }
+ return minimum_level;
+}
+
+Status DBImpl::FlushWAL(bool sync) {
+ if (manual_wal_flush_) {
+ IOStatus io_s;
+ {
+ // We need to lock log_write_mutex_ since logs_ might change concurrently
+ InstrumentedMutexLock wl(&log_write_mutex_);
+ log::Writer* cur_log_writer = logs_.back().writer;
+ io_s = cur_log_writer->WriteBuffer();
+ }
+ if (!io_s.ok()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+ io_s.ToString().c_str());
+ // In case there is a fs error we should set it globally to prevent the
+ // future writes
+ IOStatusCheck(io_s);
+ // whether sync or not, we should abort the rest of function upon error
+ return static_cast<Status>(io_s);
+ }
+ if (!sync) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false");
+ return static_cast<Status>(io_s);
+ }
+ }
+ if (!sync) {
+ return Status::OK();
+ }
+ // sync = true
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true");
+ return SyncWAL();
+}
+
+bool DBImpl::WALBufferIsEmpty(bool lock) {
+ if (lock) {
+ log_write_mutex_.Lock();
+ }
+ log::Writer* cur_log_writer = logs_.back().writer;
+ auto res = cur_log_writer->BufferIsEmpty();
+ if (lock) {
+ log_write_mutex_.Unlock();
+ }
+ return res;
+}
+
+Status DBImpl::SyncWAL() {
+ TEST_SYNC_POINT("DBImpl::SyncWAL:Begin");
+ autovector<log::Writer*, 1> logs_to_sync;
+ bool need_log_dir_sync;
+ uint64_t current_log_number;
+
+ {
+ InstrumentedMutexLock l(&log_write_mutex_);
+ assert(!logs_.empty());
+
+ // This SyncWAL() call only cares about logs up to this number.
+ current_log_number = logfile_number_;
+
+ while (logs_.front().number <= current_log_number &&
+ logs_.front().IsSyncing()) {
+ log_sync_cv_.Wait();
+ }
+ // First check that logs are safe to sync in background.
+ for (auto it = logs_.begin();
+ it != logs_.end() && it->number <= current_log_number; ++it) {
+ if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) {
+ return Status::NotSupported(
+ "SyncWAL() is not supported for this implementation of WAL file",
+ immutable_db_options_.allow_mmap_writes
+ ? "try setting Options::allow_mmap_writes to false"
+ : Slice());
+ }
+ }
+ for (auto it = logs_.begin();
+ it != logs_.end() && it->number <= current_log_number; ++it) {
+ auto& log = *it;
+ log.PrepareForSync();
+ logs_to_sync.push_back(log.writer);
+ }
+
+ need_log_dir_sync = !log_dir_synced_;
+ }
+
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+ RecordTick(stats_, WAL_FILE_SYNCED);
+ Status status;
+ IOStatus io_s;
+ for (log::Writer* log : logs_to_sync) {
+ io_s = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync);
+ if (!io_s.ok()) {
+ status = io_s;
+ break;
+ }
+ }
+ if (!io_s.ok()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL Sync error %s",
+ io_s.ToString().c_str());
+ // In case there is a fs error we should set it globally to prevent the
+ // future writes
+ IOStatusCheck(io_s);
+ }
+ if (status.ok() && need_log_dir_sync) {
+ status = directories_.GetWalDir()->FsyncWithDirOptions(
+ IOOptions(), nullptr,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+ }
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+ TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
+ VersionEdit synced_wals;
+ {
+ InstrumentedMutexLock l(&log_write_mutex_);
+ if (status.ok()) {
+ MarkLogsSynced(current_log_number, need_log_dir_sync, &synced_wals);
+ } else {
+ MarkLogsNotSynced(current_log_number);
+ }
+ }
+ if (status.ok() && synced_wals.IsWalAddition()) {
+ InstrumentedMutexLock l(&mutex_);
+ status = ApplyWALToManifest(&synced_wals);
+ }
+
+ TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
+
+ return status;
+}
+
+Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) {
+ // not empty, write to MANIFEST.
+ mutex_.AssertHeld();
+ Status status = versions_->LogAndApplyToDefaultColumnFamily(
+ synced_wals, &mutex_, directories_.GetDbDir());
+ if (!status.ok() && versions_->io_status().IsIOError()) {
+ status = error_handler_.SetBGError(versions_->io_status(),
+ BackgroundErrorReason::kManifestWrite);
+ }
+ return status;
+}
+
+Status DBImpl::LockWAL() {
+ log_write_mutex_.Lock();
+ auto cur_log_writer = logs_.back().writer;
+ IOStatus status = cur_log_writer->WriteBuffer();
+ if (!status.ok()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+ status.ToString().c_str());
+ // In case there is a fs error we should set it globally to prevent the
+ // future writes
+ WriteStatusCheck(status);
+ }
+ return static_cast<Status>(status);
+}
+
+Status DBImpl::UnlockWAL() {
+ log_write_mutex_.Unlock();
+ return Status::OK();
+}
+
+void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
+ VersionEdit* synced_wals) {
+ log_write_mutex_.AssertHeld();
+ if (synced_dir && logfile_number_ == up_to) {
+ log_dir_synced_ = true;
+ }
+ for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
+ auto& wal = *it;
+ assert(wal.IsSyncing());
+
+ if (wal.number < logs_.back().number) {
+ // Inactive WAL
+ if (immutable_db_options_.track_and_verify_wals_in_manifest &&
+ wal.GetPreSyncSize() > 0) {
+ synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
+ }
+ if (wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize()) {
+ // Fully synced
+ logs_to_free_.push_back(wal.ReleaseWriter());
+ it = logs_.erase(it);
+ } else {
+ assert(wal.GetPreSyncSize() < wal.writer->file()->GetFlushedSize());
+ wal.FinishSync();
+ ++it;
+ }
+ } else {
+ assert(wal.number == logs_.back().number);
+ // Active WAL
+ wal.FinishSync();
+ ++it;
+ }
+ }
+ log_sync_cv_.SignalAll();
+}
+
+void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
+ log_write_mutex_.AssertHeld();
+ for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
+ ++it) {
+ auto& wal = *it;
+ wal.FinishSync();
+ }
+ log_sync_cv_.SignalAll();
+}
+
+SequenceNumber DBImpl::GetLatestSequenceNumber() const {
+ return versions_->LastSequence();
+}
+
+void DBImpl::SetLastPublishedSequence(SequenceNumber seq) {
+ versions_->SetLastPublishedSequence(seq);
+}
+
+Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+ std::string* ts_low) {
+ if (ts_low == nullptr) {
+ return Status::InvalidArgument("ts_low is nullptr");
+ }
+ ColumnFamilyData* cfd = nullptr;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ assert(cfh != nullptr);
+ cfd = cfh->cfd();
+ }
+ assert(cfd != nullptr && cfd->user_comparator() != nullptr);
+ if (cfd->user_comparator()->timestamp_size() == 0) {
+ return Status::InvalidArgument(
+ "Timestamp is not enabled in this column family");
+ }
+ InstrumentedMutexLock l(&mutex_);
+ *ts_low = cfd->GetFullHistoryTsLow();
+ assert(cfd->user_comparator()->timestamp_size() == ts_low->size());
+ return Status::OK();
+}
+
+InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
+ Arena* arena,
+ SequenceNumber sequence,
+ ColumnFamilyHandle* column_family,
+ bool allow_unprepared_value) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ cfd = cfh->cfd();
+ }
+
+ mutex_.Lock();
+ SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+ mutex_.Unlock();
+ return NewInternalIterator(read_options, cfd, super_version, arena, sequence,
+ allow_unprepared_value);
+}
+
+void DBImpl::SchedulePurge() {
+ mutex_.AssertHeld();
+ assert(opened_successfully_);
+
+ // Purge operations are put into High priority queue
+ bg_purge_scheduled_++;
+ env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr);
+}
+
+void DBImpl::BackgroundCallPurge() {
+ mutex_.Lock();
+
+ while (!logs_to_free_queue_.empty()) {
+ assert(!logs_to_free_queue_.empty());
+ log::Writer* log_writer = *(logs_to_free_queue_.begin());
+ logs_to_free_queue_.pop_front();
+ mutex_.Unlock();
+ delete log_writer;
+ mutex_.Lock();
+ }
+ while (!superversions_to_free_queue_.empty()) {
+ assert(!superversions_to_free_queue_.empty());
+ SuperVersion* sv = superversions_to_free_queue_.front();
+ superversions_to_free_queue_.pop_front();
+ mutex_.Unlock();
+ delete sv;
+ mutex_.Lock();
+ }
+
+ assert(bg_purge_scheduled_ > 0);
+
+ // Can't use iterator to go over purge_files_ because inside the loop we're
+ // unlocking the mutex that protects purge_files_.
+ while (!purge_files_.empty()) {
+ auto it = purge_files_.begin();
+ // Need to make a copy of the PurgeFilesInfo before unlocking the mutex.
+ PurgeFileInfo purge_file = it->second;
+
+ const std::string& fname = purge_file.fname;
+ const std::string& dir_to_sync = purge_file.dir_to_sync;
+ FileType type = purge_file.type;
+ uint64_t number = purge_file.number;
+ int job_id = purge_file.job_id;
+
+ purge_files_.erase(it);
+
+ mutex_.Unlock();
+ DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
+ mutex_.Lock();
+ }
+
+ bg_purge_scheduled_--;
+
+ bg_cv_.SignalAll();
+ // IMPORTANT:there should be no code after calling SignalAll. This call may
+ // signal the DB destructor that it's OK to proceed with destruction. In
+ // that case, all DB variables will be dealloacated and referencing them
+ // will cause trouble.
+ mutex_.Unlock();
+}
+
+namespace {
+
+// A `SuperVersionHandle` holds a non-null `SuperVersion*` pointing at a
+// `SuperVersion` referenced once for this object. It also contains the state
+// needed to clean up the `SuperVersion` reference from outside of `DBImpl`
+// using `CleanupSuperVersionHandle()`.
+struct SuperVersionHandle {
+ // `_super_version` must be non-nullptr and `Ref()`'d once as long as the
+ // `SuperVersionHandle` may use it.
+ SuperVersionHandle(DBImpl* _db, InstrumentedMutex* _mu,
+ SuperVersion* _super_version, bool _background_purge)
+ : db(_db),
+ mu(_mu),
+ super_version(_super_version),
+ background_purge(_background_purge) {}
+
+ DBImpl* db;
+ InstrumentedMutex* mu;
+ SuperVersion* super_version;
+ bool background_purge;
+};
+
+static void CleanupSuperVersionHandle(void* arg1, void* /*arg2*/) {
+ SuperVersionHandle* sv_handle = reinterpret_cast<SuperVersionHandle*>(arg1);
+
+ if (sv_handle->super_version->Unref()) {
+ // Job id == 0 means that this is not our background process, but rather
+ // user thread
+ JobContext job_context(0);
+
+ sv_handle->mu->Lock();
+ sv_handle->super_version->Cleanup();
+ sv_handle->db->FindObsoleteFiles(&job_context, false, true);
+ if (sv_handle->background_purge) {
+ sv_handle->db->ScheduleBgLogWriterClose(&job_context);
+ sv_handle->db->AddSuperVersionsToFreeQueue(sv_handle->super_version);
+ sv_handle->db->SchedulePurge();
+ }
+ sv_handle->mu->Unlock();
+
+ if (!sv_handle->background_purge) {
+ delete sv_handle->super_version;
+ }
+ if (job_context.HaveSomethingToDelete()) {
+ sv_handle->db->PurgeObsoleteFiles(job_context,
+ sv_handle->background_purge);
+ }
+ job_context.Clean();
+ }
+
+ delete sv_handle;
+}
+
+struct GetMergeOperandsState {
+ MergeContext merge_context;
+ PinnedIteratorsManager pinned_iters_mgr;
+ SuperVersionHandle* sv_handle;
+};
+
+static void CleanupGetMergeOperandsState(void* arg1, void* /*arg2*/) {
+ GetMergeOperandsState* state = static_cast<GetMergeOperandsState*>(arg1);
+ CleanupSuperVersionHandle(state->sv_handle /* arg1 */, nullptr /* arg2 */);
+ delete state;
+}
+
+} // namespace
+
+InternalIterator* DBImpl::NewInternalIterator(
+ const ReadOptions& read_options, ColumnFamilyData* cfd,
+ SuperVersion* super_version, Arena* arena, SequenceNumber sequence,
+ bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) {
+ InternalIterator* internal_iter;
+ assert(arena != nullptr);
+ // Need to create internal iterator from the arena.
+ MergeIteratorBuilder merge_iter_builder(
+ &cfd->internal_comparator(), arena,
+ !read_options.total_order_seek &&
+ super_version->mutable_cf_options.prefix_extractor != nullptr,
+ read_options.iterate_upper_bound);
+ // Collect iterator for mutable memtable
+ auto mem_iter = super_version->mem->NewIterator(read_options, arena);
+ Status s;
+ if (!read_options.ignore_range_deletions) {
+ TruncatedRangeDelIterator* mem_tombstone_iter = nullptr;
+ auto range_del_iter = super_version->mem->NewRangeTombstoneIterator(
+ read_options, sequence, false /* immutable_memtable */);
+ if (range_del_iter == nullptr || range_del_iter->empty()) {
+ delete range_del_iter;
+ } else {
+ mem_tombstone_iter = new TruncatedRangeDelIterator(
+ std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
+ &cfd->ioptions()->internal_comparator, nullptr /* smallest */,
+ nullptr /* largest */);
+ }
+ merge_iter_builder.AddPointAndTombstoneIterator(mem_iter,
+ mem_tombstone_iter);
+ } else {
+ merge_iter_builder.AddIterator(mem_iter);
+ }
+
+ // Collect all needed child iterators for immutable memtables
+ if (s.ok()) {
+ super_version->imm->AddIterators(read_options, &merge_iter_builder,
+ !read_options.ignore_range_deletions);
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
+ if (s.ok()) {
+ // Collect iterators for files in L0 - Ln
+ if (read_options.read_tier != kMemtableTier) {
+ super_version->current->AddIterators(read_options, file_options_,
+ &merge_iter_builder,
+ allow_unprepared_value);
+ }
+ internal_iter = merge_iter_builder.Finish(
+ read_options.ignore_range_deletions ? nullptr : db_iter);
+ SuperVersionHandle* cleanup = new SuperVersionHandle(
+ this, &mutex_, super_version,
+ read_options.background_purge_on_iterator_cleanup ||
+ immutable_db_options_.avoid_unnecessary_blocking_io);
+ internal_iter->RegisterCleanup(CleanupSuperVersionHandle, cleanup, nullptr);
+
+ return internal_iter;
+ } else {
+ CleanupSuperVersion(super_version);
+ }
+ return NewErrorInternalIterator<Slice>(s, arena);
+}
+
+ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
+ return default_cf_handle_;
+}
+
+ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
+ return persist_stats_cf_handle_;
+}
+
+Status DBImpl::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) {
+ return Get(read_options, column_family, key, value, /*timestamp=*/nullptr);
+}
+
+Status DBImpl::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value, std::string* timestamp) {
+ assert(value != nullptr);
+ value->Reset();
+ GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.value = value;
+ get_impl_options.timestamp = timestamp;
+ Status s = GetImpl(read_options, key, get_impl_options);
+ return s;
+}
+
+Status DBImpl::GetEntity(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableWideColumns* columns) {
+ if (!column_family) {
+ return Status::InvalidArgument(
+ "Cannot call GetEntity without a column family handle");
+ }
+
+ if (!columns) {
+ return Status::InvalidArgument(
+ "Cannot call GetEntity without a PinnableWideColumns object");
+ }
+
+ columns->Reset();
+
+ GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.columns = columns;
+
+ return GetImpl(read_options, key, get_impl_options);
+}
+
+bool DBImpl::ShouldReferenceSuperVersion(const MergeContext& merge_context) {
+ // If both thresholds are reached, a function returning merge operands as
+ // `PinnableSlice`s should reference the `SuperVersion` to avoid large and/or
+ // numerous `memcpy()`s.
+ //
+ // The below constants enable the optimization conservatively. They are
+ // verified to not regress `GetMergeOperands()` latency in the following
+ // scenarios.
+ //
+ // - CPU: two socket Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz
+ // - `GetMergeOperands()` threads: 1 - 32
+ // - Entry size: 32 bytes - 4KB
+ // - Merges per key: 1 - 16K
+ // - LSM component: memtable
+ //
+ // TODO(ajkr): expand measurement to SST files.
+ static const size_t kNumBytesForSvRef = 32768;
+ static const size_t kLog2AvgBytesForSvRef = 8; // 256 bytes
+
+ size_t num_bytes = 0;
+ for (const Slice& sl : merge_context.GetOperands()) {
+ num_bytes += sl.size();
+ }
+ return num_bytes >= kNumBytesForSvRef &&
+ (num_bytes >> kLog2AvgBytesForSvRef) >=
+ merge_context.GetOperands().size();
+}
+
+Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
+ GetImplOptions& get_impl_options) {
+ assert(get_impl_options.value != nullptr ||
+ get_impl_options.merge_operands != nullptr ||
+ get_impl_options.columns != nullptr);
+
+ assert(get_impl_options.column_family);
+
+ if (read_options.timestamp) {
+ const Status s = FailIfTsMismatchCf(get_impl_options.column_family,
+ *(read_options.timestamp),
+ /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ const Status s = FailIfCfHasTs(get_impl_options.column_family);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Clear the timestamps for returning results so that we can distinguish
+ // between tombstone or key that has never been written
+ if (get_impl_options.timestamp) {
+ get_impl_options.timestamp->clear();
+ }
+
+ GetWithTimestampReadCallback read_cb(0); // Will call Refresh
+
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+ StopWatch sw(immutable_db_options_.clock, stats_, DB_GET);
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(
+ get_impl_options.column_family);
+ auto cfd = cfh->cfd();
+
+ if (tracer_) {
+ // TODO: This mutex should be removed later, to improve performance when
+ // tracing is enabled.
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ // TODO: maybe handle the tracing status?
+ tracer_->Get(get_impl_options.column_family, key).PermitUncheckedError();
+ }
+ }
+
+ if (get_impl_options.get_merge_operands_options != nullptr) {
+ for (int i = 0; i < get_impl_options.get_merge_operands_options
+ ->expected_max_number_of_operands;
+ ++i) {
+ get_impl_options.merge_operands[i].Reset();
+ }
+ }
+
+ // Acquire SuperVersion
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+ TEST_SYNC_POINT("DBImpl::GetImpl:1");
+ TEST_SYNC_POINT("DBImpl::GetImpl:2");
+
+ SequenceNumber snapshot;
+ if (read_options.snapshot != nullptr) {
+ if (get_impl_options.callback) {
+ // Already calculated based on read_options.snapshot
+ snapshot = get_impl_options.callback->max_visible_seq();
+ } else {
+ snapshot =
+ reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+ }
+ } else {
+ // Note that the snapshot is assigned AFTER referencing the super
+ // version because otherwise a flush happening in between may compact away
+ // data for the snapshot, so the reader would see neither data that was be
+ // visible to the snapshot before compaction nor the newer data inserted
+ // afterwards.
+ snapshot = GetLastPublishedSequence();
+ if (get_impl_options.callback) {
+ // The unprep_seqs are not published for write unprepared, so it could be
+ // that max_visible_seq is larger. Seek to the std::max of the two.
+ // However, we still want our callback to contain the actual snapshot so
+ // that it can do the correct visibility filtering.
+ get_impl_options.callback->Refresh(snapshot);
+
+ // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+ // max_visible_seq = max(max_visible_seq, snapshot)
+ //
+ // Currently, the commented out assert is broken by
+ // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+ // the regular transaction flow, then this special read callback would not
+ // be needed.
+ //
+ // assert(callback->max_visible_seq() >= snapshot);
+ snapshot = get_impl_options.callback->max_visible_seq();
+ }
+ }
+ // If timestamp is used, we use read callback to ensure <key,t,s> is returned
+ // only if t <= read_opts.timestamp and s <= snapshot.
+ // HACK: temporarily overwrite input struct field but restore
+ SaveAndRestore<ReadCallback*> restore_callback(&get_impl_options.callback);
+ const Comparator* ucmp = get_impl_options.column_family->GetComparator();
+ assert(ucmp);
+ if (ucmp->timestamp_size() > 0) {
+ assert(!get_impl_options
+ .callback); // timestamp with callback is not supported
+ read_cb.Refresh(snapshot);
+ get_impl_options.callback = &read_cb;
+ }
+ TEST_SYNC_POINT("DBImpl::GetImpl:3");
+ TEST_SYNC_POINT("DBImpl::GetImpl:4");
+
+ // Prepare to store a list of merge operations if merge occurs.
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+
+ Status s;
+ // First look in the memtable, then in the immutable memtable (if any).
+ // s is both in/out. When in, s could either be OK or MergeInProgress.
+ // merge_operands will contain the sequence of merges in the latter case.
+ LookupKey lkey(key, snapshot, read_options.timestamp);
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ bool skip_memtable = (read_options.read_tier == kPersistedTier &&
+ has_unpersisted_data_.load(std::memory_order_relaxed));
+ bool done = false;
+ std::string* timestamp =
+ ucmp->timestamp_size() > 0 ? get_impl_options.timestamp : nullptr;
+ if (!skip_memtable) {
+ // Get value associated with key
+ if (get_impl_options.get_value) {
+ if (sv->mem->Get(
+ lkey,
+ get_impl_options.value ? get_impl_options.value->GetSelf()
+ : nullptr,
+ get_impl_options.columns, timestamp, &s, &merge_context,
+ &max_covering_tombstone_seq, read_options,
+ false /* immutable_memtable */, get_impl_options.callback,
+ get_impl_options.is_blob_index)) {
+ done = true;
+
+ if (get_impl_options.value) {
+ get_impl_options.value->PinSelf();
+ }
+
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if ((s.ok() || s.IsMergeInProgress()) &&
+ sv->imm->Get(lkey,
+ get_impl_options.value
+ ? get_impl_options.value->GetSelf()
+ : nullptr,
+ get_impl_options.columns, timestamp, &s,
+ &merge_context, &max_covering_tombstone_seq,
+ read_options, get_impl_options.callback,
+ get_impl_options.is_blob_index)) {
+ done = true;
+
+ if (get_impl_options.value) {
+ get_impl_options.value->PinSelf();
+ }
+
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ } else {
+ // Get Merge Operands associated with key, Merge Operands should not be
+ // merged and raw values should be returned to the user.
+ if (sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, read_options,
+ false /* immutable_memtable */, nullptr, nullptr,
+ false)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if ((s.ok() || s.IsMergeInProgress()) &&
+ sv->imm->GetMergeOperands(lkey, &s, &merge_context,
+ &max_covering_tombstone_seq,
+ read_options)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ }
+ if (!done && !s.ok() && !s.IsMergeInProgress()) {
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ return s;
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::GetImpl:PostMemTableGet:0");
+ TEST_SYNC_POINT("DBImpl::GetImpl:PostMemTableGet:1");
+ PinnedIteratorsManager pinned_iters_mgr;
+ if (!done) {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ sv->current->Get(
+ read_options, lkey, get_impl_options.value, get_impl_options.columns,
+ timestamp, &s, &merge_context, &max_covering_tombstone_seq,
+ &pinned_iters_mgr,
+ get_impl_options.get_value ? get_impl_options.value_found : nullptr,
+ nullptr, nullptr,
+ get_impl_options.get_value ? get_impl_options.callback : nullptr,
+ get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr,
+ get_impl_options.get_value);
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+
+ {
+ PERF_TIMER_GUARD(get_post_process_time);
+
+ RecordTick(stats_, NUMBER_KEYS_READ);
+ size_t size = 0;
+ if (s.ok()) {
+ if (get_impl_options.get_value) {
+ if (get_impl_options.value) {
+ size = get_impl_options.value->size();
+ } else if (get_impl_options.columns) {
+ size = get_impl_options.columns->serialized_size();
+ }
+ } else {
+ // Return all merge operands for get_impl_options.key
+ *get_impl_options.number_of_operands =
+ static_cast<int>(merge_context.GetNumOperands());
+ if (*get_impl_options.number_of_operands >
+ get_impl_options.get_merge_operands_options
+ ->expected_max_number_of_operands) {
+ s = Status::Incomplete(
+ Status::SubCode::KMergeOperandsInsufficientCapacity);
+ } else {
+ // Each operand depends on one of the following resources: `sv`,
+ // `pinned_iters_mgr`, or `merge_context`. It would be crazy expensive
+ // to reference `sv` for each operand relying on it because `sv` is
+ // (un)ref'd in all threads using the DB. Furthermore, we do not track
+ // on which resource each operand depends.
+ //
+ // To solve this, we bundle the resources in a `GetMergeOperandsState`
+ // and manage them with a `SharedCleanablePtr` shared among the
+ // `PinnableSlice`s we return. This bundle includes one `sv` reference
+ // and ownership of the `merge_context` and `pinned_iters_mgr`
+ // objects.
+ bool ref_sv = ShouldReferenceSuperVersion(merge_context);
+ if (ref_sv) {
+ assert(!merge_context.GetOperands().empty());
+ SharedCleanablePtr shared_cleanable;
+ GetMergeOperandsState* state = nullptr;
+ state = new GetMergeOperandsState();
+ state->merge_context = std::move(merge_context);
+ state->pinned_iters_mgr = std::move(pinned_iters_mgr);
+
+ sv->Ref();
+
+ state->sv_handle = new SuperVersionHandle(
+ this, &mutex_, sv,
+ immutable_db_options_.avoid_unnecessary_blocking_io);
+
+ shared_cleanable.Allocate();
+ shared_cleanable->RegisterCleanup(CleanupGetMergeOperandsState,
+ state /* arg1 */,
+ nullptr /* arg2 */);
+ for (size_t i = 0; i < state->merge_context.GetOperands().size();
+ ++i) {
+ const Slice& sl = state->merge_context.GetOperands()[i];
+ size += sl.size();
+
+ get_impl_options.merge_operands->PinSlice(
+ sl, nullptr /* cleanable */);
+ if (i == state->merge_context.GetOperands().size() - 1) {
+ shared_cleanable.MoveAsCleanupTo(
+ get_impl_options.merge_operands);
+ } else {
+ shared_cleanable.RegisterCopyWith(
+ get_impl_options.merge_operands);
+ }
+ get_impl_options.merge_operands++;
+ }
+ } else {
+ for (const Slice& sl : merge_context.GetOperands()) {
+ size += sl.size();
+ get_impl_options.merge_operands->PinSelf(sl);
+ get_impl_options.merge_operands++;
+ }
+ }
+ }
+ }
+ RecordTick(stats_, BYTES_READ, size);
+ PERF_COUNTER_ADD(get_read_bytes, size);
+ }
+
+ ReturnAndCleanupSuperVersion(cfd, sv);
+
+ RecordInHistogram(stats_, BYTES_PER_READ, size);
+ }
+ return s;
+}
+
+std::vector<Status> DBImpl::MultiGet(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) {
+ return MultiGet(read_options, column_family, keys, values,
+ /*timestamps=*/nullptr);
+}
+
+std::vector<Status> DBImpl::MultiGet(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values,
+ std::vector<std::string>* timestamps) {
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+ StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ size_t num_keys = keys.size();
+ assert(column_family.size() == num_keys);
+ std::vector<Status> stat_list(num_keys);
+
+ bool should_fail = false;
+ for (size_t i = 0; i < num_keys; ++i) {
+ assert(column_family[i]);
+ if (read_options.timestamp) {
+ stat_list[i] = FailIfTsMismatchCf(
+ column_family[i], *(read_options.timestamp), /*ts_for_read=*/true);
+ if (!stat_list[i].ok()) {
+ should_fail = true;
+ }
+ } else {
+ stat_list[i] = FailIfCfHasTs(column_family[i]);
+ if (!stat_list[i].ok()) {
+ should_fail = true;
+ }
+ }
+ }
+
+ if (should_fail) {
+ for (auto& s : stat_list) {
+ if (s.ok()) {
+ s = Status::Incomplete(
+ "DB not queried due to invalid argument(s) in the same MultiGet");
+ }
+ }
+ return stat_list;
+ }
+
+ if (tracer_) {
+ // TODO: This mutex should be removed later, to improve performance when
+ // tracing is enabled.
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ // TODO: maybe handle the tracing status?
+ tracer_->MultiGet(column_family, keys).PermitUncheckedError();
+ }
+ }
+
+ SequenceNumber consistent_seqnum;
+
+ UnorderedMap<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
+ column_family.size());
+ for (auto cf : column_family) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(cf);
+ auto cfd = cfh->cfd();
+ if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
+ multiget_cf_data.emplace(cfd->GetID(),
+ MultiGetColumnFamilyData(cfh, nullptr));
+ }
+ }
+
+ std::function<MultiGetColumnFamilyData*(
+ UnorderedMap<uint32_t, MultiGetColumnFamilyData>::iterator&)>
+ iter_deref_lambda =
+ [](UnorderedMap<uint32_t, MultiGetColumnFamilyData>::iterator&
+ cf_iter) { return &cf_iter->second; };
+
+ bool unref_only =
+ MultiCFSnapshot<UnorderedMap<uint32_t, MultiGetColumnFamilyData>>(
+ read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+ &consistent_seqnum);
+
+ TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum1");
+ TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum2");
+
+ // Contain a list of merge operations if merge occurs.
+ MergeContext merge_context;
+
+ // Note: this always resizes the values array
+ values->resize(num_keys);
+ if (timestamps) {
+ timestamps->resize(num_keys);
+ }
+
+ // Keep track of bytes that we read for statistics-recording later
+ uint64_t bytes_read = 0;
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ // For each of the given keys, apply the entire "get" process as follows:
+ // First look in the memtable, then in the immutable memtable (if any).
+ // s is both in/out. When in, s could either be OK or MergeInProgress.
+ // merge_operands will contain the sequence of merges in the latter case.
+ size_t num_found = 0;
+ size_t keys_read;
+ uint64_t curr_value_size = 0;
+
+ GetWithTimestampReadCallback timestamp_read_callback(0);
+ ReadCallback* read_callback = nullptr;
+ if (read_options.timestamp && read_options.timestamp->size() > 0) {
+ timestamp_read_callback.Refresh(consistent_seqnum);
+ read_callback = &timestamp_read_callback;
+ }
+
+ for (keys_read = 0; keys_read < num_keys; ++keys_read) {
+ merge_context.Clear();
+ Status& s = stat_list[keys_read];
+ std::string* value = &(*values)[keys_read];
+ std::string* timestamp = timestamps ? &(*timestamps)[keys_read] : nullptr;
+
+ LookupKey lkey(keys[keys_read], consistent_seqnum, read_options.timestamp);
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(
+ column_family[keys_read]);
+ SequenceNumber max_covering_tombstone_seq = 0;
+ auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
+ assert(mgd_iter != multiget_cf_data.end());
+ auto mgd = mgd_iter->second;
+ auto super_version = mgd.super_version;
+ bool skip_memtable =
+ (read_options.read_tier == kPersistedTier &&
+ has_unpersisted_data_.load(std::memory_order_relaxed));
+ bool done = false;
+ if (!skip_memtable) {
+ if (super_version->mem->Get(
+ lkey, value, /*columns=*/nullptr, timestamp, &s, &merge_context,
+ &max_covering_tombstone_seq, read_options,
+ false /* immutable_memtable */, read_callback)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if (super_version->imm->Get(lkey, value, /*columns=*/nullptr,
+ timestamp, &s, &merge_context,
+ &max_covering_tombstone_seq,
+ read_options, read_callback)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ }
+ if (!done) {
+ PinnableSlice pinnable_val;
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ PinnedIteratorsManager pinned_iters_mgr;
+ super_version->current->Get(read_options, lkey, &pinnable_val,
+ /*columns=*/nullptr, timestamp, &s,
+ &merge_context, &max_covering_tombstone_seq,
+ &pinned_iters_mgr, /*value_found=*/nullptr,
+ /*key_exists=*/nullptr,
+ /*seq=*/nullptr, read_callback);
+ value->assign(pinnable_val.data(), pinnable_val.size());
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+
+ if (s.ok()) {
+ bytes_read += value->size();
+ num_found++;
+ curr_value_size += value->size();
+ if (curr_value_size > read_options.value_size_soft_limit) {
+ while (++keys_read < num_keys) {
+ stat_list[keys_read] = Status::Aborted();
+ }
+ break;
+ }
+ }
+ if (read_options.deadline.count() &&
+ immutable_db_options_.clock->NowMicros() >
+ static_cast<uint64_t>(read_options.deadline.count())) {
+ break;
+ }
+ }
+
+ if (keys_read < num_keys) {
+ // The only reason to break out of the loop is when the deadline is
+ // exceeded
+ assert(immutable_db_options_.clock->NowMicros() >
+ static_cast<uint64_t>(read_options.deadline.count()));
+ for (++keys_read; keys_read < num_keys; ++keys_read) {
+ stat_list[keys_read] = Status::TimedOut();
+ }
+ }
+
+ // Post processing (decrement reference counts and record statistics)
+ PERF_TIMER_GUARD(get_post_process_time);
+ autovector<SuperVersion*> superversions_to_delete;
+
+ for (auto mgd_iter : multiget_cf_data) {
+ auto mgd = mgd_iter.second;
+ if (!unref_only) {
+ ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
+ } else {
+ mgd.cfd->GetSuperVersion()->Unref();
+ }
+ }
+ RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+ RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+ RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+ PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+ PERF_TIMER_STOP(get_post_process_time);
+
+ return stat_list;
+}
+
+template <class T>
+bool DBImpl::MultiCFSnapshot(
+ const ReadOptions& read_options, ReadCallback* callback,
+ std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+ iter_deref_func,
+ T* cf_list, SequenceNumber* snapshot) {
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ bool last_try = false;
+ if (cf_list->size() == 1) {
+ // Fast path for a single column family. We can simply get the thread loca
+ // super version
+ auto cf_iter = cf_list->begin();
+ auto node = iter_deref_func(cf_iter);
+ node->super_version = GetAndRefSuperVersion(node->cfd);
+ if (read_options.snapshot != nullptr) {
+ // Note: In WritePrepared txns this is not necessary but not harmful
+ // either. Because prep_seq > snapshot => commit_seq > snapshot so if
+ // a snapshot is specified we should be fine with skipping seq numbers
+ // that are greater than that.
+ //
+ // In WriteUnprepared, we cannot set snapshot in the lookup key because we
+ // may skip uncommitted data that should be visible to the transaction for
+ // reading own writes.
+ *snapshot =
+ static_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+ if (callback) {
+ *snapshot = std::max(*snapshot, callback->max_visible_seq());
+ }
+ } else {
+ // Since we get and reference the super version before getting
+ // the snapshot number, without a mutex protection, it is possible
+ // that a memtable switch happened in the middle and not all the
+ // data for this snapshot is available. But it will contain all
+ // the data available in the super version we have, which is also
+ // a valid snapshot to read from.
+ // We shouldn't get snapshot before finding and referencing the super
+ // version because a flush happening in between may compact away data for
+ // the snapshot, but the snapshot is earlier than the data overwriting it,
+ // so users may see wrong results.
+ *snapshot = GetLastPublishedSequence();
+ }
+ } else {
+ // If we end up with the same issue of memtable geting sealed during 2
+ // consecutive retries, it means the write rate is very high. In that case
+ // its probably ok to take the mutex on the 3rd try so we can succeed for
+ // sure
+ constexpr int num_retries = 3;
+ for (int i = 0; i < num_retries; ++i) {
+ last_try = (i == num_retries - 1);
+ bool retry = false;
+
+ if (i > 0) {
+ for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+ ++cf_iter) {
+ auto node = iter_deref_func(cf_iter);
+ SuperVersion* super_version = node->super_version;
+ ColumnFamilyData* cfd = node->cfd;
+ if (super_version != nullptr) {
+ ReturnAndCleanupSuperVersion(cfd, super_version);
+ }
+ node->super_version = nullptr;
+ }
+ }
+ if (read_options.snapshot == nullptr) {
+ if (last_try) {
+ TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
+ // We're close to max number of retries. For the last retry,
+ // acquire the lock so we're sure to succeed
+ mutex_.Lock();
+ }
+ *snapshot = GetLastPublishedSequence();
+ } else {
+ *snapshot =
+ static_cast_with_check<const SnapshotImpl>(read_options.snapshot)
+ ->number_;
+ }
+ for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+ ++cf_iter) {
+ auto node = iter_deref_func(cf_iter);
+ if (!last_try) {
+ node->super_version = GetAndRefSuperVersion(node->cfd);
+ } else {
+ node->super_version = node->cfd->GetSuperVersion()->Ref();
+ }
+ TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
+ if (read_options.snapshot != nullptr || last_try) {
+ // If user passed a snapshot, then we don't care if a memtable is
+ // sealed or compaction happens because the snapshot would ensure
+ // that older key versions are kept around. If this is the last
+ // retry, then we have the lock so nothing bad can happen
+ continue;
+ }
+ // We could get the earliest sequence number for the whole list of
+ // memtables, which will include immutable memtables as well, but that
+ // might be tricky to maintain in case we decide, in future, to do
+ // memtable compaction.
+ if (!last_try) {
+ SequenceNumber seq =
+ node->super_version->mem->GetEarliestSequenceNumber();
+ if (seq > *snapshot) {
+ retry = true;
+ break;
+ }
+ }
+ }
+ if (!retry) {
+ if (last_try) {
+ mutex_.Unlock();
+ }
+ break;
+ }
+ }
+ }
+
+ // Keep track of bytes that we read for statistics-recording later
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ return last_try;
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+ ColumnFamilyHandle** column_families, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input) {
+ return MultiGet(read_options, num_keys, column_families, keys, values,
+ /*timestamps=*/nullptr, statuses, sorted_input);
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+ ColumnFamilyHandle** column_families, const Slice* keys,
+ PinnableSlice* values, std::string* timestamps,
+ Status* statuses, const bool sorted_input) {
+ if (num_keys == 0) {
+ return;
+ }
+
+ bool should_fail = false;
+ for (size_t i = 0; i < num_keys; ++i) {
+ ColumnFamilyHandle* cfh = column_families[i];
+ assert(cfh);
+ if (read_options.timestamp) {
+ statuses[i] = FailIfTsMismatchCf(cfh, *(read_options.timestamp),
+ /*ts_for_read=*/true);
+ if (!statuses[i].ok()) {
+ should_fail = true;
+ }
+ } else {
+ statuses[i] = FailIfCfHasTs(cfh);
+ if (!statuses[i].ok()) {
+ should_fail = true;
+ }
+ }
+ }
+ if (should_fail) {
+ for (size_t i = 0; i < num_keys; ++i) {
+ if (statuses[i].ok()) {
+ statuses[i] = Status::Incomplete(
+ "DB not queried due to invalid argument(s) in the same MultiGet");
+ }
+ }
+ return;
+ }
+
+ if (tracer_) {
+ // TODO: This mutex should be removed later, to improve performance when
+ // tracing is enabled.
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ // TODO: maybe handle the tracing status?
+ tracer_->MultiGet(num_keys, column_families, keys).PermitUncheckedError();
+ }
+ }
+
+ autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+ sorted_keys.resize(num_keys);
+ for (size_t i = 0; i < num_keys; ++i) {
+ values[i].Reset();
+ key_context.emplace_back(column_families[i], keys[i], &values[i],
+ timestamps ? &timestamps[i] : nullptr,
+ &statuses[i]);
+ }
+ for (size_t i = 0; i < num_keys; ++i) {
+ sorted_keys[i] = &key_context[i];
+ }
+ PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+
+ autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>
+ multiget_cf_data;
+ size_t cf_start = 0;
+ ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
+
+ for (size_t i = 0; i < num_keys; ++i) {
+ KeyContext* key_ctx = sorted_keys[i];
+ if (key_ctx->column_family != cf) {
+ multiget_cf_data.emplace_back(cf, cf_start, i - cf_start, nullptr);
+ cf_start = i;
+ cf = key_ctx->column_family;
+ }
+ }
+
+ multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
+
+ std::function<MultiGetColumnFamilyData*(
+ autovector<MultiGetColumnFamilyData,
+ MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
+ iter_deref_lambda =
+ [](autovector<MultiGetColumnFamilyData,
+ MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
+ return &(*cf_iter);
+ };
+
+ SequenceNumber consistent_seqnum;
+ bool unref_only = MultiCFSnapshot<
+ autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>>(
+ read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+ &consistent_seqnum);
+
+ GetWithTimestampReadCallback timestamp_read_callback(0);
+ ReadCallback* read_callback = nullptr;
+ if (read_options.timestamp && read_options.timestamp->size() > 0) {
+ timestamp_read_callback.Refresh(consistent_seqnum);
+ read_callback = &timestamp_read_callback;
+ }
+
+ Status s;
+ auto cf_iter = multiget_cf_data.begin();
+ for (; cf_iter != multiget_cf_data.end(); ++cf_iter) {
+ s = MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys,
+ &sorted_keys, cf_iter->super_version, consistent_seqnum,
+ read_callback);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ if (!s.ok()) {
+ assert(s.IsTimedOut() || s.IsAborted());
+ for (++cf_iter; cf_iter != multiget_cf_data.end(); ++cf_iter) {
+ for (size_t i = cf_iter->start; i < cf_iter->start + cf_iter->num_keys;
+ ++i) {
+ *sorted_keys[i]->s = s;
+ }
+ }
+ }
+
+ for (const auto& iter : multiget_cf_data) {
+ if (!unref_only) {
+ ReturnAndCleanupSuperVersion(iter.cfd, iter.super_version);
+ } else {
+ iter.cfd->GetSuperVersion()->Unref();
+ }
+ }
+}
+
+namespace {
+// Order keys by CF ID, followed by key contents
+struct CompareKeyContext {
+ inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) {
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+ uint32_t cfd_id1 = cfh->cfd()->GetID();
+ const Comparator* comparator = cfh->cfd()->user_comparator();
+ cfh = static_cast<ColumnFamilyHandleImpl*>(rhs->column_family);
+ uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+ if (cfd_id1 < cfd_id2) {
+ return true;
+ } else if (cfd_id1 > cfd_id2) {
+ return false;
+ }
+
+ // Both keys are from the same column family
+ int cmp = comparator->CompareWithoutTimestamp(
+ *(lhs->key), /*a_has_ts=*/false, *(rhs->key), /*b_has_ts=*/false);
+ if (cmp < 0) {
+ return true;
+ }
+ return false;
+ }
+};
+
+} // anonymous namespace
+
+void DBImpl::PrepareMultiGetKeys(
+ size_t num_keys, bool sorted_input,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+ if (sorted_input) {
+#ifndef NDEBUG
+ assert(std::is_sorted(sorted_keys->begin(), sorted_keys->end(),
+ CompareKeyContext()));
+#endif
+ return;
+ }
+
+ std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
+ CompareKeyContext());
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const size_t num_keys,
+ const Slice* keys, PinnableSlice* values,
+ Status* statuses, const bool sorted_input) {
+ return MultiGet(read_options, column_family, num_keys, keys, values,
+ /*timestamp=*/nullptr, statuses, sorted_input);
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const size_t num_keys,
+ const Slice* keys, PinnableSlice* values,
+ std::string* timestamps, Status* statuses,
+ const bool sorted_input) {
+ if (tracer_) {
+ // TODO: This mutex should be removed later, to improve performance when
+ // tracing is enabled.
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ // TODO: maybe handle the tracing status?
+ tracer_->MultiGet(num_keys, column_family, keys).PermitUncheckedError();
+ }
+ }
+ autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+ sorted_keys.resize(num_keys);
+ for (size_t i = 0; i < num_keys; ++i) {
+ values[i].Reset();
+ key_context.emplace_back(column_family, keys[i], &values[i],
+ timestamps ? &timestamps[i] : nullptr,
+ &statuses[i]);
+ }
+ for (size_t i = 0; i < num_keys; ++i) {
+ sorted_keys[i] = &key_context[i];
+ }
+ PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+ MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys);
+}
+
+void DBImpl::MultiGetWithCallback(
+ const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+ ReadCallback* callback,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+ std::array<MultiGetColumnFamilyData, 1> multiget_cf_data;
+ multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr);
+ std::function<MultiGetColumnFamilyData*(
+ std::array<MultiGetColumnFamilyData, 1>::iterator&)>
+ iter_deref_lambda =
+ [](std::array<MultiGetColumnFamilyData, 1>::iterator& cf_iter) {
+ return &(*cf_iter);
+ };
+
+ size_t num_keys = sorted_keys->size();
+ SequenceNumber consistent_seqnum;
+ bool unref_only = MultiCFSnapshot<std::array<MultiGetColumnFamilyData, 1>>(
+ read_options, callback, iter_deref_lambda, &multiget_cf_data,
+ &consistent_seqnum);
+#ifndef NDEBUG
+ assert(!unref_only);
+#else
+ // Silence unused variable warning
+ (void)unref_only;
+#endif // NDEBUG
+
+ if (callback && read_options.snapshot == nullptr) {
+ // The unprep_seqs are not published for write unprepared, so it could be
+ // that max_visible_seq is larger. Seek to the std::max of the two.
+ // However, we still want our callback to contain the actual snapshot so
+ // that it can do the correct visibility filtering.
+ callback->Refresh(consistent_seqnum);
+
+ // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+ // max_visible_seq = max(max_visible_seq, snapshot)
+ //
+ // Currently, the commented out assert is broken by
+ // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+ // the regular transaction flow, then this special read callback would not
+ // be needed.
+ //
+ // assert(callback->max_visible_seq() >= snapshot);
+ consistent_seqnum = callback->max_visible_seq();
+ }
+
+ GetWithTimestampReadCallback timestamp_read_callback(0);
+ ReadCallback* read_callback = callback;
+ if (read_options.timestamp && read_options.timestamp->size() > 0) {
+ assert(!read_callback); // timestamp with callback is not supported
+ timestamp_read_callback.Refresh(consistent_seqnum);
+ read_callback = &timestamp_read_callback;
+ }
+
+ Status s = MultiGetImpl(read_options, 0, num_keys, sorted_keys,
+ multiget_cf_data[0].super_version, consistent_seqnum,
+ read_callback);
+ assert(s.ok() || s.IsTimedOut() || s.IsAborted());
+ ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
+ multiget_cf_data[0].super_version);
+}
+
+// The actual implementation of batched MultiGet. Parameters -
+// start_key - Index in the sorted_keys vector to start processing from
+// num_keys - Number of keys to lookup, starting with sorted_keys[start_key]
+// sorted_keys - The entire batch of sorted keys for this CF
+//
+// The per key status is returned in the KeyContext structures pointed to by
+// sorted_keys. An overall Status is also returned, with the only possible
+// values being Status::OK() and Status::TimedOut(). The latter indicates
+// that the call exceeded read_options.deadline
+Status DBImpl::MultiGetImpl(
+ const ReadOptions& read_options, size_t start_key, size_t num_keys,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+ SuperVersion* super_version, SequenceNumber snapshot,
+ ReadCallback* callback) {
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+ StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
+
+ assert(sorted_keys);
+ // Clear the timestamps for returning results so that we can distinguish
+ // between tombstone or key that has never been written
+ for (auto* kctx : *sorted_keys) {
+ assert(kctx);
+ if (kctx->timestamp) {
+ kctx->timestamp->clear();
+ }
+ }
+
+ // For each of the given keys, apply the entire "get" process as follows:
+ // First look in the memtable, then in the immutable memtable (if any).
+ // s is both in/out. When in, s could either be OK or MergeInProgress.
+ // merge_operands will contain the sequence of merges in the latter case.
+ size_t keys_left = num_keys;
+ Status s;
+ uint64_t curr_value_size = 0;
+ while (keys_left) {
+ if (read_options.deadline.count() &&
+ immutable_db_options_.clock->NowMicros() >
+ static_cast<uint64_t>(read_options.deadline.count())) {
+ s = Status::TimedOut();
+ break;
+ }
+
+ size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
+ ? MultiGetContext::MAX_BATCH_SIZE
+ : keys_left;
+ MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
+ batch_size, snapshot, read_options, GetFileSystem(),
+ stats_);
+ MultiGetRange range = ctx.GetMultiGetRange();
+ range.AddValueSize(curr_value_size);
+ bool lookup_current = false;
+
+ keys_left -= batch_size;
+ for (auto mget_iter = range.begin(); mget_iter != range.end();
+ ++mget_iter) {
+ mget_iter->merge_context.Clear();
+ *mget_iter->s = Status::OK();
+ }
+
+ bool skip_memtable =
+ (read_options.read_tier == kPersistedTier &&
+ has_unpersisted_data_.load(std::memory_order_relaxed));
+ if (!skip_memtable) {
+ super_version->mem->MultiGet(read_options, &range, callback,
+ false /* immutable_memtable */);
+ if (!range.empty()) {
+ super_version->imm->MultiGet(read_options, &range, callback);
+ }
+ if (!range.empty()) {
+ lookup_current = true;
+ uint64_t left = range.KeysLeft();
+ RecordTick(stats_, MEMTABLE_MISS, left);
+ }
+ }
+ if (lookup_current) {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ super_version->current->MultiGet(read_options, &range, callback);
+ }
+ curr_value_size = range.GetValueSize();
+ if (curr_value_size > read_options.value_size_soft_limit) {
+ s = Status::Aborted();
+ break;
+ }
+ }
+
+ // Post processing (decrement reference counts and record statistics)
+ PERF_TIMER_GUARD(get_post_process_time);
+ size_t num_found = 0;
+ uint64_t bytes_read = 0;
+ for (size_t i = start_key; i < start_key + num_keys - keys_left; ++i) {
+ KeyContext* key = (*sorted_keys)[i];
+ if (key->s->ok()) {
+ bytes_read += key->value->size();
+ num_found++;
+ }
+ }
+ if (keys_left) {
+ assert(s.IsTimedOut() || s.IsAborted());
+ for (size_t i = start_key + num_keys - keys_left; i < start_key + num_keys;
+ ++i) {
+ KeyContext* key = (*sorted_keys)[i];
+ *key->s = s;
+ }
+ }
+
+ RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+ RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+ RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+ PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+ PERF_TIMER_STOP(get_post_process_time);
+
+ return s;
+}
+
+Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+ const std::string& column_family,
+ ColumnFamilyHandle** handle) {
+ assert(handle != nullptr);
+ Status s = CreateColumnFamilyImpl(cf_options, column_family, handle);
+ if (s.ok()) {
+ s = WriteOptionsFile(true /*need_mutex_lock*/,
+ true /*need_enter_write_thread*/);
+ }
+ return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+ const ColumnFamilyOptions& cf_options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles) {
+ assert(handles != nullptr);
+ handles->clear();
+ size_t num_cf = column_family_names.size();
+ Status s;
+ bool success_once = false;
+ for (size_t i = 0; i < num_cf; i++) {
+ ColumnFamilyHandle* handle;
+ s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle);
+ if (!s.ok()) {
+ break;
+ }
+ handles->push_back(handle);
+ success_once = true;
+ }
+ if (success_once) {
+ Status persist_options_status = WriteOptionsFile(
+ true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ if (s.ok() && !persist_options_status.ok()) {
+ s = persist_options_status;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles) {
+ assert(handles != nullptr);
+ handles->clear();
+ size_t num_cf = column_families.size();
+ Status s;
+ bool success_once = false;
+ for (size_t i = 0; i < num_cf; i++) {
+ ColumnFamilyHandle* handle;
+ s = CreateColumnFamilyImpl(column_families[i].options,
+ column_families[i].name, &handle);
+ if (!s.ok()) {
+ break;
+ }
+ handles->push_back(handle);
+ success_once = true;
+ }
+ if (success_once) {
+ Status persist_options_status = WriteOptionsFile(
+ true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ if (s.ok() && !persist_options_status.ok()) {
+ s = persist_options_status;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle) {
+ Status s;
+ *handle = nullptr;
+
+ DBOptions db_options =
+ BuildDBOptions(immutable_db_options_, mutable_db_options_);
+ s = ColumnFamilyData::ValidateOptions(db_options, cf_options);
+ if (s.ok()) {
+ for (auto& cf_path : cf_options.cf_paths) {
+ s = env_->CreateDirIfMissing(cf_path.path);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+
+ if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
+ nullptr) {
+ return Status::InvalidArgument("Column family already exists");
+ }
+ VersionEdit edit;
+ edit.AddColumnFamily(column_family_name);
+ uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+ edit.SetColumnFamily(new_id);
+ edit.SetLogNumber(logfile_number_);
+ edit.SetComparatorName(cf_options.comparator->Name());
+
+ // LogAndApply will both write the creation in MANIFEST and create
+ // ColumnFamilyData object
+ { // write thread
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ // LogAndApply will both write the creation in MANIFEST and create
+ // ColumnFamilyData object
+ s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit,
+ &mutex_, directories_.GetDbDir(), false,
+ &cf_options);
+ write_thread_.ExitUnbatched(&w);
+ }
+ if (s.ok()) {
+ auto* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+ assert(cfd != nullptr);
+ std::map<std::string, std::shared_ptr<FSDirectory>> dummy_created_dirs;
+ s = cfd->AddDirectories(&dummy_created_dirs);
+ }
+ if (s.ok()) {
+ auto* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+ assert(cfd != nullptr);
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context,
+ *cfd->GetLatestMutableCFOptions());
+
+ if (!cfd->mem()->IsSnapshotSupported()) {
+ is_snapshot_supported_ = false;
+ }
+
+ cfd->set_initialized();
+
+ *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Created column family [%s] (ID %u)",
+ column_family_name.c_str(), (unsigned)cfd->GetID());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Creating column family [%s] FAILED -- %s",
+ column_family_name.c_str(), s.ToString().c_str());
+ }
+ } // InstrumentedMutexLock l(&mutex_)
+
+ if (cf_options.preserve_internal_time_seconds > 0 ||
+ cf_options.preclude_last_level_data_seconds > 0) {
+ s = RegisterRecordSeqnoTimeWorker();
+ }
+ sv_context.Clean();
+ // this is outside the mutex
+ if (s.ok()) {
+ NewThreadStatusCfInfo(
+ static_cast_with_check<ColumnFamilyHandleImpl>(*handle)->cfd());
+ }
+ return s;
+}
+
+Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
+ assert(column_family != nullptr);
+ Status s = DropColumnFamilyImpl(column_family);
+ if (s.ok()) {
+ s = WriteOptionsFile(true /*need_mutex_lock*/,
+ true /*need_enter_write_thread*/);
+ }
+ return s;
+}
+
+Status DBImpl::DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families) {
+ Status s;
+ bool success_once = false;
+ for (auto* handle : column_families) {
+ s = DropColumnFamilyImpl(handle);
+ if (!s.ok()) {
+ break;
+ }
+ success_once = true;
+ }
+ if (success_once) {
+ Status persist_options_status = WriteOptionsFile(
+ true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ if (s.ok() && !persist_options_status.ok()) {
+ s = persist_options_status;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ auto cfd = cfh->cfd();
+ if (cfd->GetID() == 0) {
+ return Status::InvalidArgument("Can't drop default column family");
+ }
+
+ bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported();
+
+ VersionEdit edit;
+ edit.DropColumnFamily();
+ edit.SetColumnFamily(cfd->GetID());
+
+ Status s;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ if (cfd->IsDropped()) {
+ s = Status::InvalidArgument("Column family already dropped!\n");
+ }
+ if (s.ok()) {
+ // we drop column family from a single write thread
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
+ &mutex_, directories_.GetDbDir());
+ write_thread_.ExitUnbatched(&w);
+ }
+ if (s.ok()) {
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
+ mutable_cf_options->max_write_buffer_number;
+ }
+
+ if (!cf_support_snapshot) {
+ // Dropped Column Family doesn't support snapshot. Need to recalculate
+ // is_snapshot_supported_.
+ bool new_is_snapshot_supported = true;
+ for (auto c : *versions_->GetColumnFamilySet()) {
+ if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) {
+ new_is_snapshot_supported = false;
+ break;
+ }
+ }
+ is_snapshot_supported_ = new_is_snapshot_supported;
+ }
+ bg_cv_.SignalAll();
+ }
+
+ if (cfd->ioptions()->preserve_internal_time_seconds > 0 ||
+ cfd->ioptions()->preclude_last_level_data_seconds > 0) {
+ s = RegisterRecordSeqnoTimeWorker();
+ }
+
+ if (s.ok()) {
+ // Note that here we erase the associated cf_info of the to-be-dropped
+ // cfd before its ref-count goes to zero to avoid having to erase cf_info
+ // later inside db_mutex.
+ EraseThreadStatusCfInfo(cfd);
+ assert(cfd->IsDropped());
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Dropped column family with id %u\n", cfd->GetID());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Dropping column family with id %u FAILED -- %s\n",
+ cfd->GetID(), s.ToString().c_str());
+ }
+
+ return s;
+}
+
+bool DBImpl::KeyMayExist(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value, std::string* timestamp,
+ bool* value_found) {
+ assert(value != nullptr);
+ if (value_found != nullptr) {
+ // falsify later if key-may-exist but can't fetch value
+ *value_found = true;
+ }
+ ReadOptions roptions = read_options;
+ roptions.read_tier = kBlockCacheTier; // read from block cache only
+ PinnableSlice pinnable_val;
+ GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.value = &pinnable_val;
+ get_impl_options.value_found = value_found;
+ get_impl_options.timestamp = timestamp;
+ auto s = GetImpl(roptions, key, get_impl_options);
+ value->assign(pinnable_val.data(), pinnable_val.size());
+
+ // If block_cache is enabled and the index block of the table didn't
+ // not present in block_cache, the return value will be Status::Incomplete.
+ // In this case, key may still exist in the table.
+ return s.ok() || s.IsIncomplete();
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) {
+ if (read_options.managed) {
+ return NewErrorIterator(
+ Status::NotSupported("Managed iterator is not supported anymore."));
+ }
+ Iterator* result = nullptr;
+ if (read_options.read_tier == kPersistedTier) {
+ return NewErrorIterator(Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators."));
+ }
+
+ assert(column_family);
+
+ if (read_options.timestamp) {
+ const Status s = FailIfTsMismatchCf(
+ column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return NewErrorIterator(s);
+ }
+ } else {
+ const Status s = FailIfCfHasTs(column_family);
+ if (!s.ok()) {
+ return NewErrorIterator(s);
+ }
+ }
+
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+ assert(cfd != nullptr);
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+ // not supported in lite version
+ result = nullptr;
+
+#else
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+ auto iter = new ForwardIterator(this, read_options, cfd, sv,
+ /* allow_unprepared_value */ true);
+ result = NewDBIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+ cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback,
+ this, cfd);
+#endif
+ } else {
+ // Note: no need to consider the special case of
+ // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
+ // WritePreparedTxnDB
+ result = NewIteratorImpl(read_options, cfd,
+ (read_options.snapshot != nullptr)
+ ? read_options.snapshot->GetSequenceNumber()
+ : kMaxSequenceNumber,
+ read_callback);
+ }
+ return result;
+}
+
+ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
+ ColumnFamilyData* cfd,
+ SequenceNumber snapshot,
+ ReadCallback* read_callback,
+ bool expose_blob_index,
+ bool allow_refresh) {
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+
+ TEST_SYNC_POINT("DBImpl::NewIterator:1");
+ TEST_SYNC_POINT("DBImpl::NewIterator:2");
+
+ if (snapshot == kMaxSequenceNumber) {
+ // Note that the snapshot is assigned AFTER referencing the super
+ // version because otherwise a flush happening in between may compact away
+ // data for the snapshot, so the reader would see neither data that was be
+ // visible to the snapshot before compaction nor the newer data inserted
+ // afterwards.
+ // Note that the super version might not contain all the data available
+ // to this snapshot, but in that case it can see all the data in the
+ // super version, which is a valid consistent state after the user
+ // calls NewIterator().
+ snapshot = versions_->LastSequence();
+ TEST_SYNC_POINT("DBImpl::NewIterator:3");
+ TEST_SYNC_POINT("DBImpl::NewIterator:4");
+ }
+
+ // Try to generate a DB iterator tree in continuous memory area to be
+ // cache friendly. Here is an example of result:
+ // +-------------------------------+
+ // | |
+ // | ArenaWrappedDBIter |
+ // | + |
+ // | +---> Inner Iterator ------------+
+ // | | | |
+ // | | +-- -- -- -- -- -- -- --+ |
+ // | +--- | Arena | |
+ // | | | |
+ // | Allocated Memory: | |
+ // | | +-------------------+ |
+ // | | | DBIter | <---+
+ // | | + |
+ // | | | +-> iter_ ------------+
+ // | | | | |
+ // | | +-------------------+ |
+ // | | | MergingIterator | <---+
+ // | | + |
+ // | | | +->child iter1 ------------+
+ // | | | | | |
+ // | | +->child iter2 ----------+ |
+ // | | | | | | |
+ // | | | +->child iter3 --------+ | |
+ // | | | | | |
+ // | | +-------------------+ | | |
+ // | | | Iterator1 | <--------+
+ // | | +-------------------+ | |
+ // | | | Iterator2 | <------+
+ // | | +-------------------+ |
+ // | | | Iterator3 | <----+
+ // | | +-------------------+
+ // | | |
+ // +-------+-----------------------+
+ //
+ // ArenaWrappedDBIter inlines an arena area where all the iterators in
+ // the iterator tree are allocated in the order of being accessed when
+ // querying.
+ // Laying out the iterators in the order of being accessed makes it more
+ // likely that any iterator pointer is close to the iterator it points to so
+ // that they are likely to be in the same cache line and/or page.
+ ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current,
+ snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations,
+ sv->version_number, read_callback, this, cfd, expose_blob_index,
+ read_options.snapshot != nullptr ? false : allow_refresh);
+
+ InternalIterator* internal_iter = NewInternalIterator(
+ db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), snapshot,
+ /* allow_unprepared_value */ true, db_iter);
+ db_iter->SetIterUnderDBIter(internal_iter);
+
+ return db_iter;
+}
+
+Status DBImpl::NewIterators(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) {
+ if (read_options.managed) {
+ return Status::NotSupported("Managed iterator is not supported anymore.");
+ }
+ if (read_options.read_tier == kPersistedTier) {
+ return Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators.");
+ }
+
+ if (read_options.timestamp) {
+ for (auto* cf : column_families) {
+ assert(cf);
+ const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp),
+ /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ } else {
+ for (auto* cf : column_families) {
+ assert(cf);
+ const Status s = FailIfCfHasTs(cf);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ iterators->clear();
+ iterators->reserve(column_families.size());
+ if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+ return Status::InvalidArgument(
+ "Tailing iterator not supported in RocksDB lite");
+#else
+ for (auto cfh : column_families) {
+ auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+ auto iter = new ForwardIterator(this, read_options, cfd, sv,
+ /* allow_unprepared_value */ true);
+ iterators->push_back(NewDBIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+ cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations,
+ read_callback, this, cfd));
+ }
+#endif
+ } else {
+ // Note: no need to consider the special case of
+ // last_seq_same_as_publish_seq_==false since NewIterators is overridden in
+ // WritePreparedTxnDB
+ auto snapshot = read_options.snapshot != nullptr
+ ? read_options.snapshot->GetSequenceNumber()
+ : versions_->LastSequence();
+ for (size_t i = 0; i < column_families.size(); ++i) {
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_families[i])
+ ->cfd();
+ iterators->push_back(
+ NewIteratorImpl(read_options, cfd, snapshot, read_callback));
+ }
+ }
+
+ return Status::OK();
+}
+
+const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); }
+
+#ifndef ROCKSDB_LITE
+const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() {
+ return GetSnapshotImpl(true);
+}
+#endif // ROCKSDB_LITE
+
+std::pair<Status, std::shared_ptr<const Snapshot>>
+DBImpl::CreateTimestampedSnapshot(SequenceNumber snapshot_seq, uint64_t ts) {
+ assert(ts != std::numeric_limits<uint64_t>::max());
+
+ auto ret = CreateTimestampedSnapshotImpl(snapshot_seq, ts, /*lock=*/true);
+ return ret;
+}
+
+std::shared_ptr<const SnapshotImpl> DBImpl::GetTimestampedSnapshot(
+ uint64_t ts) const {
+ InstrumentedMutexLock lock_guard(&mutex_);
+ return timestamped_snapshots_.GetSnapshot(ts);
+}
+
+void DBImpl::ReleaseTimestampedSnapshotsOlderThan(uint64_t ts,
+ size_t* remaining_total_ss) {
+ autovector<std::shared_ptr<const SnapshotImpl>> snapshots_to_release;
+ {
+ InstrumentedMutexLock lock_guard(&mutex_);
+ timestamped_snapshots_.ReleaseSnapshotsOlderThan(ts, snapshots_to_release);
+ }
+ snapshots_to_release.clear();
+
+ if (remaining_total_ss) {
+ InstrumentedMutexLock lock_guard(&mutex_);
+ *remaining_total_ss = static_cast<size_t>(snapshots_.count());
+ }
+}
+
+Status DBImpl::GetTimestampedSnapshots(
+ uint64_t ts_lb, uint64_t ts_ub,
+ std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots) const {
+ if (ts_lb >= ts_ub) {
+ return Status::InvalidArgument(
+ "timestamp lower bound must be smaller than upper bound");
+ }
+ timestamped_snapshots.clear();
+ InstrumentedMutexLock lock_guard(&mutex_);
+ timestamped_snapshots_.GetSnapshots(ts_lb, ts_ub, timestamped_snapshots);
+ return Status::OK();
+}
+
+SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
+ bool lock) {
+ int64_t unix_time = 0;
+ immutable_db_options_.clock->GetCurrentTime(&unix_time)
+ .PermitUncheckedError(); // Ignore error
+ SnapshotImpl* s = new SnapshotImpl;
+
+ if (lock) {
+ mutex_.Lock();
+ } else {
+ mutex_.AssertHeld();
+ }
+ // returns null if the underlying memtable does not support snapshot.
+ if (!is_snapshot_supported_) {
+ if (lock) {
+ mutex_.Unlock();
+ }
+ delete s;
+ return nullptr;
+ }
+ auto snapshot_seq = GetLastPublishedSequence();
+ SnapshotImpl* snapshot =
+ snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary);
+ if (lock) {
+ mutex_.Unlock();
+ }
+ return snapshot;
+}
+
+std::pair<Status, std::shared_ptr<const SnapshotImpl>>
+DBImpl::CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts,
+ bool lock) {
+ int64_t unix_time = 0;
+ immutable_db_options_.clock->GetCurrentTime(&unix_time)
+ .PermitUncheckedError(); // Ignore error
+ SnapshotImpl* s = new SnapshotImpl;
+
+ const bool need_update_seq = (snapshot_seq != kMaxSequenceNumber);
+
+ if (lock) {
+ mutex_.Lock();
+ } else {
+ mutex_.AssertHeld();
+ }
+ // returns null if the underlying memtable does not support snapshot.
+ if (!is_snapshot_supported_) {
+ if (lock) {
+ mutex_.Unlock();
+ }
+ delete s;
+ return std::make_pair(
+ Status::NotSupported("Memtable does not support snapshot"), nullptr);
+ }
+
+ // Caller is not write thread, thus didn't provide a valid snapshot_seq.
+ // Obtain seq from db.
+ if (!need_update_seq) {
+ snapshot_seq = GetLastPublishedSequence();
+ }
+
+ std::shared_ptr<const SnapshotImpl> latest =
+ timestamped_snapshots_.GetSnapshot(std::numeric_limits<uint64_t>::max());
+
+ // If there is already a latest timestamped snapshot, then we need to do some
+ // checks.
+ if (latest) {
+ uint64_t latest_snap_ts = latest->GetTimestamp();
+ SequenceNumber latest_snap_seq = latest->GetSequenceNumber();
+ assert(latest_snap_seq <= snapshot_seq);
+ bool needs_create_snap = true;
+ Status status;
+ std::shared_ptr<const SnapshotImpl> ret;
+ if (latest_snap_ts > ts) {
+ // A snapshot created later cannot have smaller timestamp than a previous
+ // timestamped snapshot.
+ needs_create_snap = false;
+ std::ostringstream oss;
+ oss << "snapshot exists with larger timestamp " << latest_snap_ts << " > "
+ << ts;
+ status = Status::InvalidArgument(oss.str());
+ } else if (latest_snap_ts == ts) {
+ if (latest_snap_seq == snapshot_seq) {
+ // We are requesting the same sequence number and timestamp, thus can
+ // safely reuse (share) the current latest timestamped snapshot.
+ needs_create_snap = false;
+ ret = latest;
+ } else if (latest_snap_seq < snapshot_seq) {
+ // There may have been writes to the database since the latest
+ // timestamped snapshot, yet we are still requesting the same
+ // timestamp. In this case, we cannot create the new timestamped
+ // snapshot.
+ needs_create_snap = false;
+ std::ostringstream oss;
+ oss << "Allocated seq is " << snapshot_seq
+ << ", while snapshot exists with smaller seq " << latest_snap_seq
+ << " but same timestamp " << ts;
+ status = Status::InvalidArgument(oss.str());
+ }
+ }
+ if (!needs_create_snap) {
+ if (lock) {
+ mutex_.Unlock();
+ }
+ delete s;
+ return std::make_pair(status, ret);
+ } else {
+ status.PermitUncheckedError();
+ }
+ }
+
+ SnapshotImpl* snapshot =
+ snapshots_.New(s, snapshot_seq, unix_time,
+ /*is_write_conflict_boundary=*/true, ts);
+
+ std::shared_ptr<const SnapshotImpl> ret(
+ snapshot,
+ std::bind(&DBImpl::ReleaseSnapshot, this, std::placeholders::_1));
+ timestamped_snapshots_.AddSnapshot(ret);
+
+ // Caller is from write thread, and we need to update database's sequence
+ // number.
+ if (need_update_seq) {
+ assert(versions_);
+ if (last_seq_same_as_publish_seq_) {
+ versions_->SetLastSequence(snapshot_seq);
+ } else {
+ // TODO: support write-prepared/write-unprepared transactions with two
+ // write queues.
+ assert(false);
+ }
+ }
+
+ if (lock) {
+ mutex_.Unlock();
+ }
+ return std::make_pair(Status::OK(), ret);
+}
+
+namespace {
+using CfdList = autovector<ColumnFamilyData*, 2>;
+bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
+ for (const ColumnFamilyData* t : list) {
+ if (t == cfd) {
+ return true;
+ }
+ }
+ return false;
+}
+} // namespace
+
+void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+ if (s == nullptr) {
+ // DBImpl::GetSnapshot() can return nullptr when snapshot
+ // not supported by specifying the condition:
+ // inplace_update_support enabled.
+ return;
+ }
+ const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ snapshots_.Delete(casted_s);
+ uint64_t oldest_snapshot;
+ if (snapshots_.empty()) {
+ oldest_snapshot = GetLastPublishedSequence();
+ } else {
+ oldest_snapshot = snapshots_.oldest()->number_;
+ }
+ // Avoid to go through every column family by checking a global threshold
+ // first.
+ if (oldest_snapshot > bottommost_files_mark_threshold_) {
+ CfdList cf_scheduled;
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ if (!cfd->ioptions()->allow_ingest_behind) {
+ cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
+ if (!cfd->current()
+ ->storage_info()
+ ->BottommostFilesMarkedForCompaction()
+ .empty()) {
+ SchedulePendingCompaction(cfd);
+ MaybeScheduleFlushOrCompaction();
+ cf_scheduled.push_back(cfd);
+ }
+ }
+ }
+
+ // Calculate a new threshold, skipping those CFs where compactions are
+ // scheduled. We do not do the same pass as the previous loop because
+ // mutex might be unlocked during the loop, making the result inaccurate.
+ SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ if (CfdListContains(cf_scheduled, cfd) ||
+ cfd->ioptions()->allow_ingest_behind) {
+ continue;
+ }
+ new_bottommost_files_mark_threshold = std::min(
+ new_bottommost_files_mark_threshold,
+ cfd->current()->storage_info()->bottommost_files_mark_threshold());
+ }
+ bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
+ }
+ }
+ delete casted_s;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+ TablePropertiesCollection* props) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ auto cfd = cfh->cfd();
+
+ // Increment the ref count
+ mutex_.Lock();
+ auto version = cfd->current();
+ version->Ref();
+ mutex_.Unlock();
+
+ auto s = version->GetPropertiesOfAllTables(props);
+
+ // Decrement the ref count
+ mutex_.Lock();
+ version->Unref();
+ mutex_.Unlock();
+
+ return s;
+}
+
+Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
+ const Range* range, std::size_t n,
+ TablePropertiesCollection* props) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ auto cfd = cfh->cfd();
+
+ // Increment the ref count
+ mutex_.Lock();
+ auto version = cfd->current();
+ version->Ref();
+ mutex_.Unlock();
+
+ auto s = version->GetPropertiesOfTablesInRange(range, n, props);
+
+ // Decrement the ref count
+ mutex_.Lock();
+ version->Unref();
+ mutex_.Unlock();
+
+ return s;
+}
+
+#endif // ROCKSDB_LITE
+
+const std::string& DBImpl::GetName() const { return dbname_; }
+
+Env* DBImpl::GetEnv() const { return env_; }
+
+FileSystem* DB::GetFileSystem() const {
+ const auto& fs = GetEnv()->GetFileSystem();
+ return fs.get();
+}
+
+FileSystem* DBImpl::GetFileSystem() const {
+ return immutable_db_options_.fs.get();
+}
+
+SystemClock* DBImpl::GetSystemClock() const {
+ return immutable_db_options_.clock;
+}
+
+#ifndef ROCKSDB_LITE
+
+Status DBImpl::StartIOTrace(const TraceOptions& trace_options,
+ std::unique_ptr<TraceWriter>&& trace_writer) {
+ assert(trace_writer != nullptr);
+ return io_tracer_->StartIOTrace(GetSystemClock(), trace_options,
+ std::move(trace_writer));
+}
+
+Status DBImpl::EndIOTrace() {
+ io_tracer_->EndIOTrace();
+ return Status::OK();
+}
+
+#endif // ROCKSDB_LITE
+
+Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
+ InstrumentedMutexLock l(&mutex_);
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+ cfh->cfd()->GetLatestCFOptions());
+}
+
+DBOptions DBImpl::GetDBOptions() const {
+ InstrumentedMutexLock l(&mutex_);
+ return BuildDBOptions(immutable_db_options_, mutable_db_options_);
+}
+
+bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ value->clear();
+ auto cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+ if (property_info == nullptr) {
+ return false;
+ } else if (property_info->handle_int) {
+ uint64_t int_value;
+ bool ret_value =
+ GetIntPropertyInternal(cfd, *property_info, false, &int_value);
+ if (ret_value) {
+ *value = std::to_string(int_value);
+ }
+ return ret_value;
+ } else if (property_info->handle_string) {
+ if (property_info->need_out_of_mutex) {
+ return cfd->internal_stats()->GetStringProperty(*property_info, property,
+ value);
+ } else {
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->internal_stats()->GetStringProperty(*property_info, property,
+ value);
+ }
+ } else if (property_info->handle_string_dbimpl) {
+ if (property_info->need_out_of_mutex) {
+ return (this->*(property_info->handle_string_dbimpl))(value);
+ } else {
+ InstrumentedMutexLock l(&mutex_);
+ return (this->*(property_info->handle_string_dbimpl))(value);
+ }
+ }
+ // Shouldn't reach here since exactly one of handle_string and handle_int
+ // should be non-nullptr.
+ assert(false);
+ return false;
+}
+
+bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family,
+ const Slice& property,
+ std::map<std::string, std::string>* value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ value->clear();
+ auto cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+ if (property_info == nullptr) {
+ return false;
+ } else if (property_info->handle_map) {
+ if (property_info->need_out_of_mutex) {
+ return cfd->internal_stats()->GetMapProperty(*property_info, property,
+ value);
+ } else {
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->internal_stats()->GetMapProperty(*property_info, property,
+ value);
+ }
+ }
+ // If we reach this point it means that handle_map is not provided for the
+ // requested property
+ return false;
+}
+
+bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, uint64_t* value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ if (property_info == nullptr || property_info->handle_int == nullptr) {
+ return false;
+ }
+ auto cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+ return GetIntPropertyInternal(cfd, *property_info, false, value);
+}
+
+bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd,
+ const DBPropertyInfo& property_info,
+ bool is_locked, uint64_t* value) {
+ assert(property_info.handle_int != nullptr);
+ if (!property_info.need_out_of_mutex) {
+ if (is_locked) {
+ mutex_.AssertHeld();
+ return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+ } else {
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+ }
+ } else {
+ SuperVersion* sv = nullptr;
+ if (is_locked) {
+ mutex_.Unlock();
+ }
+ sv = GetAndRefSuperVersion(cfd);
+
+ bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex(
+ property_info, sv->current, value);
+
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ if (is_locked) {
+ mutex_.Lock();
+ }
+
+ return ret;
+ }
+}
+
+bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) {
+ assert(value != nullptr);
+ Statistics* statistics = immutable_db_options_.stats;
+ if (!statistics) {
+ return false;
+ }
+ *value = statistics->ToString();
+ return true;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::ResetStats() {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->initialized()) {
+ cfd->internal_stats()->Clear();
+ }
+ }
+ return Status::OK();
+}
+#endif // ROCKSDB_LITE
+
+bool DBImpl::GetAggregatedIntProperty(const Slice& property,
+ uint64_t* aggregated_value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ if (property_info == nullptr || property_info->handle_int == nullptr) {
+ return false;
+ }
+
+ uint64_t sum = 0;
+ bool ret = true;
+ {
+ // Needs mutex to protect the list of column families.
+ InstrumentedMutexLock l(&mutex_);
+ uint64_t value;
+ for (auto* cfd : versions_->GetRefedColumnFamilySet()) {
+ if (!cfd->initialized()) {
+ continue;
+ }
+ ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
+ // GetIntPropertyInternal may release db mutex and re-acquire it.
+ mutex_.AssertHeld();
+ if (ret) {
+ sum += value;
+ } else {
+ ret = false;
+ break;
+ }
+ }
+ }
+ *aggregated_value = sum;
+ return ret;
+}
+
+SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
+ // TODO(ljin): consider using GetReferencedSuperVersion() directly
+ return cfd->GetThreadLocalSuperVersion(this);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) {
+ auto column_family_set = versions_->GetColumnFamilySet();
+ auto cfd = column_family_set->GetColumnFamily(column_family_id);
+ if (!cfd) {
+ return nullptr;
+ }
+
+ return GetAndRefSuperVersion(cfd);
+}
+
+void DBImpl::CleanupSuperVersion(SuperVersion* sv) {
+ // Release SuperVersion
+ if (sv->Unref()) {
+ bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ sv->Cleanup();
+ if (defer_purge) {
+ AddSuperVersionsToFreeQueue(sv);
+ SchedulePurge();
+ }
+ }
+ if (!defer_purge) {
+ delete sv;
+ }
+ RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
+ }
+ RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
+}
+
+void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
+ SuperVersion* sv) {
+ if (!cfd->ReturnThreadLocalSuperVersion(sv)) {
+ CleanupSuperVersion(sv);
+ }
+}
+
+// REQUIRED: this function should only be called on the write thread.
+void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id,
+ SuperVersion* sv) {
+ auto column_family_set = versions_->GetColumnFamilySet();
+ auto cfd = column_family_set->GetColumnFamily(column_family_id);
+
+ // If SuperVersion is held, and we successfully fetched a cfd using
+ // GetAndRefSuperVersion(), it must still exist.
+ assert(cfd != nullptr);
+ ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
+ ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
+
+ if (!cf_memtables->Seek(column_family_id)) {
+ return nullptr;
+ }
+
+ return cf_memtables->GetColumnFamilyHandle();
+}
+
+// REQUIRED: mutex is NOT held.
+std::unique_ptr<ColumnFamilyHandle> DBImpl::GetColumnFamilyHandleUnlocked(
+ uint32_t column_family_id) {
+ InstrumentedMutexLock l(&mutex_);
+
+ auto* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id);
+ if (cfd == nullptr) {
+ return nullptr;
+ }
+
+ return std::unique_ptr<ColumnFamilyHandleImpl>(
+ new ColumnFamilyHandleImpl(cfd, this, &mutex_));
+}
+
+void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+ const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) {
+ ColumnFamilyHandleImpl* cfh =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+ // Convert user_key into a corresponding internal key.
+ InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek);
+ MemTable::MemTableStats memStats =
+ sv->mem->ApproximateStats(k1.Encode(), k2.Encode());
+ MemTable::MemTableStats immStats =
+ sv->imm->ApproximateStats(k1.Encode(), k2.Encode());
+ *count = memStats.count + immStats.count;
+ *size = memStats.size + immStats.size;
+
+ ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Range* range, int n, uint64_t* sizes) {
+ if (!options.include_memtables && !options.include_files) {
+ return Status::InvalidArgument("Invalid options");
+ }
+
+ const Comparator* const ucmp = column_family->GetComparator();
+ assert(ucmp);
+ size_t ts_sz = ucmp->timestamp_size();
+
+ Version* v;
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ auto cfd = cfh->cfd();
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+ v = sv->current;
+
+ for (int i = 0; i < n; i++) {
+ Slice start = range[i].start;
+ Slice limit = range[i].limit;
+
+ // Add timestamp if needed
+ std::string start_with_ts, limit_with_ts;
+ if (ts_sz > 0) {
+ // Maximum timestamp means including all key with any timestamp
+ AppendKeyWithMaxTimestamp(&start_with_ts, start, ts_sz);
+ // Append a maximum timestamp as the range limit is exclusive:
+ // [start, limit)
+ AppendKeyWithMaxTimestamp(&limit_with_ts, limit, ts_sz);
+ start = start_with_ts;
+ limit = limit_with_ts;
+ }
+ // Convert user_key into a corresponding internal key.
+ InternalKey k1(start, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey k2(limit, kMaxSequenceNumber, kValueTypeForSeek);
+ sizes[i] = 0;
+ if (options.include_files) {
+ sizes[i] += versions_->ApproximateSize(
+ options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
+ /*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
+ }
+ if (options.include_memtables) {
+ sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
+ sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size;
+ }
+ }
+
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ return Status::OK();
+}
+
+std::list<uint64_t>::iterator
+DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
+ // We need to remember the iterator of our insert, because after the
+ // background job is done, we need to remove that element from
+ // pending_outputs_.
+ pending_outputs_.push_back(versions_->current_next_file_number());
+ auto pending_outputs_inserted_elem = pending_outputs_.end();
+ --pending_outputs_inserted_elem;
+ return pending_outputs_inserted_elem;
+}
+
+void DBImpl::ReleaseFileNumberFromPendingOutputs(
+ std::unique_ptr<std::list<uint64_t>::iterator>& v) {
+ if (v.get() != nullptr) {
+ pending_outputs_.erase(*v.get());
+ v.reset();
+ }
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetUpdatesSince(
+ SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options) {
+ RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
+ if (seq_per_batch_) {
+ return Status::NotSupported(
+ "This API is not yet compatible with write-prepared/write-unprepared "
+ "transactions");
+ }
+ if (seq > versions_->LastSequence()) {
+ return Status::NotFound("Requested sequence not yet written in the db");
+ }
+ return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
+}
+
+Status DBImpl::DeleteFile(std::string name) {
+ uint64_t number;
+ FileType type;
+ WalFileType log_type;
+ if (!ParseFileName(name, &number, &type, &log_type) ||
+ (type != kTableFile && type != kWalFile)) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n",
+ name.c_str());
+ return Status::InvalidArgument("Invalid file name");
+ }
+
+ if (type == kWalFile) {
+ // Only allow deleting archived log files
+ if (log_type != kArchivedLogFile) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "DeleteFile %s failed - not archived log.\n",
+ name.c_str());
+ return Status::NotSupported("Delete only supported for archived logs");
+ }
+ Status status = wal_manager_.DeleteFile(name, number);
+ if (!status.ok()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "DeleteFile %s failed -- %s.\n", name.c_str(),
+ status.ToString().c_str());
+ }
+ return status;
+ }
+
+ Status status;
+ int level;
+ FileMetaData* metadata;
+ ColumnFamilyData* cfd;
+ VersionEdit edit;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "DeleteFile %s failed. File not found\n", name.c_str());
+ job_context.Clean();
+ return Status::InvalidArgument("File not found");
+ }
+ assert(level < cfd->NumberLevels());
+
+ // If the file is being compacted no need to delete.
+ if (metadata->being_compacted) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "DeleteFile %s Skipped. File about to be compacted\n",
+ name.c_str());
+ job_context.Clean();
+ return Status::OK();
+ }
+
+ // Only the files in the last level can be deleted externally.
+ // This is to make sure that any deletion tombstones are not
+ // lost. Check that the level passed is the last level.
+ auto* vstoreage = cfd->current()->storage_info();
+ for (int i = level + 1; i < cfd->NumberLevels(); i++) {
+ if (vstoreage->NumLevelFiles(i) != 0) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "DeleteFile %s FAILED. File not in last level\n",
+ name.c_str());
+ job_context.Clean();
+ return Status::InvalidArgument("File not in last level");
+ }
+ }
+ // if level == 0, it has to be the oldest file
+ if (level == 0 &&
+ vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "DeleteFile %s failed ---"
+ " target file in level 0 must be the oldest.",
+ name.c_str());
+ job_context.Clean();
+ return Status::InvalidArgument("File in level 0, but not oldest");
+ }
+ edit.SetColumnFamily(cfd->GetID());
+ edit.DeleteFile(level, number);
+ status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ &edit, &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd,
+ &job_context.superversion_contexts[0],
+ *cfd->GetLatestMutableCFOptions());
+ }
+ FindObsoleteFiles(&job_context, false);
+ } // lock released here
+
+ LogFlush(immutable_db_options_.info_log);
+ // remove files outside the db-lock
+ if (job_context.HaveSomethingToDelete()) {
+ // Call PurgeObsoleteFiles() without holding mutex.
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ return status;
+}
+
+Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+ const RangePtr* ranges, size_t n,
+ bool include_end) {
+ Status status = Status::OK();
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+ VersionEdit edit;
+ std::set<FileMetaData*> deleted_files;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ Version* input_version = cfd->current();
+
+ auto* vstorage = input_version->storage_info();
+ for (size_t r = 0; r < n; r++) {
+ auto begin = ranges[r].start, end = ranges[r].limit;
+ for (int i = 1; i < cfd->NumberLevels(); i++) {
+ if (vstorage->LevelFiles(i).empty() ||
+ !vstorage->OverlapInLevel(i, begin, end)) {
+ continue;
+ }
+ std::vector<FileMetaData*> level_files;
+ InternalKey begin_storage, end_storage, *begin_key, *end_key;
+ if (begin == nullptr) {
+ begin_key = nullptr;
+ } else {
+ begin_storage.SetMinPossibleForUserKey(*begin);
+ begin_key = &begin_storage;
+ }
+ if (end == nullptr) {
+ end_key = nullptr;
+ } else {
+ end_storage.SetMaxPossibleForUserKey(*end);
+ end_key = &end_storage;
+ }
+
+ vstorage->GetCleanInputsWithinInterval(
+ i, begin_key, end_key, &level_files, -1 /* hint_index */,
+ nullptr /* file_index */);
+ FileMetaData* level_file;
+ for (uint32_t j = 0; j < level_files.size(); j++) {
+ level_file = level_files[j];
+ if (level_file->being_compacted) {
+ continue;
+ }
+ if (deleted_files.find(level_file) != deleted_files.end()) {
+ continue;
+ }
+ if (!include_end && end != nullptr &&
+ cfd->user_comparator()->Compare(level_file->largest.user_key(),
+ *end) == 0) {
+ continue;
+ }
+ edit.SetColumnFamily(cfd->GetID());
+ edit.DeleteFile(i, level_file->fd.GetNumber());
+ deleted_files.insert(level_file);
+ level_file->being_compacted = true;
+ }
+ vstorage->ComputeCompactionScore(*cfd->ioptions(),
+ *cfd->GetLatestMutableCFOptions());
+ }
+ }
+ if (edit.GetDeletedFiles().empty()) {
+ job_context.Clean();
+ return status;
+ }
+ input_version->Ref();
+ status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ &edit, &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd,
+ &job_context.superversion_contexts[0],
+ *cfd->GetLatestMutableCFOptions());
+ }
+ for (auto* deleted_file : deleted_files) {
+ deleted_file->being_compacted = false;
+ }
+ input_version->Unref();
+ FindObsoleteFiles(&job_context, false);
+ } // lock released here
+
+ LogFlush(immutable_db_options_.info_log);
+ // remove files outside the db-lock
+ if (job_context.HaveSomethingToDelete()) {
+ // Call PurgeObsoleteFiles() without holding mutex.
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ return status;
+}
+
+void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+ InstrumentedMutexLock l(&mutex_);
+ versions_->GetLiveFilesMetaData(metadata);
+}
+
+Status DBImpl::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
+ InstrumentedMutexLock l(&mutex_);
+ return versions_->GetLiveFilesChecksumInfo(checksum_list);
+}
+
+void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+ ColumnFamilyMetaData* cf_meta) {
+ assert(column_family);
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+ auto* sv = GetAndRefSuperVersion(cfd);
+ {
+ // Without mutex, Version::GetColumnFamilyMetaData will have data race with
+ // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but
+ // this may cause regression. An alternative is to make
+ // FileMetaData::being_compacted atomic, but it will make FileMetaData
+ // non-copy-able. Another option is to separate these variables from
+ // original FileMetaData struct, and this requires re-organization of data
+ // structures. For now, we take the easy approach. If
+ // DB::GetColumnFamilyMetaData is not called frequently, the regression
+ // should not be big. We still need to keep an eye on it.
+ InstrumentedMutexLock l(&mutex_);
+ sv->current->GetColumnFamilyMetaData(cf_meta);
+ }
+ ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+void DBImpl::GetAllColumnFamilyMetaData(
+ std::vector<ColumnFamilyMetaData>* metadata) {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto cfd : *(versions_->GetColumnFamilySet())) {
+ {
+ metadata->emplace_back();
+ cfd->current()->GetColumnFamilyMetaData(&metadata->back());
+ }
+ }
+}
+
+#endif // ROCKSDB_LITE
+
+Status DBImpl::CheckConsistency() {
+ mutex_.AssertHeld();
+ std::vector<LiveFileMetaData> metadata;
+ versions_->GetLiveFilesMetaData(&metadata);
+ TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
+
+ std::string corruption_messages;
+
+ if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+ // Instead of calling GetFileSize() for each expected file, call
+ // GetChildren() for the DB directory and check that all expected files
+ // are listed, without checking their sizes.
+ // Since sst files might be in different directories, do it for each
+ // directory separately.
+ std::map<std::string, std::vector<std::string>> files_by_directory;
+ for (const auto& md : metadata) {
+ // md.name has a leading "/". Remove it.
+ std::string fname = md.name;
+ if (!fname.empty() && fname[0] == '/') {
+ fname = fname.substr(1);
+ }
+ files_by_directory[md.db_path].push_back(fname);
+ }
+
+ IOOptions io_opts;
+ io_opts.do_not_recurse = true;
+ for (const auto& dir_files : files_by_directory) {
+ std::string directory = dir_files.first;
+ std::vector<std::string> existing_files;
+ Status s = fs_->GetChildren(directory, io_opts, &existing_files,
+ /*IODebugContext*=*/nullptr);
+ if (!s.ok()) {
+ corruption_messages +=
+ "Can't list files in " + directory + ": " + s.ToString() + "\n";
+ continue;
+ }
+ std::sort(existing_files.begin(), existing_files.end());
+
+ for (const std::string& fname : dir_files.second) {
+ if (!std::binary_search(existing_files.begin(), existing_files.end(),
+ fname) &&
+ !std::binary_search(existing_files.begin(), existing_files.end(),
+ Rocks2LevelTableFileName(fname))) {
+ corruption_messages +=
+ "Missing sst file " + fname + " in " + directory + "\n";
+ }
+ }
+ }
+ } else {
+ for (const auto& md : metadata) {
+ // md.name has a leading "/".
+ std::string file_path = md.db_path + md.name;
+
+ uint64_t fsize = 0;
+ TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
+ Status s = env_->GetFileSize(file_path, &fsize);
+ if (!s.ok() &&
+ env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
+ s = Status::OK();
+ }
+ if (!s.ok()) {
+ corruption_messages +=
+ "Can't access " + md.name + ": " + s.ToString() + "\n";
+ } else if (fsize != md.size) {
+ corruption_messages += "Sst file size mismatch: " + file_path +
+ ". Size recorded in manifest " +
+ std::to_string(md.size) + ", actual size " +
+ std::to_string(fsize) + "\n";
+ }
+ }
+ }
+
+ if (corruption_messages.size() == 0) {
+ return Status::OK();
+ } else {
+ return Status::Corruption(corruption_messages);
+ }
+}
+
+Status DBImpl::GetDbIdentity(std::string& identity) const {
+ identity.assign(db_id_);
+ return Status::OK();
+}
+
+Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const {
+ std::string idfilename = IdentityFileName(dbname_);
+ const FileOptions soptions;
+
+ Status s = ReadFileToString(fs_.get(), idfilename, identity);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // If last character is '\n' remove it from identity. (Old implementations
+ // of Env::GenerateUniqueId() would include a trailing '\n'.)
+ if (identity->size() > 0 && identity->back() == '\n') {
+ identity->pop_back();
+ }
+ return s;
+}
+
+Status DBImpl::GetDbSessionId(std::string& session_id) const {
+ session_id.assign(db_session_id_);
+ return Status::OK();
+}
+
+namespace {
+SemiStructuredUniqueIdGen* DbSessionIdGen() {
+ static SemiStructuredUniqueIdGen gen;
+ return &gen;
+}
+} // namespace
+
+void DBImpl::TEST_ResetDbSessionIdGen() { DbSessionIdGen()->Reset(); }
+
+std::string DBImpl::GenerateDbSessionId(Env*) {
+ // See SemiStructuredUniqueIdGen for its desirable properties.
+ auto gen = DbSessionIdGen();
+
+ uint64_t lo, hi;
+ gen->GenerateNext(&hi, &lo);
+ if (lo == 0) {
+ // Avoid emitting session ID with lo==0, so that SST unique
+ // IDs can be more easily ensured non-zero
+ gen->GenerateNext(&hi, &lo);
+ assert(lo != 0);
+ }
+ return EncodeSessionId(hi, lo);
+}
+
+void DBImpl::SetDbSessionId() {
+ db_session_id_ = GenerateDbSessionId(env_);
+ TEST_SYNC_POINT_CALLBACK("DBImpl::SetDbSessionId", &db_session_id_);
+}
+
+// Default implementation -- returns not supported status
+Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/,
+ const std::string& /*column_family_name*/,
+ ColumnFamilyHandle** /*handle*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+ const ColumnFamilyOptions& /*cf_options*/,
+ const std::vector<std::string>& /*column_family_names*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& /*column_families*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) {
+ if (DefaultColumnFamily() == column_family) {
+ return Status::InvalidArgument(
+ "Cannot destroy the handle returned by DefaultColumnFamily()");
+ }
+ delete column_family;
+ return Status::OK();
+}
+
+DB::~DB() {}
+
+Status DBImpl::Close() {
+ InstrumentedMutexLock closing_lock_guard(&closing_mutex_);
+ if (closed_) {
+ return closing_status_;
+ }
+
+ {
+ const Status s = MaybeReleaseTimestampedSnapshotsAndCheck();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ closing_status_ = CloseImpl();
+ closed_ = true;
+ return closing_status_;
+}
+
+Status DB::ListColumnFamilies(const DBOptions& db_options,
+ const std::string& name,
+ std::vector<std::string>* column_families) {
+ const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
+ return VersionSet::ListColumnFamilies(column_families, name, fs.get());
+}
+
+Snapshot::~Snapshot() {}
+
+Status DestroyDB(const std::string& dbname, const Options& options,
+ const std::vector<ColumnFamilyDescriptor>& column_families) {
+ ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
+ Env* env = soptions.env;
+ std::vector<std::string> filenames;
+ bool wal_in_db_path = soptions.IsWalDirSameAsDBPath();
+
+ // Reset the logger because it holds a handle to the
+ // log file and prevents cleanup and directory removal
+ soptions.info_log.reset();
+ IOOptions io_opts;
+ // Ignore error in case directory does not exist
+ soptions.fs
+ ->GetChildren(dbname, io_opts, &filenames,
+ /*IODebugContext*=*/nullptr)
+ .PermitUncheckedError();
+
+ FileLock* lock;
+ const std::string lockname = LockFileName(dbname);
+ Status result = env->LockFile(lockname, &lock);
+ if (result.ok()) {
+ uint64_t number;
+ FileType type;
+ InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname);
+ for (const auto& fname : filenames) {
+ if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) &&
+ type != kDBLockFile) { // Lock file will be deleted at end
+ Status del;
+ std::string path_to_delete = dbname + "/" + fname;
+ if (type == kMetaDatabase) {
+ del = DestroyDB(path_to_delete, options);
+ } else if (type == kTableFile || type == kWalFile ||
+ type == kBlobFile) {
+ del = DeleteDBFile(
+ &soptions, path_to_delete, dbname,
+ /*force_bg=*/false,
+ /*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false);
+ } else {
+ del = env->DeleteFile(path_to_delete);
+ }
+ if (!del.ok() && result.ok()) {
+ result = del;
+ }
+ }
+ }
+
+ std::set<std::string> paths;
+ for (const DbPath& db_path : options.db_paths) {
+ paths.insert(db_path.path);
+ }
+ for (const ColumnFamilyDescriptor& cf : column_families) {
+ for (const DbPath& cf_path : cf.options.cf_paths) {
+ paths.insert(cf_path.path);
+ }
+ }
+
+ for (const auto& path : paths) {
+ if (soptions.fs
+ ->GetChildren(path, io_opts, &filenames,
+ /*IODebugContext*=*/nullptr)
+ .ok()) {
+ for (const auto& fname : filenames) {
+ if (ParseFileName(fname, &number, &type) &&
+ (type == kTableFile ||
+ type == kBlobFile)) { // Lock file will be deleted at end
+ std::string file_path = path + "/" + fname;
+ Status del = DeleteDBFile(&soptions, file_path, dbname,
+ /*force_bg=*/false, /*force_fg=*/false);
+ if (!del.ok() && result.ok()) {
+ result = del;
+ }
+ }
+ }
+ // TODO: Should we return an error if we cannot delete the directory?
+ env->DeleteDir(path).PermitUncheckedError();
+ }
+ }
+
+ std::vector<std::string> walDirFiles;
+ std::string archivedir = ArchivalDirectory(dbname);
+ bool wal_dir_exists = false;
+ if (!soptions.IsWalDirSameAsDBPath(dbname)) {
+ wal_dir_exists =
+ soptions.fs
+ ->GetChildren(soptions.wal_dir, io_opts, &walDirFiles,
+ /*IODebugContext*=*/nullptr)
+ .ok();
+ archivedir = ArchivalDirectory(soptions.wal_dir);
+ }
+
+ // Archive dir may be inside wal dir or dbname and should be
+ // processed and removed before those otherwise we have issues
+ // removing them
+ std::vector<std::string> archiveFiles;
+ if (soptions.fs
+ ->GetChildren(archivedir, io_opts, &archiveFiles,
+ /*IODebugContext*=*/nullptr)
+ .ok()) {
+ // Delete archival files.
+ for (const auto& file : archiveFiles) {
+ if (ParseFileName(file, &number, &type) && type == kWalFile) {
+ Status del =
+ DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
+ /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+ if (!del.ok() && result.ok()) {
+ result = del;
+ }
+ }
+ }
+ // Ignore error in case dir contains other files
+ env->DeleteDir(archivedir).PermitUncheckedError();
+ }
+
+ // Delete log files in the WAL dir
+ if (wal_dir_exists) {
+ for (const auto& file : walDirFiles) {
+ if (ParseFileName(file, &number, &type) && type == kWalFile) {
+ Status del =
+ DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
+ soptions.wal_dir, /*force_bg=*/false,
+ /*force_fg=*/!wal_in_db_path);
+ if (!del.ok() && result.ok()) {
+ result = del;
+ }
+ }
+ }
+ // Ignore error in case dir contains other files
+ env->DeleteDir(soptions.wal_dir).PermitUncheckedError();
+ }
+
+ // Ignore error since state is already gone
+ env->UnlockFile(lock).PermitUncheckedError();
+ env->DeleteFile(lockname).PermitUncheckedError();
+
+ // sst_file_manager holds a ref to the logger. Make sure the logger is
+ // gone before trying to remove the directory.
+ soptions.sst_file_manager.reset();
+
+ // Ignore error in case dir contains other files
+ env->DeleteDir(dbname).PermitUncheckedError();
+ ;
+ }
+ return result;
+}
+
+Status DBImpl::WriteOptionsFile(bool need_mutex_lock,
+ bool need_enter_write_thread) {
+#ifndef ROCKSDB_LITE
+ WriteThread::Writer w;
+ if (need_mutex_lock) {
+ mutex_.Lock();
+ } else {
+ mutex_.AssertHeld();
+ }
+ if (need_enter_write_thread) {
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ }
+
+ std::vector<std::string> cf_names;
+ std::vector<ColumnFamilyOptions> cf_opts;
+
+ // This part requires mutex to protect the column family options
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ cf_names.push_back(cfd->GetName());
+ cf_opts.push_back(cfd->GetLatestCFOptions());
+ }
+
+ // Unlock during expensive operations. New writes cannot get here
+ // because the single write thread ensures all new writes get queued.
+ DBOptions db_options =
+ BuildDBOptions(immutable_db_options_, mutable_db_options_);
+ mutex_.Unlock();
+
+ TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1");
+ TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2");
+ TEST_SYNC_POINT_CALLBACK("DBImpl::WriteOptionsFile:PersistOptions",
+ &db_options);
+
+ std::string file_name =
+ TempOptionsFileName(GetName(), versions_->NewFileNumber());
+ Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name,
+ fs_.get());
+
+ if (s.ok()) {
+ s = RenameTempFileToOptionsFile(file_name);
+ }
+ // restore lock
+ if (!need_mutex_lock) {
+ mutex_.Lock();
+ }
+ if (need_enter_write_thread) {
+ write_thread_.ExitUnbatched(&w);
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Unnable to persist options -- %s", s.ToString().c_str());
+ if (immutable_db_options_.fail_if_options_file_error) {
+ return Status::IOError("Unable to persist options.",
+ s.ToString().c_str());
+ }
+ }
+#else
+ (void)need_mutex_lock;
+ (void)need_enter_write_thread;
+#endif // !ROCKSDB_LITE
+ return Status::OK();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void DeleteOptionsFilesHelper(const std::map<uint64_t, std::string>& filenames,
+ const size_t num_files_to_keep,
+ const std::shared_ptr<Logger>& info_log,
+ Env* env) {
+ if (filenames.size() <= num_files_to_keep) {
+ return;
+ }
+ for (auto iter = std::next(filenames.begin(), num_files_to_keep);
+ iter != filenames.end(); ++iter) {
+ if (!env->DeleteFile(iter->second).ok()) {
+ ROCKS_LOG_WARN(info_log, "Unable to delete options file %s",
+ iter->second.c_str());
+ }
+ }
+}
+} // namespace
+#endif // !ROCKSDB_LITE
+
+Status DBImpl::DeleteObsoleteOptionsFiles() {
+#ifndef ROCKSDB_LITE
+ std::vector<std::string> filenames;
+ // use ordered map to store keep the filenames sorted from the newest
+ // to the oldest.
+ std::map<uint64_t, std::string> options_filenames;
+ Status s;
+ IOOptions io_opts;
+ io_opts.do_not_recurse = true;
+ s = fs_->GetChildren(GetName(), io_opts, &filenames,
+ /*IODebugContext*=*/nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ for (auto& filename : filenames) {
+ uint64_t file_number;
+ FileType type;
+ if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) {
+ options_filenames.insert(
+ {std::numeric_limits<uint64_t>::max() - file_number,
+ GetName() + "/" + filename});
+ }
+ }
+
+ // Keeps the latest 2 Options file
+ const size_t kNumOptionsFilesKept = 2;
+ DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept,
+ immutable_db_options_.info_log, GetEnv());
+ return Status::OK();
+#else
+ return Status::OK();
+#endif // !ROCKSDB_LITE
+}
+
+Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) {
+#ifndef ROCKSDB_LITE
+ Status s;
+
+ uint64_t options_file_number = versions_->NewFileNumber();
+ std::string options_file_name =
+ OptionsFileName(GetName(), options_file_number);
+ uint64_t options_file_size = 0;
+ s = GetEnv()->GetFileSize(file_name, &options_file_size);
+ if (s.ok()) {
+ // Retry if the file name happen to conflict with an existing one.
+ s = GetEnv()->RenameFile(file_name, options_file_name);
+ std::unique_ptr<FSDirectory> dir_obj;
+ if (s.ok()) {
+ s = fs_->NewDirectory(GetName(), IOOptions(), &dir_obj, nullptr);
+ }
+ if (s.ok()) {
+ s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr,
+ DirFsyncOptions(options_file_name));
+ }
+ if (s.ok()) {
+ Status temp_s = dir_obj->Close(IOOptions(), nullptr);
+ // The default Close() could return "NotSupproted" and we bypass it
+ // if it is not impelmented. Detailed explanations can be found in
+ // db/db_impl/db_impl.h
+ if (!temp_s.ok()) {
+ if (temp_s.IsNotSupported()) {
+ temp_s.PermitUncheckedError();
+ } else {
+ s = temp_s;
+ }
+ }
+ }
+ }
+ if (s.ok()) {
+ InstrumentedMutexLock l(&mutex_);
+ versions_->options_file_number_ = options_file_number;
+ versions_->options_file_size_ = options_file_size;
+ }
+
+ if (0 == disable_delete_obsolete_files_) {
+ // TODO: Should we check for errors here?
+ DeleteObsoleteOptionsFiles().PermitUncheckedError();
+ }
+ return s;
+#else
+ (void)file_name;
+ return Status::OK();
+#endif // !ROCKSDB_LITE
+}
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+ if (immutable_db_options_.enable_thread_tracking) {
+ ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(),
+ cfd->ioptions()->env);
+ }
+}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+ if (immutable_db_options_.enable_thread_tracking) {
+ ThreadStatusUtil::EraseColumnFamilyInfo(cfd);
+ }
+}
+
+void DBImpl::EraseThreadStatusDbInfo() const {
+ if (immutable_db_options_.enable_thread_tracking) {
+ ThreadStatusUtil::EraseDatabaseInfo(this);
+ }
+}
+
+#else
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusDbInfo() const {}
+#endif // ROCKSDB_USING_THREAD_STATUS
+
+//
+// A global method that can dump out the build version
+void DumpRocksDBBuildVersion(Logger* log) {
+ ROCKS_LOG_HEADER(log, "RocksDB version: %s\n",
+ GetRocksVersionAsString().c_str());
+ const auto& props = GetRocksBuildProperties();
+ const auto& sha = props.find("rocksdb_build_git_sha");
+ if (sha != props.end()) {
+ ROCKS_LOG_HEADER(log, "Git sha %s", sha->second.c_str());
+ }
+ const auto date = props.find("rocksdb_build_date");
+ if (date != props.end()) {
+ ROCKS_LOG_HEADER(log, "Compile date %s", date->second.c_str());
+ }
+}
+
+#ifndef ROCKSDB_LITE
+SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+ bool include_history) {
+ // Find the earliest sequence number that we know we can rely on reading
+ // from the memtable without needing to check sst files.
+ SequenceNumber earliest_seq =
+ sv->imm->GetEarliestSequenceNumber(include_history);
+ if (earliest_seq == kMaxSequenceNumber) {
+ earliest_seq = sv->mem->GetEarliestSequenceNumber();
+ }
+ assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq);
+
+ return earliest_seq;
+}
+
+Status DBImpl::GetLatestSequenceForKey(
+ SuperVersion* sv, const Slice& key, bool cache_only,
+ SequenceNumber lower_bound_seq, SequenceNumber* seq, std::string* timestamp,
+ bool* found_record_for_key, bool* is_blob_index) {
+ Status s;
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+
+ ReadOptions read_options;
+ SequenceNumber current_seq = versions_->LastSequence();
+
+ ColumnFamilyData* cfd = sv->cfd;
+ assert(cfd);
+ const Comparator* const ucmp = cfd->user_comparator();
+ assert(ucmp);
+ size_t ts_sz = ucmp->timestamp_size();
+ std::string ts_buf;
+ if (ts_sz > 0) {
+ assert(timestamp);
+ ts_buf.assign(ts_sz, '\xff');
+ } else {
+ assert(!timestamp);
+ }
+ Slice ts(ts_buf);
+
+ LookupKey lkey(key, current_seq, ts_sz == 0 ? nullptr : &ts);
+
+ *seq = kMaxSequenceNumber;
+ *found_record_for_key = false;
+
+ // Check if there is a record for this key in the latest memtable
+ sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s,
+ &merge_context, &max_covering_tombstone_seq, seq, read_options,
+ false /* immutable_memtable */, nullptr /*read_callback*/,
+ is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading memtable.
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Unexpected status returned from MemTable::Get: %s\n",
+ s.ToString().c_str());
+
+ return s;
+ }
+ assert(!ts_sz ||
+ (*seq != kMaxSequenceNumber &&
+ *timestamp != std::string(ts_sz, '\xff')) ||
+ (*seq == kMaxSequenceNumber && timestamp->empty()));
+
+ TEST_SYNC_POINT_CALLBACK("DBImpl::GetLatestSequenceForKey:mem", timestamp);
+
+ if (*seq != kMaxSequenceNumber) {
+ // Found a sequence number, no need to check immutable memtables
+ *found_record_for_key = true;
+ return Status::OK();
+ }
+
+ SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber();
+ if (lower_bound_in_mem != kMaxSequenceNumber &&
+ lower_bound_in_mem < lower_bound_seq) {
+ *found_record_for_key = false;
+ return Status::OK();
+ }
+
+ // Check if there is a record for this key in the immutable memtables
+ sv->imm->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s,
+ &merge_context, &max_covering_tombstone_seq, seq, read_options,
+ nullptr /*read_callback*/, is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading memtable.
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Unexpected status returned from MemTableList::Get: %s\n",
+ s.ToString().c_str());
+
+ return s;
+ }
+
+ assert(!ts_sz ||
+ (*seq != kMaxSequenceNumber &&
+ *timestamp != std::string(ts_sz, '\xff')) ||
+ (*seq == kMaxSequenceNumber && timestamp->empty()));
+
+ if (*seq != kMaxSequenceNumber) {
+ // Found a sequence number, no need to check memtable history
+ *found_record_for_key = true;
+ return Status::OK();
+ }
+
+ SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber();
+ if (lower_bound_in_imm != kMaxSequenceNumber &&
+ lower_bound_in_imm < lower_bound_seq) {
+ *found_record_for_key = false;
+ return Status::OK();
+ }
+
+ // Check if there is a record for this key in the immutable memtables
+ sv->imm->GetFromHistory(lkey, /*value=*/nullptr, /*columns=*/nullptr,
+ timestamp, &s, &merge_context,
+ &max_covering_tombstone_seq, seq, read_options,
+ is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading memtable.
+ ROCKS_LOG_ERROR(
+ immutable_db_options_.info_log,
+ "Unexpected status returned from MemTableList::GetFromHistory: %s\n",
+ s.ToString().c_str());
+
+ return s;
+ }
+
+ assert(!ts_sz ||
+ (*seq != kMaxSequenceNumber &&
+ *timestamp != std::string(ts_sz, '\xff')) ||
+ (*seq == kMaxSequenceNumber && timestamp->empty()));
+
+ if (*seq != kMaxSequenceNumber) {
+ // Found a sequence number, no need to check SST files
+ assert(0 == ts_sz || *timestamp != std::string(ts_sz, '\xff'));
+ *found_record_for_key = true;
+ return Status::OK();
+ }
+
+ // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true)
+ // check here to skip the history if possible. But currently the caller
+ // already does that. Maybe we should move the logic here later.
+
+ // TODO(agiardullo): possible optimization: consider checking cached
+ // SST files if cache_only=true?
+ if (!cache_only) {
+ // Check tables
+ PinnedIteratorsManager pinned_iters_mgr;
+ sv->current->Get(read_options, lkey, /*value=*/nullptr, /*columns=*/nullptr,
+ timestamp, &s, &merge_context, &max_covering_tombstone_seq,
+ &pinned_iters_mgr, nullptr /* value_found */,
+ found_record_for_key, seq, nullptr /*read_callback*/,
+ is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading SST files
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Unexpected status returned from Version::Get: %s\n",
+ s.ToString().c_str());
+ }
+ }
+
+ return s;
+}
+
+Status DBImpl::IngestExternalFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& ingestion_options) {
+ IngestExternalFileArg arg;
+ arg.column_family = column_family;
+ arg.external_files = external_files;
+ arg.options = ingestion_options;
+ return IngestExternalFiles({arg});
+}
+
+Status DBImpl::IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& args) {
+ if (args.empty()) {
+ return Status::InvalidArgument("ingestion arg list is empty");
+ }
+ {
+ std::unordered_set<ColumnFamilyHandle*> unique_cfhs;
+ for (const auto& arg : args) {
+ if (arg.column_family == nullptr) {
+ return Status::InvalidArgument("column family handle is null");
+ } else if (unique_cfhs.count(arg.column_family) > 0) {
+ return Status::InvalidArgument(
+ "ingestion args have duplicate column families");
+ }
+ unique_cfhs.insert(arg.column_family);
+ }
+ }
+ // Ingest multiple external SST files atomically.
+ const size_t num_cfs = args.size();
+ for (size_t i = 0; i != num_cfs; ++i) {
+ if (args[i].external_files.empty()) {
+ char err_msg[128] = {0};
+ snprintf(err_msg, 128, "external_files[%zu] is empty", i);
+ return Status::InvalidArgument(err_msg);
+ }
+ }
+ for (const auto& arg : args) {
+ const IngestExternalFileOptions& ingest_opts = arg.options;
+ if (ingest_opts.ingest_behind &&
+ !immutable_db_options_.allow_ingest_behind) {
+ return Status::InvalidArgument(
+ "can't ingest_behind file in DB with allow_ingest_behind=false");
+ }
+ }
+
+ // TODO (yanqin) maybe handle the case in which column_families have
+ // duplicates
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+ size_t total = 0;
+ for (const auto& arg : args) {
+ total += arg.external_files.size();
+ }
+ uint64_t next_file_number = 0;
+ Status status = ReserveFileNumbersBeforeIngestion(
+ static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd(), total,
+ pending_output_elem, &next_file_number);
+ if (!status.ok()) {
+ InstrumentedMutexLock l(&mutex_);
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ return status;
+ }
+
+ std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
+ for (const auto& arg : args) {
+ auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
+ ingestion_jobs.emplace_back(versions_.get(), cfd, immutable_db_options_,
+ file_options_, &snapshots_, arg.options,
+ &directories_, &event_logger_, io_tracer_);
+ }
+
+ // TODO(yanqin) maybe make jobs run in parallel
+ uint64_t start_file_number = next_file_number;
+ for (size_t i = 1; i != num_cfs; ++i) {
+ start_file_number += args[i - 1].external_files.size();
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ Status es = ingestion_jobs[i].Prepare(
+ args[i].external_files, args[i].files_checksums,
+ args[i].files_checksum_func_names, args[i].file_temperature,
+ start_file_number, super_version);
+ // capture first error only
+ if (!es.ok() && status.ok()) {
+ status = es;
+ }
+ CleanupSuperVersion(super_version);
+ }
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1");
+ {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd();
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ Status es = ingestion_jobs[0].Prepare(
+ args[0].external_files, args[0].files_checksums,
+ args[0].files_checksum_func_names, args[0].file_temperature,
+ next_file_number, super_version);
+ if (!es.ok()) {
+ status = es;
+ }
+ CleanupSuperVersion(super_version);
+ }
+ if (!status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ ingestion_jobs[i].Cleanup(status);
+ }
+ InstrumentedMutexLock l(&mutex_);
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ return status;
+ }
+
+ std::vector<SuperVersionContext> sv_ctxs;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ sv_ctxs.emplace_back(true /* create_superversion */);
+ }
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0");
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1");
+ TEST_SYNC_POINT("DBImpl::AddFile:Start");
+ {
+ InstrumentedMutexLock l(&mutex_);
+ TEST_SYNC_POINT("DBImpl::AddFile:MutexLock");
+
+ // Stop writes to the DB by entering both write threads
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ // When unordered_write is enabled, the keys are writing to memtable in an
+ // unordered way. If the ingestion job checks memtable key range before the
+ // key landing in memtable, the ingestion job may skip the necessary
+ // memtable flush.
+ // So wait here to ensure there is no pending write to memtable.
+ WaitForPendingWrites();
+
+ num_running_ingest_file_ += static_cast<int>(num_cfs);
+ TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter");
+
+ bool at_least_one_cf_need_flush = false;
+ std::vector<bool> need_flush(num_cfs, false);
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (cfd->IsDropped()) {
+ // TODO (yanqin) investigate whether we should abort ingestion or
+ // proceed with other non-dropped column families.
+ status = Status::InvalidArgument(
+ "cannot ingest an external file into a dropped CF");
+ break;
+ }
+ bool tmp = false;
+ status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion());
+ need_flush[i] = tmp;
+ at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush",
+ &at_least_one_cf_need_flush);
+
+ if (status.ok() && at_least_one_cf_need_flush) {
+ FlushOptions flush_opts;
+ flush_opts.allow_write_stall = true;
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds_to_flush;
+ SelectColumnFamiliesForAtomicFlush(&cfds_to_flush);
+ mutex_.Unlock();
+ status = AtomicFlushMemTables(cfds_to_flush, flush_opts,
+ FlushReason::kExternalFileIngestion,
+ true /* entered_write_thread */);
+ mutex_.Lock();
+ } else {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ if (need_flush[i]) {
+ mutex_.Unlock();
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)
+ ->cfd();
+ status = FlushMemTable(cfd, flush_opts,
+ FlushReason::kExternalFileIngestion,
+ true /* entered_write_thread */);
+ mutex_.Lock();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ }
+ }
+ }
+ // Run ingestion jobs.
+ if (status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ status = ingestion_jobs[i].Run();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ }
+ if (status.ok()) {
+ autovector<ColumnFamilyData*> cfds_to_commit;
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ autovector<autovector<VersionEdit*>> edit_lists;
+ uint32_t num_entries = 0;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ cfds_to_commit.push_back(cfd);
+ mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+ autovector<VersionEdit*> edit_list;
+ edit_list.push_back(ingestion_jobs[i].edit());
+ edit_lists.push_back(edit_list);
+ ++num_entries;
+ }
+ // Mark the version edits as an atomic group if the number of version
+ // edits exceeds 1.
+ if (cfds_to_commit.size() > 1) {
+ for (auto& edits : edit_lists) {
+ assert(edits.size() == 1);
+ edits[0]->MarkAtomicGroup(--num_entries);
+ }
+ assert(0 == num_entries);
+ }
+ status =
+ versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list,
+ edit_lists, &mutex_, directories_.GetDbDir());
+ // It is safe to update VersionSet last seqno here after LogAndApply since
+ // LogAndApply persists last sequence number from VersionEdits,
+ // which are from file's largest seqno and not from VersionSet.
+ //
+ // It is necessary to update last seqno here since LogAndApply releases
+ // mutex when persisting MANIFEST file, and the snapshots taken during
+ // that period will not be stable if VersionSet last seqno is updated
+ // before LogAndApply.
+ int consumed_seqno_count =
+ ingestion_jobs[0].ConsumedSequenceNumbersCount();
+ for (size_t i = 1; i != num_cfs; ++i) {
+ consumed_seqno_count =
+ std::max(consumed_seqno_count,
+ ingestion_jobs[i].ConsumedSequenceNumbersCount());
+ }
+ if (consumed_seqno_count > 0) {
+ const SequenceNumber last_seqno = versions_->LastSequence();
+ versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
+ versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count);
+ versions_->SetLastSequence(last_seqno + consumed_seqno_count);
+ }
+ }
+
+ if (status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (!cfd->IsDropped()) {
+ InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
+ *cfd->GetLatestMutableCFOptions());
+#ifndef NDEBUG
+ if (0 == i && num_cfs > 1) {
+ TEST_SYNC_POINT(
+ "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
+ TEST_SYNC_POINT(
+ "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
+ }
+#endif // !NDEBUG
+ }
+ }
+ } else if (versions_->io_status().IsIOError()) {
+ // Error while writing to MANIFEST.
+ // In fact, versions_->io_status() can also be the result of renaming
+ // CURRENT file. With current code, it's just difficult to tell. So just
+ // be pessimistic and try write to a new MANIFEST.
+ // TODO: distinguish between MANIFEST write and CURRENT renaming
+ const IOStatus& io_s = versions_->io_status();
+ // Should handle return error?
+ error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite);
+ }
+
+ // Resume writes to the DB
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ write_thread_.ExitUnbatched(&w);
+
+ if (status.ok()) {
+ for (auto& job : ingestion_jobs) {
+ job.UpdateStats();
+ }
+ }
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ num_running_ingest_file_ -= static_cast<int>(num_cfs);
+ if (0 == num_running_ingest_file_) {
+ bg_cv_.SignalAll();
+ }
+ TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock");
+ }
+ // mutex_ is unlocked here
+
+ // Cleanup
+ for (size_t i = 0; i != num_cfs; ++i) {
+ sv_ctxs[i].Clean();
+ // This may rollback jobs that have completed successfully. This is
+ // intended for atomicity.
+ ingestion_jobs[i].Cleanup(status);
+ }
+ if (status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (!cfd->IsDropped()) {
+ NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]);
+ }
+ }
+ }
+ return status;
+}
+
+Status DBImpl::CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ const ImportColumnFamilyOptions& import_options,
+ const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) {
+ assert(handle != nullptr);
+ assert(*handle == nullptr);
+ std::string cf_comparator_name = options.comparator->Name();
+ if (cf_comparator_name != metadata.db_comparator_name) {
+ return Status::InvalidArgument("Comparator name mismatch");
+ }
+
+ // Create column family.
+ auto status = CreateColumnFamily(options, column_family_name, handle);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Import sst files from metadata.
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(*handle);
+ auto cfd = cfh->cfd();
+ ImportColumnFamilyJob import_job(versions_.get(), cfd, immutable_db_options_,
+ file_options_, import_options,
+ metadata.files, io_tracer_);
+
+ SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
+ VersionEdit dummy_edit;
+ uint64_t next_file_number = 0;
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+ {
+ // Lock db mutex
+ InstrumentedMutexLock l(&mutex_);
+ if (error_handler_.IsDBStopped()) {
+ // Don't import files when there is a bg_error
+ status = error_handler_.GetBGError();
+ }
+
+ // Make sure that bg cleanup wont delete the files that we are importing
+ pending_output_elem.reset(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+
+ if (status.ok()) {
+ // If crash happen after a hard link established, Recover function may
+ // reuse the file number that has already assigned to the internal file,
+ // and this will overwrite the external file. To protect the external
+ // file, we have to make sure the file number will never being reused.
+ next_file_number = versions_->FetchAddFileNumber(metadata.files.size());
+ auto cf_options = cfd->GetLatestMutableCFOptions();
+ status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+ directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+ }
+ }
+ }
+ dummy_sv_ctx.Clean();
+
+ if (status.ok()) {
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+ status = import_job.Prepare(next_file_number, sv);
+ CleanupSuperVersion(sv);
+ }
+
+ if (status.ok()) {
+ SuperVersionContext sv_context(true /*create_superversion*/);
+ {
+ // Lock db mutex
+ InstrumentedMutexLock l(&mutex_);
+
+ // Stop writes to the DB by entering both write threads
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ num_running_ingest_file_++;
+ assert(!cfd->IsDropped());
+ status = import_job.Run();
+
+ // Install job edit [Mutex will be unlocked here]
+ if (status.ok()) {
+ auto cf_options = cfd->GetLatestMutableCFOptions();
+ status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(),
+ &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options);
+ }
+ }
+
+ // Resume writes to the DB
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ write_thread_.ExitUnbatched(&w);
+
+ num_running_ingest_file_--;
+ if (num_running_ingest_file_ == 0) {
+ bg_cv_.SignalAll();
+ }
+ }
+ // mutex_ is unlocked here
+
+ sv_context.Clean();
+ }
+
+ {
+ InstrumentedMutexLock l(&mutex_);
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ }
+
+ import_job.Cleanup(status);
+ if (!status.ok()) {
+ Status temp_s = DropColumnFamily(*handle);
+ if (!temp_s.ok()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "DropColumnFamily failed with error %s",
+ temp_s.ToString().c_str());
+ }
+ // Always returns Status::OK()
+ temp_s = DestroyColumnFamilyHandle(*handle);
+ assert(temp_s.ok());
+ *handle = nullptr;
+ }
+ return status;
+}
+
+Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) {
+ return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true);
+}
+
+Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
+ return VerifyChecksumInternal(read_options, /*use_file_checksum=*/false);
+}
+
+Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
+ bool use_file_checksum) {
+ // `bytes_read` stat is enabled based on compile-time support and cannot
+ // be dynamically toggled. So we do not need to worry about `PerfLevel`
+ // here, unlike many other `IOStatsContext` / `PerfContext` stats.
+ uint64_t prev_bytes_read = IOSTATS(bytes_read);
+
+ Status s;
+
+ if (use_file_checksum) {
+ FileChecksumGenFactory* const file_checksum_gen_factory =
+ immutable_db_options_.file_checksum_gen_factory.get();
+ if (!file_checksum_gen_factory) {
+ s = Status::InvalidArgument(
+ "Cannot verify file checksum if options.file_checksum_gen_factory is "
+ "null");
+ return s;
+ }
+ }
+
+ // TODO: simplify using GetRefedColumnFamilySet?
+ std::vector<ColumnFamilyData*> cfd_list;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (!cfd->IsDropped() && cfd->initialized()) {
+ cfd->Ref();
+ cfd_list.push_back(cfd);
+ }
+ }
+ }
+ std::vector<SuperVersion*> sv_list;
+ for (auto cfd : cfd_list) {
+ sv_list.push_back(cfd->GetReferencedSuperVersion(this));
+ }
+
+ for (auto& sv : sv_list) {
+ VersionStorageInfo* vstorage = sv->current->storage_info();
+ ColumnFamilyData* cfd = sv->current->cfd();
+ Options opts;
+ if (!use_file_checksum) {
+ InstrumentedMutexLock l(&mutex_);
+ opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+ cfd->GetLatestCFOptions());
+ }
+ for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) {
+ for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok();
+ j++) {
+ const auto& fd_with_krange = vstorage->LevelFilesBrief(i).files[j];
+ const auto& fd = fd_with_krange.fd;
+ const FileMetaData* fmeta = fd_with_krange.file_metadata;
+ assert(fmeta);
+ std::string fname = TableFileName(cfd->ioptions()->cf_paths,
+ fd.GetNumber(), fd.GetPathId());
+ if (use_file_checksum) {
+ s = VerifyFullFileChecksum(fmeta->file_checksum,
+ fmeta->file_checksum_func_name, fname,
+ read_options);
+ } else {
+ s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(
+ opts, file_options_, read_options, fname, fd.largest_seqno);
+ }
+ RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+ IOSTATS(bytes_read) - prev_bytes_read);
+ prev_bytes_read = IOSTATS(bytes_read);
+ }
+ }
+
+ if (s.ok() && use_file_checksum) {
+ const auto& blob_files = vstorage->GetBlobFiles();
+ for (const auto& meta : blob_files) {
+ assert(meta);
+
+ const uint64_t blob_file_number = meta->GetBlobFileNumber();
+
+ const std::string blob_file_name = BlobFileName(
+ cfd->ioptions()->cf_paths.front().path, blob_file_number);
+ s = VerifyFullFileChecksum(meta->GetChecksumValue(),
+ meta->GetChecksumMethod(), blob_file_name,
+ read_options);
+ RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+ IOSTATS(bytes_read) - prev_bytes_read);
+ prev_bytes_read = IOSTATS(bytes_read);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ if (!s.ok()) {
+ break;
+ }
+ }
+
+ bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto sv : sv_list) {
+ if (sv && sv->Unref()) {
+ sv->Cleanup();
+ if (defer_purge) {
+ AddSuperVersionsToFreeQueue(sv);
+ } else {
+ delete sv;
+ }
+ }
+ }
+ if (defer_purge) {
+ SchedulePurge();
+ }
+ for (auto cfd : cfd_list) {
+ cfd->UnrefAndTryDelete();
+ }
+ }
+ RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+ IOSTATS(bytes_read) - prev_bytes_read);
+ return s;
+}
+
+Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected,
+ const std::string& func_name_expected,
+ const std::string& fname,
+ const ReadOptions& read_options) {
+ Status s;
+ if (file_checksum_expected == kUnknownFileChecksum) {
+ return s;
+ }
+ std::string file_checksum;
+ std::string func_name;
+ s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum(
+ fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(),
+ func_name_expected, &file_checksum, &func_name,
+ read_options.readahead_size, immutable_db_options_.allow_mmap_reads,
+ io_tracer_, immutable_db_options_.rate_limiter.get(),
+ read_options.rate_limiter_priority);
+ if (s.ok()) {
+ assert(func_name_expected == func_name);
+ if (file_checksum != file_checksum_expected) {
+ std::ostringstream oss;
+ oss << fname << " file checksum mismatch, ";
+ oss << "expecting "
+ << Slice(file_checksum_expected).ToString(/*hex=*/true);
+ oss << ", but actual " << Slice(file_checksum).ToString(/*hex=*/true);
+ s = Status::Corruption(oss.str());
+ TEST_SYNC_POINT_CALLBACK("DBImpl::VerifyFullFileChecksum:mismatch", &s);
+ }
+ }
+ return s;
+}
+
+void DBImpl::NotifyOnExternalFileIngested(
+ ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
+ if (immutable_db_options_.listeners.empty()) {
+ return;
+ }
+
+ for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) {
+ ExternalFileIngestionInfo info;
+ info.cf_name = cfd->GetName();
+ info.external_file_path = f.external_file_path;
+ info.internal_file_path = f.internal_file_path;
+ info.global_seqno = f.assigned_seqno;
+ info.table_properties = f.table_properties;
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnExternalFileIngested(this, info);
+ }
+ }
+}
+
+void DBImpl::WaitForIngestFile() {
+ mutex_.AssertHeld();
+ while (num_running_ingest_file_ > 0) {
+ bg_cv_.Wait();
+ }
+}
+
+Status DBImpl::StartTrace(const TraceOptions& trace_options,
+ std::unique_ptr<TraceWriter>&& trace_writer) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ tracer_.reset(new Tracer(immutable_db_options_.clock, trace_options,
+ std::move(trace_writer)));
+ return Status::OK();
+}
+
+Status DBImpl::EndTrace() {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ Status s;
+ if (tracer_ != nullptr) {
+ s = tracer_->Close();
+ tracer_.reset();
+ } else {
+ s = Status::IOError("No trace file to close");
+ }
+ return s;
+}
+
+Status DBImpl::NewDefaultReplayer(
+ const std::vector<ColumnFamilyHandle*>& handles,
+ std::unique_ptr<TraceReader>&& reader,
+ std::unique_ptr<Replayer>* replayer) {
+ replayer->reset(new ReplayerImpl(this, handles, std::move(reader)));
+ return Status::OK();
+}
+
+Status DBImpl::StartBlockCacheTrace(
+ const TraceOptions& trace_options,
+ std::unique_ptr<TraceWriter>&& trace_writer) {
+ BlockCacheTraceOptions block_trace_opts;
+ block_trace_opts.sampling_frequency = trace_options.sampling_frequency;
+
+ BlockCacheTraceWriterOptions trace_writer_opt;
+ trace_writer_opt.max_trace_file_size = trace_options.max_trace_file_size;
+
+ std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+ NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
+ std::move(trace_writer));
+
+ return block_cache_tracer_.StartTrace(block_trace_opts,
+ std::move(block_cache_trace_writer));
+}
+
+Status DBImpl::StartBlockCacheTrace(
+ const BlockCacheTraceOptions& trace_options,
+ std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) {
+ return block_cache_tracer_.StartTrace(trace_options, std::move(trace_writer));
+}
+
+Status DBImpl::EndBlockCacheTrace() {
+ block_cache_tracer_.EndTrace();
+ return Status::OK();
+}
+
+Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
+ const Slice& lower_bound,
+ const Slice upper_bound) {
+ Status s;
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ s = tracer_->IteratorSeek(cf_id, key, lower_bound, upper_bound);
+ }
+ }
+ return s;
+}
+
+Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+ const Slice& lower_bound,
+ const Slice upper_bound) {
+ Status s;
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ s = tracer_->IteratorSeekForPrev(cf_id, key, lower_bound, upper_bound);
+ }
+ }
+ return s;
+}
+
+Status DBImpl::ReserveFileNumbersBeforeIngestion(
+ ColumnFamilyData* cfd, uint64_t num,
+ std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+ uint64_t* next_file_number) {
+ Status s;
+ SuperVersionContext dummy_sv_ctx(true /* create_superversion */);
+ assert(nullptr != next_file_number);
+ InstrumentedMutexLock l(&mutex_);
+ if (error_handler_.IsDBStopped()) {
+ // Do not ingest files when there is a bg_error
+ return error_handler_.GetBGError();
+ }
+ pending_output_elem.reset(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+ *next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
+ auto cf_options = cfd->GetLatestMutableCFOptions();
+ VersionEdit dummy_edit;
+ // If crash happen after a hard link established, Recover function may
+ // reuse the file number that has already assigned to the internal file,
+ // and this will overwrite the external file. To protect the external
+ // file, we have to make sure the file number will never being reused.
+ s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+ directories_.GetDbDir());
+ if (s.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+ }
+ dummy_sv_ctx.Clean();
+ return s;
+}
+
+Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+ if (mutable_db_options_.max_open_files == -1) {
+ uint64_t oldest_time = std::numeric_limits<uint64_t>::max();
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (!cfd->IsDropped()) {
+ uint64_t ctime;
+ {
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+ Version* version = sv->current;
+ version->GetCreationTimeOfOldestFile(&ctime);
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ }
+
+ if (ctime < oldest_time) {
+ oldest_time = ctime;
+ }
+ if (oldest_time == 0) {
+ break;
+ }
+ }
+ }
+ *creation_time = oldest_time;
+ return Status::OK();
+ } else {
+ return Status::NotSupported("This API only works if max_open_files = -1");
+ }
+}
+
+void DBImpl::RecordSeqnoToTimeMapping() {
+ // Get time first then sequence number, so the actual time of seqno is <=
+ // unix_time recorded
+ int64_t unix_time = 0;
+ immutable_db_options_.clock->GetCurrentTime(&unix_time)
+ .PermitUncheckedError(); // Ignore error
+ SequenceNumber seqno = GetLatestSequenceNumber();
+ bool appended = false;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ appended = seqno_time_mapping_.Append(seqno, unix_time);
+ }
+ if (!appended) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Failed to insert sequence number to time entry: %" PRIu64
+ " -> %" PRIu64,
+ seqno, unix_time);
+ }
+}
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl.h b/src/rocksdb/db/db_impl/db_impl.h
new file mode 100644
index 000000000..725e77c18
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.h
@@ -0,0 +1,2804 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/compaction/compaction_job.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/flush_scheduler.h"
+#include "db/import_column_family_job.h"
+#include "db/internal_stats.h"
+#include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
+#include "db/periodic_task_scheduler.h"
+#include "db/post_memtable_callback.h"
+#include "db/pre_release_callback.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/snapshot_checker.h"
+#include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
+#include "db/version_edit.h"
+#include "db/wal_manager.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/status.h"
+#ifndef ROCKSDB_LITE
+#include "rocksdb/trace_reader_writer.h"
+#endif // ROCKSDB_LITE
+#include "rocksdb/transaction_log.h"
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/replayer.h"
+#endif // ROCKSDB_LITE
+#include "rocksdb/write_buffer_manager.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/hash.h"
+#include "util/repeatable_thread.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class ArenaWrappedDBIter;
+class InMemoryStatsHistoryIterator;
+class MemTable;
+class PersistentStatsHistoryIterator;
+class TableCache;
+class TaskLimiterToken;
+class Version;
+class VersionEdit;
+class VersionSet;
+class WriteCallback;
+struct JobContext;
+struct ExternalSstFileInfo;
+struct MemTableInfo;
+
+// Class to maintain directories for all database paths other than main one.
+class Directories {
+ public:
+ IOStatus SetDirectories(FileSystem* fs, const std::string& dbname,
+ const std::string& wal_dir,
+ const std::vector<DbPath>& data_paths);
+
+ FSDirectory* GetDataDir(size_t path_id) const {
+ assert(path_id < data_dirs_.size());
+ FSDirectory* ret_dir = data_dirs_[path_id].get();
+ if (ret_dir == nullptr) {
+ // Should use db_dir_
+ return db_dir_.get();
+ }
+ return ret_dir;
+ }
+
+ FSDirectory* GetWalDir() {
+ if (wal_dir_) {
+ return wal_dir_.get();
+ }
+ return db_dir_.get();
+ }
+
+ FSDirectory* GetDbDir() { return db_dir_.get(); }
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) {
+ // close all directories for all database paths
+ IOStatus s = IOStatus::OK();
+
+ // The default implementation for Close() in Directory/FSDirectory class
+ // "NotSupported" status, the upper level interface should be able to
+ // handle this error so that Close() does not fail after upgrading when
+ // run on FileSystems that have not implemented `Directory::Close()` or
+ // `FSDirectory::Close()` yet
+
+ if (db_dir_) {
+ IOStatus temp_s = db_dir_->Close(options, dbg);
+ if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
+ s = std::move(temp_s);
+ }
+ }
+
+ // Attempt to close everything even if one fails
+ s.PermitUncheckedError();
+
+ if (wal_dir_) {
+ IOStatus temp_s = wal_dir_->Close(options, dbg);
+ if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
+ s = std::move(temp_s);
+ }
+ }
+
+ s.PermitUncheckedError();
+
+ for (auto& data_dir_ptr : data_dirs_) {
+ if (data_dir_ptr) {
+ IOStatus temp_s = data_dir_ptr->Close(options, dbg);
+ if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
+ s = std::move(temp_s);
+ }
+ }
+ }
+
+ // Ready for caller
+ s.MustCheck();
+ return s;
+ }
+
+ private:
+ std::unique_ptr<FSDirectory> db_dir_;
+ std::vector<std::unique_ptr<FSDirectory>> data_dirs_;
+ std::unique_ptr<FSDirectory> wal_dir_;
+};
+
+// While DB is the public interface of RocksDB, and DBImpl is the actual
+// class implementing it. It's the entrance of the core RocksdB engine.
+// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a
+// DBImpl internally.
+// Other than functions implementing the DB interface, some public
+// functions are there for other internal components to call. For
+// example, TransactionDB directly calls DBImpl::WriteImpl() and
+// BlobDB directly calls DBImpl::GetImpl(). Some other functions
+// are for sub-components to call. For example, ColumnFamilyHandleImpl
+// calls DBImpl::FindObsoleteFiles().
+//
+// Since it's a very large class, the definition of the functions is
+// divided in several db_impl_*.cc files, besides db_impl.cc.
+class DBImpl : public DB {
+ public:
+ DBImpl(const DBOptions& options, const std::string& dbname,
+ const bool seq_per_batch = false, const bool batch_per_txn = true,
+ bool read_only = false);
+ // No copying allowed
+ DBImpl(const DBImpl&) = delete;
+ void operator=(const DBImpl&) = delete;
+
+ virtual ~DBImpl();
+
+ // ---- Implementations of the DB interface ----
+
+ using DB::Resume;
+ Status Resume() override;
+
+ using DB::Put;
+ Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) override;
+ Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts, const Slice& value) override;
+
+ using DB::PutEntity;
+ Status PutEntity(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const WideColumns& columns) override;
+
+ using DB::Merge;
+ Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) override;
+ Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts, const Slice& value) override;
+
+ using DB::Delete;
+ Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts) override;
+
+ using DB::SingleDelete;
+ Status SingleDelete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ Status SingleDelete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) override;
+
+ using DB::DeleteRange;
+ Status DeleteRange(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& begin_key,
+ const Slice& end_key) override;
+ Status DeleteRange(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& begin_key,
+ const Slice& end_key, const Slice& ts) override;
+
+ using DB::Write;
+ virtual Status Write(const WriteOptions& options,
+ WriteBatch* updates) override;
+
+ using DB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value, std::string* timestamp) override;
+
+ using DB::GetEntity;
+ Status GetEntity(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableWideColumns* columns) override;
+
+ using DB::GetMergeOperands;
+ Status GetMergeOperands(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* merge_operands,
+ GetMergeOperandsOptions* get_merge_operands_options,
+ int* number_of_operands) override {
+ GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.merge_operands = merge_operands;
+ get_impl_options.get_merge_operands_options = get_merge_operands_options;
+ get_impl_options.number_of_operands = number_of_operands;
+ get_impl_options.get_value = false;
+ return GetImpl(options, key, get_impl_options);
+ }
+
+ using DB::MultiGet;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values,
+ std::vector<std::string>* timestamps) override;
+
+ // This MultiGet is a batched version, which may be faster than calling Get
+ // multiple times, especially if the keys have some spatial locality that
+ // enables them to be queried in the same SST files/set of files. The larger
+ // the batch size, the more scope for batching and performance improvement
+ // The values and statuses parameters are arrays with number of elements
+ // equal to keys.size(). This allows the storage for those to be alloacted
+ // by the caller on the stack for small batches
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input = false) override;
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, std::string* timestamps,
+ Status* statuses,
+ const bool sorted_input = false) override;
+
+ virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+ ColumnFamilyHandle** column_families, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input = false) override;
+ virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+ ColumnFamilyHandle** column_families, const Slice* keys,
+ PinnableSlice* values, std::string* timestamps,
+ Status* statuses,
+ const bool sorted_input = false) override;
+
+ virtual void MultiGetWithCallback(
+ const ReadOptions& options, ColumnFamilyHandle* column_family,
+ ReadCallback* callback,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
+
+ virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+ const std::string& column_family,
+ ColumnFamilyHandle** handle) override;
+ virtual Status CreateColumnFamilies(
+ const ColumnFamilyOptions& cf_options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles) override;
+ virtual Status CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles) override;
+ virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
+ virtual Status DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families) override;
+
+ // Returns false if key doesn't exist in the database and true if it may.
+ // If value_found is not passed in as null, then return the value if found in
+ // memory. On return, if value was found, then value_found will be set to true
+ // , otherwise false.
+ using DB::KeyMayExist;
+ virtual bool KeyMayExist(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value, std::string* timestamp,
+ bool* value_found = nullptr) override;
+
+ using DB::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) override;
+ virtual Status NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override;
+
+ virtual const Snapshot* GetSnapshot() override;
+ virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
+ // Create a timestamped snapshot. This snapshot can be shared by multiple
+ // readers. If any of them uses it for write conflict checking, then
+ // is_write_conflict_boundary is true. For simplicity, set it to true by
+ // default.
+ std::pair<Status, std::shared_ptr<const Snapshot>> CreateTimestampedSnapshot(
+ SequenceNumber snapshot_seq, uint64_t ts);
+ std::shared_ptr<const SnapshotImpl> GetTimestampedSnapshot(uint64_t ts) const;
+ void ReleaseTimestampedSnapshotsOlderThan(
+ uint64_t ts, size_t* remaining_total_ss = nullptr);
+ Status GetTimestampedSnapshots(uint64_t ts_lb, uint64_t ts_ub,
+ std::vector<std::shared_ptr<const Snapshot>>&
+ timestamped_snapshots) const;
+
+ using DB::GetProperty;
+ virtual bool GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value) override;
+ using DB::GetMapProperty;
+ virtual bool GetMapProperty(
+ ColumnFamilyHandle* column_family, const Slice& property,
+ std::map<std::string, std::string>* value) override;
+ using DB::GetIntProperty;
+ virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, uint64_t* value) override;
+ using DB::GetAggregatedIntProperty;
+ virtual bool GetAggregatedIntProperty(const Slice& property,
+ uint64_t* aggregated_value) override;
+ using DB::GetApproximateSizes;
+ virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Range* range, int n,
+ uint64_t* sizes) override;
+ using DB::GetApproximateMemTableStats;
+ virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+ const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) override;
+ using DB::CompactRange;
+ virtual Status CompactRange(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) override;
+
+ using DB::CompactFiles;
+ virtual Status CompactFiles(
+ const CompactionOptions& compact_options,
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) override;
+
+ virtual Status PauseBackgroundWork() override;
+ virtual Status ContinueBackgroundWork() override;
+
+ virtual Status EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& column_family_handles) override;
+
+ virtual void EnableManualCompaction() override;
+ virtual void DisableManualCompaction() override;
+
+ using DB::SetOptions;
+ Status SetOptions(
+ ColumnFamilyHandle* column_family,
+ const std::unordered_map<std::string, std::string>& options_map) override;
+
+ virtual Status SetDBOptions(
+ const std::unordered_map<std::string, std::string>& options_map) override;
+
+ using DB::NumberLevels;
+ virtual int NumberLevels(ColumnFamilyHandle* column_family) override;
+ using DB::MaxMemCompactionLevel;
+ virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
+ using DB::Level0StopWriteTrigger;
+ virtual int Level0StopWriteTrigger(
+ ColumnFamilyHandle* column_family) override;
+ virtual const std::string& GetName() const override;
+ virtual Env* GetEnv() const override;
+ virtual FileSystem* GetFileSystem() const override;
+ using DB::GetOptions;
+ virtual Options GetOptions(ColumnFamilyHandle* column_family) const override;
+ using DB::GetDBOptions;
+ virtual DBOptions GetDBOptions() const override;
+ using DB::Flush;
+ virtual Status Flush(const FlushOptions& options,
+ ColumnFamilyHandle* column_family) override;
+ virtual Status Flush(
+ const FlushOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families) override;
+ virtual Status FlushWAL(bool sync) override;
+ bool WALBufferIsEmpty(bool lock = true);
+ virtual Status SyncWAL() override;
+ virtual Status LockWAL() override;
+ virtual Status UnlockWAL() override;
+
+ virtual SequenceNumber GetLatestSequenceNumber() const override;
+
+ // IncreaseFullHistoryTsLow(ColumnFamilyHandle*, std::string) will acquire
+ // and release db_mutex
+ Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+ std::string ts_low) override;
+
+ // GetFullHistoryTsLow(ColumnFamilyHandle*, std::string*) will acquire and
+ // release db_mutex
+ Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+ std::string* ts_low) override;
+
+ virtual Status GetDbIdentity(std::string& identity) const override;
+
+ virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
+
+ virtual Status GetDbSessionId(std::string& session_id) const override;
+
+ ColumnFamilyHandle* DefaultColumnFamily() const override;
+
+ ColumnFamilyHandle* PersistentStatsColumnFamily() const;
+
+ virtual Status Close() override;
+
+ virtual Status DisableFileDeletions() override;
+
+ virtual Status EnableFileDeletions(bool force) override;
+
+ virtual bool IsFileDeletionsEnabled() const;
+
+ Status GetStatsHistory(
+ uint64_t start_time, uint64_t end_time,
+ std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
+
+#ifndef ROCKSDB_LITE
+ using DB::ResetStats;
+ virtual Status ResetStats() override;
+ // All the returned filenames start with "/"
+ virtual Status GetLiveFiles(std::vector<std::string>&,
+ uint64_t* manifest_file_size,
+ bool flush_memtable = true) override;
+ virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
+ virtual Status GetCurrentWalFile(
+ std::unique_ptr<LogFile>* current_log_file) override;
+ virtual Status GetCreationTimeOfOldestFile(
+ uint64_t* creation_time) override;
+
+ virtual Status GetUpdatesSince(
+ SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options =
+ TransactionLogIterator::ReadOptions()) override;
+ virtual Status DeleteFile(std::string name) override;
+ Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+ const RangePtr* ranges, size_t n,
+ bool include_end = true);
+
+ virtual void GetLiveFilesMetaData(
+ std::vector<LiveFileMetaData>* metadata) override;
+
+ virtual Status GetLiveFilesChecksumInfo(
+ FileChecksumList* checksum_list) override;
+
+ virtual Status GetLiveFilesStorageInfo(
+ const LiveFilesStorageInfoOptions& opts,
+ std::vector<LiveFileStorageInfo>* files) override;
+
+ // Obtains the meta data of the specified column family of the DB.
+ // TODO(yhchiang): output parameter is placed in the end in this codebase.
+ virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+ ColumnFamilyMetaData* metadata) override;
+
+ void GetAllColumnFamilyMetaData(
+ std::vector<ColumnFamilyMetaData>* metadata) override;
+
+ Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) override;
+
+ Status PromoteL0(ColumnFamilyHandle* column_family,
+ int target_level) override;
+
+ using DB::IngestExternalFile;
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& ingestion_options) override;
+
+ using DB::IngestExternalFiles;
+ virtual Status IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& args) override;
+
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ const ImportColumnFamilyOptions& import_options,
+ const ExportImportFilesMetaData& metadata,
+ ColumnFamilyHandle** handle) override;
+
+ using DB::VerifyFileChecksums;
+ Status VerifyFileChecksums(const ReadOptions& read_options) override;
+
+ using DB::VerifyChecksum;
+ virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
+ // Verify the checksums of files in db. Currently only tables are checked.
+ //
+ // read_options: controls file I/O behavior, e.g. read ahead size while
+ // reading all the live table files.
+ //
+ // use_file_checksum: if false, verify the block checksums of all live table
+ // in db. Otherwise, obtain the file checksums and compare
+ // with the MANIFEST. Currently, file checksums are
+ // recomputed by reading all table files.
+ //
+ // Returns: OK if there is no file whose file or block checksum mismatches.
+ Status VerifyChecksumInternal(const ReadOptions& read_options,
+ bool use_file_checksum);
+
+ Status VerifyFullFileChecksum(const std::string& file_checksum_expected,
+ const std::string& func_name_expected,
+ const std::string& fpath,
+ const ReadOptions& read_options);
+
+ using DB::StartTrace;
+ virtual Status StartTrace(
+ const TraceOptions& options,
+ std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+ using DB::EndTrace;
+ virtual Status EndTrace() override;
+
+ using DB::NewDefaultReplayer;
+ virtual Status NewDefaultReplayer(
+ const std::vector<ColumnFamilyHandle*>& handles,
+ std::unique_ptr<TraceReader>&& reader,
+ std::unique_ptr<Replayer>* replayer) override;
+
+ using DB::StartBlockCacheTrace;
+ Status StartBlockCacheTrace(
+ const TraceOptions& trace_options,
+ std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+ Status StartBlockCacheTrace(
+ const BlockCacheTraceOptions& options,
+ std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) override;
+
+ using DB::EndBlockCacheTrace;
+ Status EndBlockCacheTrace() override;
+
+ using DB::StartIOTrace;
+ Status StartIOTrace(const TraceOptions& options,
+ std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+ using DB::EndIOTrace;
+ Status EndIOTrace() override;
+
+ using DB::GetPropertiesOfAllTables;
+ virtual Status GetPropertiesOfAllTables(
+ ColumnFamilyHandle* column_family,
+ TablePropertiesCollection* props) override;
+ virtual Status GetPropertiesOfTablesInRange(
+ ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+ TablePropertiesCollection* props) override;
+
+#endif // ROCKSDB_LITE
+
+ // ---- End of implementations of the DB interface ----
+ SystemClock* GetSystemClock() const;
+
+ struct GetImplOptions {
+ ColumnFamilyHandle* column_family = nullptr;
+ PinnableSlice* value = nullptr;
+ PinnableWideColumns* columns = nullptr;
+ std::string* timestamp = nullptr;
+ bool* value_found = nullptr;
+ ReadCallback* callback = nullptr;
+ bool* is_blob_index = nullptr;
+ // If true return value associated with key via value pointer else return
+ // all merge operands for key via merge_operands pointer
+ bool get_value = true;
+ // Pointer to an array of size
+ // get_merge_operands_options.expected_max_number_of_operands allocated by
+ // user
+ PinnableSlice* merge_operands = nullptr;
+ GetMergeOperandsOptions* get_merge_operands_options = nullptr;
+ int* number_of_operands = nullptr;
+ };
+
+ // Function that Get and KeyMayExist call with no_io true or false
+ // Note: 'value_found' from KeyMayExist propagates here
+ // This function is also called by GetMergeOperands
+ // If get_impl_options.get_value = true get value associated with
+ // get_impl_options.key via get_impl_options.value
+ // If get_impl_options.get_value = false get merge operands associated with
+ // get_impl_options.key via get_impl_options.merge_operands
+ Status GetImpl(const ReadOptions& options, const Slice& key,
+ GetImplOptions& get_impl_options);
+
+ // If `snapshot` == kMaxSequenceNumber, set a recent one inside the file.
+ ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
+ ColumnFamilyData* cfd,
+ SequenceNumber snapshot,
+ ReadCallback* read_callback,
+ bool expose_blob_index = false,
+ bool allow_refresh = true);
+
+ virtual SequenceNumber GetLastPublishedSequence() const {
+ if (last_seq_same_as_publish_seq_) {
+ return versions_->LastSequence();
+ } else {
+ return versions_->LastPublishedSequence();
+ }
+ }
+
+ // REQUIRES: joined the main write queue if two_write_queues is disabled, and
+ // the second write queue otherwise.
+ virtual void SetLastPublishedSequence(SequenceNumber seq);
+ // Returns LastSequence in last_seq_same_as_publish_seq_
+ // mode and LastAllocatedSequence otherwise. This is useful when visiblility
+ // depends also on data written to the WAL but not to the memtable.
+ SequenceNumber TEST_GetLastVisibleSequence() const;
+
+#ifndef ROCKSDB_LITE
+ // Similar to Write() but will call the callback once on the single write
+ // thread to determine whether it is safe to perform the write.
+ virtual Status WriteWithCallback(const WriteOptions& write_options,
+ WriteBatch* my_batch,
+ WriteCallback* callback);
+
+ // Returns the sequence number that is guaranteed to be smaller than or equal
+ // to the sequence number of any key that could be inserted into the current
+ // memtables. It can then be assumed that any write with a larger(or equal)
+ // sequence number will be present in this memtable or a later memtable.
+ //
+ // If the earliest sequence number could not be determined,
+ // kMaxSequenceNumber will be returned.
+ //
+ // If include_history=true, will also search Memtables in MemTableList
+ // History.
+ SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+ bool include_history);
+
+ // For a given key, check to see if there are any records for this key
+ // in the memtables, including memtable history. If cache_only is false,
+ // SST files will also be checked.
+ //
+ // `key` should NOT have user-defined timestamp appended to user key even if
+ // timestamp is enabled.
+ //
+ // If a key is found, *found_record_for_key will be set to true and
+ // *seq will be set to the stored sequence number for the latest
+ // operation on this key or kMaxSequenceNumber if unknown. If user-defined
+ // timestamp is enabled for this column family and timestamp is not nullptr,
+ // then *timestamp will be set to the stored timestamp for the latest
+ // operation on this key.
+ // If no key is found, *found_record_for_key will be set to false.
+ //
+ // Note: If cache_only=false, it is possible for *seq to be set to 0 if
+ // the sequence number has been cleared from the record. If the caller is
+ // holding an active db snapshot, we know the missing sequence must be less
+ // than the snapshot's sequence number (sequence numbers are only cleared
+ // when there are no earlier active snapshots).
+ //
+ // If NotFound is returned and found_record_for_key is set to false, then no
+ // record for this key was found. If the caller is holding an active db
+ // snapshot, we know that no key could have existing after this snapshot
+ // (since we do not compact keys that have an earlier snapshot).
+ //
+ // Only records newer than or at `lower_bound_seq` are guaranteed to be
+ // returned. Memtables and files may not be checked if it only contains data
+ // older than `lower_bound_seq`.
+ //
+ // Returns OK or NotFound on success,
+ // other status on unexpected error.
+ // TODO(andrewkr): this API need to be aware of range deletion operations
+ Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
+ bool cache_only,
+ SequenceNumber lower_bound_seq,
+ SequenceNumber* seq, std::string* timestamp,
+ bool* found_record_for_key,
+ bool* is_blob_index);
+
+ Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
+ const Slice& lower_bound, const Slice upper_bound);
+ Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+ const Slice& lower_bound,
+ const Slice upper_bound);
+#endif // ROCKSDB_LITE
+
+ // Similar to GetSnapshot(), but also lets the db know that this snapshot
+ // will be used for transaction write-conflict checking. The DB can then
+ // make sure not to compact any keys that would prevent a write-conflict from
+ // being detected.
+ const Snapshot* GetSnapshotForWriteConflictBoundary();
+
+ // checks if all live files exist on file system and that their file sizes
+ // match to our in-memory records
+ virtual Status CheckConsistency();
+
+ // max_file_num_to_ignore allows bottom level compaction to filter out newly
+ // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
+ // disable the filtering
+ Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
+ int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const Slice* begin, const Slice* end,
+ bool exclusive, bool disallow_trivial_move,
+ uint64_t max_file_num_to_ignore,
+ const std::string& trim_ts);
+
+ // Return an internal iterator over the current state of the database.
+ // The keys of this iterator are internal keys (see format.h).
+ // The returned iterator should be deleted when no longer needed.
+ // If allow_unprepared_value is true, the returned iterator may defer reading
+ // the value and so will require PrepareValue() to be called before value();
+ // allow_unprepared_value = false is convenient when this optimization is not
+ // useful, e.g. when reading the whole column family.
+ //
+ // read_options.ignore_range_deletions determines whether range tombstones are
+ // processed in the returned interator internally, i.e., whether range
+ // tombstone covered keys are in this iterator's output.
+ // @param read_options Must outlive the returned iterator.
+ InternalIterator* NewInternalIterator(
+ const ReadOptions& read_options, Arena* arena, SequenceNumber sequence,
+ ColumnFamilyHandle* column_family = nullptr,
+ bool allow_unprepared_value = false);
+
+ // Note: to support DB iterator refresh, memtable range tombstones in the
+ // underlying merging iterator needs to be refreshed. If db_iter is not
+ // nullptr, db_iter->SetMemtableRangetombstoneIter() is called with the
+ // memtable range tombstone iterator used by the underlying merging iterator.
+ // This range tombstone iterator can be refreshed later by db_iter.
+ // @param read_options Must outlive the returned iterator.
+ InternalIterator* NewInternalIterator(const ReadOptions& read_options,
+ ColumnFamilyData* cfd,
+ SuperVersion* super_version,
+ Arena* arena, SequenceNumber sequence,
+ bool allow_unprepared_value,
+ ArenaWrappedDBIter* db_iter = nullptr);
+
+ LogsWithPrepTracker* logs_with_prep_tracker() {
+ return &logs_with_prep_tracker_;
+ }
+
+ struct BGJobLimits {
+ int max_flushes;
+ int max_compactions;
+ };
+ // Returns maximum background flushes and compactions allowed to be scheduled
+ BGJobLimits GetBGJobLimits() const;
+ // Need a static version that can be called during SanitizeOptions().
+ static BGJobLimits GetBGJobLimits(int max_background_flushes,
+ int max_background_compactions,
+ int max_background_jobs,
+ bool parallelize_compactions);
+
+ // move logs pending closing from job_context to the DB queue and
+ // schedule a purge
+ void ScheduleBgLogWriterClose(JobContext* job_context);
+
+ uint64_t MinLogNumberToKeep();
+
+ // Returns the lower bound file number for SSTs that won't be deleted, even if
+ // they're obsolete. This lower bound is used internally to prevent newly
+ // created flush/compaction output files from being deleted before they're
+ // installed. This technique avoids the need for tracking the exact numbers of
+ // files pending creation, although it prevents more files than necessary from
+ // being deleted.
+ uint64_t MinObsoleteSstNumberToKeep();
+
+ // Returns the list of live files in 'live' and the list
+ // of all files in the filesystem in 'candidate_files'.
+ // If force == false and the last call was less than
+ // db_options_.delete_obsolete_files_period_micros microseconds ago,
+ // it will not fill up the job_context
+ void FindObsoleteFiles(JobContext* job_context, bool force,
+ bool no_full_scan = false);
+
+ // Diffs the files listed in filenames and those that do not
+ // belong to live files are possibly removed. Also, removes all the
+ // files in sst_delete_files and log_delete_files.
+ // It is not necessary to hold the mutex when invoking this method.
+ // If FindObsoleteFiles() was run, we need to also run
+ // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
+ void PurgeObsoleteFiles(JobContext& background_contet,
+ bool schedule_only = false);
+
+ // Schedule a background job to actually delete obsolete files.
+ void SchedulePurge();
+
+ const SnapshotList& snapshots() const { return snapshots_; }
+
+ // load list of snapshots to `snap_vector` that is no newer than `max_seq`
+ // in ascending order.
+ // `oldest_write_conflict_snapshot` is filled with the oldest snapshot
+ // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true.
+ void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
+ SequenceNumber* oldest_write_conflict_snapshot,
+ const SequenceNumber& max_seq) const {
+ InstrumentedMutexLock l(mutex());
+ snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq);
+ }
+
+ const ImmutableDBOptions& immutable_db_options() const {
+ return immutable_db_options_;
+ }
+
+ // Cancel all background jobs, including flush, compaction, background
+ // purging, stats dumping threads, etc. If `wait` = true, wait for the
+ // running jobs to abort or finish before returning. Otherwise, only
+ // sends the signals.
+ void CancelAllBackgroundWork(bool wait);
+
+ // Find Super version and reference it. Based on options, it might return
+ // the thread local cached one.
+ // Call ReturnAndCleanupSuperVersion() when it is no longer needed.
+ SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
+
+ // Similar to the previous function but looks up based on a column family id.
+ // nullptr will be returned if this column family no longer exists.
+ // REQUIRED: this function should only be called on the write thread or if the
+ // mutex is held.
+ SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
+
+ // Un-reference the super version and clean it up if it is the last reference.
+ void CleanupSuperVersion(SuperVersion* sv);
+
+ // Un-reference the super version and return it to thread local cache if
+ // needed. If it is the last reference of the super version. Clean it up
+ // after un-referencing it.
+ void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv);
+
+ // Similar to the previous function but looks up based on a column family id.
+ // nullptr will be returned if this column family no longer exists.
+ // REQUIRED: this function should only be called on the write thread.
+ void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);
+
+ // REQUIRED: this function should only be called on the write thread or if the
+ // mutex is held. Return value only valid until next call to this function or
+ // mutex is released.
+ ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
+
+ // Same as above, should called without mutex held and not on write thread.
+ std::unique_ptr<ColumnFamilyHandle> GetColumnFamilyHandleUnlocked(
+ uint32_t column_family_id);
+
+ // Returns the number of currently running flushes.
+ // REQUIREMENT: mutex_ must be held when calling this function.
+ int num_running_flushes() {
+ mutex_.AssertHeld();
+ return num_running_flushes_;
+ }
+
+ // Returns the number of currently running compactions.
+ // REQUIREMENT: mutex_ must be held when calling this function.
+ int num_running_compactions() {
+ mutex_.AssertHeld();
+ return num_running_compactions_;
+ }
+
+ const WriteController& write_controller() { return write_controller_; }
+
+ // hollow transactions shell used for recovery.
+ // these will then be passed to TransactionDB so that
+ // locks can be reacquired before writing can resume.
+ struct RecoveredTransaction {
+ std::string name_;
+ bool unprepared_;
+
+ struct BatchInfo {
+ uint64_t log_number_;
+ // TODO(lth): For unprepared, the memory usage here can be big for
+ // unprepared transactions. This is only useful for rollbacks, and we
+ // can in theory just keep keyset for that.
+ WriteBatch* batch_;
+ // Number of sub-batches. A new sub-batch is created if txn attempts to
+ // insert a duplicate key,seq to memtable. This is currently used in
+ // WritePreparedTxn/WriteUnpreparedTxn.
+ size_t batch_cnt_;
+ };
+
+ // This maps the seq of the first key in the batch to BatchInfo, which
+ // contains WriteBatch and other information relevant to the batch.
+ //
+ // For WriteUnprepared, batches_ can have size greater than 1, but for
+ // other write policies, it must be of size 1.
+ std::map<SequenceNumber, BatchInfo> batches_;
+
+ explicit RecoveredTransaction(const uint64_t log, const std::string& name,
+ WriteBatch* batch, SequenceNumber seq,
+ size_t batch_cnt, bool unprepared)
+ : name_(name), unprepared_(unprepared) {
+ batches_[seq] = {log, batch, batch_cnt};
+ }
+
+ ~RecoveredTransaction() {
+ for (auto& it : batches_) {
+ delete it.second.batch_;
+ }
+ }
+
+ void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch,
+ size_t batch_cnt, bool unprepared) {
+ assert(batches_.count(seq) == 0);
+ batches_[seq] = {log_number, batch, batch_cnt};
+ // Prior state must be unprepared, since the prepare batch must be the
+ // last batch.
+ assert(unprepared_);
+ unprepared_ = unprepared;
+ }
+ };
+
+ bool allow_2pc() const { return immutable_db_options_.allow_2pc; }
+
+ std::unordered_map<std::string, RecoveredTransaction*>
+ recovered_transactions() {
+ return recovered_transactions_;
+ }
+
+ RecoveredTransaction* GetRecoveredTransaction(const std::string& name) {
+ auto it = recovered_transactions_.find(name);
+ if (it == recovered_transactions_.end()) {
+ return nullptr;
+ } else {
+ return it->second;
+ }
+ }
+
+ void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
+ WriteBatch* batch, SequenceNumber seq,
+ size_t batch_cnt, bool unprepared_batch) {
+ // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple
+ // times for every unprepared batch encountered during recovery.
+ //
+ // If the transaction is prepared, then the last call to
+ // InsertRecoveredTransaction will have unprepared_batch = false.
+ auto rtxn = recovered_transactions_.find(name);
+ if (rtxn == recovered_transactions_.end()) {
+ recovered_transactions_[name] = new RecoveredTransaction(
+ log, name, batch, seq, batch_cnt, unprepared_batch);
+ } else {
+ rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch);
+ }
+ logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log);
+ }
+
+ void DeleteRecoveredTransaction(const std::string& name) {
+ auto it = recovered_transactions_.find(name);
+ assert(it != recovered_transactions_.end());
+ auto* trx = it->second;
+ recovered_transactions_.erase(it);
+ for (const auto& info : trx->batches_) {
+ logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(
+ info.second.log_number_);
+ }
+ delete trx;
+ }
+
+ void DeleteAllRecoveredTransactions() {
+ for (auto it = recovered_transactions_.begin();
+ it != recovered_transactions_.end(); ++it) {
+ delete it->second;
+ }
+ recovered_transactions_.clear();
+ }
+
+ void AddToLogsToFreeQueue(log::Writer* log_writer) {
+ mutex_.AssertHeld();
+ logs_to_free_queue_.push_back(log_writer);
+ }
+
+ void AddSuperVersionsToFreeQueue(SuperVersion* sv) {
+ superversions_to_free_queue_.push_back(sv);
+ }
+
+ void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
+
+ // Fill JobContext with snapshot information needed by flush and compaction.
+ void GetSnapshotContext(JobContext* job_context,
+ std::vector<SequenceNumber>* snapshot_seqs,
+ SequenceNumber* earliest_write_conflict_snapshot,
+ SnapshotChecker** snapshot_checker);
+
+ // Not thread-safe.
+ void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
+
+ InstrumentedMutex* mutex() const { return &mutex_; }
+
+ // Initialize a brand new DB. The DB directory is expected to be empty before
+ // calling it. Push new manifest file name into `new_filenames`.
+ Status NewDB(std::vector<std::string>* new_filenames);
+
+ // This is to be used only by internal rocksdb classes.
+ static Status Open(const DBOptions& db_options, const std::string& name,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ const bool seq_per_batch, const bool batch_per_txn);
+
+ static IOStatus CreateAndNewDirectory(
+ FileSystem* fs, const std::string& dirname,
+ std::unique_ptr<FSDirectory>* directory);
+
+ // find stats map from stats_history_ with smallest timestamp in
+ // the range of [start_time, end_time)
+ bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
+ uint64_t* new_time,
+ std::map<std::string, uint64_t>* stats_map);
+
+ // Print information of all tombstones of all iterators to the std::string
+ // This is only used by ldb. The output might be capped. Tombstones
+ // printed out are not guaranteed to be in any order.
+ Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+ int max_entries_to_print,
+ std::string* out_str);
+
+ VersionSet* GetVersionSet() const { return versions_.get(); }
+
+ // Wait for any compaction
+ // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+ // is only for the special test of CancelledCompactions
+ Status WaitForCompact(bool waitUnscheduled = false);
+
+#ifndef NDEBUG
+ // Compact any files in the named level that overlap [*begin, *end]
+ Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
+ ColumnFamilyHandle* column_family = nullptr,
+ bool disallow_trivial_move = false);
+
+ Status TEST_SwitchWAL();
+
+ bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
+
+ bool TEST_IsLogGettingFlushed() {
+ return alive_log_files_.begin()->getting_flushed;
+ }
+
+ Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
+
+ // Force current memtable contents to be flushed.
+ Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
+ ColumnFamilyHandle* cfh = nullptr);
+
+ Status TEST_FlushMemTable(ColumnFamilyData* cfd,
+ const FlushOptions& flush_opts);
+
+ // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This
+ // is because in certain cases, we can flush column families, wait for the
+ // flush to complete, but delete the column family handle before the wait
+ // finishes. For example in CompactRange.
+ Status TEST_AtomicFlushMemTables(const autovector<ColumnFamilyData*>& cfds,
+ const FlushOptions& flush_opts);
+
+ // Wait for background threads to complete scheduled work.
+ Status TEST_WaitForBackgroundWork();
+
+ // Wait for memtable compaction
+ Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
+
+ // Wait for any compaction
+ // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+ // is only for the special test of CancelledCompactions
+ Status TEST_WaitForCompact(bool waitUnscheduled = false);
+
+ // Wait for any background purge
+ Status TEST_WaitForPurge();
+
+ // Get the background error status
+ Status TEST_GetBGError();
+
+ // Return the maximum overlapping data (in bytes) at next level for any
+ // file at a level >= 1.
+ uint64_t TEST_MaxNextLevelOverlappingBytes(
+ ColumnFamilyHandle* column_family = nullptr);
+
+ // Return the current manifest file no.
+ uint64_t TEST_Current_Manifest_FileNo();
+
+ // Returns the number that'll be assigned to the next file that's created.
+ uint64_t TEST_Current_Next_FileNo();
+
+ // get total level0 file size. Only for testing.
+ uint64_t TEST_GetLevel0TotalSize();
+
+ void TEST_GetFilesMetaData(
+ ColumnFamilyHandle* column_family,
+ std::vector<std::vector<FileMetaData>>* metadata,
+ std::vector<std::shared_ptr<BlobFileMetaData>>* blob_metadata = nullptr);
+
+ void TEST_LockMutex();
+
+ void TEST_UnlockMutex();
+
+ // REQUIRES: mutex locked
+ void* TEST_BeginWrite();
+
+ // REQUIRES: mutex locked
+ // pass the pointer that you got from TEST_BeginWrite()
+ void TEST_EndWrite(void* w);
+
+ uint64_t TEST_MaxTotalInMemoryState() const {
+ return max_total_in_memory_state_;
+ }
+
+ size_t TEST_LogsToFreeSize();
+
+ uint64_t TEST_LogfileNumber();
+
+ uint64_t TEST_total_log_size() const { return total_log_size_; }
+
+ // Returns column family name to ImmutableCFOptions map.
+ Status TEST_GetAllImmutableCFOptions(
+ std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
+
+ // Return the lastest MutableCFOptions of a column family
+ Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
+ MutableCFOptions* mutable_cf_options);
+
+ Cache* TEST_table_cache() { return table_cache_.get(); }
+
+ WriteController& TEST_write_controler() { return write_controller_; }
+
+ uint64_t TEST_FindMinLogContainingOutstandingPrep();
+ uint64_t TEST_FindMinPrepLogReferencedByMemTable();
+ size_t TEST_PreparedSectionCompletedSize();
+ size_t TEST_LogsWithPrepSize();
+
+ int TEST_BGCompactionsAllowed() const;
+ int TEST_BGFlushesAllowed() const;
+ size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+ void TEST_WaitForPeridicTaskRun(std::function<void()> callback) const;
+ SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const;
+ size_t TEST_EstimateInMemoryStatsHistorySize() const;
+
+ uint64_t TEST_GetCurrentLogNumber() const {
+ InstrumentedMutexLock l(mutex());
+ assert(!logs_.empty());
+ return logs_.back().number;
+ }
+
+ const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
+ return files_grabbed_for_purge_;
+ }
+
+#ifndef ROCKSDB_LITE
+ const PeriodicTaskScheduler& TEST_GetPeriodicTaskScheduler() const;
+#endif // !ROCKSDB_LITE
+
+#endif // NDEBUG
+
+ // persist stats to column family "_persistent_stats"
+ void PersistStats();
+
+ // dump rocksdb.stats to LOG
+ void DumpStats();
+
+ // flush LOG out of application buffer
+ void FlushInfoLog();
+
+ // record current sequence number to time mapping
+ void RecordSeqnoToTimeMapping();
+
+ // Interface to block and signal the DB in case of stalling writes by
+ // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
+ // When DB needs to be blocked or signalled by WriteBufferManager,
+ // state_ is changed accordingly.
+ class WBMStallInterface : public StallInterface {
+ public:
+ enum State {
+ BLOCKED = 0,
+ RUNNING,
+ };
+
+ WBMStallInterface() : state_cv_(&state_mutex_) {
+ MutexLock lock(&state_mutex_);
+ state_ = State::RUNNING;
+ }
+
+ void SetState(State state) {
+ MutexLock lock(&state_mutex_);
+ state_ = state;
+ }
+
+ // Change the state_ to State::BLOCKED and wait until its state is
+ // changed by WriteBufferManager. When stall is cleared, Signal() is
+ // called to change the state and unblock the DB.
+ void Block() override {
+ MutexLock lock(&state_mutex_);
+ while (state_ == State::BLOCKED) {
+ TEST_SYNC_POINT("WBMStallInterface::BlockDB");
+ state_cv_.Wait();
+ }
+ }
+
+ // Called from WriteBufferManager. This function changes the state_
+ // to State::RUNNING indicating the stall is cleared and DB can proceed.
+ void Signal() override {
+ {
+ MutexLock lock(&state_mutex_);
+ state_ = State::RUNNING;
+ }
+ state_cv_.Signal();
+ }
+
+ private:
+ // Conditional variable and mutex to block and
+ // signal the DB during stalling process.
+ port::Mutex state_mutex_;
+ port::CondVar state_cv_;
+ // state represting whether DB is running or blocked because of stall by
+ // WriteBufferManager.
+ State state_;
+ };
+
+ static void TEST_ResetDbSessionIdGen();
+ static std::string GenerateDbSessionId(Env* env);
+
+ bool seq_per_batch() const { return seq_per_batch_; }
+
+ protected:
+ const std::string dbname_;
+ // TODO(peterd): unify with VersionSet::db_id_
+ std::string db_id_;
+ // db_session_id_ is an identifier that gets reset
+ // every time the DB is opened
+ std::string db_session_id_;
+ std::unique_ptr<VersionSet> versions_;
+ // Flag to check whether we allocated and own the info log file
+ bool own_info_log_;
+ Status init_logger_creation_s_;
+ const DBOptions initial_db_options_;
+ Env* const env_;
+ std::shared_ptr<IOTracer> io_tracer_;
+ const ImmutableDBOptions immutable_db_options_;
+ FileSystemPtr fs_;
+ MutableDBOptions mutable_db_options_;
+ Statistics* stats_;
+ std::unordered_map<std::string, RecoveredTransaction*>
+ recovered_transactions_;
+ std::unique_ptr<Tracer> tracer_;
+ InstrumentedMutex trace_mutex_;
+ BlockCacheTracer block_cache_tracer_;
+
+ // constant false canceled flag, used when the compaction is not manual
+ const std::atomic<bool> kManualCompactionCanceledFalse_{false};
+
+ // State below is protected by mutex_
+ // With two_write_queues enabled, some of the variables that accessed during
+ // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
+ // logs_, logfile_number_. Refer to the definition of each variable below for
+ // more description.
+ //
+ // `mutex_` can be a hot lock in some workloads, so it deserves dedicated
+ // cachelines.
+ mutable CacheAlignedInstrumentedMutex mutex_;
+
+ ColumnFamilyHandleImpl* default_cf_handle_;
+ InternalStats* default_cf_internal_stats_;
+
+ // table_cache_ provides its own synchronization
+ std::shared_ptr<Cache> table_cache_;
+
+ ErrorHandler error_handler_;
+
+ // Unified interface for logging events
+ EventLogger event_logger_;
+
+ // only used for dynamically adjusting max_total_wal_size. it is a sum of
+ // [write_buffer_size * max_write_buffer_number] over all column families
+ std::atomic<uint64_t> max_total_in_memory_state_;
+
+ // The options to access storage files
+ const FileOptions file_options_;
+
+ // Additonal options for compaction and flush
+ FileOptions file_options_for_compaction_;
+
+ std::unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
+
+ // Increase the sequence number after writing each batch, whether memtable is
+ // disabled for that or not. Otherwise the sequence number is increased after
+ // writing each key into memtable. This implies that when disable_memtable is
+ // set, the seq is not increased at all.
+ //
+ // Default: false
+ const bool seq_per_batch_;
+ // This determines during recovery whether we expect one writebatch per
+ // recovered transaction, or potentially multiple writebatches per
+ // transaction. For WriteUnprepared, this is set to false, since multiple
+ // batches can exist per transaction.
+ //
+ // Default: true
+ const bool batch_per_txn_;
+
+ // Each flush or compaction gets its own job id. this counter makes sure
+ // they're unique
+ std::atomic<int> next_job_id_;
+
+ std::atomic<bool> shutting_down_;
+
+ // RecoveryContext struct stores the context about version edits along
+ // with corresponding column_family_data and column_family_options.
+ class RecoveryContext {
+ public:
+ ~RecoveryContext() {
+ for (auto& edit_list : edit_lists_) {
+ for (auto* edit : edit_list) {
+ delete edit;
+ }
+ }
+ }
+
+ void UpdateVersionEdits(ColumnFamilyData* cfd, const VersionEdit& edit) {
+ assert(cfd != nullptr);
+ if (map_.find(cfd->GetID()) == map_.end()) {
+ uint32_t size = static_cast<uint32_t>(map_.size());
+ map_.emplace(cfd->GetID(), size);
+ cfds_.emplace_back(cfd);
+ mutable_cf_opts_.emplace_back(cfd->GetLatestMutableCFOptions());
+ edit_lists_.emplace_back(autovector<VersionEdit*>());
+ }
+ uint32_t i = map_[cfd->GetID()];
+ edit_lists_[i].emplace_back(new VersionEdit(edit));
+ }
+
+ std::unordered_map<uint32_t, uint32_t> map_; // cf_id to index;
+ autovector<ColumnFamilyData*> cfds_;
+ autovector<const MutableCFOptions*> mutable_cf_opts_;
+ autovector<autovector<VersionEdit*>> edit_lists_;
+ // files_to_delete_ contains sst files
+ std::unordered_set<std::string> files_to_delete_;
+ };
+
+ // Except in DB::Open(), WriteOptionsFile can only be called when:
+ // Persist options to options file.
+ // If need_mutex_lock = false, the method will lock DB mutex.
+ // If need_enter_write_thread = false, the method will enter write thread.
+ Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread);
+
+ Status CompactRangeInternal(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end,
+ const std::string& trim_ts);
+
+ // The following two functions can only be called when:
+ // 1. WriteThread::Writer::EnterUnbatched() is used.
+ // 2. db_mutex is NOT held
+ Status RenameTempFileToOptionsFile(const std::string& file_name);
+ Status DeleteObsoleteOptionsFiles();
+
+ void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+ const MutableCFOptions& mutable_cf_options,
+ int job_id);
+
+ void NotifyOnFlushCompleted(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info);
+
+ void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& job_stats, int job_id);
+
+ void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& job_stats,
+ int job_id);
+ void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
+ const MemTableInfo& mem_table_info);
+
+#ifndef ROCKSDB_LITE
+ void NotifyOnExternalFileIngested(
+ ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
+
+ virtual Status FlushForGetLiveFiles();
+#endif // !ROCKSDB_LITE
+
+ void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+ void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+ void EraseThreadStatusDbInfo() const;
+
+ // If disable_memtable is set the application logic must guarantee that the
+ // batch will still be skipped from memtable during the recovery. An excption
+ // to this is seq_per_batch_ mode, in which since each batch already takes one
+ // seq, it is ok for the batch to write to memtable during recovery as long as
+ // it only takes one sequence number: i.e., no duplicate keys.
+ // In WriteCommitted it is guarnateed since disable_memtable is used for
+ // prepare batch which will be written to memtable later during the commit,
+ // and in WritePrepared it is guaranteed since it will be used only for WAL
+ // markers which will never be written to memtable. If the commit marker is
+ // accompanied with CommitTimeWriteBatch that is not written to memtable as
+ // long as it has no duplicate keys, it does not violate the one-seq-per-batch
+ // policy.
+ // batch_cnt is expected to be non-zero in seq_per_batch mode and
+ // indicates the number of sub-patches. A sub-patch is a subset of the write
+ // batch that does not have duplicate keys.
+ Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
+ WriteCallback* callback = nullptr,
+ uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+ bool disable_memtable = false, uint64_t* seq_used = nullptr,
+ size_t batch_cnt = 0,
+ PreReleaseCallback* pre_release_callback = nullptr,
+ PostMemTableCallback* post_memtable_callback = nullptr);
+
+ Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
+ WriteCallback* callback = nullptr,
+ uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+ bool disable_memtable = false,
+ uint64_t* seq_used = nullptr);
+
+ // Write only to memtables without joining any write queue
+ Status UnorderedWriteMemtable(const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback,
+ uint64_t log_ref, SequenceNumber seq,
+ const size_t sub_batch_cnt);
+
+ // Whether the batch requires to be assigned with an order
+ enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder };
+ // Whether it requires publishing last sequence or not
+ enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq };
+
+ // Join the write_thread to write the batch only to the WAL. It is the
+ // responsibility of the caller to also write the write batch to the memtable
+ // if it required.
+ //
+ // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder
+ // indicating the number of sub-batches in my_batch. A sub-patch is a subset
+ // of the write batch that does not have duplicate keys. When seq_per_batch is
+ // not set, each key is a separate sub_batch. Otherwise each duplicate key
+ // marks start of a new sub-batch.
+ Status WriteImplWALOnly(
+ WriteThread* write_thread, const WriteOptions& options,
+ WriteBatch* updates, WriteCallback* callback, uint64_t* log_used,
+ const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+ PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+ const PublishLastSeq publish_last_seq, const bool disable_memtable);
+
+ // write cached_recoverable_state_ to memtable if it is not empty
+ // The writer must be the leader in write_thread_ and holding mutex_
+ Status WriteRecoverableState();
+
+ // Actual implementation of Close()
+ Status CloseImpl();
+
+ // Recover the descriptor from persistent storage. May do a significant
+ // amount of work to recover recently logged updates. Any changes to
+ // be made to the descriptor are added to *edit.
+ // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
+ // skipped.
+ // recovery_ctx stores the context about version edits and all those
+ // edits are persisted to new Manifest after successfully syncing the new WAL.
+ virtual Status Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool read_only = false, bool error_if_wal_file_exists = false,
+ bool error_if_data_exists_in_wals = false,
+ uint64_t* recovered_seq = nullptr,
+ RecoveryContext* recovery_ctx = nullptr);
+
+ virtual bool OwnTablesAndLogs() const { return true; }
+
+ // Setup DB identity file, and write DB ID to manifest if necessary.
+ Status SetupDBId(bool read_only, RecoveryContext* recovery_ctx);
+ // Assign db_id_ and write DB ID to manifest if necessary.
+ void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx);
+
+ // REQUIRES: db mutex held when calling this function, but the db mutex can
+ // be released and re-acquired. Db mutex will be held when the function
+ // returns.
+ // After recovery, there may be SST files in db/cf paths that are
+ // not referenced in the MANIFEST (e.g.
+ // 1. It's best effort recovery;
+ // 2. The VersionEdits referencing the SST files are appended to
+ // RecoveryContext, DB crashes when syncing the MANIFEST, the VersionEdits are
+ // still not synced to MANIFEST during recovery.)
+ // It stores the SST files to be deleted in RecoveryContext. In the
+ // meantime, we find out the largest file number present in the paths, and
+ // bump up the version set's next_file_number_ to be 1 + largest_file_number.
+ // recovery_ctx stores the context about version edits and files to be
+ // deleted. All those edits are persisted to new Manifest after successfully
+ // syncing the new WAL.
+ Status DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx);
+
+ // SetDbSessionId() should be called in the constuctor DBImpl()
+ // to ensure that db_session_id_ gets updated every time the DB is opened
+ void SetDbSessionId();
+
+ Status FailIfCfHasTs(const ColumnFamilyHandle* column_family) const;
+ Status FailIfTsMismatchCf(ColumnFamilyHandle* column_family, const Slice& ts,
+ bool ts_for_read) const;
+
+ // recovery_ctx stores the context about version edits and
+ // LogAndApplyForRecovery persist all those edits to new Manifest after
+ // successfully syncing new WAL.
+ // LogAndApplyForRecovery should be called only once during recovery and it
+ // should be called when RocksDB writes to a first new MANIFEST since this
+ // recovery.
+ Status LogAndApplyForRecovery(const RecoveryContext& recovery_ctx);
+
+ void InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap();
+
+ // Return true to proceed with current WAL record whose content is stored in
+ // `batch`. Return false to skip current WAL record.
+ bool InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number,
+ const std::string& wal_fname,
+ log::Reader::Reporter& reporter,
+ Status& status, bool& stop_replay,
+ WriteBatch& batch);
+
+ private:
+ friend class DB;
+ friend class ErrorHandler;
+ friend class InternalStats;
+ friend class PessimisticTransaction;
+ friend class TransactionBaseImpl;
+ friend class WriteCommittedTxn;
+ friend class WritePreparedTxn;
+ friend class WritePreparedTxnDB;
+ friend class WriteBatchWithIndex;
+ friend class WriteUnpreparedTxnDB;
+ friend class WriteUnpreparedTxn;
+
+#ifndef ROCKSDB_LITE
+ friend class ForwardIterator;
+#endif
+ friend struct SuperVersion;
+ friend class CompactedDBImpl;
+ friend class DBTest_ConcurrentFlushWAL_Test;
+ friend class DBTest_MixedSlowdownOptionsStop_Test;
+ friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
+ friend class DBCompactionTest_CompactionDuringShutdown_Test;
+ friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test;
+#ifndef NDEBUG
+ friend class DBTest2_ReadCallbackTest_Test;
+ friend class WriteCallbackPTest_WriteWithCallbackTest_Test;
+ friend class XFTransactionWriteHandler;
+ friend class DBBlobIndexTest;
+ friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+#endif
+
+ struct CompactionState;
+ struct PrepickedCompaction;
+ struct PurgeFileInfo;
+
+ struct WriteContext {
+ SuperVersionContext superversion_context;
+ autovector<MemTable*> memtables_to_free_;
+
+ explicit WriteContext(bool create_superversion = false)
+ : superversion_context(create_superversion) {}
+
+ ~WriteContext() {
+ superversion_context.Clean();
+ for (auto& m : memtables_to_free_) {
+ delete m;
+ }
+ }
+ };
+
+ struct LogFileNumberSize {
+ explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
+ LogFileNumberSize() {}
+ void AddSize(uint64_t new_size) { size += new_size; }
+ uint64_t number;
+ uint64_t size = 0;
+ bool getting_flushed = false;
+ };
+
+ struct LogWriterNumber {
+ // pass ownership of _writer
+ LogWriterNumber(uint64_t _number, log::Writer* _writer)
+ : number(_number), writer(_writer) {}
+
+ log::Writer* ReleaseWriter() {
+ auto* w = writer;
+ writer = nullptr;
+ return w;
+ }
+ Status ClearWriter() {
+ Status s = writer->WriteBuffer();
+ delete writer;
+ writer = nullptr;
+ return s;
+ }
+
+ bool IsSyncing() { return getting_synced; }
+
+ uint64_t GetPreSyncSize() {
+ assert(getting_synced);
+ return pre_sync_size;
+ }
+
+ void PrepareForSync() {
+ assert(!getting_synced);
+ // Size is expected to be monotonically increasing.
+ assert(writer->file()->GetFlushedSize() >= pre_sync_size);
+ getting_synced = true;
+ pre_sync_size = writer->file()->GetFlushedSize();
+ }
+
+ void FinishSync() {
+ assert(getting_synced);
+ getting_synced = false;
+ }
+
+ uint64_t number;
+ // Visual Studio doesn't support deque's member to be noncopyable because
+ // of a std::unique_ptr as a member.
+ log::Writer* writer; // own
+
+ private:
+ // true for some prefix of logs_
+ bool getting_synced = false;
+ // The size of the file before the sync happens. This amount is guaranteed
+ // to be persisted even if appends happen during sync so it can be used for
+ // tracking the synced size in MANIFEST.
+ uint64_t pre_sync_size = 0;
+ };
+
+ struct LogContext {
+ explicit LogContext(bool need_sync = false)
+ : need_log_sync(need_sync), need_log_dir_sync(need_sync) {}
+ bool need_log_sync = false;
+ bool need_log_dir_sync = false;
+ log::Writer* writer = nullptr;
+ LogFileNumberSize* log_file_number_size = nullptr;
+ };
+
+ // PurgeFileInfo is a structure to hold information of files to be deleted in
+ // purge_files_
+ struct PurgeFileInfo {
+ std::string fname;
+ std::string dir_to_sync;
+ FileType type;
+ uint64_t number;
+ int job_id;
+ PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
+ int jid)
+ : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
+ };
+
+ // Argument required by background flush thread.
+ struct BGFlushArg {
+ BGFlushArg()
+ : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
+ BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
+ SuperVersionContext* superversion_context)
+ : cfd_(cfd),
+ max_memtable_id_(max_memtable_id),
+ superversion_context_(superversion_context) {}
+
+ // Column family to flush.
+ ColumnFamilyData* cfd_;
+ // Maximum ID of memtable to flush. In this column family, memtables with
+ // IDs smaller than this value must be flushed before this flush completes.
+ uint64_t max_memtable_id_;
+ // Pointer to a SuperVersionContext object. After flush completes, RocksDB
+ // installs a new superversion for the column family. This operation
+ // requires a SuperVersionContext object (currently embedded in JobContext).
+ SuperVersionContext* superversion_context_;
+ };
+
+ // Argument passed to flush thread.
+ struct FlushThreadArg {
+ DBImpl* db_;
+
+ Env::Priority thread_pri_;
+ };
+
+ // Information for a manual compaction
+ struct ManualCompactionState {
+ ManualCompactionState(ColumnFamilyData* _cfd, int _input_level,
+ int _output_level, uint32_t _output_path_id,
+ bool _exclusive, bool _disallow_trivial_move,
+ std::atomic<bool>* _canceled)
+ : cfd(_cfd),
+ input_level(_input_level),
+ output_level(_output_level),
+ output_path_id(_output_path_id),
+ exclusive(_exclusive),
+ disallow_trivial_move(_disallow_trivial_move),
+ canceled(_canceled ? *_canceled : canceled_internal_storage) {}
+ // When _canceled is not provided by ther user, we assign the reference of
+ // canceled_internal_storage to it to consolidate canceled and
+ // manual_compaction_paused since DisableManualCompaction() might be
+ // called
+
+ ColumnFamilyData* cfd;
+ int input_level;
+ int output_level;
+ uint32_t output_path_id;
+ Status status;
+ bool done = false;
+ bool in_progress = false; // compaction request being processed?
+ bool incomplete = false; // only part of requested range compacted
+ bool exclusive; // current behavior of only one manual
+ bool disallow_trivial_move; // Force actual compaction to run
+ const InternalKey* begin = nullptr; // nullptr means beginning of key range
+ const InternalKey* end = nullptr; // nullptr means end of key range
+ InternalKey* manual_end = nullptr; // how far we are compacting
+ InternalKey tmp_storage; // Used to keep track of compaction progress
+ InternalKey tmp_storage1; // Used to keep track of compaction progress
+
+ // When the user provides a canceled pointer in CompactRangeOptions, the
+ // above varaibe is the reference of the user-provided
+ // `canceled`, otherwise, it is the reference of canceled_internal_storage
+ std::atomic<bool> canceled_internal_storage = false;
+ std::atomic<bool>& canceled; // Compaction canceled pointer reference
+ };
+ struct PrepickedCompaction {
+ // background compaction takes ownership of `compaction`.
+ Compaction* compaction;
+ // caller retains ownership of `manual_compaction_state` as it is reused
+ // across background compactions.
+ ManualCompactionState* manual_compaction_state; // nullptr if non-manual
+ // task limiter token is requested during compaction picking.
+ std::unique_ptr<TaskLimiterToken> task_token;
+ };
+
+ struct CompactionArg {
+ // caller retains ownership of `db`.
+ DBImpl* db;
+ // background compaction takes ownership of `prepicked_compaction`.
+ PrepickedCompaction* prepicked_compaction;
+ Env::Priority compaction_pri_;
+ };
+
+ // Initialize the built-in column family for persistent stats. Depending on
+ // whether on-disk persistent stats have been enabled before, it may either
+ // create a new column family and column family handle or just a column family
+ // handle.
+ // Required: DB mutex held
+ Status InitPersistStatsColumnFamily();
+
+ // Persistent Stats column family has two format version key which are used
+ // for compatibility check. Write format version if it's created for the
+ // first time, read format version and check compatibility if recovering
+ // from disk. This function requires DB mutex held at entrance but may
+ // release and re-acquire DB mutex in the process.
+ // Required: DB mutex held
+ Status PersistentStatsProcessFormatVersion();
+
+ Status ResumeImpl(DBRecoverContext context);
+
+ void MaybeIgnoreError(Status* s) const;
+
+ const Status CreateArchivalDirectory();
+
+ Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+ const std::string& cf_name,
+ ColumnFamilyHandle** handle);
+
+ Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family);
+
+ // Delete any unneeded files and stale in-memory entries.
+ void DeleteObsoleteFiles();
+ // Delete obsolete files and log status and information of file deletion
+ void DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+ const std::string& path_to_sync, FileType type,
+ uint64_t number);
+
+ // Background process needs to call
+ // auto x = CaptureCurrentFileNumberInPendingOutputs()
+ // auto file_num = versions_->NewFileNumber();
+ // <do something>
+ // ReleaseFileNumberFromPendingOutputs(x)
+ // This will protect any file with number `file_num` or greater from being
+ // deleted while <do something> is running.
+ // -----------
+ // This function will capture current file number and append it to
+ // pending_outputs_. This will prevent any background process to delete any
+ // file created after this point.
+ std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
+ // This function should be called with the result of
+ // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
+ // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
+ // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
+ // and blocked by any other pending_outputs_ calls)
+ void ReleaseFileNumberFromPendingOutputs(
+ std::unique_ptr<std::list<uint64_t>::iterator>& v);
+
+ IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals);
+
+ // Flush the in-memory write buffer to storage. Switches to a new
+ // log-file/memtable and writes a new descriptor iff successful. Then
+ // installs a new super version for the column family.
+ Status FlushMemTableToOutputFile(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ bool* madeProgress, JobContext* job_context,
+ SuperVersionContext* superversion_context,
+ std::vector<SequenceNumber>& snapshot_seqs,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+ Env::Priority thread_pri);
+
+ // Flush the memtables of (multiple) column families to multiple files on
+ // persistent storage.
+ Status FlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+ Status AtomicFlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+ // REQUIRES: log_numbers are sorted in ascending order
+ // corrupted_log_found is set to true if we recover from a corrupted log file.
+ Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+ SequenceNumber* next_sequence, bool read_only,
+ bool* corrupted_log_found,
+ RecoveryContext* recovery_ctx);
+
+ // The following two methods are used to flush a memtable to
+ // storage. The first one is used at database RecoveryTime (when the
+ // database is opened) and is heavyweight because it holds the mutex
+ // for the entire period. The second method WriteLevel0Table supports
+ // concurrent flush memtables to storage.
+ Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+ MemTable* mem, VersionEdit* edit);
+
+ // Get the size of a log file and, if truncate is true, truncate the
+ // log file to its actual size, thereby freeing preallocated space.
+ // Return success even if truncate fails
+ Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
+ LogFileNumberSize* log);
+
+ // Restore alive_log_files_ and total_log_size_ after recovery.
+ // It needs to run only when there's no flush during recovery
+ // (e.g. avoid_flush_during_recovery=true). May also trigger flush
+ // in case total_log_size > max_total_wal_size.
+ Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
+
+ // num_bytes: for slowdown case, delay time is calculated based on
+ // `num_bytes` going through.
+ Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
+
+ // Begin stalling of writes when memory usage increases beyond a certain
+ // threshold.
+ void WriteBufferManagerStallWrites();
+
+ Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+ WriteBatch* my_batch);
+
+ // REQUIRES: mutex locked and in write thread.
+ Status ScheduleFlushes(WriteContext* context);
+
+ void MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds);
+
+ Status TrimMemtableHistory(WriteContext* context);
+
+ Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
+
+ void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
+
+ // Force current memtable contents to be flushed.
+ Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
+ FlushReason flush_reason,
+ bool entered_write_thread = false);
+
+ Status AtomicFlushMemTables(
+ const autovector<ColumnFamilyData*>& column_family_datas,
+ const FlushOptions& options, FlushReason flush_reason,
+ bool entered_write_thread = false);
+
+ // Wait until flushing this column family won't stall writes
+ Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+ bool* flush_needed);
+
+ // Wait for memtable flushed.
+ // If flush_memtable_id is non-null, wait until the memtable with the ID
+ // gets flush. Otherwise, wait until the column family don't have any
+ // memtable pending flush.
+ // resuming_from_bg_err indicates whether the caller is attempting to resume
+ // from background error.
+ Status WaitForFlushMemTable(ColumnFamilyData* cfd,
+ const uint64_t* flush_memtable_id = nullptr,
+ bool resuming_from_bg_err = false) {
+ return WaitForFlushMemTables({cfd}, {flush_memtable_id},
+ resuming_from_bg_err);
+ }
+ // Wait for memtables to be flushed for multiple column families.
+ Status WaitForFlushMemTables(
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const uint64_t*>& flush_memtable_ids,
+ bool resuming_from_bg_err);
+
+ inline void WaitForPendingWrites() {
+ mutex_.AssertHeld();
+ TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock");
+ // In case of pipelined write is enabled, wait for all pending memtable
+ // writers.
+ if (immutable_db_options_.enable_pipelined_write) {
+ // Memtable writers may call DB::Get in case max_successive_merges > 0,
+ // which may lock mutex. Unlocking mutex here to avoid deadlock.
+ mutex_.Unlock();
+ write_thread_.WaitForMemTableWriters();
+ mutex_.Lock();
+ }
+
+ if (!immutable_db_options_.unordered_write) {
+ // Then the writes are finished before the next write group starts
+ return;
+ }
+
+ // Wait for the ones who already wrote to the WAL to finish their
+ // memtable write.
+ if (pending_memtable_writes_.load() != 0) {
+ std::unique_lock<std::mutex> guard(switch_mutex_);
+ switch_cv_.wait(guard,
+ [&] { return pending_memtable_writes_.load() == 0; });
+ }
+ }
+
+ // TaskType is used to identify tasks in thread-pool, currently only
+ // differentiate manual compaction, which could be unscheduled from the
+ // thread-pool.
+ enum class TaskType : uint8_t {
+ kDefault = 0,
+ kManualCompaction = 1,
+ kCount = 2,
+ };
+
+ // Task tag is used to identity tasks in thread-pool, which is
+ // dbImpl obj address + type
+ inline void* GetTaskTag(TaskType type) {
+ return GetTaskTag(static_cast<uint8_t>(type));
+ }
+
+ inline void* GetTaskTag(uint8_t type) {
+ return static_cast<uint8_t*>(static_cast<void*>(this)) + type;
+ }
+
+ // REQUIRES: mutex locked and in write thread.
+ void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
+
+ // REQUIRES: mutex locked and in write thread.
+ Status SwitchWAL(WriteContext* write_context);
+
+ // REQUIRES: mutex locked and in write thread.
+ Status HandleWriteBufferManagerFlush(WriteContext* write_context);
+
+ // REQUIRES: mutex locked
+ Status PreprocessWrite(const WriteOptions& write_options,
+ LogContext* log_context, WriteContext* write_context);
+
+ // Merge write batches in the write group into merged_batch.
+ // Returns OK if merge is successful.
+ // Returns Corruption if corruption in write batch is detected.
+ Status MergeBatch(const WriteThread::WriteGroup& write_group,
+ WriteBatch* tmp_batch, WriteBatch** merged_batch,
+ size_t* write_with_wal, WriteBatch** to_be_cached_state);
+
+ // rate_limiter_priority is used to charge `DBOptions::rate_limiter`
+ // for automatic WAL flush (`Options::manual_wal_flush` == false)
+ // associated with this WriteToWAL
+ IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
+ uint64_t* log_used, uint64_t* log_size,
+ Env::IOPriority rate_limiter_priority,
+ LogFileNumberSize& log_file_number_size);
+
+ IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
+ log::Writer* log_writer, uint64_t* log_used,
+ bool need_log_sync, bool need_log_dir_sync,
+ SequenceNumber sequence,
+ LogFileNumberSize& log_file_number_size);
+
+ IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
+ uint64_t* log_used,
+ SequenceNumber* last_sequence, size_t seq_inc);
+
+ // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
+ // Caller must hold mutex_.
+ void WriteStatusCheckOnLocked(const Status& status);
+
+ // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
+ void WriteStatusCheck(const Status& status);
+
+ // Used by WriteImpl to update bg_error_ when IO error happens, e.g., write
+ // WAL, sync WAL fails, if paranoid check is enabled.
+ void IOStatusCheck(const IOStatus& status);
+
+ // Used by WriteImpl to update bg_error_ in case of memtable insert error.
+ void MemTableInsertStatusCheck(const Status& memtable_insert_status);
+
+#ifndef ROCKSDB_LITE
+ Status CompactFilesImpl(const CompactionOptions& compact_options,
+ ColumnFamilyData* cfd, Version* version,
+ const std::vector<std::string>& input_file_names,
+ std::vector<std::string>* const output_file_names,
+ const int output_level, int output_path_id,
+ JobContext* job_context, LogBuffer* log_buffer,
+ CompactionJobInfo* compaction_job_info);
+
+ // Wait for current IngestExternalFile() calls to finish.
+ // REQUIRES: mutex_ held
+ void WaitForIngestFile();
+#else
+ // IngestExternalFile is not supported in ROCKSDB_LITE so this function
+ // will be no-op
+ void WaitForIngestFile() {}
+#endif // ROCKSDB_LITE
+
+ ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
+
+ void MaybeScheduleFlushOrCompaction();
+
+ // A flush request specifies the column families to flush as well as the
+ // largest memtable id to persist for each column family. Once all the
+ // memtables whose IDs are smaller than or equal to this per-column-family
+ // specified value, this flush request is considered to have completed its
+ // work of flushing this column family. After completing the work for all
+ // column families in this request, this flush is considered complete.
+ using FlushRequest = std::vector<std::pair<ColumnFamilyData*, uint64_t>>;
+
+ void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+ FlushRequest* req);
+
+ void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason);
+
+ void SchedulePendingCompaction(ColumnFamilyData* cfd);
+ void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+ FileType type, uint64_t number, int job_id);
+ static void BGWorkCompaction(void* arg);
+ // Runs a pre-chosen universal compaction involving bottom level in a
+ // separate, bottom-pri thread pool.
+ static void BGWorkBottomCompaction(void* arg);
+ static void BGWorkFlush(void* arg);
+ static void BGWorkPurge(void* arg);
+ static void UnscheduleCompactionCallback(void* arg);
+ static void UnscheduleFlushCallback(void* arg);
+ void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+ Env::Priority thread_pri);
+ void BackgroundCallFlush(Env::Priority thread_pri);
+ void BackgroundCallPurge();
+ Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
+ LogBuffer* log_buffer,
+ PrepickedCompaction* prepicked_compaction,
+ Env::Priority thread_pri);
+ Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
+ LogBuffer* log_buffer, FlushReason* reason,
+ Env::Priority thread_pri);
+
+ bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+ const std::vector<CompactionInputFiles>& inputs,
+ bool* sfm_bookkeeping, LogBuffer* log_buffer);
+
+ // Request compaction tasks token from compaction thread limiter.
+ // It always succeeds if force = true or limiter is disable.
+ bool RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+ std::unique_ptr<TaskLimiterToken>* token,
+ LogBuffer* log_buffer);
+
+ // Schedule background tasks
+ Status StartPeriodicTaskScheduler();
+
+ Status RegisterRecordSeqnoTimeWorker();
+
+ void PrintStatistics();
+
+ size_t EstimateInMemoryStatsHistorySize() const;
+
+ // Return the minimum empty level that could hold the total data in the
+ // input level. Return the input level, if such level could not be found.
+ int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
+ const MutableCFOptions& mutable_cf_options,
+ int level);
+
+ // Move the files in the input level to the target level.
+ // If target_level < 0, automatically calculate the minimum level that could
+ // hold the data set.
+ Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
+
+ // helper functions for adding and removing from flush & compaction queues
+ void AddToCompactionQueue(ColumnFamilyData* cfd);
+ ColumnFamilyData* PopFirstFromCompactionQueue();
+ FlushRequest PopFirstFromFlushQueue();
+
+ // Pick the first unthrottled compaction with task token from queue.
+ ColumnFamilyData* PickCompactionFromQueue(
+ std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
+
+ // helper function to call after some of the logs_ were synced
+ void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit);
+ Status ApplyWALToManifest(VersionEdit* edit);
+ // WALs with log number up to up_to are not synced successfully.
+ void MarkLogsNotSynced(uint64_t up_to);
+
+ SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
+ bool lock = true);
+
+ // If snapshot_seq != kMaxSequenceNumber, then this function can only be
+ // called from the write thread that publishes sequence numbers to readers.
+ // For 1) write-committed, or 2) write-prepared + one-write-queue, this will
+ // be the write thread performing memtable writes. For write-prepared with
+ // two write queues, this will be the write thread writing commit marker to
+ // the WAL.
+ // If snapshot_seq == kMaxSequenceNumber, this function is called by a caller
+ // ensuring no writes to the database.
+ std::pair<Status, std::shared_ptr<const SnapshotImpl>>
+ CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts,
+ bool lock = true);
+
+ uint64_t GetMaxTotalWalSize() const;
+
+ FSDirectory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
+
+ Status MaybeReleaseTimestampedSnapshotsAndCheck();
+
+ Status CloseHelper();
+
+ void WaitForBackgroundWork();
+
+ // Background threads call this function, which is just a wrapper around
+ // the InstallSuperVersion() function. Background threads carry
+ // sv_context which can have new_superversion already
+ // allocated.
+ // All ColumnFamily state changes go through this function. Here we analyze
+ // the new state and we schedule background work if we detect that the new
+ // state needs flush or compaction.
+ void InstallSuperVersionAndScheduleWork(
+ ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+ const MutableCFOptions& mutable_cf_options);
+
+ bool GetIntPropertyInternal(ColumnFamilyData* cfd,
+ const DBPropertyInfo& property_info,
+ bool is_locked, uint64_t* value);
+ bool GetPropertyHandleOptionsStatistics(std::string* value);
+
+ bool HasPendingManualCompaction();
+ bool HasExclusiveManualCompaction();
+ void AddManualCompaction(ManualCompactionState* m);
+ void RemoveManualCompaction(ManualCompactionState* m);
+ bool ShouldntRunManualCompaction(ManualCompactionState* m);
+ bool HaveManualCompaction(ColumnFamilyData* cfd);
+ bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
+#ifndef ROCKSDB_LITE
+ void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& compaction_job_stats,
+ const int job_id, const Version* current,
+ CompactionJobInfo* compaction_job_info) const;
+ // Reserve the next 'num' file numbers for to-be-ingested external SST files,
+ // and return the current file_number in 'next_file_number'.
+ // Write a version edit to the MANIFEST.
+ Status ReserveFileNumbersBeforeIngestion(
+ ColumnFamilyData* cfd, uint64_t num,
+ std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+ uint64_t* next_file_number);
+#endif //! ROCKSDB_LITE
+
+ bool ShouldPurge(uint64_t file_number) const;
+ void MarkAsGrabbedForPurge(uint64_t file_number);
+
+ size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+ Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
+
+ IOStatus CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+ size_t preallocate_block_size, log::Writer** new_log);
+
+ // Validate self-consistency of DB options
+ static Status ValidateOptions(const DBOptions& db_options);
+ // Validate self-consistency of DB options and its consistency with cf options
+ static Status ValidateOptions(
+ const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families);
+
+ // Utility function to do some debug validation and sort the given vector
+ // of MultiGet keys
+ void PrepareMultiGetKeys(
+ const size_t num_keys, bool sorted,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* key_ptrs);
+
+ // A structure to hold the information required to process MultiGet of keys
+ // belonging to one column family. For a multi column family MultiGet, there
+ // will be a container of these objects.
+ struct MultiGetColumnFamilyData {
+ ColumnFamilyHandle* cf;
+ ColumnFamilyData* cfd;
+
+ // For the batched MultiGet which relies on sorted keys, start specifies
+ // the index of first key belonging to this column family in the sorted
+ // list.
+ size_t start;
+
+ // For the batched MultiGet case, num_keys specifies the number of keys
+ // belonging to this column family in the sorted list
+ size_t num_keys;
+
+ // SuperVersion for the column family obtained in a manner that ensures a
+ // consistent view across all column families in the DB
+ SuperVersion* super_version;
+ MultiGetColumnFamilyData(ColumnFamilyHandle* column_family,
+ SuperVersion* sv)
+ : cf(column_family),
+ cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+ start(0),
+ num_keys(0),
+ super_version(sv) {}
+
+ MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first,
+ size_t count, SuperVersion* sv)
+ : cf(column_family),
+ cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+ start(first),
+ num_keys(count),
+ super_version(sv) {}
+
+ MultiGetColumnFamilyData() = default;
+ };
+
+ // A common function to obtain a consistent snapshot, which can be implicit
+ // if the user doesn't specify a snapshot in read_options, across
+ // multiple column families for MultiGet. It will attempt to get an implicit
+ // snapshot without acquiring the db_mutes, but will give up after a few
+ // tries and acquire the mutex if a memtable flush happens. The template
+ // allows both the batched and non-batched MultiGet to call this with
+ // either an std::unordered_map or autovector of column families.
+ //
+ // If callback is non-null, the callback is refreshed with the snapshot
+ // sequence number
+ //
+ // A return value of true indicates that the SuperVersions were obtained
+ // from the ColumnFamilyData, whereas false indicates they are thread
+ // local
+ template <class T>
+ bool MultiCFSnapshot(
+ const ReadOptions& read_options, ReadCallback* callback,
+ std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+ iter_deref_func,
+ T* cf_list, SequenceNumber* snapshot);
+
+ // The actual implementation of the batching MultiGet. The caller is expected
+ // to have acquired the SuperVersion and pass in a snapshot sequence number
+ // in order to construct the LookupKeys. The start_key and num_keys specify
+ // the range of keys in the sorted_keys vector for a single column family.
+ Status MultiGetImpl(
+ const ReadOptions& read_options, size_t start_key, size_t num_keys,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+ SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback);
+
+ Status DisableFileDeletionsWithLock();
+
+ Status IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
+ std::string ts_low);
+
+ bool ShouldReferenceSuperVersion(const MergeContext& merge_context);
+
+ // Lock over the persistent DB state. Non-nullptr iff successfully acquired.
+ FileLock* db_lock_;
+
+ // In addition to mutex_, log_write_mutex_ protected writes to stats_history_
+ InstrumentedMutex stats_history_mutex_;
+ // In addition to mutex_, log_write_mutex_ protected writes to logs_ and
+ // logfile_number_. With two_write_queues it also protects alive_log_files_,
+ // and log_empty_. Refer to the definition of each variable below for more
+ // details.
+ // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and
+ // mutex_, the order should be first mutex_ and then log_write_mutex_.
+ InstrumentedMutex log_write_mutex_;
+
+ // If zero, manual compactions are allowed to proceed. If non-zero, manual
+ // compactions may still be running, but will quickly fail with
+ // `Status::Incomplete`. The value indicates how many threads have paused
+ // manual compactions. It is accessed in read mode outside the DB mutex in
+ // compaction code paths.
+ std::atomic<int> manual_compaction_paused_;
+
+ // This condition variable is signaled on these conditions:
+ // * whenever bg_compaction_scheduled_ goes down to 0
+ // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
+ // made any progress
+ // * whenever a compaction made any progress
+ // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases
+ // (i.e. whenever a flush is done, even if it didn't make any progress)
+ // * whenever there is an error in background purge, flush or compaction
+ // * whenever num_running_ingest_file_ goes to 0.
+ // * whenever pending_purge_obsolete_files_ goes to 0.
+ // * whenever disable_delete_obsolete_files_ goes to 0.
+ // * whenever SetOptions successfully updates options.
+ // * whenever a column family is dropped.
+ InstrumentedCondVar bg_cv_;
+ // Writes are protected by locking both mutex_ and log_write_mutex_, and reads
+ // must be under either mutex_ or log_write_mutex_. Since after ::Open,
+ // logfile_number_ is currently updated only in write_thread_, it can be read
+ // from the same write_thread_ without any locks.
+ uint64_t logfile_number_;
+ // Log files that we can recycle. Must be protected by db mutex_.
+ std::deque<uint64_t> log_recycle_files_;
+ // Protected by log_write_mutex_.
+ bool log_dir_synced_;
+ // Without two_write_queues, read and writes to log_empty_ are protected by
+ // mutex_. Since it is currently updated/read only in write_thread_, it can be
+ // accessed from the same write_thread_ without any locks. With
+ // two_write_queues writes, where it can be updated in different threads,
+ // read and writes are protected by log_write_mutex_ instead. This is to avoid
+ // expensive mutex_ lock during WAL write, which update log_empty_.
+ bool log_empty_;
+
+ ColumnFamilyHandleImpl* persist_stats_cf_handle_;
+
+ bool persistent_stats_cfd_exists_ = true;
+
+ // alive_log_files_ is protected by mutex_ and log_write_mutex_ with details
+ // as follows:
+ // 1. read by FindObsoleteFiles() which can be called in either application
+ // thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are
+ // held.
+ // 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_
+ // are held.
+ // 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles()
+ // (actually called by Open()), only mutex_ is held because at this point,
+ // the DB::Open() call has not returned success to application, and the
+ // only other thread(s) that can conflict are bg threads calling
+ // FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_
+ // are held when accessing alive_log_files_.
+ // 4. read by DBImpl::Open() is protected by mutex_.
+ // 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are
+ // held. This is done by the write group leader. Note that in the case of
+ // two-write-queues, another WAL-only write thread can be writing to the
+ // WAL concurrently. See 9.
+ // 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is
+ // done by write group leader.
+ // 7. read by ConcurrentWriteToWAL() by the write group leader in the case of
+ // two-write-queues. Only log_write_mutex_ is held to protect concurrent
+ // pop_front() by FindObsoleteFiles().
+ // 8. read by PreprocessWrite() by the write group leader. log_write_mutex_
+ // is held to protect the data structure from concurrent pop_front() by
+ // FindObsoleteFiles().
+ // 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case
+ // of two-write-queues. Only log_write_mutex_ is held. This suffices to
+ // protect the data structure from concurrent push_back() by current
+ // write group leader as well as pop_front() by FindObsoleteFiles().
+ std::deque<LogFileNumberSize> alive_log_files_;
+
+ // Log files that aren't fully synced, and the current log file.
+ // Synchronization:
+ // 1. read by FindObsoleteFiles() which can be called either in application
+ // thread or RocksDB bg threads. log_write_mutex_ is always held, while
+ // some reads are performed without mutex_.
+ // 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held.
+ // 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_.
+ // 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex.
+ // Note that at this point, DB::Open() has not returned success to
+ // application, thus the only other thread(s) that can conflict are bg
+ // threads calling FindObsoleteFiles(). See 1.
+ // 5. iteration and clear() from CloseHelper() always hold log_write_mutex
+ // and mutex_.
+ // 6. back() called by APIs FlushWAL() and LockWAL() are protected by only
+ // log_write_mutex_. These two can be called by application threads after
+ // DB::Open() returns success to applications.
+ // 7. read by SyncWAL(), another API, protected by only log_write_mutex_.
+ // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by
+ // log_write_mutex_.
+ // 9. erase() by MarkLogsSynced() protected by log_write_mutex_.
+ // 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can
+ // happen in bg flush threads after DB::Open() returns success to
+ // applications.
+ // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite()
+ // holds only the log_write_mutex_. This is done by the write group
+ // leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced()
+ // can happen concurrently. This is fine because log_write_mutex_ is used
+ // by all parties. See 2, 5, 9.
+ // 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and
+ // log_write_mutex_. This happens in the write group leader.
+ // 13. emplace_back() by SwitchMemtable() hold both mutex_ and
+ // log_write_mutex_. This happens in the write group leader. Can conflict
+ // with bg threads calling FindObsoleteFiles(), MarkLogsSynced(),
+ // SyncClosedLogs(), etc. as well as application threads calling
+ // FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties
+ // require at least log_write_mutex_.
+ // 14. iteration called in WriteToWAL(write_group) protected by
+ // log_write_mutex_. This is done by write group leader when
+ // two-write-queues is disabled and write needs to sync logs.
+ // 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_.
+ // This can be done by the write group leader if two-write-queues is
+ // enabled. It can also be done by another WAL-only write thread.
+ //
+ // Other observations:
+ // - back() and items with getting_synced=true are not popped,
+ // - The same thread that sets getting_synced=true will reset it.
+ // - it follows that the object referred by back() can be safely read from
+ // the write_thread_ without using mutex. Note that calling back() without
+ // mutex may be unsafe because different implementations of deque::back() may
+ // access other member variables of deque, causing undefined behaviors.
+ // Generally, do not access stl containers without proper synchronization.
+ // - it follows that the items with getting_synced=true can be safely read
+ // from the same thread that has set getting_synced=true
+ std::deque<LogWriterNumber> logs_;
+
+ // Signaled when getting_synced becomes false for some of the logs_.
+ InstrumentedCondVar log_sync_cv_;
+ // This is the app-level state that is written to the WAL but will be used
+ // only during recovery. Using this feature enables not writing the state to
+ // memtable on normal writes and hence improving the throughput. Each new
+ // write of the state will replace the previous state entirely even if the
+ // keys in the two consecutive states do not overlap.
+ // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
+ // Otherwise only the heaad of write_thread_ can access it.
+ WriteBatch cached_recoverable_state_;
+ std::atomic<bool> cached_recoverable_state_empty_ = {true};
+ std::atomic<uint64_t> total_log_size_;
+
+ // If this is non-empty, we need to delete these log files in background
+ // threads. Protected by log_write_mutex_.
+ autovector<log::Writer*> logs_to_free_;
+
+ bool is_snapshot_supported_;
+
+ std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
+
+ std::map<std::string, uint64_t> stats_slice_;
+
+ bool stats_slice_initialized_ = false;
+
+ Directories directories_;
+
+ WriteBufferManager* write_buffer_manager_;
+
+ WriteThread write_thread_;
+ WriteBatch tmp_batch_;
+ // The write thread when the writers have no memtable write. This will be used
+ // in 2PC to batch the prepares separately from the serial commit.
+ WriteThread nonmem_write_thread_;
+
+ WriteController write_controller_;
+
+ // Size of the last batch group. In slowdown mode, next write needs to
+ // sleep if it uses up the quota.
+ // Note: This is to protect memtable and compaction. If the batch only writes
+ // to the WAL its size need not to be included in this.
+ uint64_t last_batch_group_size_;
+
+ FlushScheduler flush_scheduler_;
+
+ TrimHistoryScheduler trim_history_scheduler_;
+
+ SnapshotList snapshots_;
+
+ TimestampedSnapshotList timestamped_snapshots_;
+
+ // For each background job, pending_outputs_ keeps the current file number at
+ // the time that background job started.
+ // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
+ // number bigger than any of the file number in pending_outputs_. Since file
+ // numbers grow monotonically, this also means that pending_outputs_ is always
+ // sorted. After a background job is done executing, its file number is
+ // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
+ // it up.
+ // State is protected with db mutex.
+ std::list<uint64_t> pending_outputs_;
+
+ // flush_queue_ and compaction_queue_ hold column families that we need to
+ // flush and compact, respectively.
+ // A column family is inserted into flush_queue_ when it satisfies condition
+ // cfd->imm()->IsFlushPending()
+ // A column family is inserted into compaction_queue_ when it satisfied
+ // condition cfd->NeedsCompaction()
+ // Column families in this list are all Ref()-erenced
+ // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
+ // do RAII on ColumnFamilyData
+ // Column families are in this queue when they need to be flushed or
+ // compacted. Consumers of these queues are flush and compaction threads. When
+ // column family is put on this queue, we increase unscheduled_flushes_ and
+ // unscheduled_compactions_. When these variables are bigger than zero, that
+ // means we need to schedule background threads for flush and compaction.
+ // Once the background threads are scheduled, we decrease unscheduled_flushes_
+ // and unscheduled_compactions_. That way we keep track of number of
+ // compaction and flush threads we need to schedule. This scheduling is done
+ // in MaybeScheduleFlushOrCompaction()
+ // invariant(column family present in flush_queue_ <==>
+ // ColumnFamilyData::pending_flush_ == true)
+ std::deque<FlushRequest> flush_queue_;
+ // invariant(column family present in compaction_queue_ <==>
+ // ColumnFamilyData::pending_compaction_ == true)
+ std::deque<ColumnFamilyData*> compaction_queue_;
+
+ // A map to store file numbers and filenames of the files to be purged
+ std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
+
+ // A vector to store the file numbers that have been assigned to certain
+ // JobContext. Current implementation tracks table and blob files only.
+ std::unordered_set<uint64_t> files_grabbed_for_purge_;
+
+ // A queue to store log writers to close. Protected by db mutex_.
+ std::deque<log::Writer*> logs_to_free_queue_;
+
+ std::deque<SuperVersion*> superversions_to_free_queue_;
+
+ int unscheduled_flushes_;
+
+ int unscheduled_compactions_;
+
+ // count how many background compactions are running or have been scheduled in
+ // the BOTTOM pool
+ int bg_bottom_compaction_scheduled_;
+
+ // count how many background compactions are running or have been scheduled
+ int bg_compaction_scheduled_;
+
+ // stores the number of compactions are currently running
+ int num_running_compactions_;
+
+ // number of background memtable flush jobs, submitted to the HIGH pool
+ int bg_flush_scheduled_;
+
+ // stores the number of flushes are currently running
+ int num_running_flushes_;
+
+ // number of background obsolete file purge jobs, submitted to the HIGH pool
+ int bg_purge_scheduled_;
+
+ std::deque<ManualCompactionState*> manual_compaction_dequeue_;
+
+ // shall we disable deletion of obsolete files
+ // if 0 the deletion is enabled.
+ // if non-zero, files will not be getting deleted
+ // This enables two different threads to call
+ // EnableFileDeletions() and DisableFileDeletions()
+ // without any synchronization
+ int disable_delete_obsolete_files_;
+
+ // Number of times FindObsoleteFiles has found deletable files and the
+ // corresponding call to PurgeObsoleteFiles has not yet finished.
+ int pending_purge_obsolete_files_;
+
+ // last time when DeleteObsoleteFiles with full scan was executed. Originally
+ // initialized with startup time.
+ uint64_t delete_obsolete_files_last_run_;
+
+ // last time stats were dumped to LOG
+ std::atomic<uint64_t> last_stats_dump_time_microsec_;
+
+ // The thread that wants to switch memtable, can wait on this cv until the
+ // pending writes to memtable finishes.
+ std::condition_variable switch_cv_;
+ // The mutex used by switch_cv_. mutex_ should be acquired beforehand.
+ std::mutex switch_mutex_;
+ // Number of threads intending to write to memtable
+ std::atomic<size_t> pending_memtable_writes_ = {};
+
+ // A flag indicating whether the current rocksdb database has any
+ // data that is not yet persisted into either WAL or SST file.
+ // Used when disableWAL is true.
+ std::atomic<bool> has_unpersisted_data_;
+
+ // if an attempt was made to flush all column families that
+ // the oldest log depends on but uncommitted data in the oldest
+ // log prevents the log from being released.
+ // We must attempt to free the dependent memtables again
+ // at a later time after the transaction in the oldest
+ // log is fully commited.
+ bool unable_to_release_oldest_log_;
+
+ // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
+ // calls.
+ // REQUIRES: mutex held
+ int num_running_ingest_file_;
+
+#ifndef ROCKSDB_LITE
+ WalManager wal_manager_;
+#endif // ROCKSDB_LITE
+
+ // A value of > 0 temporarily disables scheduling of background work
+ int bg_work_paused_;
+
+ // A value of > 0 temporarily disables scheduling of background compaction
+ int bg_compaction_paused_;
+
+ // Guard against multiple concurrent refitting
+ bool refitting_level_;
+
+ // Indicate DB was opened successfully
+ bool opened_successfully_;
+
+ // The min threshold to triggere bottommost compaction for removing
+ // garbages, among all column families.
+ SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+ LogsWithPrepTracker logs_with_prep_tracker_;
+
+ // Callback for compaction to check if a key is visible to a snapshot.
+ // REQUIRES: mutex held
+ std::unique_ptr<SnapshotChecker> snapshot_checker_;
+
+ // Callback for when the cached_recoverable_state_ is written to memtable
+ // Only to be set during initialization
+ std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
+
+#ifndef ROCKSDB_LITE
+ // Scheduler to run DumpStats(), PersistStats(), and FlushInfoLog().
+ // Currently, internally it has a global timer instance for running the tasks.
+ PeriodicTaskScheduler periodic_task_scheduler_;
+
+ // It contains the implementations for each periodic task.
+ std::map<PeriodicTaskType, const PeriodicTaskFunc> periodic_task_functions_;
+#endif
+
+ // When set, we use a separate queue for writes that don't write to memtable.
+ // In 2PC these are the writes at Prepare phase.
+ const bool two_write_queues_;
+ const bool manual_wal_flush_;
+
+ // LastSequence also indicates last published sequence visibile to the
+ // readers. Otherwise LastPublishedSequence should be used.
+ const bool last_seq_same_as_publish_seq_;
+ // It indicates that a customized gc algorithm must be used for
+ // flush/compaction and if it is not provided vis SnapshotChecker, we should
+ // disable gc to be safe.
+ const bool use_custom_gc_;
+ // Flag to indicate that the DB instance shutdown has been initiated. This
+ // different from shutting_down_ atomic in that it is set at the beginning
+ // of shutdown sequence, specifically in order to prevent any background
+ // error recovery from going on in parallel. The latter, shutting_down_,
+ // is set a little later during the shutdown after scheduling memtable
+ // flushes
+ std::atomic<bool> shutdown_initiated_;
+ // Flag to indicate whether sst_file_manager object was allocated in
+ // DB::Open() or passed to us
+ bool own_sfm_;
+
+ // Flag to check whether Close() has been called on this DB
+ bool closed_;
+ // save the closing status, for re-calling the close()
+ Status closing_status_;
+ // mutex for DB::Close()
+ InstrumentedMutex closing_mutex_;
+
+ // Conditional variable to coordinate installation of atomic flush results.
+ // With atomic flush, each bg thread installs the result of flushing multiple
+ // column families, and different threads can flush different column
+ // families. It's difficult to rely on one thread to perform batch
+ // installation for all threads. This is different from the non-atomic flush
+ // case.
+ // atomic_flush_install_cv_ makes sure that threads install atomic flush
+ // results sequentially. Flush results of memtables with lower IDs get
+ // installed to MANIFEST first.
+ InstrumentedCondVar atomic_flush_install_cv_;
+
+ bool wal_in_db_path_;
+ std::atomic<uint64_t> max_total_wal_size_;
+
+ BlobFileCompletionCallback blob_callback_;
+
+ // Pointer to WriteBufferManager stalling interface.
+ std::unique_ptr<StallInterface> wbm_stall_;
+
+ // seqno_time_mapping_ stores the sequence number to time mapping, it's not
+ // thread safe, both read and write need db mutex hold.
+ SeqnoToTimeMapping seqno_time_mapping_;
+};
+
+class GetWithTimestampReadCallback : public ReadCallback {
+ public:
+ explicit GetWithTimestampReadCallback(SequenceNumber seq)
+ : ReadCallback(seq) {}
+ bool IsVisibleFullCheck(SequenceNumber seq) override {
+ return seq <= max_visible_seq_;
+ }
+};
+
+extern Options SanitizeOptions(const std::string& db, const Options& src,
+ bool read_only = false,
+ Status* logger_creation_s = nullptr);
+
+extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src,
+ bool read_only = false,
+ Status* logger_creation_s = nullptr);
+
+extern CompressionType GetCompressionFlush(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options);
+
+// Return the earliest log file to keep after the memtable flush is
+// finalized.
+// `cfd_to_flush` is the column family whose memtable (specified in
+// `memtables_to_flush`) will be flushed and thus will not depend on any WAL
+// file.
+// The function is only applicable to 2pc mode.
+extern uint64_t PrecomputeMinLogNumberToKeep2PC(
+ VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+ const autovector<VersionEdit*>& edit_list,
+ const autovector<MemTable*>& memtables_to_flush,
+ LogsWithPrepTracker* prep_tracker);
+// For atomic flush.
+extern uint64_t PrecomputeMinLogNumberToKeep2PC(
+ VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+ const autovector<autovector<VersionEdit*>>& edit_lists,
+ const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+ LogsWithPrepTracker* prep_tracker);
+
+// In non-2PC mode, WALs with log number < the returned number can be
+// deleted after the cfd_to_flush column family is flushed successfully.
+extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+ VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+ const autovector<VersionEdit*>& edit_list);
+// For atomic flush.
+extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+ VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+ const autovector<autovector<VersionEdit*>>& edit_lists);
+
+// `cfd_to_flush` is the column family whose memtable will be flushed and thus
+// will not depend on any WAL file. nullptr means no memtable is being flushed.
+// The function is only applicable to 2pc mode.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+ VersionSet* vset, const autovector<MemTable*>& memtables_to_flush);
+// For atomic flush.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+ VersionSet* vset,
+ const autovector<const autovector<MemTable*>*>& memtables_to_flush);
+
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+ if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+ if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+
+inline Status DBImpl::FailIfCfHasTs(
+ const ColumnFamilyHandle* column_family) const {
+ column_family = column_family ? column_family : DefaultColumnFamily();
+ assert(column_family);
+ const Comparator* const ucmp = column_family->GetComparator();
+ assert(ucmp);
+ if (ucmp->timestamp_size() > 0) {
+ std::ostringstream oss;
+ oss << "cannot call this method on column family "
+ << column_family->GetName() << " that enables timestamp";
+ return Status::InvalidArgument(oss.str());
+ }
+ return Status::OK();
+}
+
+inline Status DBImpl::FailIfTsMismatchCf(ColumnFamilyHandle* column_family,
+ const Slice& ts,
+ bool ts_for_read) const {
+ if (!column_family) {
+ return Status::InvalidArgument("column family handle cannot be null");
+ }
+ assert(column_family);
+ const Comparator* const ucmp = column_family->GetComparator();
+ assert(ucmp);
+ if (0 == ucmp->timestamp_size()) {
+ std::stringstream oss;
+ oss << "cannot call this method on column family "
+ << column_family->GetName() << " that does not enable timestamp";
+ return Status::InvalidArgument(oss.str());
+ }
+ const size_t ts_sz = ts.size();
+ if (ts_sz != ucmp->timestamp_size()) {
+ std::stringstream oss;
+ oss << "Timestamp sizes mismatch: expect " << ucmp->timestamp_size() << ", "
+ << ts_sz << " given";
+ return Status::InvalidArgument(oss.str());
+ }
+ if (ts_for_read) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ auto cfd = cfh->cfd();
+ std::string current_ts_low = cfd->GetFullHistoryTsLow();
+ if (!current_ts_low.empty() &&
+ ucmp->CompareTimestamp(ts, current_ts_low) < 0) {
+ std::stringstream oss;
+ oss << "Read timestamp: " << ts.ToString(true)
+ << " is smaller than full_history_ts_low: "
+ << Slice(current_ts_low).ToString(true) << std::endl;
+ return Status::InvalidArgument(oss.str());
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
new file mode 100644
index 000000000..a605fac87
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
@@ -0,0 +1,3857 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+#include <deque>
+
+#include "db/builder.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/concurrent_task_limiter_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool DBImpl::EnoughRoomForCompaction(
+ ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+ bool* sfm_reserved_compact_space, LogBuffer* log_buffer) {
+ // Check if we have enough room to do the compaction
+ bool enough_room = true;
+#ifndef ROCKSDB_LITE
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm) {
+ // Pass the current bg_error_ to SFM so it can decide what checks to
+ // perform. If this DB instance hasn't seen any error yet, the SFM can be
+ // optimistic and not do disk space checks
+ Status bg_error = error_handler_.GetBGError();
+ enough_room = sfm->EnoughRoomForCompaction(cfd, inputs, bg_error);
+ bg_error.PermitUncheckedError(); // bg_error is just a copy of the Status
+ // from the error_handler_
+ if (enough_room) {
+ *sfm_reserved_compact_space = true;
+ }
+ }
+#else
+ (void)cfd;
+ (void)inputs;
+ (void)sfm_reserved_compact_space;
+#endif // ROCKSDB_LITE
+ if (!enough_room) {
+ // Just in case tests want to change the value of enough_room
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::BackgroundCompaction():CancelledCompaction", &enough_room);
+ ROCKS_LOG_BUFFER(log_buffer,
+ "Cancelled compaction because not enough room");
+ RecordTick(stats_, COMPACTION_CANCELLED, 1);
+ }
+ return enough_room;
+}
+
+bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+ std::unique_ptr<TaskLimiterToken>* token,
+ LogBuffer* log_buffer) {
+ assert(*token == nullptr);
+ auto limiter = static_cast<ConcurrentTaskLimiterImpl*>(
+ cfd->ioptions()->compaction_thread_limiter.get());
+ if (limiter == nullptr) {
+ return true;
+ }
+ *token = limiter->GetToken(force);
+ if (*token != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "Thread limiter [%s] increase [%s] compaction task, "
+ "force: %s, tasks after: %d",
+ limiter->GetName().c_str(), cfd->GetName().c_str(),
+ force ? "true" : "false", limiter->GetOutstandingTask());
+ return true;
+ }
+ return false;
+}
+
+IOStatus DBImpl::SyncClosedLogs(JobContext* job_context,
+ VersionEdit* synced_wals) {
+ TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
+ InstrumentedMutexLock l(&log_write_mutex_);
+ autovector<log::Writer*, 1> logs_to_sync;
+ uint64_t current_log_number = logfile_number_;
+ while (logs_.front().number < current_log_number &&
+ logs_.front().IsSyncing()) {
+ log_sync_cv_.Wait();
+ }
+ for (auto it = logs_.begin();
+ it != logs_.end() && it->number < current_log_number; ++it) {
+ auto& log = *it;
+ log.PrepareForSync();
+ logs_to_sync.push_back(log.writer);
+ }
+
+ IOStatus io_s;
+ if (!logs_to_sync.empty()) {
+ log_write_mutex_.Unlock();
+
+ assert(job_context);
+
+ for (log::Writer* log : logs_to_sync) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[JOB %d] Syncing log #%" PRIu64, job_context->job_id,
+ log->get_log_number());
+ if (error_handler_.IsRecoveryInProgress()) {
+ log->file()->reset_seen_error();
+ }
+ io_s = log->file()->Sync(immutable_db_options_.use_fsync);
+ if (!io_s.ok()) {
+ break;
+ }
+
+ if (immutable_db_options_.recycle_log_file_num > 0) {
+ if (error_handler_.IsRecoveryInProgress()) {
+ log->file()->reset_seen_error();
+ }
+ io_s = log->Close();
+ if (!io_s.ok()) {
+ break;
+ }
+ }
+ }
+ if (io_s.ok()) {
+ io_s = directories_.GetWalDir()->FsyncWithDirOptions(
+ IOOptions(), nullptr,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+ }
+
+ TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock",
+ /*arg=*/nullptr);
+ log_write_mutex_.Lock();
+
+ // "number <= current_log_number - 1" is equivalent to
+ // "number < current_log_number".
+ if (io_s.ok()) {
+ MarkLogsSynced(current_log_number - 1, true, synced_wals);
+ } else {
+ MarkLogsNotSynced(current_log_number - 1);
+ }
+ if (!io_s.ok()) {
+ TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed");
+ return io_s;
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::SyncClosedLogs:end");
+ return io_s;
+}
+
+Status DBImpl::FlushMemTableToOutputFile(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ bool* made_progress, JobContext* job_context,
+ SuperVersionContext* superversion_context,
+ std::vector<SequenceNumber>& snapshot_seqs,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+ Env::Priority thread_pri) {
+ mutex_.AssertHeld();
+ assert(cfd);
+ assert(cfd->imm());
+ assert(cfd->imm()->NumNotFlushed() != 0);
+ assert(cfd->imm()->IsFlushPending());
+ assert(versions_);
+ assert(versions_->GetColumnFamilySet());
+ // If there are more than one column families, we need to make sure that
+ // all the log files except the most recent one are synced. Otherwise if
+ // the host crashes after flushing and before WAL is persistent, the
+ // flushed SST may contain data from write batches whose updates to
+ // other (unflushed) column families are missing.
+ const bool needs_to_sync_closed_wals =
+ logfile_number_ > 0 &&
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1;
+
+ // If needs_to_sync_closed_wals is true, we need to record the current
+ // maximum memtable ID of this column family so that a later PickMemtables()
+ // call will not pick memtables whose IDs are higher. This is due to the fact
+ // that SyncClosedLogs() may release the db mutex, and memtable switch can
+ // happen for this column family in the meantime. The newly created memtables
+ // have their data backed by unsynced WALs, thus they cannot be included in
+ // this flush job.
+ // Another reason why we must record the current maximum memtable ID of this
+ // column family: SyncClosedLogs() may release db mutex, thus it's possible
+ // for application to continue to insert into memtables increasing db's
+ // sequence number. The application may take a snapshot, but this snapshot is
+ // not included in `snapshot_seqs` which will be passed to flush job because
+ // `snapshot_seqs` has already been computed before this function starts.
+ // Recording the max memtable ID ensures that the flush job does not flush
+ // a memtable without knowing such snapshot(s).
+ uint64_t max_memtable_id = needs_to_sync_closed_wals
+ ? cfd->imm()->GetLatestMemTableID()
+ : std::numeric_limits<uint64_t>::max();
+
+ // If needs_to_sync_closed_wals is false, then the flush job will pick ALL
+ // existing memtables of the column family when PickMemTable() is called
+ // later. Although we won't call SyncClosedLogs() in this case, we may still
+ // call the callbacks of the listeners, i.e. NotifyOnFlushBegin() which also
+ // releases and re-acquires the db mutex. In the meantime, the application
+ // can still insert into the memtables and increase the db's sequence number.
+ // The application can take a snapshot, hoping that the latest visible state
+ // to this snapshto is preserved. This is hard to guarantee since db mutex
+ // not held. This newly-created snapshot is not included in `snapshot_seqs`
+ // and the flush job is unaware of its presence. Consequently, the flush job
+ // may drop certain keys when generating the L0, causing incorrect data to be
+ // returned for snapshot read using this snapshot.
+ // To address this, we make sure NotifyOnFlushBegin() executes after memtable
+ // picking so that no new snapshot can be taken between the two functions.
+
+ FlushJob flush_job(
+ dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id,
+ file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
+ snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+ job_context, log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U),
+ GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
+ &event_logger_, mutable_cf_options.report_bg_io_stats,
+ true /* sync_output_directory */, true /* write_manifest */, thread_pri,
+ io_tracer_, seqno_time_mapping_, db_id_, db_session_id_,
+ cfd->GetFullHistoryTsLow(), &blob_callback_);
+ FileMetaData file_meta;
+
+ Status s;
+ bool need_cancel = false;
+ IOStatus log_io_s = IOStatus::OK();
+ if (needs_to_sync_closed_wals) {
+ // SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple
+ // times.
+ VersionEdit synced_wals;
+ mutex_.Unlock();
+ log_io_s = SyncClosedLogs(job_context, &synced_wals);
+ mutex_.Lock();
+ if (log_io_s.ok() && synced_wals.IsWalAddition()) {
+ log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
+ TEST_SYNC_POINT_CALLBACK("DBImpl::FlushMemTableToOutputFile:CommitWal:1",
+ nullptr);
+ }
+
+ if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
+ !log_io_s.IsColumnFamilyDropped()) {
+ error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
+ }
+ } else {
+ TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
+ }
+ s = log_io_s;
+
+ // If the log sync failed, we do not need to pick memtable. Otherwise,
+ // num_flush_not_started_ needs to be rollback.
+ TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
+ if (s.ok()) {
+ flush_job.PickMemTable();
+ need_cancel = true;
+ }
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", &flush_job);
+
+#ifndef ROCKSDB_LITE
+ // may temporarily unlock and lock the mutex.
+ NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
+#endif // ROCKSDB_LITE
+
+ bool switched_to_mempurge = false;
+ // Within flush_job.Run, rocksdb may call event listener to notify
+ // file creation and deletion.
+ //
+ // Note that flush_job.Run will unlock and lock the db_mutex,
+ // and EventListener callback will be called when the db_mutex
+ // is unlocked by the current thread.
+ if (s.ok()) {
+ s = flush_job.Run(&logs_with_prep_tracker_, &file_meta,
+ &switched_to_mempurge);
+ need_cancel = false;
+ }
+
+ if (!s.ok() && need_cancel) {
+ flush_job.Cancel();
+ }
+
+ if (s.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, superversion_context,
+ mutable_cf_options);
+ if (made_progress) {
+ *made_progress = true;
+ }
+
+ const std::string& column_family_name = cfd->GetName();
+
+ Version* const current = cfd->current();
+ assert(current);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
+
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+ column_family_name.c_str(),
+ storage_info->LevelSummary(&tmp));
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+ if (!blob_files.empty()) {
+ assert(blob_files.front());
+ assert(blob_files.back());
+
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n",
+ column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(),
+ blob_files.back()->GetBlobFileNumber());
+ }
+ }
+
+ if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
+ if (log_io_s.ok()) {
+ // Error while writing to MANIFEST.
+ // In fact, versions_->io_status() can also be the result of renaming
+ // CURRENT file. With current code, it's just difficult to tell. So just
+ // be pessimistic and try write to a new MANIFEST.
+ // TODO: distinguish between MANIFEST write and CURRENT renaming
+ if (!versions_->io_status().ok()) {
+ // If WAL sync is successful (either WAL size is 0 or there is no IO
+ // error), all the Manifest write will be map to soft error.
+ // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor is
+ // needed.
+ error_handler_.SetBGError(s,
+ BackgroundErrorReason::kManifestWriteNoWAL);
+ } else {
+ // If WAL sync is successful (either WAL size is 0 or there is no IO
+ // error), all the other SST file write errors will be set as
+ // kFlushNoWAL.
+ error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL);
+ }
+ } else {
+ assert(s == log_io_s);
+ Status new_bg_error = s;
+ error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+ }
+ }
+ // If flush ran smoothly and no mempurge happened
+ // install new SST file path.
+ if (s.ok() && (!switched_to_mempurge)) {
+#ifndef ROCKSDB_LITE
+ // may temporarily unlock and lock the mutex.
+ NotifyOnFlushCompleted(cfd, mutable_cf_options,
+ flush_job.GetCommittedFlushJobsInfo());
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm) {
+ // Notify sst_file_manager that a new file was added
+ std::string file_path = MakeTableFileName(
+ cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber());
+ // TODO (PR7798). We should only add the file to the FileManager if it
+ // exists. Otherwise, some tests may fail. Ignore the error in the
+ // interim.
+ sfm->OnAddFile(file_path).PermitUncheckedError();
+ if (sfm->IsMaxAllowedSpaceReached()) {
+ Status new_bg_error =
+ Status::SpaceLimit("Max allowed space was reached");
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
+ &new_bg_error);
+ error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+ }
+ }
+#endif // ROCKSDB_LITE
+ }
+ TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish");
+ return s;
+}
+
+Status DBImpl::FlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+ if (immutable_db_options_.atomic_flush) {
+ return AtomicFlushMemTablesToOutputFiles(
+ bg_flush_args, made_progress, job_context, log_buffer, thread_pri);
+ }
+ assert(bg_flush_args.size() == 1);
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+ const auto& bg_flush_arg = bg_flush_args[0];
+ ColumnFamilyData* cfd = bg_flush_arg.cfd_;
+ // intentional infrequent copy for each flush
+ MutableCFOptions mutable_cf_options_copy = *cfd->GetLatestMutableCFOptions();
+ SuperVersionContext* superversion_context =
+ bg_flush_arg.superversion_context_;
+ Status s = FlushMemTableToOutputFile(
+ cfd, mutable_cf_options_copy, made_progress, job_context,
+ superversion_context, snapshot_seqs, earliest_write_conflict_snapshot,
+ snapshot_checker, log_buffer, thread_pri);
+ return s;
+}
+
+/*
+ * Atomically flushes multiple column families.
+ *
+ * For each column family, all memtables with ID smaller than or equal to the
+ * ID specified in bg_flush_args will be flushed. Only after all column
+ * families finish flush will this function commit to MANIFEST. If any of the
+ * column families are not flushed successfully, this function does not have
+ * any side-effect on the state of the database.
+ */
+Status DBImpl::AtomicFlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+ mutex_.AssertHeld();
+
+ autovector<ColumnFamilyData*> cfds;
+ for (const auto& arg : bg_flush_args) {
+ cfds.emplace_back(arg.cfd_);
+ }
+
+#ifndef NDEBUG
+ for (const auto cfd : cfds) {
+ assert(cfd->imm()->NumNotFlushed() != 0);
+ assert(cfd->imm()->IsFlushPending());
+ assert(cfd->GetFlushReason() == cfds[0]->GetFlushReason());
+ }
+#endif /* !NDEBUG */
+
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+
+ autovector<FSDirectory*> distinct_output_dirs;
+ autovector<std::string> distinct_output_dir_paths;
+ std::vector<std::unique_ptr<FlushJob>> jobs;
+ std::vector<MutableCFOptions> all_mutable_cf_options;
+ int num_cfs = static_cast<int>(cfds.size());
+ all_mutable_cf_options.reserve(num_cfs);
+ for (int i = 0; i < num_cfs; ++i) {
+ auto cfd = cfds[i];
+ FSDirectory* data_dir = GetDataDir(cfd, 0U);
+ const std::string& curr_path = cfd->ioptions()->cf_paths[0].path;
+
+ // Add to distinct output directories if eligible. Use linear search. Since
+ // the number of elements in the vector is not large, performance should be
+ // tolerable.
+ bool found = false;
+ for (const auto& path : distinct_output_dir_paths) {
+ if (path == curr_path) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ distinct_output_dir_paths.emplace_back(curr_path);
+ distinct_output_dirs.emplace_back(data_dir);
+ }
+
+ all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
+ const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
+ uint64_t max_memtable_id = bg_flush_args[i].max_memtable_id_;
+ jobs.emplace_back(new FlushJob(
+ dbname_, cfd, immutable_db_options_, mutable_cf_options,
+ max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_,
+ &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+ snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+ data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+ stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
+ false /* sync_output_directory */, false /* write_manifest */,
+ thread_pri, io_tracer_, seqno_time_mapping_, db_id_, db_session_id_,
+ cfd->GetFullHistoryTsLow(), &blob_callback_));
+ }
+
+ std::vector<FileMetaData> file_meta(num_cfs);
+ // Use of deque<bool> because vector<bool>
+ // is specific and doesn't allow &v[i].
+ std::deque<bool> switched_to_mempurge(num_cfs, false);
+ Status s;
+ IOStatus log_io_s = IOStatus::OK();
+ assert(num_cfs == static_cast<int>(jobs.size()));
+
+#ifndef ROCKSDB_LITE
+ for (int i = 0; i != num_cfs; ++i) {
+ const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
+ // may temporarily unlock and lock the mutex.
+ NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
+ job_context->job_id);
+ }
+#endif /* !ROCKSDB_LITE */
+
+ if (logfile_number_ > 0) {
+ // TODO (yanqin) investigate whether we should sync the closed logs for
+ // single column family case.
+ VersionEdit synced_wals;
+ mutex_.Unlock();
+ log_io_s = SyncClosedLogs(job_context, &synced_wals);
+ mutex_.Lock();
+ if (log_io_s.ok() && synced_wals.IsWalAddition()) {
+ log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
+ }
+
+ if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
+ !log_io_s.IsColumnFamilyDropped()) {
+ if (total_log_size_ > 0) {
+ error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
+ } else {
+ // If the WAL is empty, we use different error reason
+ error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlushNoWAL);
+ }
+ }
+ }
+ s = log_io_s;
+
+ // exec_status stores the execution status of flush_jobs as
+ // <bool /* executed */, Status /* status code */>
+ autovector<std::pair<bool, Status>> exec_status;
+ std::vector<bool> pick_status;
+ for (int i = 0; i != num_cfs; ++i) {
+ // Initially all jobs are not executed, with status OK.
+ exec_status.emplace_back(false, Status::OK());
+ pick_status.push_back(false);
+ }
+
+ if (s.ok()) {
+ for (int i = 0; i != num_cfs; ++i) {
+ jobs[i]->PickMemTable();
+ pick_status[i] = true;
+ }
+ }
+
+ if (s.ok()) {
+ assert(switched_to_mempurge.size() ==
+ static_cast<long unsigned int>(num_cfs));
+ // TODO (yanqin): parallelize jobs with threads.
+ for (int i = 1; i != num_cfs; ++i) {
+ exec_status[i].second =
+ jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i],
+ &(switched_to_mempurge.at(i)));
+ exec_status[i].first = true;
+ }
+ if (num_cfs > 1) {
+ TEST_SYNC_POINT(
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1");
+ TEST_SYNC_POINT(
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2");
+ }
+ assert(exec_status.size() > 0);
+ assert(!file_meta.empty());
+ exec_status[0].second = jobs[0]->Run(
+ &logs_with_prep_tracker_, file_meta.data() /* &file_meta[0] */,
+ switched_to_mempurge.empty() ? nullptr : &(switched_to_mempurge.at(0)));
+ exec_status[0].first = true;
+
+ Status error_status;
+ for (const auto& e : exec_status) {
+ if (!e.second.ok()) {
+ s = e.second;
+ if (!e.second.IsShutdownInProgress() &&
+ !e.second.IsColumnFamilyDropped()) {
+ // If a flush job did not return OK, and the CF is not dropped, and
+ // the DB is not shutting down, then we have to return this result to
+ // caller later.
+ error_status = e.second;
+ }
+ }
+ }
+
+ s = error_status.ok() ? s : error_status;
+ }
+
+ if (s.IsColumnFamilyDropped()) {
+ s = Status::OK();
+ }
+
+ if (s.ok() || s.IsShutdownInProgress()) {
+ // Sync on all distinct output directories.
+ for (auto dir : distinct_output_dirs) {
+ if (dir != nullptr) {
+ Status error_status = dir->FsyncWithDirOptions(
+ IOOptions(), nullptr,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+ if (!error_status.ok()) {
+ s = error_status;
+ break;
+ }
+ }
+ }
+ } else {
+ // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+ // it is not because of CF drop.
+ // Have to cancel the flush jobs that have NOT executed because we need to
+ // unref the versions.
+ for (int i = 0; i != num_cfs; ++i) {
+ if (pick_status[i] && !exec_status[i].first) {
+ jobs[i]->Cancel();
+ }
+ }
+ for (int i = 0; i != num_cfs; ++i) {
+ if (exec_status[i].second.ok() && exec_status[i].first) {
+ auto& mems = jobs[i]->GetMemTables();
+ cfds[i]->imm()->RollbackMemtableFlush(mems,
+ file_meta[i].fd.GetNumber());
+ }
+ }
+ }
+
+ if (s.ok()) {
+ const auto wait_to_install_func =
+ [&]() -> std::pair<Status, bool /*continue to wait*/> {
+ if (!versions_->io_status().ok()) {
+ // Something went wrong elsewhere, we cannot count on waiting for our
+ // turn to write/sync to MANIFEST or CURRENT. Just return.
+ return std::make_pair(versions_->io_status(), false);
+ } else if (shutting_down_.load(std::memory_order_acquire)) {
+ return std::make_pair(Status::ShutdownInProgress(), false);
+ }
+ bool ready = true;
+ for (size_t i = 0; i != cfds.size(); ++i) {
+ const auto& mems = jobs[i]->GetMemTables();
+ if (cfds[i]->IsDropped()) {
+ // If the column family is dropped, then do not wait.
+ continue;
+ } else if (!mems.empty() &&
+ cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) {
+ // If a flush job needs to install the flush result for mems and
+ // mems[0] is not the earliest memtable, it means another thread must
+ // be installing flush results for the same column family, then the
+ // current thread needs to wait.
+ ready = false;
+ break;
+ } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <=
+ bg_flush_args[i].max_memtable_id_) {
+ // If a flush job does not need to install flush results, then it has
+ // to wait until all memtables up to max_memtable_id_ (inclusive) are
+ // installed.
+ ready = false;
+ break;
+ }
+ }
+ return std::make_pair(Status::OK(), !ready);
+ };
+
+ bool resuming_from_bg_err =
+ error_handler_.IsDBStopped() ||
+ (cfds[0]->GetFlushReason() == FlushReason::kErrorRecovery ||
+ cfds[0]->GetFlushReason() == FlushReason::kErrorRecoveryRetryFlush);
+ while ((!resuming_from_bg_err || error_handler_.GetRecoveryError().ok())) {
+ std::pair<Status, bool> res = wait_to_install_func();
+
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", &res);
+
+ if (!res.first.ok()) {
+ s = res.first;
+ break;
+ } else if (!res.second) {
+ break;
+ }
+ atomic_flush_install_cv_.Wait();
+
+ resuming_from_bg_err =
+ error_handler_.IsDBStopped() ||
+ (cfds[0]->GetFlushReason() == FlushReason::kErrorRecovery ||
+ cfds[0]->GetFlushReason() == FlushReason::kErrorRecoveryRetryFlush);
+ }
+
+ if (!resuming_from_bg_err) {
+ // If not resuming from bg err, then we determine future action based on
+ // whether we hit background error.
+ if (s.ok()) {
+ s = error_handler_.GetBGError();
+ }
+ } else if (s.ok()) {
+ // If resuming from bg err, we still rely on wait_to_install_func()'s
+ // result to determine future action. If wait_to_install_func() returns
+ // non-ok already, then we should not proceed to flush result
+ // installation.
+ s = error_handler_.GetRecoveryError();
+ }
+ }
+
+ if (s.ok()) {
+ autovector<ColumnFamilyData*> tmp_cfds;
+ autovector<const autovector<MemTable*>*> mems_list;
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ autovector<FileMetaData*> tmp_file_meta;
+ autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+ committed_flush_jobs_info;
+ for (int i = 0; i != num_cfs; ++i) {
+ const auto& mems = jobs[i]->GetMemTables();
+ if (!cfds[i]->IsDropped() && !mems.empty()) {
+ tmp_cfds.emplace_back(cfds[i]);
+ mems_list.emplace_back(&mems);
+ mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]);
+ tmp_file_meta.emplace_back(&file_meta[i]);
+#ifndef ROCKSDB_LITE
+ committed_flush_jobs_info.emplace_back(
+ jobs[i]->GetCommittedFlushJobsInfo());
+#endif //! ROCKSDB_LITE
+ }
+ }
+
+ s = InstallMemtableAtomicFlushResults(
+ nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list,
+ versions_.get(), &logs_with_prep_tracker_, &mutex_, tmp_file_meta,
+ committed_flush_jobs_info, &job_context->memtables_to_free,
+ directories_.GetDbDir(), log_buffer);
+ }
+
+ if (s.ok()) {
+ assert(num_cfs ==
+ static_cast<int>(job_context->superversion_contexts.size()));
+ for (int i = 0; i != num_cfs; ++i) {
+ assert(cfds[i]);
+
+ if (cfds[i]->IsDropped()) {
+ continue;
+ }
+ InstallSuperVersionAndScheduleWork(cfds[i],
+ &job_context->superversion_contexts[i],
+ all_mutable_cf_options[i]);
+
+ const std::string& column_family_name = cfds[i]->GetName();
+
+ Version* const current = cfds[i]->current();
+ assert(current);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
+
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+ column_family_name.c_str(),
+ storage_info->LevelSummary(&tmp));
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+ if (!blob_files.empty()) {
+ assert(blob_files.front());
+ assert(blob_files.back());
+
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n",
+ column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(),
+ blob_files.back()->GetBlobFileNumber());
+ }
+ }
+ if (made_progress) {
+ *made_progress = true;
+ }
+#ifndef ROCKSDB_LITE
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ assert(all_mutable_cf_options.size() == static_cast<size_t>(num_cfs));
+ for (int i = 0; s.ok() && i != num_cfs; ++i) {
+ // If mempurge happened instead of Flush,
+ // no NotifyOnFlushCompleted call (no SST file created).
+ if (switched_to_mempurge[i]) {
+ continue;
+ }
+ if (cfds[i]->IsDropped()) {
+ continue;
+ }
+ NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i],
+ jobs[i]->GetCommittedFlushJobsInfo());
+ if (sfm) {
+ std::string file_path = MakeTableFileName(
+ cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
+ // TODO (PR7798). We should only add the file to the FileManager if it
+ // exists. Otherwise, some tests may fail. Ignore the error in the
+ // interim.
+ sfm->OnAddFile(file_path).PermitUncheckedError();
+ if (sfm->IsMaxAllowedSpaceReached() &&
+ error_handler_.GetBGError().ok()) {
+ Status new_bg_error =
+ Status::SpaceLimit("Max allowed space was reached");
+ error_handler_.SetBGError(new_bg_error,
+ BackgroundErrorReason::kFlush);
+ }
+ }
+ }
+#endif // ROCKSDB_LITE
+ }
+
+ // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+ // it is not because of CF drop.
+ if (!s.ok() && !s.IsColumnFamilyDropped()) {
+ if (log_io_s.ok()) {
+ // Error while writing to MANIFEST.
+ // In fact, versions_->io_status() can also be the result of renaming
+ // CURRENT file. With current code, it's just difficult to tell. So just
+ // be pessimistic and try write to a new MANIFEST.
+ // TODO: distinguish between MANIFEST write and CURRENT renaming
+ if (!versions_->io_status().ok()) {
+ // If WAL sync is successful (either WAL size is 0 or there is no IO
+ // error), all the Manifest write will be map to soft error.
+ // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor
+ // is needed.
+ error_handler_.SetBGError(s,
+ BackgroundErrorReason::kManifestWriteNoWAL);
+ } else {
+ // If WAL sync is successful (either WAL size is 0 or there is no IO
+ // error), all the other SST file write errors will be set as
+ // kFlushNoWAL.
+ error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL);
+ }
+ } else {
+ assert(s == log_io_s);
+ Status new_bg_error = s;
+ error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+ }
+ }
+
+ return s;
+}
+
+void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+ const MutableCFOptions& mutable_cf_options,
+ int job_id) {
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+ bool triggered_writes_slowdown =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_slowdown_writes_trigger);
+ bool triggered_writes_stop =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_stop_writes_trigger);
+ // release lock while notifying events
+ mutex_.Unlock();
+ {
+ FlushJobInfo info{};
+ info.cf_id = cfd->GetID();
+ info.cf_name = cfd->GetName();
+ // TODO(yhchiang): make db_paths dynamic in case flush does not
+ // go to L0 in the future.
+ const uint64_t file_number = file_meta->fd.GetNumber();
+ info.file_path =
+ MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number);
+ info.file_number = file_number;
+ info.thread_id = env_->GetThreadID();
+ info.job_id = job_id;
+ info.triggered_writes_slowdown = triggered_writes_slowdown;
+ info.triggered_writes_stop = triggered_writes_stop;
+ info.smallest_seqno = file_meta->fd.smallest_seqno;
+ info.largest_seqno = file_meta->fd.largest_seqno;
+ info.flush_reason = cfd->GetFlushReason();
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnFlushBegin(this, info);
+ }
+ }
+ mutex_.Lock();
+// no need to signal bg_cv_ as it will be signaled at the end of the
+// flush process.
+#else
+ (void)cfd;
+ (void)file_meta;
+ (void)mutable_cf_options;
+ (void)job_id;
+#endif // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnFlushCompleted(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info) {
+#ifndef ROCKSDB_LITE
+ assert(flush_jobs_info != nullptr);
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+ bool triggered_writes_slowdown =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_slowdown_writes_trigger);
+ bool triggered_writes_stop =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_stop_writes_trigger);
+ // release lock while notifying events
+ mutex_.Unlock();
+ {
+ for (auto& info : *flush_jobs_info) {
+ info->triggered_writes_slowdown = triggered_writes_slowdown;
+ info->triggered_writes_stop = triggered_writes_stop;
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnFlushCompleted(this, *info);
+ }
+ TEST_SYNC_POINT(
+ "DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted");
+ }
+ flush_jobs_info->clear();
+ }
+ mutex_.Lock();
+ // no need to signal bg_cv_ as it will be signaled at the end of the
+ // flush process.
+#else
+ (void)cfd;
+ (void)mutable_cf_options;
+ (void)flush_jobs_info;
+#endif // ROCKSDB_LITE
+}
+
+Status DBImpl::CompactRange(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin_without_ts,
+ const Slice* end_without_ts) {
+ if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+ return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+
+ if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+ return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+
+ const Comparator* const ucmp = column_family->GetComparator();
+ assert(ucmp);
+ size_t ts_sz = ucmp->timestamp_size();
+ if (ts_sz == 0) {
+ return CompactRangeInternal(options, column_family, begin_without_ts,
+ end_without_ts, "" /*trim_ts*/);
+ }
+
+ std::string begin_str;
+ std::string end_str;
+
+ // CompactRange compact all keys: [begin, end] inclusively. Add maximum
+ // timestamp to include all `begin` keys, and add minimal timestamp to include
+ // all `end` keys.
+ if (begin_without_ts != nullptr) {
+ AppendKeyWithMaxTimestamp(&begin_str, *begin_without_ts, ts_sz);
+ }
+ if (end_without_ts != nullptr) {
+ AppendKeyWithMinTimestamp(&end_str, *end_without_ts, ts_sz);
+ }
+ Slice begin(begin_str);
+ Slice end(end_str);
+
+ Slice* begin_with_ts = begin_without_ts ? &begin : nullptr;
+ Slice* end_with_ts = end_without_ts ? &end : nullptr;
+
+ return CompactRangeInternal(options, column_family, begin_with_ts,
+ end_with_ts, "" /*trim_ts*/);
+}
+
+Status DBImpl::IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+ std::string ts_low) {
+ ColumnFamilyData* cfd = nullptr;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ assert(cfh != nullptr);
+ cfd = cfh->cfd();
+ }
+ assert(cfd != nullptr && cfd->user_comparator() != nullptr);
+ if (cfd->user_comparator()->timestamp_size() == 0) {
+ return Status::InvalidArgument(
+ "Timestamp is not enabled in this column family");
+ }
+ if (cfd->user_comparator()->timestamp_size() != ts_low.size()) {
+ return Status::InvalidArgument("ts_low size mismatch");
+ }
+ return IncreaseFullHistoryTsLowImpl(cfd, ts_low);
+}
+
+Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
+ std::string ts_low) {
+ VersionEdit edit;
+ edit.SetColumnFamily(cfd->GetID());
+ edit.SetFullHistoryTsLow(ts_low);
+ TEST_SYNC_POINT_CALLBACK("DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit",
+ &edit);
+
+ InstrumentedMutexLock l(&mutex_);
+ std::string current_ts_low = cfd->GetFullHistoryTsLow();
+ const Comparator* ucmp = cfd->user_comparator();
+ assert(ucmp->timestamp_size() == ts_low.size() && !ts_low.empty());
+ if (!current_ts_low.empty() &&
+ ucmp->CompareTimestamp(ts_low, current_ts_low) < 0) {
+ return Status::InvalidArgument("Cannot decrease full_history_ts_low");
+ }
+
+ Status s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ &edit, &mutex_, directories_.GetDbDir());
+ if (!s.ok()) {
+ return s;
+ }
+ current_ts_low = cfd->GetFullHistoryTsLow();
+ if (!current_ts_low.empty() &&
+ ucmp->CompareTimestamp(current_ts_low, ts_low) > 0) {
+ std::stringstream oss;
+ oss << "full_history_ts_low: " << Slice(current_ts_low).ToString(true)
+ << " is set to be higher than the requested "
+ "timestamp: "
+ << Slice(ts_low).ToString(true) << std::endl;
+ return Status::TryAgain(oss.str());
+ }
+ return Status::OK();
+}
+
+Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end,
+ const std::string& trim_ts) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ auto cfd = cfh->cfd();
+
+ if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) {
+ return Status::InvalidArgument("Invalid target path ID");
+ }
+
+ bool flush_needed = true;
+
+ // Update full_history_ts_low if it's set
+ if (options.full_history_ts_low != nullptr &&
+ !options.full_history_ts_low->empty()) {
+ std::string ts_low = options.full_history_ts_low->ToString();
+ if (begin != nullptr || end != nullptr) {
+ return Status::InvalidArgument(
+ "Cannot specify compaction range with full_history_ts_low");
+ }
+ Status s = IncreaseFullHistoryTsLowImpl(cfd, ts_low);
+ if (!s.ok()) {
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+ }
+ }
+
+ Status s;
+ if (begin != nullptr && end != nullptr) {
+ // TODO(ajkr): We could also optimize away the flush in certain cases where
+ // one/both sides of the interval are unbounded. But it requires more
+ // changes to RangesOverlapWithMemtables.
+ Range range(*begin, *end);
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ s = cfd->RangesOverlapWithMemtables(
+ {range}, super_version, immutable_db_options_.allow_data_in_errors,
+ &flush_needed);
+ CleanupSuperVersion(super_version);
+ }
+
+ if (s.ok() && flush_needed) {
+ FlushOptions fo;
+ fo.allow_write_stall = options.allow_write_stall;
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds;
+ mutex_.Lock();
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ mutex_.Unlock();
+ s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction,
+ false /* entered_write_thread */);
+ } else {
+ s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction,
+ false /* entered_write_thread */);
+ }
+ if (!s.ok()) {
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+ }
+ }
+
+ constexpr int kInvalidLevel = -1;
+ int final_output_level = kInvalidLevel;
+ bool exclusive = options.exclusive_manual_compaction;
+ if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
+ cfd->NumberLevels() > 1) {
+ // Always compact all files together.
+ final_output_level = cfd->NumberLevels() - 1;
+ // if bottom most level is reserved
+ if (immutable_db_options_.allow_ingest_behind) {
+ final_output_level--;
+ }
+ s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
+ final_output_level, options, begin, end, exclusive,
+ false, std::numeric_limits<uint64_t>::max(),
+ trim_ts);
+ } else {
+ int first_overlapped_level = kInvalidLevel;
+ int max_overlapped_level = kInvalidLevel;
+ {
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ Version* current_version = super_version->current;
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ bool overlap;
+ for (int level = 0;
+ level < current_version->storage_info()->num_non_empty_levels();
+ level++) {
+ overlap = true;
+ if (begin != nullptr && end != nullptr) {
+ Status status = current_version->OverlapWithLevelIterator(
+ ro, file_options_, *begin, *end, level, &overlap);
+ if (!status.ok()) {
+ overlap = current_version->storage_info()->OverlapInLevel(
+ level, begin, end);
+ }
+ } else {
+ overlap = current_version->storage_info()->OverlapInLevel(level,
+ begin, end);
+ }
+ if (overlap) {
+ if (first_overlapped_level == kInvalidLevel) {
+ first_overlapped_level = level;
+ }
+ max_overlapped_level = level;
+ }
+ }
+ CleanupSuperVersion(super_version);
+ }
+ if (s.ok() && first_overlapped_level != kInvalidLevel) {
+ // max_file_num_to_ignore can be used to filter out newly created SST
+ // files, useful for bottom level compaction in a manual compaction
+ uint64_t max_file_num_to_ignore = std::numeric_limits<uint64_t>::max();
+ uint64_t next_file_number = versions_->current_next_file_number();
+ final_output_level = max_overlapped_level;
+ int output_level;
+ for (int level = first_overlapped_level; level <= max_overlapped_level;
+ level++) {
+ bool disallow_trivial_move = false;
+ // in case the compaction is universal or if we're compacting the
+ // bottom-most level, the output level will be the same as input one.
+ // level 0 can never be the bottommost level (i.e. if all files are in
+ // level 0, we will compact to level 1)
+ if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+ output_level = level;
+ } else if (level == max_overlapped_level && level > 0) {
+ if (options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kSkip) {
+ // Skip bottommost level compaction
+ continue;
+ } else if (options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kIfHaveCompactionFilter &&
+ cfd->ioptions()->compaction_filter == nullptr &&
+ cfd->ioptions()->compaction_filter_factory == nullptr) {
+ // Skip bottommost level compaction since we don't have a compaction
+ // filter
+ continue;
+ }
+ output_level = level;
+ // update max_file_num_to_ignore only for bottom level compaction
+ // because data in newly compacted files in middle levels may still
+ // need to be pushed down
+ max_file_num_to_ignore = next_file_number;
+ } else {
+ output_level = level + 1;
+ if (cfd->ioptions()->compaction_style == kCompactionStyleLevel &&
+ cfd->ioptions()->level_compaction_dynamic_level_bytes &&
+ level == 0) {
+ output_level = ColumnFamilyData::kCompactToBaseLevel;
+ }
+ // if it's a BottommostLevel compaction and `kForce*` compaction is
+ // set, disallow trivial move
+ if (level == max_overlapped_level &&
+ (options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kForce ||
+ options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kForceOptimized)) {
+ disallow_trivial_move = true;
+ }
+ }
+ // trim_ts need real compaction to remove latest record
+ if (!trim_ts.empty()) {
+ disallow_trivial_move = true;
+ }
+ s = RunManualCompaction(cfd, level, output_level, options, begin, end,
+ exclusive, disallow_trivial_move,
+ max_file_num_to_ignore, trim_ts);
+ if (!s.ok()) {
+ break;
+ }
+ if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+ final_output_level = cfd->NumberLevels() - 1;
+ } else if (output_level > final_output_level) {
+ final_output_level = output_level;
+ }
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1");
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2");
+ }
+ }
+ }
+ if (!s.ok() || final_output_level == kInvalidLevel) {
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+ }
+
+ if (options.change_level) {
+ TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:1");
+ TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:2");
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[RefitLevel] waiting for background threads to stop");
+ DisableManualCompaction();
+ s = PauseBackgroundWork();
+ if (s.ok()) {
+ TEST_SYNC_POINT("DBImpl::CompactRange:PreRefitLevel");
+ s = ReFitLevel(cfd, final_output_level, options.target_level);
+ TEST_SYNC_POINT("DBImpl::CompactRange:PostRefitLevel");
+ // ContinueBackgroundWork always return Status::OK().
+ Status temp_s = ContinueBackgroundWork();
+ assert(temp_s.ok());
+ }
+ EnableManualCompaction();
+ TEST_SYNC_POINT(
+ "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled");
+ }
+ LogFlush(immutable_db_options_.info_log);
+
+ {
+ InstrumentedMutexLock l(&mutex_);
+ // an automatic compaction that has been scheduled might have been
+ // preempted by the manual compactions. Need to schedule it back.
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ return s;
+}
+
+Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& input_file_names,
+ const int output_level, const int output_path_id,
+ std::vector<std::string>* const output_file_names,
+ CompactionJobInfo* compaction_job_info) {
+#ifdef ROCKSDB_LITE
+ (void)compact_options;
+ (void)column_family;
+ (void)input_file_names;
+ (void)output_level;
+ (void)output_path_id;
+ (void)output_file_names;
+ (void)compaction_job_info;
+ // not supported in lite version
+ return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+ if (column_family == nullptr) {
+ return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
+ }
+
+ auto cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+ assert(cfd);
+
+ Status s;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+
+ // Perform CompactFiles
+ TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2");
+ TEST_SYNC_POINT_CALLBACK(
+ "TestCompactFiles:PausingManualCompaction:3",
+ reinterpret_cast<void*>(
+ const_cast<std::atomic<int>*>(&manual_compaction_paused_)));
+ {
+ InstrumentedMutexLock l(&mutex_);
+
+ // This call will unlock/lock the mutex to wait for current running
+ // IngestExternalFile() calls to finish.
+ WaitForIngestFile();
+
+ // We need to get current after `WaitForIngestFile`, because
+ // `IngestExternalFile` may add files that overlap with `input_file_names`
+ auto* current = cfd->current();
+ current->Ref();
+
+ s = CompactFilesImpl(compact_options, cfd, current, input_file_names,
+ output_file_names, output_level, output_path_id,
+ &job_context, &log_buffer, compaction_job_info);
+
+ current->Unref();
+ }
+
+ // Find and delete obsolete files
+ {
+ InstrumentedMutexLock l(&mutex_);
+ // If !s.ok(), this means that Compaction failed. In that case, we want
+ // to delete all obsolete files we might have created and we force
+ // FindObsoleteFiles(). This is because job_context does not
+ // catch all created files if compaction failed.
+ FindObsoleteFiles(&job_context, !s.ok());
+ } // release the mutex
+
+ // delete unnecessary files if any, this is done outside the mutex
+ if (job_context.HaveSomethingToClean() ||
+ job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+ // Have to flush the info logs before bg_compaction_scheduled_--
+ // because if bg_flush_scheduled_ becomes 0 and the lock is
+ // released, the deconstructor of DB can kick in and destroy all the
+ // states of DB so info_log might not be available after that point.
+ // It also applies to access other states that DB owns.
+ log_buffer.FlushBufferToLog();
+ if (job_context.HaveSomethingToDelete()) {
+ // no mutex is locked here. No need to Unlock() and Lock() here.
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ }
+
+ return s;
+#endif // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::CompactFilesImpl(
+ const CompactionOptions& compact_options, ColumnFamilyData* cfd,
+ Version* version, const std::vector<std::string>& input_file_names,
+ std::vector<std::string>* const output_file_names, const int output_level,
+ int output_path_id, JobContext* job_context, LogBuffer* log_buffer,
+ CompactionJobInfo* compaction_job_info) {
+ mutex_.AssertHeld();
+
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return Status::ShutdownInProgress();
+ }
+ if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+ return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+
+ std::unordered_set<uint64_t> input_set;
+ for (const auto& file_name : input_file_names) {
+ input_set.insert(TableFileNameToNumber(file_name));
+ }
+
+ ColumnFamilyMetaData cf_meta;
+ // TODO(yhchiang): can directly use version here if none of the
+ // following functions call is pluggable to external developers.
+ version->GetColumnFamilyMetaData(&cf_meta);
+
+ if (output_path_id < 0) {
+ if (cfd->ioptions()->cf_paths.size() == 1U) {
+ output_path_id = 0;
+ } else {
+ return Status::NotSupported(
+ "Automatic output path selection is not "
+ "yet supported in CompactFiles()");
+ }
+ }
+
+ Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
+ &input_set, cf_meta, output_level);
+ if (!s.ok()) {
+ return s;
+ }
+
+ std::vector<CompactionInputFiles> input_files;
+ s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, version->storage_info(), compact_options);
+ if (!s.ok()) {
+ return s;
+ }
+
+ for (const auto& inputs : input_files) {
+ if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) {
+ return Status::Aborted(
+ "Some of the necessary compaction input "
+ "files are already being compacted");
+ }
+ }
+ bool sfm_reserved_compact_space = false;
+ // First check if we have enough room to do the compaction
+ bool enough_room = EnoughRoomForCompaction(
+ cfd, input_files, &sfm_reserved_compact_space, log_buffer);
+
+ if (!enough_room) {
+ // m's vars will get set properly at the end of this function,
+ // as long as status == CompactionTooLarge
+ return Status::CompactionTooLarge();
+ }
+
+ // At this point, CompactFiles will be run.
+ bg_compaction_scheduled_++;
+
+ std::unique_ptr<Compaction> c;
+ assert(cfd->compaction_picker());
+ c.reset(cfd->compaction_picker()->CompactFiles(
+ compact_options, input_files, output_level, version->storage_info(),
+ *cfd->GetLatestMutableCFOptions(), mutable_db_options_, output_path_id));
+ // we already sanitized the set of input files and checked for conflicts
+ // without releasing the lock, so we're guaranteed a compaction can be formed.
+ assert(c != nullptr);
+
+ c->SetInputVersion(version);
+ // deletion compaction currently not allowed in CompactFiles.
+ assert(!c->deletion_compaction());
+
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+ new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+
+ assert(is_snapshot_supported_ || snapshots_.empty());
+ CompactionJobStats compaction_job_stats;
+ CompactionJob compaction_job(
+ job_context->job_id, c.get(), immutable_db_options_, mutable_db_options_,
+ file_options_for_compaction_, versions_.get(), &shutting_down_,
+ log_buffer, directories_.GetDbDir(),
+ GetDataDir(c->column_family_data(), c->output_path_id()),
+ GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_,
+ snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+ job_context, table_cache_, &event_logger_,
+ c->mutable_cf_options()->paranoid_file_checks,
+ c->mutable_cf_options()->report_bg_io_stats, dbname_,
+ &compaction_job_stats, Env::Priority::USER, io_tracer_,
+ kManualCompactionCanceledFalse_, db_id_, db_session_id_,
+ c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(),
+ &blob_callback_, &bg_compaction_scheduled_,
+ &bg_bottom_compaction_scheduled_);
+
+ // Creating a compaction influences the compaction score because the score
+ // takes running compactions into account (by skipping files that are already
+ // being compacted). Since we just changed compaction score, we recalculate it
+ // here.
+ version->storage_info()->ComputeCompactionScore(*cfd->ioptions(),
+ *c->mutable_cf_options());
+
+ compaction_job.Prepare();
+
+ mutex_.Unlock();
+ TEST_SYNC_POINT("CompactFilesImpl:0");
+ TEST_SYNC_POINT("CompactFilesImpl:1");
+ // Ignore the status here, as it will be checked in the Install down below...
+ compaction_job.Run().PermitUncheckedError();
+ TEST_SYNC_POINT("CompactFilesImpl:2");
+ TEST_SYNC_POINT("CompactFilesImpl:3");
+ mutex_.Lock();
+
+ Status status = compaction_job.Install(*c->mutable_cf_options());
+ if (status.ok()) {
+ assert(compaction_job.io_status().ok());
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+ }
+ // status above captures any error during compaction_job.Install, so its ok
+ // not check compaction_job.io_status() explicitly if we're not calling
+ // SetBGError
+ compaction_job.io_status().PermitUncheckedError();
+ c->ReleaseCompactionFiles(s);
+#ifndef ROCKSDB_LITE
+ // Need to make sure SstFileManager does its bookkeeping
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm && sfm_reserved_compact_space) {
+ sfm->OnCompactionCompletion(c.get());
+ }
+#endif // ROCKSDB_LITE
+
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ if (compaction_job_info != nullptr) {
+ BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats,
+ job_context->job_id, version, compaction_job_info);
+ }
+
+ if (status.ok()) {
+ // Done
+ } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+ // Ignore compaction errors found during shutting down
+ } else if (status.IsManualCompactionPaused()) {
+ // Don't report stopping manual compaction as error
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] [JOB %d] Stopping manual compaction",
+ c->column_family_data()->GetName().c_str(),
+ job_context->job_id);
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "[%s] [JOB %d] Compaction error: %s",
+ c->column_family_data()->GetName().c_str(),
+ job_context->job_id, status.ToString().c_str());
+ IOStatus io_s = compaction_job.io_status();
+ if (!io_s.ok()) {
+ error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction);
+ } else {
+ error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+ }
+ }
+
+ if (output_file_names != nullptr) {
+ for (const auto& newf : c->edit()->GetNewFiles()) {
+ output_file_names->push_back(TableFileName(
+ c->immutable_options()->cf_paths, newf.second.fd.GetNumber(),
+ newf.second.fd.GetPathId()));
+ }
+
+ for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) {
+ output_file_names->push_back(
+ BlobFileName(c->immutable_options()->cf_paths.front().path,
+ blob_file.GetBlobFileNumber()));
+ }
+ }
+
+ c.reset();
+
+ bg_compaction_scheduled_--;
+ if (bg_compaction_scheduled_ == 0) {
+ bg_cv_.SignalAll();
+ }
+ MaybeScheduleFlushOrCompaction();
+ TEST_SYNC_POINT("CompactFilesImpl:End");
+
+ return status;
+}
+#endif // ROCKSDB_LITE
+
+Status DBImpl::PauseBackgroundWork() {
+ InstrumentedMutexLock guard_lock(&mutex_);
+ bg_compaction_paused_++;
+ while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 ||
+ bg_flush_scheduled_ > 0) {
+ bg_cv_.Wait();
+ }
+ bg_work_paused_++;
+ return Status::OK();
+}
+
+Status DBImpl::ContinueBackgroundWork() {
+ InstrumentedMutexLock guard_lock(&mutex_);
+ if (bg_work_paused_ == 0) {
+ return Status::InvalidArgument();
+ }
+ assert(bg_work_paused_ > 0);
+ assert(bg_compaction_paused_ > 0);
+ bg_compaction_paused_--;
+ bg_work_paused_--;
+ // It's sufficient to check just bg_work_paused_ here since
+ // bg_work_paused_ is always no greater than bg_compaction_paused_
+ if (bg_work_paused_ == 0) {
+ MaybeScheduleFlushOrCompaction();
+ }
+ return Status::OK();
+}
+
+void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& job_stats,
+ int job_id) {
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.listeners.empty()) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+ if (c->is_manual_compaction() &&
+ manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+ return;
+ }
+
+ c->SetNotifyOnCompactionCompleted();
+ Version* current = cfd->current();
+ current->Ref();
+ // release lock while notifying events
+ mutex_.Unlock();
+ TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
+ {
+ CompactionJobInfo info{};
+ BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, current, &info);
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnCompactionBegin(this, info);
+ }
+ info.status.PermitUncheckedError();
+ }
+ mutex_.Lock();
+ current->Unref();
+#else
+ (void)cfd;
+ (void)c;
+ (void)st;
+ (void)job_stats;
+ (void)job_id;
+#endif // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnCompactionCompleted(
+ ColumnFamilyData* cfd, Compaction* c, const Status& st,
+ const CompactionJobStats& compaction_job_stats, const int job_id) {
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+
+ if (c->ShouldNotifyOnCompactionCompleted() == false) {
+ return;
+ }
+
+ Version* current = cfd->current();
+ current->Ref();
+ // release lock while notifying events
+ mutex_.Unlock();
+ TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
+ {
+ CompactionJobInfo info{};
+ BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current,
+ &info);
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnCompactionCompleted(this, info);
+ }
+ }
+ mutex_.Lock();
+ current->Unref();
+ // no need to signal bg_cv_ as it will be signaled at the end of the
+ // flush process.
+#else
+ (void)cfd;
+ (void)c;
+ (void)st;
+ (void)compaction_job_stats;
+ (void)job_id;
+#endif // ROCKSDB_LITE
+}
+
+// REQUIREMENT: block all background work by calling PauseBackgroundWork()
+// before calling this function
+Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
+ assert(level < cfd->NumberLevels());
+ if (target_level >= cfd->NumberLevels()) {
+ return Status::InvalidArgument("Target level exceeds number of levels");
+ }
+
+ SuperVersionContext sv_context(/* create_superversion */ true);
+
+ InstrumentedMutexLock guard_lock(&mutex_);
+
+ // only allow one thread refitting
+ if (refitting_level_) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[ReFitLevel] another thread is refitting");
+ return Status::NotSupported("another thread is refitting");
+ }
+ refitting_level_ = true;
+
+ const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+ // move to a smaller level
+ int to_level = target_level;
+ if (target_level < 0) {
+ to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level);
+ }
+
+ auto* vstorage = cfd->current()->storage_info();
+ if (to_level != level) {
+ if (to_level > level) {
+ if (level == 0) {
+ refitting_level_ = false;
+ return Status::NotSupported(
+ "Cannot change from level 0 to other levels.");
+ }
+ // Check levels are empty for a trivial move
+ for (int l = level + 1; l <= to_level; l++) {
+ if (vstorage->NumLevelFiles(l) > 0) {
+ refitting_level_ = false;
+ return Status::NotSupported(
+ "Levels between source and target are not empty for a move.");
+ }
+ }
+ } else {
+ // to_level < level
+ // Check levels are empty for a trivial move
+ for (int l = to_level; l < level; l++) {
+ if (vstorage->NumLevelFiles(l) > 0) {
+ refitting_level_ = false;
+ return Status::NotSupported(
+ "Levels between source and target are not empty for a move.");
+ }
+ }
+ }
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] Before refitting:\n%s", cfd->GetName().c_str(),
+ cfd->current()->DebugString().data());
+
+ VersionEdit edit;
+ edit.SetColumnFamily(cfd->GetID());
+ for (const auto& f : vstorage->LevelFiles(level)) {
+ edit.DeleteFile(level, f->fd.GetNumber());
+ edit.AddFile(
+ to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
+ f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno,
+ f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
+ f->oldest_ancester_time, f->file_creation_time, f->file_checksum,
+ f->file_checksum_func_name, f->unique_id);
+ }
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
+ edit.DebugString().data());
+
+ Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit,
+ &mutex_, directories_.GetDbDir());
+
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options);
+
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n",
+ cfd->GetName().c_str(), status.ToString().data());
+
+ if (status.ok()) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] After refitting:\n%s", cfd->GetName().c_str(),
+ cfd->current()->DebugString().data());
+ }
+ sv_context.Clean();
+ refitting_level_ = false;
+
+ return status;
+ }
+
+ refitting_level_ = false;
+ return Status::OK();
+}
+
+int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ return cfh->cfd()->NumberLevels();
+}
+
+int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) {
+ return 0;
+}
+
+int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ InstrumentedMutexLock l(&mutex_);
+ return cfh->cfd()
+ ->GetSuperVersion()
+ ->mutable_cf_options.level0_stop_writes_trigger;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+ ColumnFamilyHandle* column_family) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.",
+ cfh->GetName().c_str());
+ Status s;
+ if (immutable_db_options_.atomic_flush) {
+ s = AtomicFlushMemTables({cfh->cfd()}, flush_options,
+ FlushReason::kManualFlush);
+ } else {
+ s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush);
+ }
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] Manual flush finished, status: %s\n",
+ cfh->GetName().c_str(), s.ToString().c_str());
+ return s;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+ const std::vector<ColumnFamilyHandle*>& column_families) {
+ Status s;
+ if (!immutable_db_options_.atomic_flush) {
+ for (auto cfh : column_families) {
+ s = Flush(flush_options, cfh);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ } else {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Manual atomic flush start.\n"
+ "=====Column families:=====");
+ for (auto cfh : column_families) {
+ auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+ cfhi->GetName().c_str());
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "=====End of column families list=====");
+ autovector<ColumnFamilyData*> cfds;
+ std::for_each(column_families.begin(), column_families.end(),
+ [&cfds](ColumnFamilyHandle* elem) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(elem);
+ cfds.emplace_back(cfh->cfd());
+ });
+ s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Manual atomic flush finished, status: %s\n"
+ "=====Column families:=====",
+ s.ToString().c_str());
+ for (auto cfh : column_families) {
+ auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+ cfhi->GetName().c_str());
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "=====End of column families list=====");
+ }
+ return s;
+}
+
+Status DBImpl::RunManualCompaction(
+ ColumnFamilyData* cfd, int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options, const Slice* begin,
+ const Slice* end, bool exclusive, bool disallow_trivial_move,
+ uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+ assert(input_level == ColumnFamilyData::kCompactAllLevels ||
+ input_level >= 0);
+
+ InternalKey begin_storage, end_storage;
+ CompactionArg* ca = nullptr;
+
+ bool scheduled = false;
+ bool unscheduled = false;
+ Env::Priority thread_pool_priority = Env::Priority::TOTAL;
+ bool manual_conflict = false;
+
+ ManualCompactionState manual(
+ cfd, input_level, output_level, compact_range_options.target_path_id,
+ exclusive, disallow_trivial_move, compact_range_options.canceled);
+ // For universal compaction, we enforce every manual compaction to compact
+ // all files.
+ if (begin == nullptr ||
+ cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+ manual.begin = nullptr;
+ } else {
+ begin_storage.SetMinPossibleForUserKey(*begin);
+ manual.begin = &begin_storage;
+ }
+ if (end == nullptr ||
+ cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+ manual.end = nullptr;
+ } else {
+ end_storage.SetMaxPossibleForUserKey(*end);
+ manual.end = &end_storage;
+ }
+
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:0");
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:1");
+ InstrumentedMutexLock l(&mutex_);
+
+ if (manual_compaction_paused_ > 0) {
+ // Does not make sense to `AddManualCompaction()` in this scenario since
+ // `DisableManualCompaction()` just waited for the manual compaction queue
+ // to drain. So return immediately.
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:PausedAtStart");
+ manual.status =
+ Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ manual.done = true;
+ return manual.status;
+ }
+
+ // When a manual compaction arrives, temporarily disable scheduling of
+ // non-manual compactions and wait until the number of scheduled compaction
+ // jobs drops to zero. This used to be needed to ensure that this manual
+ // compaction can compact any range of keys/files. Now it is optional
+ // (see `CompactRangeOptions::exclusive_manual_compaction`). The use case for
+ // `exclusive_manual_compaction=true` is unclear beyond not trusting the code.
+ //
+ // HasPendingManualCompaction() is true when at least one thread is inside
+ // RunManualCompaction(), i.e. during that time no other compaction will
+ // get scheduled (see MaybeScheduleFlushOrCompaction).
+ //
+ // Note that the following loop doesn't stop more that one thread calling
+ // RunManualCompaction() from getting to the second while loop below.
+ // However, only one of them will actually schedule compaction, while
+ // others will wait on a condition variable until it completes.
+
+ AddManualCompaction(&manual);
+ TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_);
+ if (exclusive) {
+ // Limitation: there's no way to wake up the below loop when user sets
+ // `*manual.canceled`. So `CompactRangeOptions::exclusive_manual_compaction`
+ // and `CompactRangeOptions::canceled` might not work well together.
+ while (bg_bottom_compaction_scheduled_ > 0 ||
+ bg_compaction_scheduled_ > 0) {
+ if (manual_compaction_paused_ > 0 || manual.canceled == true) {
+ // Pretend the error came from compaction so the below cleanup/error
+ // handling code can process it.
+ manual.done = true;
+ manual.status =
+ Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ break;
+ }
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled");
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "[%s] Manual compaction waiting for all other scheduled background "
+ "compactions to finish",
+ cfd->GetName().c_str());
+ bg_cv_.Wait();
+ }
+ }
+
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+
+ ROCKS_LOG_BUFFER(&log_buffer, "[%s] Manual compaction starting",
+ cfd->GetName().c_str());
+
+ // We don't check bg_error_ here, because if we get the error in compaction,
+ // the compaction will set manual.status to bg_error_ and set manual.done to
+ // true.
+ while (!manual.done) {
+ assert(HasPendingManualCompaction());
+ manual_conflict = false;
+ Compaction* compaction = nullptr;
+ if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) ||
+ scheduled ||
+ (((manual.manual_end = &manual.tmp_storage1) != nullptr) &&
+ ((compaction = manual.cfd->CompactRange(
+ *manual.cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+ manual.input_level, manual.output_level, compact_range_options,
+ manual.begin, manual.end, &manual.manual_end, &manual_conflict,
+ max_file_num_to_ignore, trim_ts)) == nullptr &&
+ manual_conflict))) {
+ // exclusive manual compactions should not see a conflict during
+ // CompactRange
+ assert(!exclusive || !manual_conflict);
+ // Running either this or some other manual compaction
+ bg_cv_.Wait();
+ if (manual_compaction_paused_ > 0 && scheduled && !unscheduled) {
+ assert(thread_pool_priority != Env::Priority::TOTAL);
+ // unschedule all manual compactions
+ auto unscheduled_task_num = env_->UnSchedule(
+ GetTaskTag(TaskType::kManualCompaction), thread_pool_priority);
+ if (unscheduled_task_num > 0) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "[%s] Unscheduled %d number of manual compactions from the "
+ "thread-pool",
+ cfd->GetName().c_str(), unscheduled_task_num);
+ // it may unschedule other manual compactions, notify others.
+ bg_cv_.SignalAll();
+ }
+ unscheduled = true;
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:Unscheduled");
+ }
+ if (scheduled && manual.incomplete == true) {
+ assert(!manual.in_progress);
+ scheduled = false;
+ manual.incomplete = false;
+ }
+ } else if (!scheduled) {
+ if (compaction == nullptr) {
+ manual.done = true;
+ bg_cv_.SignalAll();
+ continue;
+ }
+ ca = new CompactionArg;
+ ca->db = this;
+ ca->prepicked_compaction = new PrepickedCompaction;
+ ca->prepicked_compaction->manual_compaction_state = &manual;
+ ca->prepicked_compaction->compaction = compaction;
+ if (!RequestCompactionToken(
+ cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) {
+ // Don't throttle manual compaction, only count outstanding tasks.
+ assert(false);
+ }
+ manual.incomplete = false;
+ if (compaction->bottommost_level() &&
+ env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+ bg_bottom_compaction_scheduled_++;
+ ca->compaction_pri_ = Env::Priority::BOTTOM;
+ env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca,
+ Env::Priority::BOTTOM,
+ GetTaskTag(TaskType::kManualCompaction),
+ &DBImpl::UnscheduleCompactionCallback);
+ thread_pool_priority = Env::Priority::BOTTOM;
+ } else {
+ bg_compaction_scheduled_++;
+ ca->compaction_pri_ = Env::Priority::LOW;
+ env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW,
+ GetTaskTag(TaskType::kManualCompaction),
+ &DBImpl::UnscheduleCompactionCallback);
+ thread_pool_priority = Env::Priority::LOW;
+ }
+ scheduled = true;
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled");
+ }
+ }
+
+ log_buffer.FlushBufferToLog();
+ assert(!manual.in_progress);
+ assert(HasPendingManualCompaction());
+ RemoveManualCompaction(&manual);
+ // if the manual job is unscheduled, try schedule other jobs in case there's
+ // any unscheduled compaction job which was blocked by exclusive manual
+ // compaction.
+ if (manual.status.IsIncomplete() &&
+ manual.status.subcode() == Status::SubCode::kManualCompactionPaused) {
+ MaybeScheduleFlushOrCompaction();
+ }
+ bg_cv_.SignalAll();
+ return manual.status;
+}
+
+void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+ FlushRequest* req) {
+ assert(req != nullptr);
+ req->reserve(cfds.size());
+ for (const auto cfd : cfds) {
+ if (nullptr == cfd) {
+ // cfd may be null, see DBImpl::ScheduleFlushes
+ continue;
+ }
+ uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID();
+ req->emplace_back(cfd, max_memtable_id);
+ }
+}
+
+Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
+ const FlushOptions& flush_options,
+ FlushReason flush_reason,
+ bool entered_write_thread) {
+ // This method should not be called if atomic_flush is true.
+ assert(!immutable_db_options_.atomic_flush);
+ if (!flush_options.wait && write_controller_.IsStopped()) {
+ std::ostringstream oss;
+ oss << "Writes have been stopped, thus unable to perform manual flush. "
+ "Please try again later after writes are resumed";
+ return Status::TryAgain(oss.str());
+ }
+ Status s;
+ if (!flush_options.allow_write_stall) {
+ bool flush_needed = true;
+ s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone");
+ if (!s.ok() || !flush_needed) {
+ return s;
+ }
+ }
+
+ const bool needs_to_join_write_thread = !entered_write_thread;
+ autovector<FlushRequest> flush_reqs;
+ autovector<uint64_t> memtable_ids_to_wait;
+ {
+ WriteContext context;
+ InstrumentedMutexLock guard_lock(&mutex_);
+
+ WriteThread::Writer w;
+ WriteThread::Writer nonmem_w;
+ if (needs_to_join_write_thread) {
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+ }
+ WaitForPendingWrites();
+
+ if (flush_reason != FlushReason::kErrorRecoveryRetryFlush &&
+ (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) {
+ // Note that, when flush reason is kErrorRecoveryRetryFlush, during the
+ // auto retry resume, we want to avoid creating new small memtables.
+ // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl
+ // will iterate through all the CFs and call FlushMemtable during auto
+ // retry resume, it is possible that in some CFs,
+ // cfd->imm()->NumNotFlushed() = 0. In this case, so no flush request will
+ // be created and scheduled, status::OK() will be returned.
+ s = SwitchMemtable(cfd, &context);
+ }
+ const uint64_t flush_memtable_id = std::numeric_limits<uint64_t>::max();
+ if (s.ok()) {
+ if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+ !cached_recoverable_state_empty_.load()) {
+ FlushRequest req{{cfd, flush_memtable_id}};
+ flush_reqs.emplace_back(std::move(req));
+ memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID());
+ }
+ if (immutable_db_options_.persist_stats_to_disk &&
+ flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
+ ColumnFamilyData* cfd_stats =
+ versions_->GetColumnFamilySet()->GetColumnFamily(
+ kPersistentStatsColumnFamilyName);
+ if (cfd_stats != nullptr && cfd_stats != cfd &&
+ !cfd_stats->mem()->IsEmpty()) {
+ // only force flush stats CF when it will be the only CF lagging
+ // behind after the current flush
+ bool stats_cf_flush_needed = true;
+ for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+ if (loop_cfd == cfd_stats || loop_cfd == cfd) {
+ continue;
+ }
+ if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+ stats_cf_flush_needed = false;
+ }
+ }
+ if (stats_cf_flush_needed) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Force flushing stats CF with manual flush of %s "
+ "to avoid holding old logs",
+ cfd->GetName().c_str());
+ s = SwitchMemtable(cfd_stats, &context);
+ FlushRequest req{{cfd_stats, flush_memtable_id}};
+ flush_reqs.emplace_back(std::move(req));
+ memtable_ids_to_wait.emplace_back(
+ cfd->imm()->GetLatestMemTableID());
+ }
+ }
+ }
+ }
+
+ if (s.ok() && !flush_reqs.empty()) {
+ for (const auto& req : flush_reqs) {
+ assert(req.size() == 1);
+ ColumnFamilyData* loop_cfd = req[0].first;
+ loop_cfd->imm()->FlushRequested();
+ }
+ // If the caller wants to wait for this flush to complete, it indicates
+ // that the caller expects the ColumnFamilyData not to be free'ed by
+ // other threads which may drop the column family concurrently.
+ // Therefore, we increase the cfd's ref count.
+ if (flush_options.wait) {
+ for (const auto& req : flush_reqs) {
+ assert(req.size() == 1);
+ ColumnFamilyData* loop_cfd = req[0].first;
+ loop_cfd->Ref();
+ }
+ }
+ for (const auto& req : flush_reqs) {
+ SchedulePendingFlush(req, flush_reason);
+ }
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ if (needs_to_join_write_thread) {
+ write_thread_.ExitUnbatched(&w);
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush");
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush");
+ if (s.ok() && flush_options.wait) {
+ autovector<ColumnFamilyData*> cfds;
+ autovector<const uint64_t*> flush_memtable_ids;
+ assert(flush_reqs.size() == memtable_ids_to_wait.size());
+ for (size_t i = 0; i < flush_reqs.size(); ++i) {
+ assert(flush_reqs[i].size() == 1);
+ cfds.push_back(flush_reqs[i][0].first);
+ flush_memtable_ids.push_back(&(memtable_ids_to_wait[i]));
+ }
+ s = WaitForFlushMemTables(
+ cfds, flush_memtable_ids,
+ (flush_reason == FlushReason::kErrorRecovery ||
+ flush_reason == FlushReason::kErrorRecoveryRetryFlush));
+ InstrumentedMutexLock lock_guard(&mutex_);
+ for (auto* tmp_cfd : cfds) {
+ tmp_cfd->UnrefAndTryDelete();
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished");
+ return s;
+}
+
+// Flush all elements in 'column_family_datas'
+// and atomically record the result to the MANIFEST.
+Status DBImpl::AtomicFlushMemTables(
+ const autovector<ColumnFamilyData*>& column_family_datas,
+ const FlushOptions& flush_options, FlushReason flush_reason,
+ bool entered_write_thread) {
+ assert(immutable_db_options_.atomic_flush);
+ if (!flush_options.wait && write_controller_.IsStopped()) {
+ std::ostringstream oss;
+ oss << "Writes have been stopped, thus unable to perform manual flush. "
+ "Please try again later after writes are resumed";
+ return Status::TryAgain(oss.str());
+ }
+ Status s;
+ if (!flush_options.allow_write_stall) {
+ int num_cfs_to_flush = 0;
+ for (auto cfd : column_family_datas) {
+ bool flush_needed = true;
+ s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+ if (!s.ok()) {
+ return s;
+ } else if (flush_needed) {
+ ++num_cfs_to_flush;
+ }
+ }
+ if (0 == num_cfs_to_flush) {
+ return s;
+ }
+ }
+ const bool needs_to_join_write_thread = !entered_write_thread;
+ FlushRequest flush_req;
+ autovector<ColumnFamilyData*> cfds;
+ {
+ WriteContext context;
+ InstrumentedMutexLock guard_lock(&mutex_);
+
+ WriteThread::Writer w;
+ WriteThread::Writer nonmem_w;
+ if (needs_to_join_write_thread) {
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+ }
+ WaitForPendingWrites();
+
+ for (auto cfd : column_family_datas) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+ !cached_recoverable_state_empty_.load()) {
+ cfds.emplace_back(cfd);
+ }
+ }
+ for (auto cfd : cfds) {
+ if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) ||
+ flush_reason == FlushReason::kErrorRecoveryRetryFlush) {
+ continue;
+ }
+ cfd->Ref();
+ s = SwitchMemtable(cfd, &context);
+ cfd->UnrefAndTryDelete();
+ if (!s.ok()) {
+ break;
+ }
+ }
+ if (s.ok()) {
+ AssignAtomicFlushSeq(cfds);
+ for (auto cfd : cfds) {
+ cfd->imm()->FlushRequested();
+ }
+ // If the caller wants to wait for this flush to complete, it indicates
+ // that the caller expects the ColumnFamilyData not to be free'ed by
+ // other threads which may drop the column family concurrently.
+ // Therefore, we increase the cfd's ref count.
+ if (flush_options.wait) {
+ for (auto cfd : cfds) {
+ cfd->Ref();
+ }
+ }
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, flush_reason);
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ if (needs_to_join_write_thread) {
+ write_thread_.ExitUnbatched(&w);
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
+ TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush");
+ if (s.ok() && flush_options.wait) {
+ autovector<const uint64_t*> flush_memtable_ids;
+ for (auto& iter : flush_req) {
+ flush_memtable_ids.push_back(&(iter.second));
+ }
+ s = WaitForFlushMemTables(
+ cfds, flush_memtable_ids,
+ (flush_reason == FlushReason::kErrorRecovery ||
+ flush_reason == FlushReason::kErrorRecoveryRetryFlush));
+ InstrumentedMutexLock lock_guard(&mutex_);
+ for (auto* cfd : cfds) {
+ cfd->UnrefAndTryDelete();
+ }
+ }
+ return s;
+}
+
+// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can
+// cause write stall, for example if one memtable is being flushed already.
+// This method tries to avoid write stall (similar to CompactRange() behavior)
+// it emulates how the SuperVersion / LSM would change if flush happens, checks
+// it against various constrains and delays flush if it'd cause write stall.
+// Caller should check status and flush_needed to see if flush already happened.
+Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+ bool* flush_needed) {
+ {
+ *flush_needed = true;
+ InstrumentedMutexLock l(&mutex_);
+ uint64_t orig_active_memtable_id = cfd->mem()->GetID();
+ WriteStallCondition write_stall_condition = WriteStallCondition::kNormal;
+ do {
+ if (write_stall_condition != WriteStallCondition::kNormal) {
+ // Same error handling as user writes: Don't wait if there's a
+ // background error, even if it's a soft error. We might wait here
+ // indefinitely as the pending flushes/compactions may never finish
+ // successfully, resulting in the stall condition lasting indefinitely
+ if (error_handler_.IsBGWorkStopped()) {
+ return error_handler_.GetBGError();
+ }
+
+ TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] WaitUntilFlushWouldNotStallWrites"
+ " waiting on stall conditions to clear",
+ cfd->GetName().c_str());
+ bg_cv_.Wait();
+ }
+ if (cfd->IsDropped()) {
+ return Status::ColumnFamilyDropped();
+ }
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return Status::ShutdownInProgress();
+ }
+
+ uint64_t earliest_memtable_id =
+ std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID());
+ if (earliest_memtable_id > orig_active_memtable_id) {
+ // We waited so long that the memtable we were originally waiting on was
+ // flushed.
+ *flush_needed = false;
+ return Status::OK();
+ }
+
+ const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+ const auto* vstorage = cfd->current()->storage_info();
+
+ // Skip stalling check if we're below auto-flush and auto-compaction
+ // triggers. If it stalled in these conditions, that'd mean the stall
+ // triggers are so low that stalling is needed for any background work. In
+ // that case we shouldn't wait since background work won't be scheduled.
+ if (cfd->imm()->NumNotFlushed() <
+ cfd->ioptions()->min_write_buffer_number_to_merge &&
+ vstorage->l0_delay_trigger_count() <
+ mutable_cf_options.level0_file_num_compaction_trigger) {
+ break;
+ }
+
+ // check whether one extra immutable memtable or an extra L0 file would
+ // cause write stalling mode to be entered. It could still enter stall
+ // mode due to pending compaction bytes, but that's less common
+ write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause(
+ cfd->imm()->NumNotFlushed() + 1,
+ vstorage->l0_delay_trigger_count() + 1,
+ vstorage->estimated_compaction_needed_bytes(),
+ mutable_cf_options, *cfd->ioptions())
+ .first;
+ } while (write_stall_condition != WriteStallCondition::kNormal);
+ }
+ return Status::OK();
+}
+
+// Wait for memtables to be flushed for multiple column families.
+// let N = cfds.size()
+// for i in [0, N),
+// 1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs
+// have to be flushed for THIS column family;
+// 2) if flush_memtable_ids[i] is null, then all memtables in THIS column
+// family have to be flushed.
+// Finish waiting when ALL column families finish flushing memtables.
+// resuming_from_bg_err indicates whether the caller is trying to resume from
+// background error or in normal processing.
+Status DBImpl::WaitForFlushMemTables(
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const uint64_t*>& flush_memtable_ids,
+ bool resuming_from_bg_err) {
+ int num = static_cast<int>(cfds.size());
+ // Wait until the compaction completes
+ InstrumentedMutexLock l(&mutex_);
+ Status s;
+ // If the caller is trying to resume from bg error, then
+ // error_handler_.IsDBStopped() is true.
+ while (resuming_from_bg_err || !error_handler_.IsDBStopped()) {
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ s = Status::ShutdownInProgress();
+ return s;
+ }
+ // If an error has occurred during resumption, then no need to wait.
+ // But flush operation may fail because of this error, so need to
+ // return the status.
+ if (!error_handler_.GetRecoveryError().ok()) {
+ s = error_handler_.GetRecoveryError();
+ break;
+ }
+ // If BGWorkStopped, which indicate that there is a BG error and
+ // 1) soft error but requires no BG work, 2) no in auto_recovery_
+ if (!resuming_from_bg_err && error_handler_.IsBGWorkStopped() &&
+ error_handler_.GetBGError().severity() < Status::Severity::kHardError) {
+ s = error_handler_.GetBGError();
+ return s;
+ }
+
+ // Number of column families that have been dropped.
+ int num_dropped = 0;
+ // Number of column families that have finished flush.
+ int num_finished = 0;
+ for (int i = 0; i < num; ++i) {
+ if (cfds[i]->IsDropped()) {
+ ++num_dropped;
+ } else if (cfds[i]->imm()->NumNotFlushed() == 0 ||
+ (flush_memtable_ids[i] != nullptr &&
+ cfds[i]->imm()->GetEarliestMemTableID() >
+ *flush_memtable_ids[i])) {
+ ++num_finished;
+ }
+ }
+ if (1 == num_dropped && 1 == num) {
+ s = Status::ColumnFamilyDropped();
+ return s;
+ }
+ // Column families involved in this flush request have either been dropped
+ // or finished flush. Then it's time to finish waiting.
+ if (num_dropped + num_finished == num) {
+ break;
+ }
+ bg_cv_.Wait();
+ }
+ // If not resuming from bg error, and an error has caused the DB to stop,
+ // then report the bg error to caller.
+ if (!resuming_from_bg_err && error_handler_.IsDBStopped()) {
+ s = error_handler_.GetBGError();
+ }
+ return s;
+}
+
+Status DBImpl::EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& column_family_handles) {
+ Status s;
+ for (auto cf_ptr : column_family_handles) {
+ Status status =
+ this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}});
+ if (!status.ok()) {
+ s = status;
+ }
+ }
+
+ return s;
+}
+
+// NOTE: Calling DisableManualCompaction() may overwrite the
+// user-provided canceled variable in CompactRangeOptions
+void DBImpl::DisableManualCompaction() {
+ InstrumentedMutexLock l(&mutex_);
+ manual_compaction_paused_.fetch_add(1, std::memory_order_release);
+
+ // Mark the canceled as true when the cancellation is triggered by
+ // manual_compaction_paused (may overwrite user-provided `canceled`)
+ for (const auto& manual_compaction : manual_compaction_dequeue_) {
+ manual_compaction->canceled = true;
+ }
+
+ // Wake up manual compactions waiting to start.
+ bg_cv_.SignalAll();
+
+ // Wait for any pending manual compactions to finish (typically through
+ // failing with `Status::Incomplete`) prior to returning. This way we are
+ // guaranteed no pending manual compaction will commit while manual
+ // compactions are "disabled".
+ while (HasPendingManualCompaction()) {
+ bg_cv_.Wait();
+ }
+}
+
+// NOTE: In contrast to DisableManualCompaction(), calling
+// EnableManualCompaction() does NOT overwrite the user-provided *canceled
+// variable to be false since there is NO CHANCE a canceled compaction
+// is uncanceled. In other words, a canceled compaction must have been
+// dropped out of the manual compaction queue, when we disable it.
+void DBImpl::EnableManualCompaction() {
+ InstrumentedMutexLock l(&mutex_);
+ assert(manual_compaction_paused_ > 0);
+ manual_compaction_paused_.fetch_sub(1, std::memory_order_release);
+}
+
+void DBImpl::MaybeScheduleFlushOrCompaction() {
+ mutex_.AssertHeld();
+ if (!opened_successfully_) {
+ // Compaction may introduce data race to DB open
+ return;
+ }
+ if (bg_work_paused_ > 0) {
+ // we paused the background work
+ return;
+ } else if (error_handler_.IsBGWorkStopped() &&
+ !error_handler_.IsRecoveryInProgress()) {
+ // There has been a hard error and this call is not part of the recovery
+ // sequence. Bail out here so we don't get into an endless loop of
+ // scheduling BG work which will again call this function
+ return;
+ } else if (shutting_down_.load(std::memory_order_acquire)) {
+ // DB is being deleted; no more background compactions
+ return;
+ }
+ auto bg_job_limits = GetBGJobLimits();
+ bool is_flush_pool_empty =
+ env_->GetBackgroundThreads(Env::Priority::HIGH) == 0;
+ while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
+ bg_flush_scheduled_ < bg_job_limits.max_flushes) {
+ bg_flush_scheduled_++;
+ FlushThreadArg* fta = new FlushThreadArg;
+ fta->db_ = this;
+ fta->thread_pri_ = Env::Priority::HIGH;
+ env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::HIGH, this,
+ &DBImpl::UnscheduleFlushCallback);
+ --unscheduled_flushes_;
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0",
+ &unscheduled_flushes_);
+ }
+
+ // special case -- if high-pri (flush) thread pool is empty, then schedule
+ // flushes in low-pri (compaction) thread pool.
+ if (is_flush_pool_empty) {
+ while (unscheduled_flushes_ > 0 &&
+ bg_flush_scheduled_ + bg_compaction_scheduled_ <
+ bg_job_limits.max_flushes) {
+ bg_flush_scheduled_++;
+ FlushThreadArg* fta = new FlushThreadArg;
+ fta->db_ = this;
+ fta->thread_pri_ = Env::Priority::LOW;
+ env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::LOW, this,
+ &DBImpl::UnscheduleFlushCallback);
+ --unscheduled_flushes_;
+ }
+ }
+
+ if (bg_compaction_paused_ > 0) {
+ // we paused the background compaction
+ return;
+ } else if (error_handler_.IsBGWorkStopped()) {
+ // Compaction is not part of the recovery sequence from a hard error. We
+ // might get here because recovery might do a flush and install a new
+ // super version, which will try to schedule pending compactions. Bail
+ // out here and let the higher level recovery handle compactions
+ return;
+ }
+
+ if (HasExclusiveManualCompaction()) {
+ // only manual compactions are allowed to run. don't schedule automatic
+ // compactions
+ TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Conflict");
+ return;
+ }
+
+ while (bg_compaction_scheduled_ + bg_bottom_compaction_scheduled_ <
+ bg_job_limits.max_compactions &&
+ unscheduled_compactions_ > 0) {
+ CompactionArg* ca = new CompactionArg;
+ ca->db = this;
+ ca->compaction_pri_ = Env::Priority::LOW;
+ ca->prepicked_compaction = nullptr;
+ bg_compaction_scheduled_++;
+ unscheduled_compactions_--;
+ env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
+ &DBImpl::UnscheduleCompactionCallback);
+ }
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const {
+ mutex_.AssertHeld();
+ return GetBGJobLimits(mutable_db_options_.max_background_flushes,
+ mutable_db_options_.max_background_compactions,
+ mutable_db_options_.max_background_jobs,
+ write_controller_.NeedSpeedupCompaction());
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes,
+ int max_background_compactions,
+ int max_background_jobs,
+ bool parallelize_compactions) {
+ BGJobLimits res;
+ if (max_background_flushes == -1 && max_background_compactions == -1) {
+ // for our first stab implementing max_background_jobs, simply allocate a
+ // quarter of the threads to flushes.
+ res.max_flushes = std::max(1, max_background_jobs / 4);
+ res.max_compactions = std::max(1, max_background_jobs - res.max_flushes);
+ } else {
+ // compatibility code in case users haven't migrated to max_background_jobs,
+ // which automatically computes flush/compaction limits
+ res.max_flushes = std::max(1, max_background_flushes);
+ res.max_compactions = std::max(1, max_background_compactions);
+ }
+ if (!parallelize_compactions) {
+ // throttle background compactions until we deem necessary
+ res.max_compactions = 1;
+ }
+ return res;
+}
+
+void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
+ assert(!cfd->queued_for_compaction());
+ cfd->Ref();
+ compaction_queue_.push_back(cfd);
+ cfd->set_queued_for_compaction(true);
+}
+
+ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
+ assert(!compaction_queue_.empty());
+ auto cfd = *compaction_queue_.begin();
+ compaction_queue_.pop_front();
+ assert(cfd->queued_for_compaction());
+ cfd->set_queued_for_compaction(false);
+ return cfd;
+}
+
+DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() {
+ assert(!flush_queue_.empty());
+ FlushRequest flush_req = flush_queue_.front();
+ flush_queue_.pop_front();
+ if (!immutable_db_options_.atomic_flush) {
+ assert(flush_req.size() == 1);
+ }
+ for (const auto& elem : flush_req) {
+ if (!immutable_db_options_.atomic_flush) {
+ ColumnFamilyData* cfd = elem.first;
+ assert(cfd);
+ assert(cfd->queued_for_flush());
+ cfd->set_queued_for_flush(false);
+ }
+ }
+ // TODO: need to unset flush reason?
+ return flush_req;
+}
+
+ColumnFamilyData* DBImpl::PickCompactionFromQueue(
+ std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer) {
+ assert(!compaction_queue_.empty());
+ assert(*token == nullptr);
+ autovector<ColumnFamilyData*> throttled_candidates;
+ ColumnFamilyData* cfd = nullptr;
+ while (!compaction_queue_.empty()) {
+ auto first_cfd = *compaction_queue_.begin();
+ compaction_queue_.pop_front();
+ assert(first_cfd->queued_for_compaction());
+ if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) {
+ throttled_candidates.push_back(first_cfd);
+ continue;
+ }
+ cfd = first_cfd;
+ cfd->set_queued_for_compaction(false);
+ break;
+ }
+ // Add throttled compaction candidates back to queue in the original order.
+ for (auto iter = throttled_candidates.rbegin();
+ iter != throttled_candidates.rend(); ++iter) {
+ compaction_queue_.push_front(*iter);
+ }
+ return cfd;
+}
+
+void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
+ FlushReason flush_reason) {
+ mutex_.AssertHeld();
+ if (flush_req.empty()) {
+ return;
+ }
+ if (!immutable_db_options_.atomic_flush) {
+ // For the non-atomic flush case, we never schedule multiple column
+ // families in the same flush request.
+ assert(flush_req.size() == 1);
+ ColumnFamilyData* cfd = flush_req[0].first;
+ assert(cfd);
+
+ if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) {
+ cfd->Ref();
+ cfd->set_queued_for_flush(true);
+ cfd->SetFlushReason(flush_reason);
+ ++unscheduled_flushes_;
+ flush_queue_.push_back(flush_req);
+ }
+ } else {
+ for (auto& iter : flush_req) {
+ ColumnFamilyData* cfd = iter.first;
+ cfd->Ref();
+ cfd->SetFlushReason(flush_reason);
+ }
+ ++unscheduled_flushes_;
+ flush_queue_.push_back(flush_req);
+ }
+}
+
+void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
+ mutex_.AssertHeld();
+ if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+ }
+}
+
+void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+ FileType type, uint64_t number, int job_id) {
+ mutex_.AssertHeld();
+ PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
+ purge_files_.insert({{number, std::move(file_info)}});
+}
+
+void DBImpl::BGWorkFlush(void* arg) {
+ FlushThreadArg fta = *(reinterpret_cast<FlushThreadArg*>(arg));
+ delete reinterpret_cast<FlushThreadArg*>(arg);
+
+ IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_);
+ TEST_SYNC_POINT("DBImpl::BGWorkFlush");
+ static_cast_with_check<DBImpl>(fta.db_)->BackgroundCallFlush(fta.thread_pri_);
+ TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
+}
+
+void DBImpl::BGWorkCompaction(void* arg) {
+ CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
+ delete reinterpret_cast<CompactionArg*>(arg);
+ IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW);
+ TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
+ auto prepicked_compaction =
+ static_cast<PrepickedCompaction*>(ca.prepicked_compaction);
+ static_cast_with_check<DBImpl>(ca.db)->BackgroundCallCompaction(
+ prepicked_compaction, Env::Priority::LOW);
+ delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkBottomCompaction(void* arg) {
+ CompactionArg ca = *(static_cast<CompactionArg*>(arg));
+ delete static_cast<CompactionArg*>(arg);
+ IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM);
+ TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction");
+ auto* prepicked_compaction = ca.prepicked_compaction;
+ assert(prepicked_compaction && prepicked_compaction->compaction);
+ ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM);
+ delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkPurge(void* db) {
+ IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
+ TEST_SYNC_POINT("DBImpl::BGWorkPurge:start");
+ reinterpret_cast<DBImpl*>(db)->BackgroundCallPurge();
+ TEST_SYNC_POINT("DBImpl::BGWorkPurge:end");
+}
+
+void DBImpl::UnscheduleCompactionCallback(void* arg) {
+ CompactionArg* ca_ptr = reinterpret_cast<CompactionArg*>(arg);
+ Env::Priority compaction_pri = ca_ptr->compaction_pri_;
+ if (Env::Priority::BOTTOM == compaction_pri) {
+ // Decrement bg_bottom_compaction_scheduled_ if priority is BOTTOM
+ ca_ptr->db->bg_bottom_compaction_scheduled_--;
+ } else if (Env::Priority::LOW == compaction_pri) {
+ // Decrement bg_compaction_scheduled_ if priority is LOW
+ ca_ptr->db->bg_compaction_scheduled_--;
+ }
+ CompactionArg ca = *(ca_ptr);
+ delete reinterpret_cast<CompactionArg*>(arg);
+ if (ca.prepicked_compaction != nullptr) {
+ // if it's a manual compaction, set status to ManualCompactionPaused
+ if (ca.prepicked_compaction->manual_compaction_state) {
+ ca.prepicked_compaction->manual_compaction_state->done = true;
+ ca.prepicked_compaction->manual_compaction_state->status =
+ Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+ if (ca.prepicked_compaction->compaction != nullptr) {
+ ca.prepicked_compaction->compaction->ReleaseCompactionFiles(
+ Status::Incomplete(Status::SubCode::kManualCompactionPaused));
+ delete ca.prepicked_compaction->compaction;
+ }
+ delete ca.prepicked_compaction;
+ }
+ TEST_SYNC_POINT("DBImpl::UnscheduleCompactionCallback");
+}
+
+void DBImpl::UnscheduleFlushCallback(void* arg) {
+ // Decrement bg_flush_scheduled_ in flush callback
+ reinterpret_cast<FlushThreadArg*>(arg)->db_->bg_flush_scheduled_--;
+ Env::Priority flush_pri = reinterpret_cast<FlushThreadArg*>(arg)->thread_pri_;
+ if (Env::Priority::LOW == flush_pri) {
+ TEST_SYNC_POINT("DBImpl::UnscheduleLowFlushCallback");
+ } else if (Env::Priority::HIGH == flush_pri) {
+ TEST_SYNC_POINT("DBImpl::UnscheduleHighFlushCallback");
+ }
+ delete reinterpret_cast<FlushThreadArg*>(arg);
+ TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback");
+}
+
+Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
+ LogBuffer* log_buffer, FlushReason* reason,
+ Env::Priority thread_pri) {
+ mutex_.AssertHeld();
+
+ Status status;
+ *reason = FlushReason::kOthers;
+ // If BG work is stopped due to an error, but a recovery is in progress,
+ // that means this flush is part of the recovery. So allow it to go through
+ if (!error_handler_.IsBGWorkStopped()) {
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ status = Status::ShutdownInProgress();
+ }
+ } else if (!error_handler_.IsRecoveryInProgress()) {
+ status = error_handler_.GetBGError();
+ }
+
+ if (!status.ok()) {
+ return status;
+ }
+
+ autovector<BGFlushArg> bg_flush_args;
+ std::vector<SuperVersionContext>& superversion_contexts =
+ job_context->superversion_contexts;
+ autovector<ColumnFamilyData*> column_families_not_to_flush;
+ while (!flush_queue_.empty()) {
+ // This cfd is already referenced
+ const FlushRequest& flush_req = PopFirstFromFlushQueue();
+ superversion_contexts.clear();
+ superversion_contexts.reserve(flush_req.size());
+
+ for (const auto& iter : flush_req) {
+ ColumnFamilyData* cfd = iter.first;
+ if (cfd->GetMempurgeUsed()) {
+ // If imm() contains silent memtables (e.g.: because
+ // MemPurge was activated), requesting a flush will
+ // mark the imm_needed as true.
+ cfd->imm()->FlushRequested();
+ }
+
+ if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
+ // can't flush this CF, try next one
+ column_families_not_to_flush.push_back(cfd);
+ continue;
+ }
+ superversion_contexts.emplace_back(SuperVersionContext(true));
+ bg_flush_args.emplace_back(cfd, iter.second,
+ &(superversion_contexts.back()));
+ }
+ if (!bg_flush_args.empty()) {
+ break;
+ }
+ }
+
+ if (!bg_flush_args.empty()) {
+ auto bg_job_limits = GetBGJobLimits();
+ for (const auto& arg : bg_flush_args) {
+ ColumnFamilyData* cfd = arg.cfd_;
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "Calling FlushMemTableToOutputFile with column "
+ "family [%s], flush slots available %d, compaction slots available "
+ "%d, "
+ "flush slots scheduled %d, compaction slots scheduled %d",
+ cfd->GetName().c_str(), bg_job_limits.max_flushes,
+ bg_job_limits.max_compactions, bg_flush_scheduled_,
+ bg_compaction_scheduled_);
+ }
+ status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress,
+ job_context, log_buffer, thread_pri);
+ TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush");
+ // All the CFDs in the FlushReq must have the same flush reason, so just
+ // grab the first one
+ *reason = bg_flush_args[0].cfd_->GetFlushReason();
+ for (auto& arg : bg_flush_args) {
+ ColumnFamilyData* cfd = arg.cfd_;
+ if (cfd->UnrefAndTryDelete()) {
+ arg.cfd_ = nullptr;
+ }
+ }
+ }
+ for (auto cfd : column_families_not_to_flush) {
+ cfd->UnrefAndTryDelete();
+ }
+ return status;
+}
+
+void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
+ bool made_progress = false;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCallFlush:start", nullptr);
+
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1");
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2");
+ {
+ InstrumentedMutexLock l(&mutex_);
+ assert(bg_flush_scheduled_);
+ num_running_flushes_++;
+
+ std::unique_ptr<std::list<uint64_t>::iterator>
+ pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+ FlushReason reason;
+
+ Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
+ &reason, thread_pri);
+ if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
+ reason != FlushReason::kErrorRecovery) {
+ // Wait a little bit before retrying background flush in
+ // case this is an environmental problem and we do not want to
+ // chew up resources for failed flushes for the duration of
+ // the problem.
+ uint64_t error_cnt =
+ default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
+ mutex_.Unlock();
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Waiting after background flush error: %s"
+ "Accumulated background error counts: %" PRIu64,
+ s.ToString().c_str(), error_cnt);
+ log_buffer.FlushBufferToLog();
+ LogFlush(immutable_db_options_.info_log);
+ immutable_db_options_.clock->SleepForMicroseconds(1000000);
+ mutex_.Lock();
+ }
+
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0");
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ // If flush failed, we want to delete all temporary files that we might have
+ // created. Thus, we force full scan in FindObsoleteFiles()
+ FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+ !s.IsColumnFamilyDropped());
+ // delete unnecessary files if any, this is done outside the mutex
+ if (job_context.HaveSomethingToClean() ||
+ job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+ mutex_.Unlock();
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound");
+ // Have to flush the info logs before bg_flush_scheduled_--
+ // because if bg_flush_scheduled_ becomes 0 and the lock is
+ // released, the deconstructor of DB can kick in and destroy all the
+ // states of DB so info_log might not be available after that point.
+ // It also applies to access other states that DB owns.
+ log_buffer.FlushBufferToLog();
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ mutex_.Lock();
+ }
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
+
+ assert(num_running_flushes_ > 0);
+ num_running_flushes_--;
+ bg_flush_scheduled_--;
+ // See if there's more work to be done
+ MaybeScheduleFlushOrCompaction();
+ atomic_flush_install_cv_.SignalAll();
+ bg_cv_.SignalAll();
+ // IMPORTANT: there should be no code after calling SignalAll. This call may
+ // signal the DB destructor that it's OK to proceed with destruction. In
+ // that case, all DB variables will be dealloacated and referencing them
+ // will cause trouble.
+ }
+}
+
+void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+ Env::Priority bg_thread_pri) {
+ bool made_progress = false;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ TEST_SYNC_POINT("BackgroundCallCompaction:0");
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+ {
+ InstrumentedMutexLock l(&mutex_);
+
+ // This call will unlock/lock the mutex to wait for current running
+ // IngestExternalFile() calls to finish.
+ WaitForIngestFile();
+
+ num_running_compactions_++;
+
+ std::unique_ptr<std::list<uint64_t>::iterator>
+ pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+
+ assert((bg_thread_pri == Env::Priority::BOTTOM &&
+ bg_bottom_compaction_scheduled_) ||
+ (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_));
+ Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer,
+ prepicked_compaction, bg_thread_pri);
+ TEST_SYNC_POINT("BackgroundCallCompaction:1");
+ if (s.IsBusy()) {
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
+ mutex_.Unlock();
+ immutable_db_options_.clock->SleepForMicroseconds(
+ 10000); // prevent hot loop
+ mutex_.Lock();
+ } else if (!s.ok() && !s.IsShutdownInProgress() &&
+ !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
+ // Wait a little bit before retrying background compaction in
+ // case this is an environmental problem and we do not want to
+ // chew up resources for failed compactions for the duration of
+ // the problem.
+ uint64_t error_cnt =
+ default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
+ mutex_.Unlock();
+ log_buffer.FlushBufferToLog();
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Waiting after background compaction error: %s, "
+ "Accumulated background error counts: %" PRIu64,
+ s.ToString().c_str(), error_cnt);
+ LogFlush(immutable_db_options_.info_log);
+ immutable_db_options_.clock->SleepForMicroseconds(1000000);
+ mutex_.Lock();
+ } else if (s.IsManualCompactionPaused()) {
+ assert(prepicked_compaction);
+ ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
+ assert(m);
+ ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
+ m->cfd->GetName().c_str(), job_context.job_id);
+ }
+
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ // If compaction failed, we want to delete all temporary files that we
+ // might have created (they might not be all recorded in job_context in
+ // case of a failure). Thus, we force full scan in FindObsoleteFiles()
+ FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+ !s.IsManualCompactionPaused() &&
+ !s.IsColumnFamilyDropped() &&
+ !s.IsBusy());
+ TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
+
+ // delete unnecessary files if any, this is done outside the mutex
+ if (job_context.HaveSomethingToClean() ||
+ job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+ mutex_.Unlock();
+ // Have to flush the info logs before bg_compaction_scheduled_--
+ // because if bg_flush_scheduled_ becomes 0 and the lock is
+ // released, the deconstructor of DB can kick in and destroy all the
+ // states of DB so info_log might not be available after that point.
+ // It also applies to access other states that DB owns.
+ log_buffer.FlushBufferToLog();
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles");
+ }
+ job_context.Clean();
+ mutex_.Lock();
+ }
+
+ assert(num_running_compactions_ > 0);
+ num_running_compactions_--;
+
+ if (bg_thread_pri == Env::Priority::LOW) {
+ bg_compaction_scheduled_--;
+ } else {
+ assert(bg_thread_pri == Env::Priority::BOTTOM);
+ bg_bottom_compaction_scheduled_--;
+ }
+
+ // See if there's more work to be done
+ MaybeScheduleFlushOrCompaction();
+
+ if (prepicked_compaction != nullptr &&
+ prepicked_compaction->task_token != nullptr) {
+ // Releasing task tokens affects (and asserts on) the DB state, so
+ // must be done before we potentially signal the DB close process to
+ // proceed below.
+ prepicked_compaction->task_token.reset();
+ }
+
+ if (made_progress ||
+ (bg_compaction_scheduled_ == 0 &&
+ bg_bottom_compaction_scheduled_ == 0) ||
+ HasPendingManualCompaction() || unscheduled_compactions_ == 0) {
+ // signal if
+ // * made_progress -- need to wakeup DelayWrite
+ // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
+ // * HasPendingManualCompaction -- need to wakeup RunManualCompaction
+ // If none of this is true, there is no need to signal since nobody is
+ // waiting for it
+ bg_cv_.SignalAll();
+ }
+ // IMPORTANT: there should be no code after calling SignalAll. This call may
+ // signal the DB destructor that it's OK to proceed with destruction. In
+ // that case, all DB variables will be dealloacated and referencing them
+ // will cause trouble.
+ }
+}
+
+Status DBImpl::BackgroundCompaction(bool* made_progress,
+ JobContext* job_context,
+ LogBuffer* log_buffer,
+ PrepickedCompaction* prepicked_compaction,
+ Env::Priority thread_pri) {
+ ManualCompactionState* manual_compaction =
+ prepicked_compaction == nullptr
+ ? nullptr
+ : prepicked_compaction->manual_compaction_state;
+ *made_progress = false;
+ mutex_.AssertHeld();
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start");
+
+ bool is_manual = (manual_compaction != nullptr);
+ std::unique_ptr<Compaction> c;
+ if (prepicked_compaction != nullptr &&
+ prepicked_compaction->compaction != nullptr) {
+ c.reset(prepicked_compaction->compaction);
+ }
+ bool is_prepicked = is_manual || c;
+
+ // (manual_compaction->in_progress == false);
+ bool trivial_move_disallowed =
+ is_manual && manual_compaction->disallow_trivial_move;
+
+ CompactionJobStats compaction_job_stats;
+ Status status;
+ if (!error_handler_.IsBGWorkStopped()) {
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ status = Status::ShutdownInProgress();
+ } else if (is_manual &&
+ manual_compaction->canceled.load(std::memory_order_acquire)) {
+ status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+ } else {
+ status = error_handler_.GetBGError();
+ // If we get here, it means a hard error happened after this compaction
+ // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got
+ // a chance to execute. Since we didn't pop a cfd from the compaction
+ // queue, increment unscheduled_compactions_
+ unscheduled_compactions_++;
+ }
+
+ if (!status.ok()) {
+ if (is_manual) {
+ manual_compaction->status = status;
+ manual_compaction->done = true;
+ manual_compaction->in_progress = false;
+ manual_compaction = nullptr;
+ }
+ if (c) {
+ c->ReleaseCompactionFiles(status);
+ c.reset();
+ }
+ return status;
+ }
+
+ if (is_manual) {
+ // another thread cannot pick up the same work
+ manual_compaction->in_progress = true;
+ }
+
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:InProgress");
+
+ std::unique_ptr<TaskLimiterToken> task_token;
+
+ // InternalKey manual_end_storage;
+ // InternalKey* manual_end = &manual_end_storage;
+ bool sfm_reserved_compact_space = false;
+ if (is_manual) {
+ ManualCompactionState* m = manual_compaction;
+ assert(m->in_progress);
+ if (!c) {
+ m->done = true;
+ m->manual_end = nullptr;
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Manual compaction from level-%d from %s .. "
+ "%s; nothing to do\n",
+ m->cfd->GetName().c_str(), m->input_level,
+ (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"),
+ (m->end ? m->end->DebugString(true).c_str() : "(end)"));
+ } else {
+ // First check if we have enough room to do the compaction
+ bool enough_room = EnoughRoomForCompaction(
+ m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+ if (!enough_room) {
+ // Then don't do the compaction
+ c->ReleaseCompactionFiles(status);
+ c.reset();
+ // m's vars will get set properly at the end of this function,
+ // as long as status == CompactionTooLarge
+ status = Status::CompactionTooLarge();
+ } else {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Manual compaction from level-%d to level-%d from %s .. "
+ "%s; will stop at %s\n",
+ m->cfd->GetName().c_str(), m->input_level, c->output_level(),
+ (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"),
+ (m->end ? m->end->DebugString(true).c_str() : "(end)"),
+ ((m->done || m->manual_end == nullptr)
+ ? "(end)"
+ : m->manual_end->DebugString(true).c_str()));
+ }
+ }
+ } else if (!is_prepicked && !compaction_queue_.empty()) {
+ if (HasExclusiveManualCompaction()) {
+ // Can't compact right now, but try again later
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict");
+
+ // Stay in the compaction queue.
+ unscheduled_compactions_++;
+
+ return Status::OK();
+ }
+
+ auto cfd = PickCompactionFromQueue(&task_token, log_buffer);
+ if (cfd == nullptr) {
+ // Can't find any executable task from the compaction queue.
+ // All tasks have been throttled by compaction thread limiter.
+ ++unscheduled_compactions_;
+ return Status::Busy();
+ }
+
+ // We unreference here because the following code will take a Ref() on
+ // this cfd if it is going to use it (Compaction class holds a
+ // reference).
+ // This will all happen under a mutex so we don't have to be afraid of
+ // somebody else deleting it.
+ if (cfd->UnrefAndTryDelete()) {
+ // This was the last reference of the column family, so no need to
+ // compact.
+ return Status::OK();
+ }
+
+ // Pick up latest mutable CF Options and use it throughout the
+ // compaction job
+ // Compaction makes a copy of the latest MutableCFOptions. It should be used
+ // throughout the compaction procedure to make sure consistency. It will
+ // eventually be installed into SuperVersion
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) {
+ // NOTE: try to avoid unnecessary copy of MutableCFOptions if
+ // compaction is not necessary. Need to make sure mutex is held
+ // until we make a copy in the following code
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
+ c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_,
+ log_buffer));
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
+
+ if (c != nullptr) {
+ bool enough_room = EnoughRoomForCompaction(
+ cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+ if (!enough_room) {
+ // Then don't do the compaction
+ c->ReleaseCompactionFiles(status);
+ c->column_family_data()
+ ->current()
+ ->storage_info()
+ ->ComputeCompactionScore(*(c->immutable_options()),
+ *(c->mutable_cf_options()));
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+
+ c.reset();
+ // Don't need to sleep here, because BackgroundCallCompaction
+ // will sleep if !s.ok()
+ status = Status::CompactionTooLarge();
+ } else {
+ // update statistics
+ size_t num_files = 0;
+ for (auto& each_level : *c->inputs()) {
+ num_files += each_level.files.size();
+ }
+ RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION, num_files);
+
+ // There are three things that can change compaction score:
+ // 1) When flush or compaction finish. This case is covered by
+ // InstallSuperVersionAndScheduleWork
+ // 2) When MutableCFOptions changes. This case is also covered by
+ // InstallSuperVersionAndScheduleWork, because this is when the new
+ // options take effect.
+ // 3) When we Pick a new compaction, we "remove" those files being
+ // compacted from the calculation, which then influences compaction
+ // score. Here we check if we need the new compaction even without the
+ // files that are currently being compacted. If we need another
+ // compaction, we might be able to execute it in parallel, so we add
+ // it to the queue and schedule a new thread.
+ if (cfd->NeedsCompaction()) {
+ // Yes, we need more compactions!
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+ MaybeScheduleFlushOrCompaction();
+ }
+ }
+ }
+ }
+ }
+
+ IOStatus io_s;
+ if (!c) {
+ // Nothing to do
+ ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do");
+ } else if (c->deletion_compaction()) {
+ // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
+ // file if there is alive snapshot pointing to it
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+ c->column_family_data());
+ assert(c->num_input_files(1) == 0);
+ assert(c->column_family_data()->ioptions()->compaction_style ==
+ kCompactionStyleFIFO);
+
+ compaction_job_stats.num_input_files = c->num_input_files(0);
+
+ NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+
+ for (const auto& f : *c->inputs(0)) {
+ c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
+ }
+ status = versions_->LogAndApply(c->column_family_data(),
+ *c->mutable_cf_options(), c->edit(),
+ &mutex_, directories_.GetDbDir());
+ io_s = versions_->io_status();
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n",
+ c->column_family_data()->GetName().c_str(),
+ c->num_input_files(0));
+ *made_progress = true;
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+ c->column_family_data());
+ } else if (!trivial_move_disallowed && c->IsTrivialMove()) {
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+ c->column_family_data());
+ // Instrument for event update
+ // TODO(yhchiang): add op details for showing trivial-move.
+ ThreadStatusUtil::SetColumnFamily(
+ c->column_family_data(), c->column_family_data()->ioptions()->env,
+ immutable_db_options_.enable_thread_tracking);
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+ compaction_job_stats.num_input_files = c->num_input_files(0);
+
+ NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+
+ // Move files to next level
+ int32_t moved_files = 0;
+ int64_t moved_bytes = 0;
+ for (unsigned int l = 0; l < c->num_input_levels(); l++) {
+ if (c->level(l) == c->output_level()) {
+ continue;
+ }
+ for (size_t i = 0; i < c->num_input_files(l); i++) {
+ FileMetaData* f = c->input(l, i);
+ c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
+ c->edit()->AddFile(
+ c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
+ f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno,
+ f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
+ f->oldest_blob_file_number, f->oldest_ancester_time,
+ f->file_creation_time, f->file_checksum, f->file_checksum_func_name,
+ f->unique_id);
+
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+ c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+ c->output_level(), f->fd.GetFileSize());
+ ++moved_files;
+ moved_bytes += f->fd.GetFileSize();
+ }
+ }
+ if (c->compaction_reason() == CompactionReason::kLevelMaxLevelSize &&
+ c->immutable_options()->compaction_pri == kRoundRobin) {
+ int start_level = c->start_level();
+ if (start_level > 0) {
+ auto vstorage = c->input_version()->storage_info();
+ c->edit()->AddCompactCursor(
+ start_level,
+ vstorage->GetNextCompactCursor(start_level, c->num_input_files(0)));
+ }
+ }
+ status = versions_->LogAndApply(c->column_family_data(),
+ *c->mutable_cf_options(), c->edit(),
+ &mutex_, directories_.GetDbDir());
+ io_s = versions_->io_status();
+ // Use latest MutableCFOptions
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(),
+ moved_bytes);
+ {
+ event_logger_.LogToBuffer(log_buffer)
+ << "job" << job_context->job_id << "event"
+ << "trivial_move"
+ << "destination_level" << c->output_level() << "files" << moved_files
+ << "total_files_size" << moved_bytes;
+ }
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
+ c->column_family_data()->GetName().c_str(), moved_files,
+ c->output_level(), moved_bytes, status.ToString().c_str(),
+ c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
+ *made_progress = true;
+
+ // Clear Instrument
+ ThreadStatusUtil::ResetThreadStatus();
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+ c->column_family_data());
+ } else if (!is_prepicked && c->output_level() > 0 &&
+ c->output_level() ==
+ c->column_family_data()
+ ->current()
+ ->storage_info()
+ ->MaxOutputLevel(
+ immutable_db_options_.allow_ingest_behind) &&
+ env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+ // Forward compactions involving last level to the bottom pool if it exists,
+ // such that compactions unlikely to contribute to write stalls can be
+ // delayed or deprioritized.
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool");
+ CompactionArg* ca = new CompactionArg;
+ ca->db = this;
+ ca->compaction_pri_ = Env::Priority::BOTTOM;
+ ca->prepicked_compaction = new PrepickedCompaction;
+ ca->prepicked_compaction->compaction = c.release();
+ ca->prepicked_compaction->manual_compaction_state = nullptr;
+ // Transfer requested token, so it doesn't need to do it again.
+ ca->prepicked_compaction->task_token = std::move(task_token);
+ ++bg_bottom_compaction_scheduled_;
+ env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM,
+ this, &DBImpl::UnscheduleCompactionCallback);
+ } else {
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+ c->column_family_data());
+ int output_level __attribute__((__unused__));
+ output_level = c->output_level();
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
+ &output_level);
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+ assert(is_snapshot_supported_ || snapshots_.empty());
+
+ CompactionJob compaction_job(
+ job_context->job_id, c.get(), immutable_db_options_,
+ mutable_db_options_, file_options_for_compaction_, versions_.get(),
+ &shutting_down_, log_buffer, directories_.GetDbDir(),
+ GetDataDir(c->column_family_data(), c->output_path_id()),
+ GetDataDir(c->column_family_data(), 0), stats_, &mutex_,
+ &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
+ snapshot_checker, job_context, table_cache_, &event_logger_,
+ c->mutable_cf_options()->paranoid_file_checks,
+ c->mutable_cf_options()->report_bg_io_stats, dbname_,
+ &compaction_job_stats, thread_pri, io_tracer_,
+ is_manual ? manual_compaction->canceled
+ : kManualCompactionCanceledFalse_,
+ db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
+ c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_,
+ &bg_bottom_compaction_scheduled_);
+ compaction_job.Prepare();
+
+ NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+ mutex_.Unlock();
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
+ // Should handle erorr?
+ compaction_job.Run().PermitUncheckedError();
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
+ mutex_.Lock();
+
+ status = compaction_job.Install(*c->mutable_cf_options());
+ io_s = compaction_job.io_status();
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+ }
+ *made_progress = true;
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+ c->column_family_data());
+ }
+
+ if (status.ok() && !io_s.ok()) {
+ status = io_s;
+ } else {
+ io_s.PermitUncheckedError();
+ }
+
+ if (c != nullptr) {
+ c->ReleaseCompactionFiles(status);
+ *made_progress = true;
+
+#ifndef ROCKSDB_LITE
+ // Need to make sure SstFileManager does its bookkeeping
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm && sfm_reserved_compact_space) {
+ sfm->OnCompactionCompletion(c.get());
+ }
+#endif // ROCKSDB_LITE
+
+ NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+ }
+
+ if (status.ok() || status.IsCompactionTooLarge() ||
+ status.IsManualCompactionPaused()) {
+ // Done
+ } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+ // Ignore compaction errors found during shutting down
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
+ status.ToString().c_str());
+ if (!io_s.ok()) {
+ // Error while writing to MANIFEST.
+ // In fact, versions_->io_status() can also be the result of renaming
+ // CURRENT file. With current code, it's just difficult to tell. So just
+ // be pessimistic and try write to a new MANIFEST.
+ // TODO: distinguish between MANIFEST write and CURRENT renaming
+ auto err_reason = versions_->io_status().ok()
+ ? BackgroundErrorReason::kCompaction
+ : BackgroundErrorReason::kManifestWrite;
+ error_handler_.SetBGError(io_s, err_reason);
+ } else {
+ error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+ }
+ if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) {
+ // Put this cfd back in the compaction queue so we can retry after some
+ // time
+ auto cfd = c->column_family_data();
+ assert(cfd != nullptr);
+ // Since this compaction failed, we need to recompute the score so it
+ // takes the original input files into account
+ c->column_family_data()
+ ->current()
+ ->storage_info()
+ ->ComputeCompactionScore(*(c->immutable_options()),
+ *(c->mutable_cf_options()));
+ if (!cfd->queued_for_compaction()) {
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+ }
+ }
+ }
+ // this will unref its input_version and column_family_data
+ c.reset();
+
+ if (is_manual) {
+ ManualCompactionState* m = manual_compaction;
+ if (!status.ok()) {
+ m->status = status;
+ m->done = true;
+ }
+ // For universal compaction:
+ // Because universal compaction always happens at level 0, so one
+ // compaction will pick up all overlapped files. No files will be
+ // filtered out due to size limit and left for a successive compaction.
+ // So we can safely conclude the current compaction.
+ //
+ // Also note that, if we don't stop here, then the current compaction
+ // writes a new file back to level 0, which will be used in successive
+ // compaction. Hence the manual compaction will never finish.
+ //
+ // Stop the compaction if manual_end points to nullptr -- this means
+ // that we compacted the whole range. manual_end should always point
+ // to nullptr in case of universal compaction
+ if (m->manual_end == nullptr) {
+ m->done = true;
+ }
+ if (!m->done) {
+ // We only compacted part of the requested range. Update *m
+ // to the range that is left to be compacted.
+ // Universal and FIFO compactions should always compact the whole range
+ assert(m->cfd->ioptions()->compaction_style !=
+ kCompactionStyleUniversal ||
+ m->cfd->ioptions()->num_levels > 1);
+ assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO);
+ m->tmp_storage = *m->manual_end;
+ m->begin = &m->tmp_storage;
+ m->incomplete = true;
+ }
+ m->in_progress = false; // not being processed anymore
+ }
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish");
+ return status;
+}
+
+bool DBImpl::HasPendingManualCompaction() {
+ return (!manual_compaction_dequeue_.empty());
+}
+
+void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) {
+ assert(manual_compaction_paused_ == 0);
+ manual_compaction_dequeue_.push_back(m);
+}
+
+void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) {
+ // Remove from queue
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ while (it != manual_compaction_dequeue_.end()) {
+ if (m == (*it)) {
+ it = manual_compaction_dequeue_.erase(it);
+ return;
+ }
+ ++it;
+ }
+ assert(false);
+ return;
+}
+
+bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) {
+ if (num_running_ingest_file_ > 0) {
+ // We need to wait for other IngestExternalFile() calls to finish
+ // before running a manual compaction.
+ return true;
+ }
+ if (m->exclusive) {
+ return (bg_bottom_compaction_scheduled_ > 0 ||
+ bg_compaction_scheduled_ > 0);
+ }
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ bool seen = false;
+ while (it != manual_compaction_dequeue_.end()) {
+ if (m == (*it)) {
+ ++it;
+ seen = true;
+ continue;
+ } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) {
+ // Consider the other manual compaction *it, conflicts if:
+ // overlaps with m
+ // and (*it) is ahead in the queue and is not yet in progress
+ return true;
+ }
+ ++it;
+ }
+ return false;
+}
+
+bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) {
+ // Remove from priority queue
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ while (it != manual_compaction_dequeue_.end()) {
+ if ((*it)->exclusive) {
+ return true;
+ }
+ if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) {
+ // Allow automatic compaction if manual compaction is
+ // in progress
+ return true;
+ }
+ ++it;
+ }
+ return false;
+}
+
+bool DBImpl::HasExclusiveManualCompaction() {
+ // Remove from priority queue
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ while (it != manual_compaction_dequeue_.end()) {
+ if ((*it)->exclusive) {
+ return true;
+ }
+ ++it;
+ }
+ return false;
+}
+
+bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
+ if ((m->exclusive) || (m1->exclusive)) {
+ return true;
+ }
+ if (m->cfd != m1->cfd) {
+ return false;
+ }
+ return false;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::BuildCompactionJobInfo(
+ const ColumnFamilyData* cfd, Compaction* c, const Status& st,
+ const CompactionJobStats& compaction_job_stats, const int job_id,
+ const Version* current, CompactionJobInfo* compaction_job_info) const {
+ assert(compaction_job_info != nullptr);
+ compaction_job_info->cf_id = cfd->GetID();
+ compaction_job_info->cf_name = cfd->GetName();
+ compaction_job_info->status = st;
+ compaction_job_info->thread_id = env_->GetThreadID();
+ compaction_job_info->job_id = job_id;
+ compaction_job_info->base_input_level = c->start_level();
+ compaction_job_info->output_level = c->output_level();
+ compaction_job_info->stats = compaction_job_stats;
+ compaction_job_info->table_properties = c->GetOutputTableProperties();
+ compaction_job_info->compaction_reason = c->compaction_reason();
+ compaction_job_info->compression = c->output_compression();
+ for (size_t i = 0; i < c->num_input_levels(); ++i) {
+ for (const auto fmd : *c->inputs(i)) {
+ const FileDescriptor& desc = fmd->fd;
+ const uint64_t file_number = desc.GetNumber();
+ auto fn = TableFileName(c->immutable_options()->cf_paths, file_number,
+ desc.GetPathId());
+ compaction_job_info->input_files.push_back(fn);
+ compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
+ static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
+ if (compaction_job_info->table_properties.count(fn) == 0) {
+ std::shared_ptr<const TableProperties> tp;
+ auto s = current->GetTableProperties(&tp, fmd, &fn);
+ if (s.ok()) {
+ compaction_job_info->table_properties[fn] = tp;
+ }
+ }
+ }
+ }
+ for (const auto& newf : c->edit()->GetNewFiles()) {
+ const FileMetaData& meta = newf.second;
+ const FileDescriptor& desc = meta.fd;
+ const uint64_t file_number = desc.GetNumber();
+ compaction_job_info->output_files.push_back(TableFileName(
+ c->immutable_options()->cf_paths, file_number, desc.GetPathId()));
+ compaction_job_info->output_file_infos.push_back(CompactionFileInfo{
+ newf.first, file_number, meta.oldest_blob_file_number});
+ }
+ compaction_job_info->blob_compression_type =
+ c->mutable_cf_options()->blob_compression_type;
+
+ // Update BlobFilesInfo.
+ for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) {
+ BlobFileAdditionInfo blob_file_addition_info(
+ BlobFileName(c->immutable_options()->cf_paths.front().path,
+ blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+ blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(),
+ blob_file.GetTotalBlobBytes());
+ compaction_job_info->blob_file_addition_infos.emplace_back(
+ std::move(blob_file_addition_info));
+ }
+
+ // Update BlobFilesGarbageInfo.
+ for (const auto& blob_file : c->edit()->GetBlobFileGarbages()) {
+ BlobFileGarbageInfo blob_file_garbage_info(
+ BlobFileName(c->immutable_options()->cf_paths.front().path,
+ blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+ blob_file.GetBlobFileNumber(), blob_file.GetGarbageBlobCount(),
+ blob_file.GetGarbageBlobBytes());
+ compaction_job_info->blob_file_garbage_infos.emplace_back(
+ std::move(blob_file_garbage_info));
+ }
+}
+#endif
+
+// SuperVersionContext gets created and destructed outside of the lock --
+// we use this conveniently to:
+// * malloc one SuperVersion() outside of the lock -- new_superversion
+// * delete SuperVersion()s outside of the lock -- superversions_to_free
+//
+// However, if InstallSuperVersionAndScheduleWork() gets called twice with the
+// same sv_context, we can't reuse the SuperVersion() that got
+// malloced because
+// first call already used it. In that rare case, we take a hit and create a
+// new SuperVersion() inside of the mutex. We do similar thing
+// for superversion_to_free
+
+void DBImpl::InstallSuperVersionAndScheduleWork(
+ ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+ const MutableCFOptions& mutable_cf_options) {
+ mutex_.AssertHeld();
+
+ // Update max_total_in_memory_state_
+ size_t old_memtable_size = 0;
+ auto* old_sv = cfd->GetSuperVersion();
+ if (old_sv) {
+ old_memtable_size = old_sv->mutable_cf_options.write_buffer_size *
+ old_sv->mutable_cf_options.max_write_buffer_number;
+ }
+
+ // this branch is unlikely to step in
+ if (UNLIKELY(sv_context->new_superversion == nullptr)) {
+ sv_context->NewSuperVersion();
+ }
+ cfd->InstallSuperVersion(sv_context, mutable_cf_options);
+
+ // There may be a small data race here. The snapshot tricking bottommost
+ // compaction may already be released here. But assuming there will always be
+ // newer snapshot created and released frequently, the compaction will be
+ // triggered soon anyway.
+ bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+ for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
+ if (!my_cfd->ioptions()->allow_ingest_behind) {
+ bottommost_files_mark_threshold_ = std::min(
+ bottommost_files_mark_threshold_,
+ my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
+ }
+ }
+
+ // Whenever we install new SuperVersion, we might need to issue new flushes or
+ // compactions.
+ SchedulePendingCompaction(cfd);
+ MaybeScheduleFlushOrCompaction();
+
+ // Update max_total_in_memory_state_
+ max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size +
+ mutable_cf_options.write_buffer_size *
+ mutable_cf_options.max_write_buffer_number;
+}
+
+// ShouldPurge is called by FindObsoleteFiles when doing a full scan,
+// and db mutex (mutex_) should already be held.
+// Actually, the current implementation of FindObsoleteFiles with
+// full_scan=true can issue I/O requests to obtain list of files in
+// directories, e.g. env_->getChildren while holding db mutex.
+bool DBImpl::ShouldPurge(uint64_t file_number) const {
+ return files_grabbed_for_purge_.find(file_number) ==
+ files_grabbed_for_purge_.end() &&
+ purge_files_.find(file_number) == purge_files_.end();
+}
+
+// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex
+// (mutex_) should already be held.
+void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) {
+ files_grabbed_for_purge_.insert(file_number);
+}
+
+void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {
+ InstrumentedMutexLock l(&mutex_);
+ // snapshot_checker_ should only set once. If we need to set it multiple
+ // times, we need to make sure the old one is not deleted while it is still
+ // using by a compaction job.
+ assert(!snapshot_checker_);
+ snapshot_checker_.reset(snapshot_checker);
+}
+
+void DBImpl::GetSnapshotContext(
+ JobContext* job_context, std::vector<SequenceNumber>* snapshot_seqs,
+ SequenceNumber* earliest_write_conflict_snapshot,
+ SnapshotChecker** snapshot_checker_ptr) {
+ mutex_.AssertHeld();
+ assert(job_context != nullptr);
+ assert(snapshot_seqs != nullptr);
+ assert(earliest_write_conflict_snapshot != nullptr);
+ assert(snapshot_checker_ptr != nullptr);
+
+ *snapshot_checker_ptr = snapshot_checker_.get();
+ if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) {
+ *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance();
+ }
+ if (*snapshot_checker_ptr != nullptr) {
+ // If snapshot_checker is used, that means the flush/compaction may
+ // contain values not visible to snapshot taken after
+ // flush/compaction job starts. Take a snapshot and it will appear
+ // in snapshot_seqs and force compaction iterator to consider such
+ // snapshots.
+ const Snapshot* job_snapshot =
+ GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/);
+ job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot));
+ }
+ *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot);
+}
+
+Status DBImpl::WaitForCompact(bool wait_unscheduled) {
+ // Wait until the compaction completes
+ InstrumentedMutexLock l(&mutex_);
+ while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+ bg_flush_scheduled_ ||
+ (wait_unscheduled && unscheduled_compactions_)) &&
+ (error_handler_.GetBGError().ok())) {
+ bg_cv_.Wait();
+ }
+ return error_handler_.GetBGError();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_debug.cc b/src/rocksdb/db/db_impl/db_impl_debug.cc
new file mode 100644
index 000000000..7054b0669
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_debug.cc
@@ -0,0 +1,312 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef NDEBUG
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/periodic_task_scheduler.h"
+#include "monitoring/thread_status_updater.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+ InstrumentedMutexLock l(&mutex_);
+ return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
+}
+
+Status DBImpl::TEST_SwitchWAL() {
+ WriteContext write_context;
+ InstrumentedMutexLock l(&mutex_);
+ void* writer = TEST_BeginWrite();
+ auto s = SwitchWAL(&write_context);
+ TEST_EndWrite(writer);
+ return s;
+}
+
+uint64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
+ ColumnFamilyHandle* column_family) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ cfd = cfh->cfd();
+ }
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
+}
+
+void DBImpl::TEST_GetFilesMetaData(
+ ColumnFamilyHandle* column_family,
+ std::vector<std::vector<FileMetaData>>* metadata,
+ std::vector<std::shared_ptr<BlobFileMetaData>>* blob_metadata) {
+ assert(metadata);
+
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ assert(cfh);
+
+ auto cfd = cfh->cfd();
+ assert(cfd);
+
+ InstrumentedMutexLock l(&mutex_);
+
+ const auto* current = cfd->current();
+ assert(current);
+
+ const auto* vstorage = current->storage_info();
+ assert(vstorage);
+
+ metadata->resize(NumberLevels());
+
+ for (int level = 0; level < NumberLevels(); ++level) {
+ const std::vector<FileMetaData*>& files = vstorage->LevelFiles(level);
+
+ (*metadata)[level].clear();
+ (*metadata)[level].reserve(files.size());
+
+ for (const auto& f : files) {
+ (*metadata)[level].push_back(*f);
+ }
+ }
+
+ if (blob_metadata) {
+ *blob_metadata = vstorage->GetBlobFiles();
+ }
+}
+
+uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
+ return versions_->manifest_file_number();
+}
+
+uint64_t DBImpl::TEST_Current_Next_FileNo() {
+ return versions_->current_next_file_number();
+}
+
+Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
+ const Slice* end,
+ ColumnFamilyHandle* column_family,
+ bool disallow_trivial_move) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ cfd = cfh->cfd();
+ }
+ int output_level =
+ (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
+ ? level
+ : level + 1;
+ return RunManualCompaction(
+ cfd, level, output_level, CompactRangeOptions(), begin, end, true,
+ disallow_trivial_move,
+ std::numeric_limits<uint64_t>::max() /*max_file_num_to_ignore*/,
+ "" /*trim_ts*/);
+}
+
+Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
+ WriteContext write_context;
+ InstrumentedMutexLock l(&mutex_);
+ if (cfd == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ }
+
+ Status s;
+ void* writer = TEST_BeginWrite();
+ if (two_write_queues_) {
+ WriteThread::Writer nonmem_w;
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ s = SwitchMemtable(cfd, &write_context);
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ } else {
+ s = SwitchMemtable(cfd, &write_context);
+ }
+ TEST_EndWrite(writer);
+ return s;
+}
+
+Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
+ ColumnFamilyHandle* cfh) {
+ FlushOptions fo;
+ fo.wait = wait;
+ fo.allow_write_stall = allow_write_stall;
+ ColumnFamilyData* cfd;
+ if (cfh == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
+ cfd = cfhi->cfd();
+ }
+ return FlushMemTable(cfd, fo, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd,
+ const FlushOptions& flush_opts) {
+ return FlushMemTable(cfd, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_AtomicFlushMemTables(
+ const autovector<ColumnFamilyData*>& cfds, const FlushOptions& flush_opts) {
+ return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_WaitForBackgroundWork() {
+ InstrumentedMutexLock l(&mutex_);
+ WaitForBackgroundWork();
+ return error_handler_.GetBGError();
+}
+
+Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ cfd = cfh->cfd();
+ }
+ return WaitForFlushMemTable(cfd, nullptr, false);
+}
+
+Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) {
+ // Wait until the compaction completes
+ return WaitForCompact(wait_unscheduled);
+}
+
+Status DBImpl::TEST_WaitForPurge() {
+ InstrumentedMutexLock l(&mutex_);
+ while (bg_purge_scheduled_ && error_handler_.GetBGError().ok()) {
+ bg_cv_.Wait();
+ }
+ return error_handler_.GetBGError();
+}
+
+Status DBImpl::TEST_GetBGError() {
+ InstrumentedMutexLock l(&mutex_);
+ return error_handler_.GetBGError();
+}
+
+void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
+
+void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
+
+void* DBImpl::TEST_BeginWrite() {
+ auto w = new WriteThread::Writer();
+ write_thread_.EnterUnbatched(w, &mutex_);
+ return reinterpret_cast<void*>(w);
+}
+
+void DBImpl::TEST_EndWrite(void* w) {
+ auto writer = reinterpret_cast<WriteThread::Writer*>(w);
+ write_thread_.ExitUnbatched(writer);
+ delete writer;
+}
+
+size_t DBImpl::TEST_LogsToFreeSize() {
+ InstrumentedMutexLock l(&log_write_mutex_);
+ return logs_to_free_.size();
+}
+
+uint64_t DBImpl::TEST_LogfileNumber() {
+ InstrumentedMutexLock l(&mutex_);
+ return logfile_number_;
+}
+
+Status DBImpl::TEST_GetAllImmutableCFOptions(
+ std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map) {
+ std::vector<std::string> cf_names;
+ std::vector<const ImmutableCFOptions*> iopts;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ cf_names.push_back(cfd->GetName());
+ iopts.push_back(cfd->ioptions());
+ }
+ }
+ iopts_map->clear();
+ for (size_t i = 0; i < cf_names.size(); ++i) {
+ iopts_map->insert({cf_names[i], iopts[i]});
+ }
+
+ return Status::OK();
+}
+
+uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() {
+ return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+}
+
+size_t DBImpl::TEST_PreparedSectionCompletedSize() {
+ return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize();
+}
+
+size_t DBImpl::TEST_LogsWithPrepSize() {
+ return logs_with_prep_tracker_.TEST_LogsWithPrepSize();
+}
+
+uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
+ autovector<MemTable*> empty_list;
+ return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list);
+}
+
+Status DBImpl::TEST_GetLatestMutableCFOptions(
+ ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) {
+ InstrumentedMutexLock l(&mutex_);
+
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions();
+ return Status::OK();
+}
+
+int DBImpl::TEST_BGCompactionsAllowed() const {
+ InstrumentedMutexLock l(&mutex_);
+ return GetBGJobLimits().max_compactions;
+}
+
+int DBImpl::TEST_BGFlushesAllowed() const {
+ InstrumentedMutexLock l(&mutex_);
+ return GetBGJobLimits().max_flushes;
+}
+
+SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const {
+ if (last_seq_same_as_publish_seq_) {
+ return versions_->LastSequence();
+ } else {
+ return versions_->LastAllocatedSequence();
+ }
+}
+
+size_t DBImpl::TEST_GetWalPreallocateBlockSize(
+ uint64_t write_buffer_size) const {
+ InstrumentedMutexLock l(&mutex_);
+ return GetWalPreallocateBlockSize(write_buffer_size);
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::TEST_WaitForPeridicTaskRun(std::function<void()> callback) const {
+ periodic_task_scheduler_.TEST_WaitForRun(callback);
+}
+
+const PeriodicTaskScheduler& DBImpl::TEST_GetPeriodicTaskScheduler() const {
+ return periodic_task_scheduler_;
+}
+
+SeqnoToTimeMapping DBImpl::TEST_GetSeqnoToTimeMapping() const {
+ InstrumentedMutexLock l(&mutex_);
+ return seqno_time_mapping_;
+}
+
+#endif // !ROCKSDB_LITE
+
+size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
+ return EstimateInMemoryStatsHistorySize();
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif // NDEBUG
diff --git a/src/rocksdb/db/db_impl/db_impl_experimental.cc b/src/rocksdb/db/db_impl/db_impl_experimental.cc
new file mode 100644
index 000000000..c1b1e4137
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_experimental.cc
@@ -0,0 +1,158 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "logging/logging.h"
+#include "rocksdb/status.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ auto cfd = cfh->cfd();
+ InternalKey start_key, end_key;
+ if (begin != nullptr) {
+ start_key.SetMinPossibleForUserKey(*begin);
+ }
+ if (end != nullptr) {
+ end_key.SetMaxPossibleForUserKey(*end);
+ }
+ {
+ InstrumentedMutexLock l(&mutex_);
+ auto vstorage = cfd->current()->storage_info();
+ for (int level = 0; level < vstorage->num_non_empty_levels() - 1; ++level) {
+ std::vector<FileMetaData*> inputs;
+ vstorage->GetOverlappingInputs(
+ level, begin == nullptr ? nullptr : &start_key,
+ end == nullptr ? nullptr : &end_key, &inputs);
+ for (auto f : inputs) {
+ f->marked_for_compaction = true;
+ }
+ }
+ // Since we have some more files to compact, we should also recompute
+ // compaction score
+ vstorage->ComputeCompactionScore(*cfd->ioptions(),
+ *cfd->GetLatestMutableCFOptions());
+ SchedulePendingCompaction(cfd);
+ MaybeScheduleFlushOrCompaction();
+ }
+ return Status::OK();
+}
+
+Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
+ assert(column_family);
+
+ if (target_level < 1) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Invalid target level %d\n", target_level);
+ return Status::InvalidArgument("Invalid target level");
+ }
+
+ Status status;
+ VersionEdit edit;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ auto* cfd = static_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ const auto* vstorage = cfd->current()->storage_info();
+
+ if (target_level >= vstorage->num_levels()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Target level %d does not exist\n",
+ target_level);
+ job_context.Clean();
+ status = Status::InvalidArgument("Target level does not exist");
+ return status;
+ }
+
+ // Sort L0 files by range.
+ const InternalKeyComparator* icmp = &cfd->internal_comparator();
+ auto l0_files = vstorage->LevelFiles(0);
+ std::sort(l0_files.begin(), l0_files.end(),
+ [icmp](FileMetaData* f1, FileMetaData* f2) {
+ return icmp->Compare(f1->largest, f2->largest) < 0;
+ });
+
+ // Check that no L0 file is being compacted and that they have
+ // non-overlapping ranges.
+ for (size_t i = 0; i < l0_files.size(); ++i) {
+ auto f = l0_files[i];
+ if (f->being_compacted) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. File %" PRIu64 " being compacted\n",
+ f->fd.GetNumber());
+ job_context.Clean();
+ status =
+ Status::InvalidArgument("PromoteL0 called during L0 compaction");
+ return status;
+ }
+
+ if (i == 0) continue;
+ auto prev_f = l0_files[i - 1];
+ if (icmp->Compare(prev_f->largest, f->smallest) >= 0) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Files %" PRIu64 " and %" PRIu64
+ " have overlapping ranges\n",
+ prev_f->fd.GetNumber(), f->fd.GetNumber());
+ job_context.Clean();
+ status = Status::InvalidArgument("L0 has overlapping files");
+ return status;
+ }
+ }
+
+ // Check that all levels up to target_level are empty.
+ for (int level = 1; level <= target_level; ++level) {
+ if (vstorage->NumLevelFiles(level) > 0) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Level %d not empty\n", level);
+ job_context.Clean();
+ status = Status::InvalidArgument(
+ "All levels up to target_level "
+ "must be empty");
+ return status;
+ }
+ }
+
+ edit.SetColumnFamily(cfd->GetID());
+ for (const auto& f : l0_files) {
+ edit.DeleteFile(0, f->fd.GetNumber());
+ edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
+ f->fd.GetFileSize(), f->smallest, f->largest,
+ f->fd.smallest_seqno, f->fd.largest_seqno,
+ f->marked_for_compaction, f->temperature,
+ f->oldest_blob_file_number, f->oldest_ancester_time,
+ f->file_creation_time, f->file_checksum,
+ f->file_checksum_func_name, f->unique_id);
+ }
+
+ status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ &edit, &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd,
+ &job_context.superversion_contexts[0],
+ *cfd->GetLatestMutableCFOptions());
+ }
+ } // lock released here
+ LogFlush(immutable_db_options_.info_log);
+ job_context.Clean();
+
+ return status;
+}
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_files.cc b/src/rocksdb/db/db_impl/db_impl_files.cc
new file mode 100644
index 000000000..058df4da7
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_files.cc
@@ -0,0 +1,1013 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+#include <set>
+#include <unordered_set>
+
+#include "db/db_impl/db_impl.h"
+#include "db/event_helpers.h"
+#include "db/memtable_list.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "util/autovector.h"
+#include "util/defer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint64_t DBImpl::MinLogNumberToKeep() {
+ return versions_->min_log_number_to_keep();
+}
+
+uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
+ mutex_.AssertHeld();
+ if (!pending_outputs_.empty()) {
+ return *pending_outputs_.begin();
+ }
+ return std::numeric_limits<uint64_t>::max();
+}
+
+Status DBImpl::DisableFileDeletions() {
+ Status s;
+ int my_disable_delete_obsolete_files;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ s = DisableFileDeletionsWithLock();
+ my_disable_delete_obsolete_files = disable_delete_obsolete_files_;
+ }
+ if (my_disable_delete_obsolete_files == 1) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled");
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "File Deletions Disabled, but already disabled. Counter: %d",
+ my_disable_delete_obsolete_files);
+ }
+ return s;
+}
+
+// FIXME: can be inconsistent with DisableFileDeletions in cases like
+// DBImplReadOnly
+Status DBImpl::DisableFileDeletionsWithLock() {
+ mutex_.AssertHeld();
+ ++disable_delete_obsolete_files_;
+ return Status::OK();
+}
+
+Status DBImpl::EnableFileDeletions(bool force) {
+ // Job id == 0 means that this is not our background process, but rather
+ // user thread
+ JobContext job_context(0);
+ int saved_counter; // initialize on all paths
+ {
+ InstrumentedMutexLock l(&mutex_);
+ if (force) {
+ // if force, we need to enable file deletions right away
+ disable_delete_obsolete_files_ = 0;
+ } else if (disable_delete_obsolete_files_ > 0) {
+ --disable_delete_obsolete_files_;
+ }
+ saved_counter = disable_delete_obsolete_files_;
+ if (saved_counter == 0) {
+ FindObsoleteFiles(&job_context, true);
+ bg_cv_.SignalAll();
+ }
+ }
+ if (saved_counter == 0) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "File Deletions Enable, but not really enabled. Counter: %d",
+ saved_counter);
+ }
+ job_context.Clean();
+ LogFlush(immutable_db_options_.info_log);
+ return Status::OK();
+}
+
+bool DBImpl::IsFileDeletionsEnabled() const {
+ return 0 == disable_delete_obsolete_files_;
+}
+
+// * Returns the list of live files in 'sst_live' and 'blob_live'.
+// If it's doing full scan:
+// * Returns the list of all files in the filesystem in
+// 'full_scan_candidate_files'.
+// Otherwise, gets obsolete files from VersionSet.
+// no_full_scan = true -- never do the full scan using GetChildren()
+// force = false -- don't force the full scan, except every
+// mutable_db_options_.delete_obsolete_files_period_micros
+// force = true -- force the full scan
+void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
+ bool no_full_scan) {
+ mutex_.AssertHeld();
+
+ // if deletion is disabled, do nothing
+ if (disable_delete_obsolete_files_ > 0) {
+ return;
+ }
+
+ bool doing_the_full_scan = false;
+
+ // logic for figuring out if we're doing the full scan
+ if (no_full_scan) {
+ doing_the_full_scan = false;
+ } else if (force ||
+ mutable_db_options_.delete_obsolete_files_period_micros == 0) {
+ doing_the_full_scan = true;
+ } else {
+ const uint64_t now_micros = immutable_db_options_.clock->NowMicros();
+ if ((delete_obsolete_files_last_run_ +
+ mutable_db_options_.delete_obsolete_files_period_micros) <
+ now_micros) {
+ doing_the_full_scan = true;
+ delete_obsolete_files_last_run_ = now_micros;
+ }
+ }
+
+ // don't delete files that might be currently written to from compaction
+ // threads
+ // Since job_context->min_pending_output is set, until file scan finishes,
+ // mutex_ cannot be released. Otherwise, we might see no min_pending_output
+ // here but later find newer generated unfinalized files while scanning.
+ job_context->min_pending_output = MinObsoleteSstNumberToKeep();
+
+ // Get obsolete files. This function will also update the list of
+ // pending files in VersionSet().
+ versions_->GetObsoleteFiles(
+ &job_context->sst_delete_files, &job_context->blob_delete_files,
+ &job_context->manifest_delete_files, job_context->min_pending_output);
+
+ // Mark the elements in job_context->sst_delete_files and
+ // job_context->blob_delete_files as "grabbed for purge" so that other threads
+ // calling FindObsoleteFiles with full_scan=true will not add these files to
+ // candidate list for purge.
+ for (const auto& sst_to_del : job_context->sst_delete_files) {
+ MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber());
+ }
+
+ for (const auto& blob_file : job_context->blob_delete_files) {
+ MarkAsGrabbedForPurge(blob_file.GetBlobFileNumber());
+ }
+
+ // store the current filenum, lognum, etc
+ job_context->manifest_file_number = versions_->manifest_file_number();
+ job_context->pending_manifest_file_number =
+ versions_->pending_manifest_file_number();
+ job_context->log_number = MinLogNumberToKeep();
+ job_context->prev_log_number = versions_->prev_log_number();
+
+ if (doing_the_full_scan) {
+ versions_->AddLiveFiles(&job_context->sst_live, &job_context->blob_live);
+ InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+ dbname_);
+ std::set<std::string> paths;
+ for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size();
+ path_id++) {
+ paths.insert(immutable_db_options_.db_paths[path_id].path);
+ }
+
+ // Note that if cf_paths is not specified in the ColumnFamilyOptions
+ // of a particular column family, we use db_paths as the cf_paths
+ // setting. Hence, there can be multiple duplicates of files from db_paths
+ // in the following code. The duplicate are removed while identifying
+ // unique files in PurgeObsoleteFiles.
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ for (size_t path_id = 0; path_id < cfd->ioptions()->cf_paths.size();
+ path_id++) {
+ auto& path = cfd->ioptions()->cf_paths[path_id].path;
+
+ if (paths.find(path) == paths.end()) {
+ paths.insert(path);
+ }
+ }
+ }
+
+ IOOptions io_opts;
+ io_opts.do_not_recurse = true;
+ for (auto& path : paths) {
+ // set of all files in the directory. We'll exclude files that are still
+ // alive in the subsequent processings.
+ std::vector<std::string> files;
+ Status s = immutable_db_options_.fs->GetChildren(
+ path, io_opts, &files, /*IODebugContext*=*/nullptr);
+ s.PermitUncheckedError(); // TODO: What should we do on error?
+ for (const std::string& file : files) {
+ uint64_t number;
+ FileType type;
+ // 1. If we cannot parse the file name, we skip;
+ // 2. If the file with file_number equals number has already been
+ // grabbed for purge by another compaction job, or it has already been
+ // schedule for purge, we also skip it if we
+ // are doing full scan in order to avoid double deletion of the same
+ // file under race conditions. See
+ // https://github.com/facebook/rocksdb/issues/3573
+ if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) ||
+ !ShouldPurge(number)) {
+ continue;
+ }
+
+ // TODO(icanadi) clean up this mess to avoid having one-off "/"
+ // prefixes
+ job_context->full_scan_candidate_files.emplace_back("/" + file, path);
+ }
+ }
+
+ // Add log files in wal_dir
+ if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) {
+ std::vector<std::string> log_files;
+ Status s = immutable_db_options_.fs->GetChildren(
+ immutable_db_options_.wal_dir, io_opts, &log_files,
+ /*IODebugContext*=*/nullptr);
+ s.PermitUncheckedError(); // TODO: What should we do on error?
+ for (const std::string& log_file : log_files) {
+ job_context->full_scan_candidate_files.emplace_back(
+ log_file, immutable_db_options_.wal_dir);
+ }
+ }
+
+ // Add info log files in db_log_dir
+ if (!immutable_db_options_.db_log_dir.empty() &&
+ immutable_db_options_.db_log_dir != dbname_) {
+ std::vector<std::string> info_log_files;
+ Status s = immutable_db_options_.fs->GetChildren(
+ immutable_db_options_.db_log_dir, io_opts, &info_log_files,
+ /*IODebugContext*=*/nullptr);
+ s.PermitUncheckedError(); // TODO: What should we do on error?
+ for (std::string& log_file : info_log_files) {
+ job_context->full_scan_candidate_files.emplace_back(
+ log_file, immutable_db_options_.db_log_dir);
+ }
+ }
+ } else {
+ // Instead of filling ob_context->sst_live and job_context->blob_live,
+ // directly remove files that show up in any Version. This is because
+ // candidate files tend to be a small percentage of all files, so it is
+ // usually cheaper to check them against every version, compared to
+ // building a map for all files.
+ versions_->RemoveLiveFiles(job_context->sst_delete_files,
+ job_context->blob_delete_files);
+ }
+
+ // Before potentially releasing mutex and waiting on condvar, increment
+ // pending_purge_obsolete_files_ so that another thread executing
+ // `GetSortedWals` will wait until this thread finishes execution since the
+ // other thread will be waiting for `pending_purge_obsolete_files_`.
+ // pending_purge_obsolete_files_ MUST be decremented if there is nothing to
+ // delete.
+ ++pending_purge_obsolete_files_;
+
+ Defer cleanup([job_context, this]() {
+ assert(job_context != nullptr);
+ if (!job_context->HaveSomethingToDelete()) {
+ mutex_.AssertHeld();
+ --pending_purge_obsolete_files_;
+ }
+ });
+
+ // logs_ is empty when called during recovery, in which case there can't yet
+ // be any tracked obsolete logs
+ log_write_mutex_.Lock();
+
+ if (alive_log_files_.empty() || logs_.empty()) {
+ mutex_.AssertHeld();
+ // We may reach here if the db is DBImplSecondary
+ log_write_mutex_.Unlock();
+ return;
+ }
+
+ if (!alive_log_files_.empty() && !logs_.empty()) {
+ uint64_t min_log_number = job_context->log_number;
+ size_t num_alive_log_files = alive_log_files_.size();
+ // find newly obsoleted log files
+ while (alive_log_files_.begin()->number < min_log_number) {
+ auto& earliest = *alive_log_files_.begin();
+ if (immutable_db_options_.recycle_log_file_num >
+ log_recycle_files_.size()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "adding log %" PRIu64 " to recycle list\n",
+ earliest.number);
+ log_recycle_files_.push_back(earliest.number);
+ } else {
+ job_context->log_delete_files.push_back(earliest.number);
+ }
+ if (job_context->size_log_to_delete == 0) {
+ job_context->prev_total_log_size = total_log_size_;
+ job_context->num_alive_log_files = num_alive_log_files;
+ }
+ job_context->size_log_to_delete += earliest.size;
+ total_log_size_ -= earliest.size;
+ alive_log_files_.pop_front();
+
+ // Current log should always stay alive since it can't have
+ // number < MinLogNumber().
+ assert(alive_log_files_.size());
+ }
+ log_write_mutex_.Unlock();
+ mutex_.Unlock();
+ log_write_mutex_.Lock();
+ while (!logs_.empty() && logs_.front().number < min_log_number) {
+ auto& log = logs_.front();
+ if (log.IsSyncing()) {
+ log_sync_cv_.Wait();
+ // logs_ could have changed while we were waiting.
+ continue;
+ }
+ logs_to_free_.push_back(log.ReleaseWriter());
+ logs_.pop_front();
+ }
+ // Current log cannot be obsolete.
+ assert(!logs_.empty());
+ }
+
+ // We're just cleaning up for DB::Write().
+ assert(job_context->logs_to_free.empty());
+ job_context->logs_to_free = logs_to_free_;
+
+ logs_to_free_.clear();
+ log_write_mutex_.Unlock();
+ mutex_.Lock();
+ job_context->log_recycle_files.assign(log_recycle_files_.begin(),
+ log_recycle_files_.end());
+}
+
+// Delete obsolete files and log status and information of file deletion
+void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+ const std::string& path_to_sync,
+ FileType type, uint64_t number) {
+ TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl::BeforeDeletion",
+ const_cast<std::string*>(&fname));
+
+ Status file_deletion_status;
+ if (type == kTableFile || type == kBlobFile || type == kWalFile) {
+ // Rate limit WAL deletion only if its in the DB dir
+ file_deletion_status = DeleteDBFile(
+ &immutable_db_options_, fname, path_to_sync,
+ /*force_bg=*/false,
+ /*force_fg=*/(type == kWalFile) ? !wal_in_db_path_ : false);
+ } else {
+ file_deletion_status = env_->DeleteFile(fname);
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion",
+ &file_deletion_status);
+ if (file_deletion_status.ok()) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id,
+ fname.c_str(), type, number,
+ file_deletion_status.ToString().c_str());
+ } else if (env_->FileExists(fname).IsNotFound()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64
+ " -- %s\n",
+ job_id, fname.c_str(), type, number,
+ file_deletion_status.ToString().c_str());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n",
+ job_id, fname.c_str(), type, number,
+ file_deletion_status.ToString().c_str());
+ }
+ if (type == kTableFile) {
+ EventHelpers::LogAndNotifyTableFileDeletion(
+ &event_logger_, job_id, number, fname, file_deletion_status, GetName(),
+ immutable_db_options_.listeners);
+ }
+ if (type == kBlobFile) {
+ EventHelpers::LogAndNotifyBlobFileDeletion(
+ &event_logger_, immutable_db_options_.listeners, job_id, number, fname,
+ file_deletion_status, GetName());
+ }
+}
+
+// Diffs the files listed in filenames and those that do not
+// belong to live files are possibly removed. Also, removes all the
+// files in sst_delete_files and log_delete_files.
+// It is not necessary to hold the mutex when invoking this method.
+void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
+ TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:Begin");
+ // we'd better have sth to delete
+ assert(state.HaveSomethingToDelete());
+
+ // FindObsoleteFiles() should've populated this so nonzero
+ assert(state.manifest_file_number != 0);
+
+ // Now, convert lists to unordered sets, WITHOUT mutex held; set is slow.
+ std::unordered_set<uint64_t> sst_live_set(state.sst_live.begin(),
+ state.sst_live.end());
+ std::unordered_set<uint64_t> blob_live_set(state.blob_live.begin(),
+ state.blob_live.end());
+ std::unordered_set<uint64_t> log_recycle_files_set(
+ state.log_recycle_files.begin(), state.log_recycle_files.end());
+
+ auto candidate_files = state.full_scan_candidate_files;
+ candidate_files.reserve(
+ candidate_files.size() + state.sst_delete_files.size() +
+ state.blob_delete_files.size() + state.log_delete_files.size() +
+ state.manifest_delete_files.size());
+ // We may ignore the dbname when generating the file names.
+ for (auto& file : state.sst_delete_files) {
+ if (!file.only_delete_metadata) {
+ candidate_files.emplace_back(
+ MakeTableFileName(file.metadata->fd.GetNumber()), file.path);
+ }
+ if (file.metadata->table_reader_handle) {
+ table_cache_->Release(file.metadata->table_reader_handle);
+ }
+ file.DeleteMetadata();
+ }
+
+ for (const auto& blob_file : state.blob_delete_files) {
+ candidate_files.emplace_back(BlobFileName(blob_file.GetBlobFileNumber()),
+ blob_file.GetPath());
+ }
+
+ auto wal_dir = immutable_db_options_.GetWalDir();
+ for (auto file_num : state.log_delete_files) {
+ if (file_num > 0) {
+ candidate_files.emplace_back(LogFileName(file_num), wal_dir);
+ }
+ }
+ for (const auto& filename : state.manifest_delete_files) {
+ candidate_files.emplace_back(filename, dbname_);
+ }
+
+ // dedup state.candidate_files so we don't try to delete the same
+ // file twice
+ std::sort(candidate_files.begin(), candidate_files.end(),
+ [](const JobContext::CandidateFileInfo& lhs,
+ const JobContext::CandidateFileInfo& rhs) {
+ if (lhs.file_name > rhs.file_name) {
+ return true;
+ } else if (lhs.file_name < rhs.file_name) {
+ return false;
+ } else {
+ return (lhs.file_path > rhs.file_path);
+ }
+ });
+ candidate_files.erase(
+ std::unique(candidate_files.begin(), candidate_files.end()),
+ candidate_files.end());
+
+ if (state.prev_total_log_size > 0) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[JOB %d] Try to delete WAL files size %" PRIu64
+ ", prev total WAL file size %" PRIu64
+ ", number of live WAL files %" ROCKSDB_PRIszt ".\n",
+ state.job_id, state.size_log_to_delete,
+ state.prev_total_log_size, state.num_alive_log_files);
+ }
+
+ std::vector<std::string> old_info_log_files;
+ InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+ dbname_);
+
+ // File numbers of most recent two OPTIONS file in candidate_files (found in
+ // previos FindObsoleteFiles(full_scan=true))
+ // At this point, there must not be any duplicate file numbers in
+ // candidate_files.
+ uint64_t optsfile_num1 = std::numeric_limits<uint64_t>::min();
+ uint64_t optsfile_num2 = std::numeric_limits<uint64_t>::min();
+ for (const auto& candidate_file : candidate_files) {
+ const std::string& fname = candidate_file.file_name;
+ uint64_t number;
+ FileType type;
+ if (!ParseFileName(fname, &number, info_log_prefix.prefix, &type) ||
+ type != kOptionsFile) {
+ continue;
+ }
+ if (number > optsfile_num1) {
+ optsfile_num2 = optsfile_num1;
+ optsfile_num1 = number;
+ } else if (number > optsfile_num2) {
+ optsfile_num2 = number;
+ }
+ }
+
+ // Close WALs before trying to delete them.
+ for (const auto w : state.logs_to_free) {
+ // TODO: maybe check the return value of Close.
+ auto s = w->Close();
+ s.PermitUncheckedError();
+ }
+
+ bool own_files = OwnTablesAndLogs();
+ std::unordered_set<uint64_t> files_to_del;
+ for (const auto& candidate_file : candidate_files) {
+ const std::string& to_delete = candidate_file.file_name;
+ uint64_t number;
+ FileType type;
+ // Ignore file if we cannot recognize it.
+ if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) {
+ continue;
+ }
+
+ bool keep = true;
+ switch (type) {
+ case kWalFile:
+ keep = ((number >= state.log_number) ||
+ (number == state.prev_log_number) ||
+ (log_recycle_files_set.find(number) !=
+ log_recycle_files_set.end()));
+ break;
+ case kDescriptorFile:
+ // Keep my manifest file, and any newer incarnations'
+ // (can happen during manifest roll)
+ keep = (number >= state.manifest_file_number);
+ break;
+ case kTableFile:
+ // If the second condition is not there, this makes
+ // DontDeletePendingOutputs fail
+ keep = (sst_live_set.find(number) != sst_live_set.end()) ||
+ number >= state.min_pending_output;
+ if (!keep) {
+ files_to_del.insert(number);
+ }
+ break;
+ case kBlobFile:
+ keep = number >= state.min_pending_output ||
+ (blob_live_set.find(number) != blob_live_set.end());
+ if (!keep) {
+ files_to_del.insert(number);
+ }
+ break;
+ case kTempFile:
+ // Any temp files that are currently being written to must
+ // be recorded in pending_outputs_, which is inserted into "live".
+ // Also, SetCurrentFile creates a temp file when writing out new
+ // manifest, which is equal to state.pending_manifest_file_number. We
+ // should not delete that file
+ //
+ // TODO(yhchiang): carefully modify the third condition to safely
+ // remove the temp options files.
+ keep = (sst_live_set.find(number) != sst_live_set.end()) ||
+ (blob_live_set.find(number) != blob_live_set.end()) ||
+ (number == state.pending_manifest_file_number) ||
+ (to_delete.find(kOptionsFileNamePrefix) != std::string::npos);
+ break;
+ case kInfoLogFile:
+ keep = true;
+ if (number != 0) {
+ old_info_log_files.push_back(to_delete);
+ }
+ break;
+ case kOptionsFile:
+ keep = (number >= optsfile_num2);
+ break;
+ case kCurrentFile:
+ case kDBLockFile:
+ case kIdentityFile:
+ case kMetaDatabase:
+ keep = true;
+ break;
+ }
+
+ if (keep) {
+ continue;
+ }
+
+ std::string fname;
+ std::string dir_to_sync;
+ if (type == kTableFile) {
+ // evict from cache
+ TableCache::Evict(table_cache_.get(), number);
+ fname = MakeTableFileName(candidate_file.file_path, number);
+ dir_to_sync = candidate_file.file_path;
+ } else if (type == kBlobFile) {
+ fname = BlobFileName(candidate_file.file_path, number);
+ dir_to_sync = candidate_file.file_path;
+ } else {
+ dir_to_sync = (type == kWalFile) ? wal_dir : dbname_;
+ fname = dir_to_sync +
+ ((!dir_to_sync.empty() && dir_to_sync.back() == '/') ||
+ (!to_delete.empty() && to_delete.front() == '/')
+ ? ""
+ : "/") +
+ to_delete;
+ }
+
+#ifndef ROCKSDB_LITE
+ if (type == kWalFile && (immutable_db_options_.WAL_ttl_seconds > 0 ||
+ immutable_db_options_.WAL_size_limit_MB > 0)) {
+ wal_manager_.ArchiveWALFile(fname, number);
+ continue;
+ }
+#endif // !ROCKSDB_LITE
+
+ // If I do not own these files, e.g. secondary instance with max_open_files
+ // = -1, then no need to delete or schedule delete these files since they
+ // will be removed by their owner, e.g. the primary instance.
+ if (!own_files) {
+ continue;
+ }
+ if (schedule_only) {
+ InstrumentedMutexLock guard_lock(&mutex_);
+ SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id);
+ } else {
+ DeleteObsoleteFileImpl(state.job_id, fname, dir_to_sync, type, number);
+ }
+ }
+
+ {
+ // After purging obsolete files, remove them from files_grabbed_for_purge_.
+ InstrumentedMutexLock guard_lock(&mutex_);
+ autovector<uint64_t> to_be_removed;
+ for (auto fn : files_grabbed_for_purge_) {
+ if (files_to_del.count(fn) != 0) {
+ to_be_removed.emplace_back(fn);
+ }
+ }
+ for (auto fn : to_be_removed) {
+ files_grabbed_for_purge_.erase(fn);
+ }
+ }
+
+ // Delete old info log files.
+ size_t old_info_log_file_count = old_info_log_files.size();
+ if (old_info_log_file_count != 0 &&
+ old_info_log_file_count >= immutable_db_options_.keep_log_file_num) {
+ std::sort(old_info_log_files.begin(), old_info_log_files.end());
+ size_t end =
+ old_info_log_file_count - immutable_db_options_.keep_log_file_num;
+ for (unsigned int i = 0; i <= end; i++) {
+ std::string& to_delete = old_info_log_files.at(i);
+ std::string full_path_to_delete =
+ (immutable_db_options_.db_log_dir.empty()
+ ? dbname_
+ : immutable_db_options_.db_log_dir) +
+ "/" + to_delete;
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[JOB %d] Delete info log file %s\n", state.job_id,
+ full_path_to_delete.c_str());
+ Status s = env_->DeleteFile(full_path_to_delete);
+ if (!s.ok()) {
+ if (env_->FileExists(full_path_to_delete).IsNotFound()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "[JOB %d] Tried to delete non-existing info log file %s FAILED "
+ "-- %s\n",
+ state.job_id, to_delete.c_str(), s.ToString().c_str());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "[JOB %d] Delete info log file %s FAILED -- %s\n",
+ state.job_id, to_delete.c_str(),
+ s.ToString().c_str());
+ }
+ }
+ }
+ }
+#ifndef ROCKSDB_LITE
+ wal_manager_.PurgeObsoleteWALFiles();
+#endif // ROCKSDB_LITE
+ LogFlush(immutable_db_options_.info_log);
+ InstrumentedMutexLock l(&mutex_);
+ --pending_purge_obsolete_files_;
+ assert(pending_purge_obsolete_files_ >= 0);
+ if (schedule_only) {
+ // Must change from pending_purge_obsolete_files_ to bg_purge_scheduled_
+ // while holding mutex (for GetSortedWalFiles() etc.)
+ SchedulePurge();
+ }
+ if (pending_purge_obsolete_files_ == 0) {
+ bg_cv_.SignalAll();
+ }
+ TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:End");
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+ mutex_.AssertHeld();
+ JobContext job_context(next_job_id_.fetch_add(1));
+ FindObsoleteFiles(&job_context, true);
+
+ mutex_.Unlock();
+ if (job_context.HaveSomethingToDelete()) {
+ bool defer_purge = immutable_db_options_.avoid_unnecessary_blocking_io;
+ PurgeObsoleteFiles(job_context, defer_purge);
+ }
+ job_context.Clean();
+ mutex_.Lock();
+}
+
+uint64_t FindMinPrepLogReferencedByMemTable(
+ VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) {
+ uint64_t min_log = 0;
+
+ // we must look through the memtables for two phase transactions
+ // that have been committed but not yet flushed
+ std::unordered_set<MemTable*> memtables_to_flush_set(
+ memtables_to_flush.begin(), memtables_to_flush.end());
+ for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+ if (loop_cfd->IsDropped()) {
+ continue;
+ }
+
+ auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+ &memtables_to_flush_set);
+
+ if (log > 0 && (min_log == 0 || log < min_log)) {
+ min_log = log;
+ }
+
+ log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+
+ if (log > 0 && (min_log == 0 || log < min_log)) {
+ min_log = log;
+ }
+ }
+
+ return min_log;
+}
+
+uint64_t FindMinPrepLogReferencedByMemTable(
+ VersionSet* vset,
+ const autovector<const autovector<MemTable*>*>& memtables_to_flush) {
+ uint64_t min_log = 0;
+
+ std::unordered_set<MemTable*> memtables_to_flush_set;
+ for (const autovector<MemTable*>* memtables : memtables_to_flush) {
+ memtables_to_flush_set.insert(memtables->begin(), memtables->end());
+ }
+ for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+ if (loop_cfd->IsDropped()) {
+ continue;
+ }
+
+ auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+ &memtables_to_flush_set);
+ if (log > 0 && (min_log == 0 || log < min_log)) {
+ min_log = log;
+ }
+
+ log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+ if (log > 0 && (min_log == 0 || log < min_log)) {
+ min_log = log;
+ }
+ }
+
+ return min_log;
+}
+
+uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+ VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+ const autovector<VersionEdit*>& edit_list) {
+ assert(vset != nullptr);
+
+ // Precompute the min log number containing unflushed data for the column
+ // family being flushed (`cfd_to_flush`).
+ uint64_t cf_min_log_number_to_keep = 0;
+ for (auto& e : edit_list) {
+ if (e->HasLogNumber()) {
+ cf_min_log_number_to_keep =
+ std::max(cf_min_log_number_to_keep, e->GetLogNumber());
+ }
+ }
+ if (cf_min_log_number_to_keep == 0) {
+ // No version edit contains information on log number. The log number
+ // for this column family should stay the same as it is.
+ cf_min_log_number_to_keep = cfd_to_flush.GetLogNumber();
+ }
+
+ // Get min log number containing unflushed data for other column families.
+ uint64_t min_log_number_to_keep =
+ vset->PreComputeMinLogNumberWithUnflushedData(&cfd_to_flush);
+ if (cf_min_log_number_to_keep != 0) {
+ min_log_number_to_keep =
+ std::min(cf_min_log_number_to_keep, min_log_number_to_keep);
+ }
+ return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+ VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+ const autovector<autovector<VersionEdit*>>& edit_lists) {
+ assert(vset != nullptr);
+ assert(!cfds_to_flush.empty());
+ assert(cfds_to_flush.size() == edit_lists.size());
+
+ uint64_t min_log_number_to_keep = std::numeric_limits<uint64_t>::max();
+ for (const auto& edit_list : edit_lists) {
+ uint64_t log = 0;
+ for (const auto& e : edit_list) {
+ if (e->HasLogNumber()) {
+ log = std::max(log, e->GetLogNumber());
+ }
+ }
+ if (log != 0) {
+ min_log_number_to_keep = std::min(min_log_number_to_keep, log);
+ }
+ }
+ if (min_log_number_to_keep == std::numeric_limits<uint64_t>::max()) {
+ min_log_number_to_keep = cfds_to_flush[0]->GetLogNumber();
+ for (size_t i = 1; i < cfds_to_flush.size(); i++) {
+ min_log_number_to_keep =
+ std::min(min_log_number_to_keep, cfds_to_flush[i]->GetLogNumber());
+ }
+ }
+
+ std::unordered_set<const ColumnFamilyData*> flushed_cfds(
+ cfds_to_flush.begin(), cfds_to_flush.end());
+ min_log_number_to_keep =
+ std::min(min_log_number_to_keep,
+ vset->PreComputeMinLogNumberWithUnflushedData(flushed_cfds));
+
+ return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep2PC(
+ VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+ const autovector<VersionEdit*>& edit_list,
+ const autovector<MemTable*>& memtables_to_flush,
+ LogsWithPrepTracker* prep_tracker) {
+ assert(vset != nullptr);
+ assert(prep_tracker != nullptr);
+ // Calculate updated min_log_number_to_keep
+ // Since the function should only be called in 2pc mode, log number in
+ // the version edit should be sufficient.
+
+ uint64_t min_log_number_to_keep =
+ PrecomputeMinLogNumberToKeepNon2PC(vset, cfd_to_flush, edit_list);
+
+ // if are 2pc we must consider logs containing prepared
+ // sections of outstanding transactions.
+ //
+ // We must check min logs with outstanding prep before we check
+ // logs references by memtables because a log referenced by the
+ // first data structure could transition to the second under us.
+ //
+ // TODO: iterating over all column families under db mutex.
+ // should find more optimal solution
+ auto min_log_in_prep_heap =
+ prep_tracker->FindMinLogContainingOutstandingPrep();
+
+ if (min_log_in_prep_heap != 0 &&
+ min_log_in_prep_heap < min_log_number_to_keep) {
+ min_log_number_to_keep = min_log_in_prep_heap;
+ }
+
+ uint64_t min_log_refed_by_mem =
+ FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);
+
+ if (min_log_refed_by_mem != 0 &&
+ min_log_refed_by_mem < min_log_number_to_keep) {
+ min_log_number_to_keep = min_log_refed_by_mem;
+ }
+ return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep2PC(
+ VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+ const autovector<autovector<VersionEdit*>>& edit_lists,
+ const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+ LogsWithPrepTracker* prep_tracker) {
+ assert(vset != nullptr);
+ assert(prep_tracker != nullptr);
+ assert(cfds_to_flush.size() == edit_lists.size());
+ assert(cfds_to_flush.size() == memtables_to_flush.size());
+
+ uint64_t min_log_number_to_keep =
+ PrecomputeMinLogNumberToKeepNon2PC(vset, cfds_to_flush, edit_lists);
+
+ uint64_t min_log_in_prep_heap =
+ prep_tracker->FindMinLogContainingOutstandingPrep();
+
+ if (min_log_in_prep_heap != 0 &&
+ min_log_in_prep_heap < min_log_number_to_keep) {
+ min_log_number_to_keep = min_log_in_prep_heap;
+ }
+
+ uint64_t min_log_refed_by_mem =
+ FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);
+
+ if (min_log_refed_by_mem != 0 &&
+ min_log_refed_by_mem < min_log_number_to_keep) {
+ min_log_number_to_keep = min_log_refed_by_mem;
+ }
+
+ return min_log_number_to_keep;
+}
+
+void DBImpl::SetDBId(std::string&& id, bool read_only,
+ RecoveryContext* recovery_ctx) {
+ assert(db_id_.empty());
+ assert(!id.empty());
+ db_id_ = std::move(id);
+ if (!read_only && immutable_db_options_.write_dbid_to_manifest) {
+ assert(recovery_ctx != nullptr);
+ assert(versions_->GetColumnFamilySet() != nullptr);
+ VersionEdit edit;
+ edit.SetDBId(db_id_);
+ versions_->db_id_ = db_id_;
+ recovery_ctx->UpdateVersionEdits(
+ versions_->GetColumnFamilySet()->GetDefault(), edit);
+ }
+}
+
+Status DBImpl::SetupDBId(bool read_only, RecoveryContext* recovery_ctx) {
+ Status s;
+ // Check for the IDENTITY file and create it if not there or
+ // broken or not matching manifest
+ std::string db_id_in_file;
+ s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
+ if (s.ok()) {
+ s = GetDbIdentityFromIdentityFile(&db_id_in_file);
+ if (s.ok() && !db_id_in_file.empty()) {
+ if (db_id_.empty()) {
+ // Loaded from file and wasn't already known from manifest
+ SetDBId(std::move(db_id_in_file), read_only, recovery_ctx);
+ return s;
+ } else if (db_id_ == db_id_in_file) {
+ // Loaded from file and matches manifest
+ return s;
+ }
+ }
+ }
+ if (s.IsNotFound()) {
+ s = Status::OK();
+ }
+ if (!s.ok()) {
+ assert(s.IsIOError());
+ return s;
+ }
+ // Otherwise IDENTITY file is missing or no good.
+ // Generate new id if needed
+ if (db_id_.empty()) {
+ SetDBId(env_->GenerateUniqueId(), read_only, recovery_ctx);
+ }
+ // Persist it to IDENTITY file if allowed
+ if (!read_only) {
+ s = SetIdentityFile(env_, dbname_, db_id_);
+ }
+ return s;
+}
+
+Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) {
+ mutex_.AssertHeld();
+ std::vector<std::string> paths;
+ paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator)));
+ for (const auto& db_path : immutable_db_options_.db_paths) {
+ paths.push_back(
+ NormalizePath(db_path.path + std::string(1, kFilePathSeparator)));
+ }
+ for (const auto* cfd : *versions_->GetColumnFamilySet()) {
+ for (const auto& cf_path : cfd->ioptions()->cf_paths) {
+ paths.push_back(
+ NormalizePath(cf_path.path + std::string(1, kFilePathSeparator)));
+ }
+ }
+ // Dedup paths
+ std::sort(paths.begin(), paths.end());
+ paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+
+ uint64_t next_file_number = versions_->current_next_file_number();
+ uint64_t largest_file_number = next_file_number;
+ Status s;
+ for (const auto& path : paths) {
+ std::vector<std::string> files;
+ s = env_->GetChildren(path, &files);
+ if (!s.ok()) {
+ break;
+ }
+ for (const auto& fname : files) {
+ uint64_t number = 0;
+ FileType type;
+ if (!ParseFileName(fname, &number, &type)) {
+ continue;
+ }
+ // path ends with '/' or '\\'
+ const std::string normalized_fpath = path + fname;
+ largest_file_number = std::max(largest_file_number, number);
+ if (type == kTableFile && number >= next_file_number &&
+ recovery_ctx->files_to_delete_.find(normalized_fpath) ==
+ recovery_ctx->files_to_delete_.end()) {
+ recovery_ctx->files_to_delete_.emplace(normalized_fpath);
+ }
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (largest_file_number >= next_file_number) {
+ versions_->next_file_number_.store(largest_file_number + 1);
+ }
+
+ VersionEdit edit;
+ edit.SetNextFile(versions_->next_file_number_.load());
+ assert(versions_->GetColumnFamilySet());
+ ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault();
+ assert(default_cfd);
+ recovery_ctx->UpdateVersionEdits(default_cfd, edit);
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_open.cc b/src/rocksdb/db/db_impl/db_impl_open.cc
new file mode 100644
index 000000000..40ffa2e85
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_open.cc
@@ -0,0 +1,2106 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/periodic_task_scheduler.h"
+#include "env/composite_env_wrapper.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "rocksdb/table.h"
+#include "rocksdb/wal_filter.h"
+#include "test_util/sync_point.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Options SanitizeOptions(const std::string& dbname, const Options& src,
+ bool read_only, Status* logger_creation_s) {
+ auto db_options =
+ SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s);
+ ImmutableDBOptions immutable_db_options(db_options);
+ auto cf_options =
+ SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
+ return Options(db_options, cf_options);
+}
+
+DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src,
+ bool read_only, Status* logger_creation_s) {
+ DBOptions result(src);
+
+ if (result.env == nullptr) {
+ result.env = Env::Default();
+ }
+
+ // result.max_open_files means an "infinite" open files.
+ if (result.max_open_files != -1) {
+ int max_max_open_files = port::GetMaxOpenFiles();
+ if (max_max_open_files == -1) {
+ max_max_open_files = 0x400000;
+ }
+ ClipToRange(&result.max_open_files, 20, max_max_open_files);
+ TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles",
+ &result.max_open_files);
+ }
+
+ if (result.info_log == nullptr && !read_only) {
+ Status s = CreateLoggerFromOptions(dbname, result, &result.info_log);
+ if (!s.ok()) {
+ // No place suitable for logging
+ result.info_log = nullptr;
+ if (logger_creation_s) {
+ *logger_creation_s = s;
+ }
+ }
+ }
+
+ if (!result.write_buffer_manager) {
+ result.write_buffer_manager.reset(
+ new WriteBufferManager(result.db_write_buffer_size));
+ }
+ auto bg_job_limits = DBImpl::GetBGJobLimits(
+ result.max_background_flushes, result.max_background_compactions,
+ result.max_background_jobs, true /* parallelize_compactions */);
+ result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions,
+ Env::Priority::LOW);
+ result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes,
+ Env::Priority::HIGH);
+
+ if (result.rate_limiter.get() != nullptr) {
+ if (result.bytes_per_sync == 0) {
+ result.bytes_per_sync = 1024 * 1024;
+ }
+ }
+
+ if (result.delayed_write_rate == 0) {
+ if (result.rate_limiter.get() != nullptr) {
+ result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond();
+ }
+ if (result.delayed_write_rate == 0) {
+ result.delayed_write_rate = 16 * 1024 * 1024;
+ }
+ }
+
+ if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) {
+ result.recycle_log_file_num = false;
+ }
+
+ if (result.recycle_log_file_num &&
+ (result.wal_recovery_mode ==
+ WALRecoveryMode::kTolerateCorruptedTailRecords ||
+ result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery ||
+ result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) {
+ // - kTolerateCorruptedTailRecords is inconsistent with recycle log file
+ // feature. WAL recycling expects recovery success upon encountering a
+ // corrupt record at the point where new data ends and recycled data
+ // remains at the tail. However, `kTolerateCorruptedTailRecords` must fail
+ // upon encountering any such corrupt record, as it cannot differentiate
+ // between this and a real corruption, which would cause committed updates
+ // to be truncated -- a violation of the recovery guarantee.
+ // - kPointInTimeRecovery and kAbsoluteConsistency are incompatible with
+ // recycle log file feature temporarily due to a bug found introducing a
+ // hole in the recovered data
+ // (https://github.com/facebook/rocksdb/pull/7252#issuecomment-673766236).
+ // Besides this bug, we believe the features are fundamentally compatible.
+ result.recycle_log_file_num = 0;
+ }
+
+ if (result.db_paths.size() == 0) {
+ result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
+ } else if (result.wal_dir.empty()) {
+ // Use dbname as default
+ result.wal_dir = dbname;
+ }
+ if (!result.wal_dir.empty()) {
+ // If there is a wal_dir already set, check to see if the wal_dir is the
+ // same as the dbname AND the same as the db_path[0] (which must exist from
+ // a few lines ago). If the wal_dir matches both of these values, then clear
+ // the wal_dir value, which will make wal_dir == dbname. Most likely this
+ // condition was the result of reading an old options file where we forced
+ // wal_dir to be set (to dbname).
+ auto npath = NormalizePath(dbname + "/");
+ if (npath == NormalizePath(result.wal_dir + "/") &&
+ npath == NormalizePath(result.db_paths[0].path + "/")) {
+ result.wal_dir.clear();
+ }
+ }
+
+ if (!result.wal_dir.empty() && result.wal_dir.back() == '/') {
+ result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
+ }
+
+ if (result.use_direct_reads && result.compaction_readahead_size == 0) {
+ TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr);
+ result.compaction_readahead_size = 1024 * 1024 * 2;
+ }
+
+ // Force flush on DB open if 2PC is enabled, since with 2PC we have no
+ // guarantee that consecutive log files have consecutive sequence id, which
+ // make recovery complicated.
+ if (result.allow_2pc) {
+ result.avoid_flush_during_recovery = false;
+ }
+
+#ifndef ROCKSDB_LITE
+ ImmutableDBOptions immutable_db_options(result);
+ if (!immutable_db_options.IsWalDirSameAsDBPath()) {
+ // Either the WAL dir and db_paths[0]/db_name are not the same, or we
+ // cannot tell for sure. In either case, assume they're different and
+ // explicitly cleanup the trash log files (bypass DeleteScheduler)
+ // Do this first so even if we end up calling
+ // DeleteScheduler::CleanupDirectory on the same dir later, it will be
+ // safe
+ std::vector<std::string> filenames;
+ IOOptions io_opts;
+ io_opts.do_not_recurse = true;
+ auto wal_dir = immutable_db_options.GetWalDir();
+ Status s = immutable_db_options.fs->GetChildren(
+ wal_dir, io_opts, &filenames, /*IODebugContext*=*/nullptr);
+ s.PermitUncheckedError(); //**TODO: What to do on error?
+ for (std::string& filename : filenames) {
+ if (filename.find(".log.trash", filename.length() -
+ std::string(".log.trash").length()) !=
+ std::string::npos) {
+ std::string trash_file = wal_dir + "/" + filename;
+ result.env->DeleteFile(trash_file).PermitUncheckedError();
+ }
+ }
+ }
+ // When the DB is stopped, it's possible that there are some .trash files that
+ // were not deleted yet, when we open the DB we will find these .trash files
+ // and schedule them to be deleted (or delete immediately if SstFileManager
+ // was not used)
+ auto sfm = static_cast<SstFileManagerImpl*>(result.sst_file_manager.get());
+ for (size_t i = 0; i < result.db_paths.size(); i++) {
+ DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path)
+ .PermitUncheckedError();
+ }
+
+ // Create a default SstFileManager for purposes of tracking compaction size
+ // and facilitating recovery from out of space errors.
+ if (result.sst_file_manager.get() == nullptr) {
+ std::shared_ptr<SstFileManager> sst_file_manager(
+ NewSstFileManager(result.env, result.info_log));
+ result.sst_file_manager = sst_file_manager;
+ }
+#endif // !ROCKSDB_LITE
+
+ // Supported wal compression types
+ if (!StreamingCompressionTypeSupported(result.wal_compression)) {
+ result.wal_compression = kNoCompression;
+ ROCKS_LOG_WARN(result.info_log,
+ "wal_compression is disabled since only zstd is supported");
+ }
+
+ if (!result.paranoid_checks) {
+ result.skip_checking_sst_file_sizes_on_db_open = true;
+ ROCKS_LOG_INFO(result.info_log,
+ "file size check will be skipped during open.");
+ }
+
+ return result;
+}
+
+namespace {
+Status ValidateOptionsByTable(
+ const DBOptions& db_opts,
+ const std::vector<ColumnFamilyDescriptor>& column_families) {
+ Status s;
+ for (auto cf : column_families) {
+ s = ValidateOptions(db_opts, cf.options);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ return Status::OK();
+}
+} // namespace
+
+Status DBImpl::ValidateOptions(
+ const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families) {
+ Status s;
+ for (auto& cfd : column_families) {
+ s = ColumnFamilyData::ValidateOptions(db_options, cfd.options);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ s = ValidateOptions(db_options);
+ return s;
+}
+
+Status DBImpl::ValidateOptions(const DBOptions& db_options) {
+ if (db_options.db_paths.size() > 4) {
+ return Status::NotSupported(
+ "More than four DB paths are not supported yet. ");
+ }
+
+ if (db_options.allow_mmap_reads && db_options.use_direct_reads) {
+ // Protect against assert in PosixMMapReadableFile constructor
+ return Status::NotSupported(
+ "If memory mapped reads (allow_mmap_reads) are enabled "
+ "then direct I/O reads (use_direct_reads) must be disabled. ");
+ }
+
+ if (db_options.allow_mmap_writes &&
+ db_options.use_direct_io_for_flush_and_compaction) {
+ return Status::NotSupported(
+ "If memory mapped writes (allow_mmap_writes) are enabled "
+ "then direct I/O writes (use_direct_io_for_flush_and_compaction) must "
+ "be disabled. ");
+ }
+
+ if (db_options.keep_log_file_num == 0) {
+ return Status::InvalidArgument("keep_log_file_num must be greater than 0");
+ }
+
+ if (db_options.unordered_write &&
+ !db_options.allow_concurrent_memtable_write) {
+ return Status::InvalidArgument(
+ "unordered_write is incompatible with "
+ "!allow_concurrent_memtable_write");
+ }
+
+ if (db_options.unordered_write && db_options.enable_pipelined_write) {
+ return Status::InvalidArgument(
+ "unordered_write is incompatible with enable_pipelined_write");
+ }
+
+ if (db_options.atomic_flush && db_options.enable_pipelined_write) {
+ return Status::InvalidArgument(
+ "atomic_flush is incompatible with enable_pipelined_write");
+ }
+
+ // TODO remove this restriction
+ if (db_options.atomic_flush && db_options.best_efforts_recovery) {
+ return Status::InvalidArgument(
+ "atomic_flush is currently incompatible with best-efforts recovery");
+ }
+
+ if (db_options.use_direct_io_for_flush_and_compaction &&
+ 0 == db_options.writable_file_max_buffer_size) {
+ return Status::InvalidArgument(
+ "writes in direct IO require writable_file_max_buffer_size > 0");
+ }
+
+ return Status::OK();
+}
+
+Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
+ VersionEdit new_db;
+ Status s = SetIdentityFile(env_, dbname_);
+ if (!s.ok()) {
+ return s;
+ }
+ if (immutable_db_options_.write_dbid_to_manifest) {
+ std::string temp_db_id;
+ GetDbIdentityFromIdentityFile(&temp_db_id);
+ new_db.SetDBId(temp_db_id);
+ }
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ {
+ if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) {
+ fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
+ }
+ std::unique_ptr<FSWritableFile> file;
+ FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
+ s = NewWritableFile(fs_.get(), manifest, &file, file_options);
+ if (!s.ok()) {
+ return s;
+ }
+ FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
+ file->SetPreallocationBlockSize(
+ immutable_db_options_.manifest_preallocation_size);
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(file), manifest, file_options, immutable_db_options_.clock,
+ io_tracer_, nullptr /* stats */, immutable_db_options_.listeners,
+ nullptr, tmp_set.Contains(FileType::kDescriptorFile),
+ tmp_set.Contains(FileType::kDescriptorFile)));
+ log::Writer log(std::move(file_writer), 0, false);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+ if (s.ok()) {
+ s = SyncManifest(&immutable_db_options_, log.file());
+ }
+ }
+ if (s.ok()) {
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(fs_.get(), dbname_, 1, directories_.GetDbDir());
+ if (new_filenames) {
+ new_filenames->emplace_back(
+ manifest.substr(manifest.find_last_of("/\\") + 1));
+ }
+ } else {
+ fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
+ }
+ return s;
+}
+
+IOStatus DBImpl::CreateAndNewDirectory(
+ FileSystem* fs, const std::string& dirname,
+ std::unique_ptr<FSDirectory>* directory) {
+ // We call CreateDirIfMissing() as the directory may already exist (if we
+ // are reopening a DB), when this happens we don't want creating the
+ // directory to cause an error. However, we need to check if creating the
+ // directory fails or else we may get an obscure message about the lock
+ // file not existing. One real-world example of this occurring is if
+ // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
+ // when dbname_ is "dir/db" but when "dir" doesn't exist.
+ IOStatus io_s = fs->CreateDirIfMissing(dirname, IOOptions(), nullptr);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ return fs->NewDirectory(dirname, IOOptions(), directory, nullptr);
+}
+
+IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname,
+ const std::string& wal_dir,
+ const std::vector<DbPath>& data_paths) {
+ IOStatus io_s = DBImpl::CreateAndNewDirectory(fs, dbname, &db_dir_);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ if (!wal_dir.empty() && dbname != wal_dir) {
+ io_s = DBImpl::CreateAndNewDirectory(fs, wal_dir, &wal_dir_);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+
+ data_dirs_.clear();
+ for (auto& p : data_paths) {
+ const std::string db_path = p.path;
+ if (db_path == dbname) {
+ data_dirs_.emplace_back(nullptr);
+ } else {
+ std::unique_ptr<FSDirectory> path_directory;
+ io_s = DBImpl::CreateAndNewDirectory(fs, db_path, &path_directory);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ data_dirs_.emplace_back(path_directory.release());
+ }
+ }
+ assert(data_dirs_.size() == data_paths.size());
+ return IOStatus::OK();
+}
+
+Status DBImpl::Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+ bool error_if_wal_file_exists, bool error_if_data_exists_in_wals,
+ uint64_t* recovered_seq, RecoveryContext* recovery_ctx) {
+ mutex_.AssertHeld();
+
+ bool is_new_db = false;
+ assert(db_lock_ == nullptr);
+ std::vector<std::string> files_in_dbname;
+ if (!read_only) {
+ Status s = directories_.SetDirectories(fs_.get(), dbname_,
+ immutable_db_options_.wal_dir,
+ immutable_db_options_.db_paths);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ std::string current_fname = CurrentFileName(dbname_);
+ // Path to any MANIFEST file in the db dir. It does not matter which one.
+ // Since best-efforts recovery ignores CURRENT file, existence of a
+ // MANIFEST indicates the recovery to recover existing db. If no MANIFEST
+ // can be found, a new db will be created.
+ std::string manifest_path;
+ if (!immutable_db_options_.best_efforts_recovery) {
+ s = env_->FileExists(current_fname);
+ } else {
+ s = Status::NotFound();
+ IOOptions io_opts;
+ io_opts.do_not_recurse = true;
+ Status io_s = immutable_db_options_.fs->GetChildren(
+ dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr);
+ if (!io_s.ok()) {
+ s = io_s;
+ files_in_dbname.clear();
+ }
+ for (const std::string& file : files_in_dbname) {
+ uint64_t number = 0;
+ FileType type = kWalFile; // initialize
+ if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
+ uint64_t bytes;
+ s = env_->GetFileSize(DescriptorFileName(dbname_, number), &bytes);
+ if (s.ok() && bytes != 0) {
+ // Found non-empty MANIFEST (descriptor log), thus best-efforts
+ // recovery does not have to treat the db as empty.
+ manifest_path = dbname_ + "/" + file;
+ break;
+ }
+ }
+ }
+ }
+ if (s.IsNotFound()) {
+ if (immutable_db_options_.create_if_missing) {
+ s = NewDB(&files_in_dbname);
+ is_new_db = true;
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ return Status::InvalidArgument(
+ current_fname, "does not exist (create_if_missing is false)");
+ }
+ } else if (s.ok()) {
+ if (immutable_db_options_.error_if_exists) {
+ return Status::InvalidArgument(dbname_,
+ "exists (error_if_exists is true)");
+ }
+ } else {
+ // Unexpected error reading file
+ assert(s.IsIOError());
+ return s;
+ }
+ // Verify compatibility of file_options_ and filesystem
+ {
+ std::unique_ptr<FSRandomAccessFile> idfile;
+ FileOptions customized_fs(file_options_);
+ customized_fs.use_direct_reads |=
+ immutable_db_options_.use_direct_io_for_flush_and_compaction;
+ const std::string& fname =
+ manifest_path.empty() ? current_fname : manifest_path;
+ s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr);
+ if (!s.ok()) {
+ std::string error_str = s.ToString();
+ // Check if unsupported Direct I/O is the root cause
+ customized_fs.use_direct_reads = false;
+ s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr);
+ if (s.ok()) {
+ return Status::InvalidArgument(
+ "Direct I/O is not supported by the specified DB.");
+ } else {
+ return Status::InvalidArgument(
+ "Found options incompatible with filesystem", error_str.c_str());
+ }
+ }
+ }
+ } else if (immutable_db_options_.best_efforts_recovery) {
+ assert(files_in_dbname.empty());
+ IOOptions io_opts;
+ io_opts.do_not_recurse = true;
+ Status s = immutable_db_options_.fs->GetChildren(
+ dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr);
+ if (s.IsNotFound()) {
+ return Status::InvalidArgument(dbname_,
+ "does not exist (open for read only)");
+ } else if (s.IsIOError()) {
+ return s;
+ }
+ assert(s.ok());
+ }
+ assert(db_id_.empty());
+ Status s;
+ bool missing_table_file = false;
+ if (!immutable_db_options_.best_efforts_recovery) {
+ s = versions_->Recover(column_families, read_only, &db_id_);
+ } else {
+ assert(!files_in_dbname.empty());
+ s = versions_->TryRecover(column_families, read_only, files_in_dbname,
+ &db_id_, &missing_table_file);
+ if (s.ok()) {
+ // TryRecover may delete previous column_family_set_.
+ column_family_memtables_.reset(
+ new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+ s = SetupDBId(read_only, recovery_ctx);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str());
+ if (s.ok() && !read_only) {
+ s = DeleteUnreferencedSstFiles(recovery_ctx);
+ }
+
+ if (immutable_db_options_.paranoid_checks && s.ok()) {
+ s = CheckConsistency();
+ }
+ if (s.ok() && !read_only) {
+ // TODO: share file descriptors (FSDirectory) with SetDirectories above
+ std::map<std::string, std::shared_ptr<FSDirectory>> created_dirs;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ s = cfd->AddDirectories(&created_dirs);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+
+ std::vector<std::string> files_in_wal_dir;
+ if (s.ok()) {
+ // Initial max_total_in_memory_state_ before recovery wals. Log recovery
+ // may check this value to decide whether to flush.
+ max_total_in_memory_state_ = 0;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+ mutable_cf_options->max_write_buffer_number;
+ }
+
+ SequenceNumber next_sequence(kMaxSequenceNumber);
+ default_cf_handle_ = new ColumnFamilyHandleImpl(
+ versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+ default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+
+ // Recover from all newer log files than the ones named in the
+ // descriptor (new log files may have been added by the previous
+ // incarnation without registering them in the descriptor).
+ //
+ // Note that prev_log_number() is no longer used, but we pay
+ // attention to it in case we are recovering a database
+ // produced by an older version of rocksdb.
+ auto wal_dir = immutable_db_options_.GetWalDir();
+ if (!immutable_db_options_.best_efforts_recovery) {
+ IOOptions io_opts;
+ io_opts.do_not_recurse = true;
+ s = immutable_db_options_.fs->GetChildren(
+ wal_dir, io_opts, &files_in_wal_dir, /*IODebugContext*=*/nullptr);
+ }
+ if (s.IsNotFound()) {
+ return Status::InvalidArgument("wal_dir not found", wal_dir);
+ } else if (!s.ok()) {
+ return s;
+ }
+
+ std::unordered_map<uint64_t, std::string> wal_files;
+ for (const auto& file : files_in_wal_dir) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(file, &number, &type) && type == kWalFile) {
+ if (is_new_db) {
+ return Status::Corruption(
+ "While creating a new Db, wal_dir contains "
+ "existing log file: ",
+ file);
+ } else {
+ wal_files[number] = LogFileName(wal_dir, number);
+ }
+ }
+ }
+
+ if (immutable_db_options_.track_and_verify_wals_in_manifest) {
+ if (!immutable_db_options_.best_efforts_recovery) {
+ // Verify WALs in MANIFEST.
+ s = versions_->GetWalSet().CheckWals(env_, wal_files);
+ } // else since best effort recovery does not recover from WALs, no need
+ // to check WALs.
+ } else if (!versions_->GetWalSet().GetWals().empty()) {
+ // Tracking is disabled, clear previously tracked WALs from MANIFEST,
+ // otherwise, in the future, if WAL tracking is enabled again,
+ // since the WALs deleted when WAL tracking is disabled are not persisted
+ // into MANIFEST, WAL check may fail.
+ VersionEdit edit;
+ WalNumber max_wal_number =
+ versions_->GetWalSet().GetWals().rbegin()->first;
+ edit.DeleteWalsBefore(max_wal_number + 1);
+ assert(recovery_ctx != nullptr);
+ assert(versions_->GetColumnFamilySet() != nullptr);
+ recovery_ctx->UpdateVersionEdits(
+ versions_->GetColumnFamilySet()->GetDefault(), edit);
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (!wal_files.empty()) {
+ if (error_if_wal_file_exists) {
+ return Status::Corruption(
+ "The db was opened in readonly mode with error_if_wal_file_exists"
+ "flag but a WAL file already exists");
+ } else if (error_if_data_exists_in_wals) {
+ for (auto& wal_file : wal_files) {
+ uint64_t bytes;
+ s = env_->GetFileSize(wal_file.second, &bytes);
+ if (s.ok()) {
+ if (bytes > 0) {
+ return Status::Corruption(
+ "error_if_data_exists_in_wals is set but there are data "
+ " in WAL files.");
+ }
+ }
+ }
+ }
+ }
+
+ if (!wal_files.empty()) {
+ // Recover in the order in which the wals were generated
+ std::vector<uint64_t> wals;
+ wals.reserve(wal_files.size());
+ for (const auto& wal_file : wal_files) {
+ wals.push_back(wal_file.first);
+ }
+ std::sort(wals.begin(), wals.end());
+
+ bool corrupted_wal_found = false;
+ s = RecoverLogFiles(wals, &next_sequence, read_only, &corrupted_wal_found,
+ recovery_ctx);
+ if (corrupted_wal_found && recovered_seq != nullptr) {
+ *recovered_seq = next_sequence;
+ }
+ if (!s.ok()) {
+ // Clear memtables if recovery failed
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+ }
+ }
+ }
+ }
+
+ if (read_only) {
+ // If we are opening as read-only, we need to update options_file_number_
+ // to reflect the most recent OPTIONS file. It does not matter for regular
+ // read-write db instance because options_file_number_ will later be
+ // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile.
+ std::vector<std::string> filenames;
+ if (s.ok()) {
+ const std::string normalized_dbname = NormalizePath(dbname_);
+ const std::string normalized_wal_dir =
+ NormalizePath(immutable_db_options_.GetWalDir());
+ if (immutable_db_options_.best_efforts_recovery) {
+ filenames = std::move(files_in_dbname);
+ } else if (normalized_dbname == normalized_wal_dir) {
+ filenames = std::move(files_in_wal_dir);
+ } else {
+ IOOptions io_opts;
+ io_opts.do_not_recurse = true;
+ s = immutable_db_options_.fs->GetChildren(
+ GetName(), io_opts, &filenames, /*IODebugContext*=*/nullptr);
+ }
+ }
+ if (s.ok()) {
+ uint64_t number = 0;
+ uint64_t options_file_number = 0;
+ FileType type;
+ for (const auto& fname : filenames) {
+ if (ParseFileName(fname, &number, &type) && type == kOptionsFile) {
+ options_file_number = std::max(number, options_file_number);
+ }
+ }
+ versions_->options_file_number_ = options_file_number;
+ uint64_t options_file_size = 0;
+ if (options_file_number > 0) {
+ s = env_->GetFileSize(OptionsFileName(GetName(), options_file_number),
+ &options_file_size);
+ }
+ versions_->options_file_size_ = options_file_size;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::PersistentStatsProcessFormatVersion() {
+ mutex_.AssertHeld();
+ Status s;
+ // persist version when stats CF doesn't exist
+ bool should_persist_format_version = !persistent_stats_cfd_exists_;
+ mutex_.Unlock();
+ if (persistent_stats_cfd_exists_) {
+ // Check persistent stats format version compatibility. Drop and recreate
+ // persistent stats CF if format version is incompatible
+ uint64_t format_version_recovered = 0;
+ Status s_format = DecodePersistentStatsVersionNumber(
+ this, StatsVersionKeyType::kFormatVersion, &format_version_recovered);
+ uint64_t compatible_version_recovered = 0;
+ Status s_compatible = DecodePersistentStatsVersionNumber(
+ this, StatsVersionKeyType::kCompatibleVersion,
+ &compatible_version_recovered);
+ // abort reading from existing stats CF if any of following is true:
+ // 1. failed to read format version or compatible version from disk
+ // 2. sst's format version is greater than current format version, meaning
+ // this sst is encoded with a newer RocksDB release, and current compatible
+ // version is below the sst's compatible version
+ if (!s_format.ok() || !s_compatible.ok() ||
+ (kStatsCFCurrentFormatVersion < format_version_recovered &&
+ kStatsCFCompatibleFormatVersion < compatible_version_recovered)) {
+ if (!s_format.ok() || !s_compatible.ok()) {
+ ROCKS_LOG_WARN(
+ immutable_db_options_.info_log,
+ "Recreating persistent stats column family since reading "
+ "persistent stats version key failed. Format key: %s, compatible "
+ "key: %s",
+ s_format.ToString().c_str(), s_compatible.ToString().c_str());
+ } else {
+ ROCKS_LOG_WARN(
+ immutable_db_options_.info_log,
+ "Recreating persistent stats column family due to corrupted or "
+ "incompatible format version. Recovered format: %" PRIu64
+ "; recovered format compatible since: %" PRIu64 "\n",
+ format_version_recovered, compatible_version_recovered);
+ }
+ s = DropColumnFamily(persist_stats_cf_handle_);
+ if (s.ok()) {
+ s = DestroyColumnFamilyHandle(persist_stats_cf_handle_);
+ }
+ ColumnFamilyHandle* handle = nullptr;
+ if (s.ok()) {
+ ColumnFamilyOptions cfo;
+ OptimizeForPersistentStats(&cfo);
+ s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+ }
+ if (s.ok()) {
+ persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+ // should also persist version here because old stats CF is discarded
+ should_persist_format_version = true;
+ }
+ }
+ }
+ if (should_persist_format_version) {
+ // Persistent stats CF being created for the first time, need to write
+ // format version key
+ WriteBatch batch;
+ if (s.ok()) {
+ s = batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString,
+ std::to_string(kStatsCFCurrentFormatVersion));
+ }
+ if (s.ok()) {
+ s = batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString,
+ std::to_string(kStatsCFCompatibleFormatVersion));
+ }
+ if (s.ok()) {
+ WriteOptions wo;
+ wo.low_pri = true;
+ wo.no_slowdown = true;
+ wo.sync = false;
+ s = Write(wo, &batch);
+ }
+ }
+ mutex_.Lock();
+ return s;
+}
+
+Status DBImpl::InitPersistStatsColumnFamily() {
+ mutex_.AssertHeld();
+ assert(!persist_stats_cf_handle_);
+ ColumnFamilyData* persistent_stats_cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(
+ kPersistentStatsColumnFamilyName);
+ persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr;
+
+ Status s;
+ if (persistent_stats_cfd != nullptr) {
+ // We are recovering from a DB which already contains persistent stats CF,
+ // the CF is already created in VersionSet::ApplyOneVersionEdit, but
+ // column family handle was not. Need to explicitly create handle here.
+ persist_stats_cf_handle_ =
+ new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_);
+ } else {
+ mutex_.Unlock();
+ ColumnFamilyHandle* handle = nullptr;
+ ColumnFamilyOptions cfo;
+ OptimizeForPersistentStats(&cfo);
+ s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+ persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+ mutex_.Lock();
+ }
+ return s;
+}
+
+Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) {
+ mutex_.AssertHeld();
+ assert(versions_->descriptor_log_ == nullptr);
+ Status s = versions_->LogAndApply(
+ recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_,
+ recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir());
+ if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) {
+ mutex_.Unlock();
+ for (const auto& fname : recovery_ctx.files_to_delete_) {
+ s = env_->DeleteFile(fname);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ mutex_.Lock();
+ }
+ return s;
+}
+
+void DBImpl::InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap() {
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.wal_filter == nullptr) {
+ return;
+ }
+ assert(immutable_db_options_.wal_filter != nullptr);
+ WalFilter& wal_filter = *(immutable_db_options_.wal_filter);
+
+ std::map<std::string, uint32_t> cf_name_id_map;
+ std::map<uint32_t, uint64_t> cf_lognumber_map;
+ assert(versions_);
+ assert(versions_->GetColumnFamilySet());
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ assert(cfd);
+ cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID()));
+ cf_lognumber_map.insert(std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
+ }
+
+ wal_filter.ColumnFamilyLogNumberMap(cf_lognumber_map, cf_name_id_map);
+#endif // !ROCKSDB_LITE
+}
+
+bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number,
+ const std::string& wal_fname,
+ log::Reader::Reporter& reporter,
+ Status& status,
+ bool& stop_replay,
+ WriteBatch& batch) {
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.wal_filter == nullptr) {
+ return true;
+ }
+ assert(immutable_db_options_.wal_filter != nullptr);
+ WalFilter& wal_filter = *(immutable_db_options_.wal_filter);
+
+ WriteBatch new_batch;
+ bool batch_changed = false;
+
+ bool process_current_record = true;
+
+ WalFilter::WalProcessingOption wal_processing_option =
+ wal_filter.LogRecordFound(wal_number, wal_fname, batch, &new_batch,
+ &batch_changed);
+
+ switch (wal_processing_option) {
+ case WalFilter::WalProcessingOption::kContinueProcessing:
+ // do nothing, proceeed normally
+ break;
+ case WalFilter::WalProcessingOption::kIgnoreCurrentRecord:
+ // skip current record
+ process_current_record = false;
+ break;
+ case WalFilter::WalProcessingOption::kStopReplay:
+ // skip current record and stop replay
+ process_current_record = false;
+ stop_replay = true;
+ break;
+ case WalFilter::WalProcessingOption::kCorruptedRecord: {
+ status = Status::Corruption("Corruption reported by Wal Filter ",
+ wal_filter.Name());
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ process_current_record = false;
+ reporter.Corruption(batch.GetDataSize(), status);
+ }
+ break;
+ }
+ default: {
+ // logical error which should not happen. If RocksDB throws, we would
+ // just do `throw std::logic_error`.
+ assert(false);
+ status = Status::NotSupported(
+ "Unknown WalProcessingOption returned by Wal Filter ",
+ wal_filter.Name());
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ // Ignore the error with current record processing.
+ stop_replay = true;
+ }
+ break;
+ }
+ }
+
+ if (!process_current_record) {
+ return false;
+ }
+
+ if (batch_changed) {
+ // Make sure that the count in the new batch is
+ // within the orignal count.
+ int new_count = WriteBatchInternal::Count(&new_batch);
+ int original_count = WriteBatchInternal::Count(&batch);
+ if (new_count > original_count) {
+ ROCKS_LOG_FATAL(
+ immutable_db_options_.info_log,
+ "Recovering log #%" PRIu64
+ " mode %d log filter %s returned "
+ "more records (%d) than original (%d) which is not allowed. "
+ "Aborting recovery.",
+ wal_number, static_cast<int>(immutable_db_options_.wal_recovery_mode),
+ wal_filter.Name(), new_count, original_count);
+ status = Status::NotSupported(
+ "More than original # of records "
+ "returned by Wal Filter ",
+ wal_filter.Name());
+ return false;
+ }
+ // Set the same sequence number in the new_batch
+ // as the original batch.
+ WriteBatchInternal::SetSequence(&new_batch,
+ WriteBatchInternal::Sequence(&batch));
+ batch = new_batch;
+ }
+ return true;
+#else // !ROCKSDB_LITE
+ (void)wal_number;
+ (void)wal_fname;
+ (void)reporter;
+ (void)status;
+ (void)stop_replay;
+ (void)batch;
+ return true;
+#endif // ROCKSDB_LITE
+}
+
+// REQUIRES: wal_numbers are sorted in ascending order
+Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
+ SequenceNumber* next_sequence, bool read_only,
+ bool* corrupted_wal_found,
+ RecoveryContext* recovery_ctx) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ const char* fname;
+ Status* status; // nullptr if immutable_db_options_.paranoid_checks==false
+ void Corruption(size_t bytes, const Status& s) override {
+ ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+ (status == nullptr ? "(ignoring error) " : ""), fname,
+ static_cast<int>(bytes), s.ToString().c_str());
+ if (status != nullptr && status->ok()) {
+ *status = s;
+ }
+ }
+ };
+
+ mutex_.AssertHeld();
+ Status status;
+ std::unordered_map<int, VersionEdit> version_edits;
+ // no need to refcount because iteration is under mutex
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ VersionEdit edit;
+ edit.SetColumnFamily(cfd->GetID());
+ version_edits.insert({cfd->GetID(), edit});
+ }
+ int job_id = next_job_id_.fetch_add(1);
+ {
+ auto stream = event_logger_.Log();
+ stream << "job" << job_id << "event"
+ << "recovery_started";
+ stream << "wal_files";
+ stream.StartArray();
+ for (auto wal_number : wal_numbers) {
+ stream << wal_number;
+ }
+ stream.EndArray();
+ }
+
+ // No-op for immutable_db_options_.wal_filter == nullptr.
+ InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap();
+
+ bool stop_replay_by_wal_filter = false;
+ bool stop_replay_for_corruption = false;
+ bool flushed = false;
+ uint64_t corrupted_wal_number = kMaxSequenceNumber;
+ uint64_t min_wal_number = MinLogNumberToKeep();
+ if (!allow_2pc()) {
+ // In non-2pc mode, we skip WALs that do not back unflushed data.
+ min_wal_number =
+ std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData());
+ }
+ for (auto wal_number : wal_numbers) {
+ if (wal_number < min_wal_number) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Skipping log #%" PRIu64
+ " since it is older than min log to keep #%" PRIu64,
+ wal_number, min_wal_number);
+ continue;
+ }
+ // The previous incarnation may not have written any MANIFEST
+ // records after allocating this log number. So we manually
+ // update the file number allocation counter in VersionSet.
+ versions_->MarkFileNumberUsed(wal_number);
+ // Open the log file
+ std::string fname =
+ LogFileName(immutable_db_options_.GetWalDir(), wal_number);
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Recovering log #%" PRIu64 " mode %d", wal_number,
+ static_cast<int>(immutable_db_options_.wal_recovery_mode));
+ auto logFileDropped = [this, &fname]() {
+ uint64_t bytes;
+ if (env_->GetFileSize(fname, &bytes).ok()) {
+ auto info_log = immutable_db_options_.info_log.get();
+ ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(),
+ static_cast<int>(bytes));
+ }
+ };
+ if (stop_replay_by_wal_filter) {
+ logFileDropped();
+ continue;
+ }
+
+ std::unique_ptr<SequentialFileReader> file_reader;
+ {
+ std::unique_ptr<FSSequentialFile> file;
+ status = fs_->NewSequentialFile(
+ fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
+ if (!status.ok()) {
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ return status;
+ } else {
+ // Fail with one log file, but that's ok.
+ // Try next one.
+ continue;
+ }
+ }
+ file_reader.reset(new SequentialFileReader(
+ std::move(file), fname, immutable_db_options_.log_readahead_size,
+ io_tracer_));
+ }
+
+ // Create the log reader.
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = immutable_db_options_.info_log.get();
+ reporter.fname = fname.c_str();
+ if (!immutable_db_options_.paranoid_checks ||
+ immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kSkipAnyCorruptedRecords) {
+ reporter.status = nullptr;
+ } else {
+ reporter.status = &status;
+ }
+ // We intentially make log::Reader do checksumming even if
+ // paranoid_checks==false so that corruptions cause entire commits
+ // to be skipped instead of propagating bad information (like overly
+ // large sequence numbers).
+ log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
+ &reporter, true /*checksum*/, wal_number);
+
+ // Determine if we should tolerate incomplete records at the tail end of the
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+
+ TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
+ /*arg=*/nullptr);
+ uint64_t record_checksum;
+ while (!stop_replay_by_wal_filter &&
+ reader.ReadRecord(&record, &scratch,
+ immutable_db_options_.wal_recovery_mode,
+ &record_checksum) &&
+ status.ok()) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter.Corruption(record.size(),
+ Status::Corruption("log record too small"));
+ continue;
+ }
+
+ // We create a new batch and initialize with a valid prot_info_ to store
+ // the data checksums
+ WriteBatch batch;
+
+ status = WriteBatchInternal::SetContents(&batch, record);
+ if (!status.ok()) {
+ return status;
+ }
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", &batch);
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
+ &record_checksum);
+ status = WriteBatchInternal::UpdateProtectionInfo(
+ &batch, 8 /* bytes_per_key */, &record_checksum);
+ if (!status.ok()) {
+ return status;
+ }
+
+ SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
+
+ if (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kPointInTimeRecovery) {
+ // In point-in-time recovery mode, if sequence id of log files are
+ // consecutive, we continue recovery despite corruption. This could
+ // happen when we open and write to a corrupted DB, where sequence id
+ // will start from the last sequence id we recovered.
+ if (sequence == *next_sequence) {
+ stop_replay_for_corruption = false;
+ }
+ if (stop_replay_for_corruption) {
+ logFileDropped();
+ break;
+ }
+ }
+
+ // For the default case of wal_filter == nullptr, always performs no-op
+ // and returns true.
+ if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, reporter,
+ status, stop_replay_by_wal_filter,
+ batch)) {
+ continue;
+ }
+
+ // If column family was not found, it might mean that the WAL write
+ // batch references to the column family that was dropped after the
+ // insert. We don't want to fail the whole write batch in that case --
+ // we just ignore the update.
+ // That's why we set ignore missing column families to true
+ bool has_valid_writes = false;
+ status = WriteBatchInternal::InsertInto(
+ &batch, column_family_memtables_.get(), &flush_scheduler_,
+ &trim_history_scheduler_, true, wal_number, this,
+ false /* concurrent_memtable_writes */, next_sequence,
+ &has_valid_writes, seq_per_batch_, batch_per_txn_);
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ // We are treating this as a failure while reading since we read valid
+ // blocks that do not form coherent data
+ reporter.Corruption(record.size(), status);
+ continue;
+ }
+
+ if (has_valid_writes && !read_only) {
+ // we can do this because this is called before client has access to the
+ // DB and there is only a single thread operating on DB
+ ColumnFamilyData* cfd;
+
+ while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+ cfd->UnrefAndTryDelete();
+ // If this asserts, it means that InsertInto failed in
+ // filtering updates to already-flushed column families
+ assert(cfd->GetLogNumber() <= wal_number);
+ auto iter = version_edits.find(cfd->GetID());
+ assert(iter != version_edits.end());
+ VersionEdit* edit = &iter->second;
+ status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+ if (!status.ok()) {
+ // Reflect errors immediately so that conditions like full
+ // file-systems cause the DB::Open() to fail.
+ return status;
+ }
+ flushed = true;
+
+ cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ *next_sequence);
+ }
+ }
+ }
+
+ if (!status.ok()) {
+ if (status.IsNotSupported()) {
+ // We should not treat NotSupported as corruption. It is rather a clear
+ // sign that we are processing a WAL that is produced by an incompatible
+ // version of the code.
+ return status;
+ }
+ if (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kSkipAnyCorruptedRecords) {
+ // We should ignore all errors unconditionally
+ status = Status::OK();
+ } else if (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kPointInTimeRecovery) {
+ if (status.IsIOError()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "IOError during point-in-time reading log #%" PRIu64
+ " seq #%" PRIu64
+ ". %s. This likely mean loss of synced WAL, "
+ "thus recovery fails.",
+ wal_number, *next_sequence,
+ status.ToString().c_str());
+ return status;
+ }
+ // We should ignore the error but not continue replaying
+ status = Status::OK();
+ stop_replay_for_corruption = true;
+ corrupted_wal_number = wal_number;
+ if (corrupted_wal_found != nullptr) {
+ *corrupted_wal_found = true;
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Point in time recovered to log #%" PRIu64
+ " seq #%" PRIu64,
+ wal_number, *next_sequence);
+ } else {
+ assert(immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kTolerateCorruptedTailRecords ||
+ immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kAbsoluteConsistency);
+ return status;
+ }
+ }
+
+ flush_scheduler_.Clear();
+ trim_history_scheduler_.Clear();
+ auto last_sequence = *next_sequence - 1;
+ if ((*next_sequence != kMaxSequenceNumber) &&
+ (versions_->LastSequence() <= last_sequence)) {
+ versions_->SetLastAllocatedSequence(last_sequence);
+ versions_->SetLastPublishedSequence(last_sequence);
+ versions_->SetLastSequence(last_sequence);
+ }
+ }
+ // Compare the corrupted log number to all columnfamily's current log number.
+ // Abort Open() if any column family's log number is greater than
+ // the corrupted log number, which means CF contains data beyond the point of
+ // corruption. This could during PIT recovery when the WAL is corrupted and
+ // some (but not all) CFs are flushed
+ // Exclude the PIT case where no log is dropped after the corruption point.
+ // This is to cover the case for empty wals after corrupted log, in which we
+ // don't reset stop_replay_for_corruption.
+ if (stop_replay_for_corruption == true &&
+ (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kPointInTimeRecovery ||
+ immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kTolerateCorruptedTailRecords)) {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ // One special case cause cfd->GetLogNumber() > corrupted_wal_number but
+ // the CF is still consistent: If a new column family is created during
+ // the flush and the WAL sync fails at the same time, the new CF points to
+ // the new WAL but the old WAL is curropted. Since the new CF is empty, it
+ // is still consistent. We add the check of CF sst file size to avoid the
+ // false positive alert.
+
+ // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to
+ // the ignorance of a very rare inconsistency case caused in data
+ // canclation. One CF is empty due to KV deletion. But those operations
+ // are in the WAL. If the WAL is corrupted, the status of this CF might
+ // not be consistent with others. However, the consistency check will be
+ // bypassed due to empty CF.
+ // TODO: a better and complete implementation is needed to ensure strict
+ // consistency check in WAL recovery including hanlding the tailing
+ // issues.
+ if (cfd->GetLogNumber() > corrupted_wal_number &&
+ cfd->GetLiveSstFilesSize() > 0) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Column family inconsistency: SST file contains data"
+ " beyond the point of corruption.");
+ return Status::Corruption("SST file is ahead of WALs in CF " +
+ cfd->GetName());
+ }
+ }
+ }
+
+ // True if there's any data in the WALs; if not, we can skip re-processing
+ // them later
+ bool data_seen = false;
+ if (!read_only) {
+ // no need to refcount since client still doesn't have access
+ // to the DB and can not drop column families while we iterate
+ const WalNumber max_wal_number = wal_numbers.back();
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ auto iter = version_edits.find(cfd->GetID());
+ assert(iter != version_edits.end());
+ VersionEdit* edit = &iter->second;
+
+ if (cfd->GetLogNumber() > max_wal_number) {
+ // Column family cfd has already flushed the data
+ // from all wals. Memtable has to be empty because
+ // we filter the updates based on wal_number
+ // (in WriteBatch::InsertInto)
+ assert(cfd->mem()->GetFirstSequenceNumber() == 0);
+ assert(edit->NumEntries() == 0);
+ continue;
+ }
+
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr);
+
+ // flush the final memtable (if non-empty)
+ if (cfd->mem()->GetFirstSequenceNumber() != 0) {
+ // If flush happened in the middle of recovery (e.g. due to memtable
+ // being full), we flush at the end. Otherwise we'll need to record
+ // where we were on last flush, which make the logic complicated.
+ if (flushed || !immutable_db_options_.avoid_flush_during_recovery) {
+ status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+ if (!status.ok()) {
+ // Recovery failed
+ break;
+ }
+ flushed = true;
+
+ cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ versions_->LastSequence());
+ }
+ data_seen = true;
+ }
+
+ // Update the log number info in the version edit corresponding to this
+ // column family. Note that the version edits will be written to MANIFEST
+ // together later.
+ // writing wal_number in the manifest means that any log file
+ // with number strongly less than (wal_number + 1) is already
+ // recovered and should be ignored on next reincarnation.
+ // Since we already recovered max_wal_number, we want all wals
+ // with numbers `<= max_wal_number` (includes this one) to be ignored
+ if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) {
+ edit->SetLogNumber(max_wal_number + 1);
+ }
+ }
+ if (status.ok()) {
+ // we must mark the next log number as used, even though it's
+ // not actually used. that is because VersionSet assumes
+ // VersionSet::next_file_number_ always to be strictly greater than any
+ // log number
+ versions_->MarkFileNumberUsed(max_wal_number + 1);
+ assert(recovery_ctx != nullptr);
+
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ auto iter = version_edits.find(cfd->GetID());
+ assert(iter != version_edits.end());
+ recovery_ctx->UpdateVersionEdits(cfd, iter->second);
+ }
+
+ if (flushed) {
+ VersionEdit wal_deletion;
+ if (immutable_db_options_.track_and_verify_wals_in_manifest) {
+ wal_deletion.DeleteWalsBefore(max_wal_number + 1);
+ }
+ if (!allow_2pc()) {
+ // In non-2pc mode, flushing the memtables of the column families
+ // means we can advance min_log_number_to_keep.
+ wal_deletion.SetMinLogNumberToKeep(max_wal_number + 1);
+ }
+ assert(versions_->GetColumnFamilySet() != nullptr);
+ recovery_ctx->UpdateVersionEdits(
+ versions_->GetColumnFamilySet()->GetDefault(), wal_deletion);
+ }
+ }
+ }
+
+ if (status.ok()) {
+ if (data_seen && !flushed) {
+ status = RestoreAliveLogFiles(wal_numbers);
+ } else if (!wal_numbers.empty()) { // If there's no data in the WAL, or we
+ // flushed all the data, still
+ // truncate the log file. If the process goes into a crash loop before
+ // the file is deleted, the preallocated space will never get freed.
+ const bool truncate = !read_only;
+ GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr)
+ .PermitUncheckedError();
+ }
+ }
+
+ event_logger_.Log() << "job" << job_id << "event"
+ << "recovery_finished";
+
+ return status;
+}
+
+Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
+ LogFileNumberSize* log_ptr) {
+ LogFileNumberSize log(wal_number);
+ std::string fname =
+ LogFileName(immutable_db_options_.GetWalDir(), wal_number);
+ Status s;
+ // This gets the appear size of the wals, not including preallocated space.
+ s = env_->GetFileSize(fname, &log.size);
+ TEST_SYNC_POINT_CALLBACK("DBImpl::GetLogSizeAndMaybeTruncate:0", /*arg=*/&s);
+ if (s.ok() && truncate) {
+ std::unique_ptr<FSWritableFile> last_log;
+ Status truncate_status = fs_->ReopenWritableFile(
+ fname,
+ fs_->OptimizeForLogWrite(
+ file_options_,
+ BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+ &last_log, nullptr);
+ if (truncate_status.ok()) {
+ truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
+ }
+ if (truncate_status.ok()) {
+ truncate_status = last_log->Close(IOOptions(), nullptr);
+ }
+ // Not a critical error if fail to truncate.
+ if (!truncate_status.ok() && !truncate_status.IsNotSupported()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Failed to truncate log #%" PRIu64 ": %s", wal_number,
+ truncate_status.ToString().c_str());
+ }
+ }
+ if (log_ptr) {
+ *log_ptr = log;
+ }
+ return s;
+}
+
+Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
+ if (wal_numbers.empty()) {
+ return Status::OK();
+ }
+ Status s;
+ mutex_.AssertHeld();
+ assert(immutable_db_options_.avoid_flush_during_recovery);
+ // Mark these as alive so they'll be considered for deletion later by
+ // FindObsoleteFiles()
+ total_log_size_ = 0;
+ log_empty_ = false;
+ uint64_t min_wal_with_unflushed_data =
+ versions_->MinLogNumberWithUnflushedData();
+ for (auto wal_number : wal_numbers) {
+ if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
+ // In non-2pc mode, the WAL files not backing unflushed data are not
+ // alive, thus should not be added to the alive_log_files_.
+ continue;
+ }
+ // We preallocate space for wals, but then after a crash and restart, those
+ // preallocated space are not needed anymore. It is likely only the last
+ // log has such preallocated space, so we only truncate for the last log.
+ LogFileNumberSize log;
+ s = GetLogSizeAndMaybeTruncate(
+ wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
+ if (!s.ok()) {
+ break;
+ }
+ total_log_size_ += log.size;
+ alive_log_files_.push_back(log);
+ }
+ return s;
+}
+
+Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+ MemTable* mem, VersionEdit* edit) {
+ mutex_.AssertHeld();
+ assert(cfd);
+ assert(cfd->imm());
+ // The immutable memtable list must be empty.
+ assert(std::numeric_limits<uint64_t>::max() ==
+ cfd->imm()->GetEarliestMemTableID());
+
+ const uint64_t start_micros = immutable_db_options_.clock->NowMicros();
+
+ FileMetaData meta;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+ new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+ meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ Arena arena;
+ Status s;
+ TableProperties table_properties;
+ {
+ ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] [WriteLevel0TableForRecovery]"
+ " Level-0 table #%" PRIu64 ": started",
+ cfd->GetName().c_str(), meta.fd.GetNumber());
+
+ // Get the latest mutable cf options while the mutex is still locked
+ const MutableCFOptions mutable_cf_options =
+ *cfd->GetLatestMutableCFOptions();
+ bool paranoid_file_checks =
+ cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
+
+ int64_t _current_time = 0;
+ immutable_db_options_.clock->GetCurrentTime(&_current_time)
+ .PermitUncheckedError(); // ignore error
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+ meta.oldest_ancester_time = current_time;
+
+ {
+ auto write_hint = cfd->CalculateSSTWriteHint(0);
+ mutex_.Unlock();
+
+ SequenceNumber earliest_write_conflict_snapshot;
+ std::vector<SequenceNumber> snapshot_seqs =
+ snapshots_.GetAll(&earliest_write_conflict_snapshot);
+ auto snapshot_checker = snapshot_checker_.get();
+ if (use_custom_gc_ && snapshot_checker == nullptr) {
+ snapshot_checker = DisableGCSnapshotChecker::Instance();
+ }
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters;
+ auto range_del_iter =
+ // This is called during recovery, where a live memtable is flushed
+ // directly. In this case, no fragmented tombstone list is cached in
+ // this memtable yet.
+ mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber,
+ false /* immutable_memtable */);
+ if (range_del_iter != nullptr) {
+ range_del_iters.emplace_back(range_del_iter);
+ }
+
+ IOStatus io_s;
+ TableBuilderOptions tboptions(
+ *cfd->ioptions(), mutable_cf_options, cfd->internal_comparator(),
+ cfd->int_tbl_prop_collector_factories(),
+ GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+ mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(),
+ 0 /* level */, false /* is_bottommost */,
+ TableFileCreationReason::kRecovery, 0 /* oldest_key_time */,
+ 0 /* file_creation_time */, db_id_, db_session_id_,
+ 0 /* target_file_size */, meta.fd.GetNumber());
+ SeqnoToTimeMapping empty_seqno_time_mapping;
+ s = BuildTable(
+ dbname_, versions_.get(), immutable_db_options_, tboptions,
+ file_options_for_compaction_, cfd->table_cache(), iter.get(),
+ std::move(range_del_iters), &meta, &blob_file_additions,
+ snapshot_seqs, earliest_write_conflict_snapshot, kMaxSequenceNumber,
+ snapshot_checker, paranoid_file_checks, cfd->internal_stats(), &io_s,
+ io_tracer_, BlobFileCreationReason::kRecovery,
+ empty_seqno_time_mapping, &event_logger_, job_id, Env::IO_HIGH,
+ nullptr /* table_properties */, write_hint,
+ nullptr /*full_history_ts_low*/, &blob_callback_);
+ LogFlush(immutable_db_options_.info_log);
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] [WriteLevel0TableForRecovery]"
+ " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
+ cfd->GetName().c_str(), meta.fd.GetNumber(),
+ meta.fd.GetFileSize(), s.ToString().c_str());
+ mutex_.Lock();
+
+ // TODO(AR) is this ok?
+ if (!io_s.ok() && s.ok()) {
+ s = io_s;
+ }
+ }
+ }
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ // Note that if file_size is zero, the file has been deleted and
+ // should not be added to the manifest.
+ const bool has_output = meta.fd.GetFileSize() > 0;
+
+ constexpr int level = 0;
+
+ if (s.ok() && has_output) {
+ edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+ meta.fd.GetFileSize(), meta.smallest, meta.largest,
+ meta.fd.smallest_seqno, meta.fd.largest_seqno,
+ meta.marked_for_compaction, meta.temperature,
+ meta.oldest_blob_file_number, meta.oldest_ancester_time,
+ meta.file_creation_time, meta.file_checksum,
+ meta.file_checksum_func_name, meta.unique_id);
+
+ for (const auto& blob : blob_file_additions) {
+ edit->AddBlobFile(blob);
+ }
+ }
+
+ InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
+ stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
+
+ if (has_output) {
+ stats.bytes_written = meta.fd.GetFileSize();
+ stats.num_output_files = 1;
+ }
+
+ const auto& blobs = edit->GetBlobFileAdditions();
+ for (const auto& blob : blobs) {
+ stats.bytes_written_blob += blob.GetTotalBlobBytes();
+ }
+
+ stats.num_output_files_blob = static_cast<int>(blobs.size());
+
+ cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
+ cfd->internal_stats()->AddCFStats(
+ InternalStats::BYTES_FLUSHED,
+ stats.bytes_written + stats.bytes_written_blob);
+ RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
+ return s;
+}
+
+Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ if (db_options.persist_stats_to_disk) {
+ column_families.push_back(
+ ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options));
+ }
+ std::vector<ColumnFamilyHandle*> handles;
+ Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
+ if (s.ok()) {
+ if (db_options.persist_stats_to_disk) {
+ assert(handles.size() == 2);
+ } else {
+ assert(handles.size() == 1);
+ }
+ // i can delete the handle since DBImpl is always holding a reference to
+ // default column family
+ if (db_options.persist_stats_to_disk && handles[1] != nullptr) {
+ delete handles[1];
+ }
+ delete handles[0];
+ }
+ return s;
+}
+
+Status DB::Open(const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+ const bool kSeqPerBatch = true;
+ const bool kBatchPerTxn = true;
+ return DBImpl::Open(db_options, dbname, column_families, handles, dbptr,
+ !kSeqPerBatch, kBatchPerTxn);
+}
+
+// TODO: Implement the trimming in flush code path.
+// TODO: Perform trimming before inserting into memtable during recovery.
+// TODO: Pick files with max_timestamp > trim_ts by each file's timestamp meta
+// info, and handle only these files to reduce io.
+Status DB::OpenAndTrimHistory(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ std::string trim_ts) {
+ assert(dbptr != nullptr);
+ assert(handles != nullptr);
+ auto validate_options = [&db_options] {
+ if (db_options.avoid_flush_during_recovery) {
+ return Status::InvalidArgument(
+ "avoid_flush_during_recovery incompatible with "
+ "OpenAndTrimHistory");
+ }
+ return Status::OK();
+ };
+ auto s = validate_options();
+ if (!s.ok()) {
+ return s;
+ }
+
+ DB* db = nullptr;
+ s = DB::Open(db_options, dbname, column_families, handles, &db);
+ if (!s.ok()) {
+ return s;
+ }
+ assert(db);
+ CompactRangeOptions options;
+ options.bottommost_level_compaction =
+ BottommostLevelCompaction::kForceOptimized;
+ auto db_impl = static_cast_with_check<DBImpl>(db);
+ for (auto handle : *handles) {
+ assert(handle != nullptr);
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle);
+ auto cfd = cfh->cfd();
+ assert(cfd != nullptr);
+ // Only compact column families with timestamp enabled
+ if (cfd->user_comparator() != nullptr &&
+ cfd->user_comparator()->timestamp_size() > 0) {
+ s = db_impl->CompactRangeInternal(options, handle, nullptr, nullptr,
+ trim_ts);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ auto clean_op = [&handles, &db] {
+ for (auto handle : *handles) {
+ auto temp_s = db->DestroyColumnFamilyHandle(handle);
+ assert(temp_s.ok());
+ }
+ handles->clear();
+ delete db;
+ };
+ if (!s.ok()) {
+ clean_op();
+ return s;
+ }
+
+ *dbptr = db;
+ return s;
+}
+
+IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+ size_t preallocate_block_size,
+ log::Writer** new_log) {
+ IOStatus io_s;
+ std::unique_ptr<FSWritableFile> lfile;
+
+ DBOptions db_options =
+ BuildDBOptions(immutable_db_options_, mutable_db_options_);
+ FileOptions opt_file_options =
+ fs_->OptimizeForLogWrite(file_options_, db_options);
+ std::string wal_dir = immutable_db_options_.GetWalDir();
+ std::string log_fname = LogFileName(wal_dir, log_file_num);
+
+ if (recycle_log_number) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "reusing log %" PRIu64 " from recycle list\n",
+ recycle_log_number);
+ std::string old_log_fname = LogFileName(wal_dir, recycle_log_number);
+ TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1");
+ TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2");
+ io_s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options,
+ &lfile, /*dbg=*/nullptr);
+ } else {
+ io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
+ }
+
+ if (io_s.ok()) {
+ lfile->SetWriteLifeTimeHint(CalculateWALWriteHint());
+ lfile->SetPreallocationBlockSize(preallocate_block_size);
+
+ const auto& listeners = immutable_db_options_.listeners;
+ FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(lfile), log_fname, opt_file_options,
+ immutable_db_options_.clock, io_tracer_, nullptr /* stats */, listeners,
+ nullptr, tmp_set.Contains(FileType::kWalFile),
+ tmp_set.Contains(FileType::kWalFile)));
+ *new_log = new log::Writer(std::move(file_writer), log_file_num,
+ immutable_db_options_.recycle_log_file_num > 0,
+ immutable_db_options_.manual_wal_flush,
+ immutable_db_options_.wal_compression);
+ io_s = (*new_log)->AddCompressionTypeRecord();
+ }
+ return io_s;
+}
+
+Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ const bool seq_per_batch, const bool batch_per_txn) {
+ Status s = ValidateOptionsByTable(db_options, column_families);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = ValidateOptions(db_options, column_families);
+ if (!s.ok()) {
+ return s;
+ }
+
+ *dbptr = nullptr;
+ assert(handles);
+ handles->clear();
+
+ size_t max_write_buffer_size = 0;
+ for (auto cf : column_families) {
+ max_write_buffer_size =
+ std::max(max_write_buffer_size, cf.options.write_buffer_size);
+ }
+
+ DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn);
+ if (!impl->immutable_db_options_.info_log) {
+ s = impl->init_logger_creation_s_;
+ delete impl;
+ return s;
+ } else {
+ assert(impl->init_logger_creation_s_.ok());
+ }
+ s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.GetWalDir());
+ if (s.ok()) {
+ std::vector<std::string> paths;
+ for (auto& db_path : impl->immutable_db_options_.db_paths) {
+ paths.emplace_back(db_path.path);
+ }
+ for (auto& cf : column_families) {
+ for (auto& cf_path : cf.options.cf_paths) {
+ paths.emplace_back(cf_path.path);
+ }
+ }
+ for (auto& path : paths) {
+ s = impl->env_->CreateDirIfMissing(path);
+ if (!s.ok()) {
+ break;
+ }
+ }
+
+ // For recovery from NoSpace() error, we can only handle
+ // the case where the database is stored in a single path
+ if (paths.size() <= 1) {
+ impl->error_handler_.EnableAutoRecovery();
+ }
+ }
+ if (s.ok()) {
+ s = impl->CreateArchivalDirectory();
+ }
+ if (!s.ok()) {
+ delete impl;
+ return s;
+ }
+
+ impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
+ RecoveryContext recovery_ctx;
+ impl->mutex_.Lock();
+
+ // Handles create_if_missing, error_if_exists
+ uint64_t recovered_seq(kMaxSequenceNumber);
+ s = impl->Recover(column_families, false, false, false, &recovered_seq,
+ &recovery_ctx);
+ if (s.ok()) {
+ uint64_t new_log_number = impl->versions_->NewFileNumber();
+ log::Writer* new_log = nullptr;
+ const size_t preallocate_block_size =
+ impl->GetWalPreallocateBlockSize(max_write_buffer_size);
+ s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/,
+ preallocate_block_size, &new_log);
+ if (s.ok()) {
+ InstrumentedMutexLock wl(&impl->log_write_mutex_);
+ impl->logfile_number_ = new_log_number;
+ assert(new_log != nullptr);
+ assert(impl->logs_.empty());
+ impl->logs_.emplace_back(new_log_number, new_log);
+ }
+
+ if (s.ok()) {
+ impl->alive_log_files_.push_back(
+ DBImpl::LogFileNumberSize(impl->logfile_number_));
+ // In WritePrepared there could be gap in sequence numbers. This breaks
+ // the trick we use in kPointInTimeRecovery which assumes the first seq in
+ // the log right after the corrupted log is one larger than the last seq
+ // we read from the wals. To let this trick keep working, we add a dummy
+ // entry with the expected sequence to the first log right after recovery.
+ // In non-WritePrepared case also the new log after recovery could be
+ // empty, and thus missing the consecutive seq hint to distinguish
+ // middle-log corruption to corrupted-log-remained-after-recovery. This
+ // case also will be addressed by a dummy write.
+ if (recovered_seq != kMaxSequenceNumber) {
+ WriteBatch empty_batch;
+ WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
+ WriteOptions write_options;
+ uint64_t log_used, log_size;
+ log::Writer* log_writer = impl->logs_.back().writer;
+ LogFileNumberSize& log_file_number_size = impl->alive_log_files_.back();
+
+ assert(log_writer->get_log_number() == log_file_number_size.number);
+ impl->mutex_.AssertHeld();
+ s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size,
+ Env::IO_TOTAL, log_file_number_size);
+ if (s.ok()) {
+ // Need to fsync, otherwise it might get lost after a power reset.
+ s = impl->FlushWAL(false);
+ TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s);
+ if (s.ok()) {
+ s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
+ }
+ }
+ }
+ }
+ }
+ if (s.ok()) {
+ s = impl->LogAndApplyForRecovery(recovery_ctx);
+ }
+
+ if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+ impl->mutex_.AssertHeld();
+ s = impl->InitPersistStatsColumnFamily();
+ }
+
+ if (s.ok()) {
+ // set column family handles
+ for (auto cf : column_families) {
+ auto cfd =
+ impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+ if (cfd != nullptr) {
+ handles->push_back(
+ new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+ impl->NewThreadStatusCfInfo(cfd);
+ } else {
+ if (db_options.create_missing_column_families) {
+ // missing column family, create it
+ ColumnFamilyHandle* handle = nullptr;
+ impl->mutex_.Unlock();
+ s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
+ impl->mutex_.Lock();
+ if (s.ok()) {
+ handles->push_back(handle);
+ } else {
+ break;
+ }
+ } else {
+ s = Status::InvalidArgument("Column family not found", cf.name);
+ break;
+ }
+ }
+ }
+ }
+
+ if (s.ok()) {
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ impl->InstallSuperVersionAndScheduleWork(
+ cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
+ }
+ sv_context.Clean();
+ }
+
+ if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+ // try to read format version
+ s = impl->PersistentStatsProcessFormatVersion();
+ }
+
+ if (s.ok()) {
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ if (!cfd->mem()->IsSnapshotSupported()) {
+ impl->is_snapshot_supported_ = false;
+ }
+ if (cfd->ioptions()->merge_operator != nullptr &&
+ !cfd->mem()->IsMergeOperatorSupported()) {
+ s = Status::InvalidArgument(
+ "The memtable of column family %s does not support merge operator "
+ "its options.merge_operator is non-null",
+ cfd->GetName().c_str());
+ }
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::Open:Opened");
+ Status persist_options_status;
+ if (s.ok()) {
+ // Persist RocksDB Options before scheduling the compaction.
+ // The WriteOptionsFile() will release and lock the mutex internally.
+ persist_options_status = impl->WriteOptionsFile(
+ false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+
+ *dbptr = impl;
+ impl->opened_successfully_ = true;
+ impl->DeleteObsoleteFiles();
+ TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles");
+ impl->MaybeScheduleFlushOrCompaction();
+ } else {
+ persist_options_status.PermitUncheckedError();
+ }
+ impl->mutex_.Unlock();
+
+#ifndef ROCKSDB_LITE
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ impl->immutable_db_options_.sst_file_manager.get());
+ if (s.ok() && sfm) {
+ // Set Statistics ptr for SstFileManager to dump the stats of
+ // DeleteScheduler.
+ sfm->SetStatisticsPtr(impl->immutable_db_options_.statistics);
+ ROCKS_LOG_INFO(impl->immutable_db_options_.info_log,
+ "SstFileManager instance %p", sfm);
+
+ // Notify SstFileManager about all sst files that already exist in
+ // db_paths[0] and cf_paths[0] when the DB is opened.
+
+ // SstFileManagerImpl needs to know sizes of the files. For files whose size
+ // we already know (sst files that appear in manifest - typically that's the
+ // vast majority of all files), we'll pass the size to SstFileManager.
+ // For all other files SstFileManager will query the size from filesystem.
+
+ std::vector<ColumnFamilyMetaData> metadata;
+ impl->GetAllColumnFamilyMetaData(&metadata);
+
+ std::unordered_map<std::string, uint64_t> known_file_sizes;
+ for (const auto& md : metadata) {
+ for (const auto& lmd : md.levels) {
+ for (const auto& fmd : lmd.files) {
+ known_file_sizes[fmd.relative_filename] = fmd.size;
+ }
+ }
+ for (const auto& bmd : md.blob_files) {
+ std::string name = bmd.blob_file_name;
+ // The BlobMetaData.blob_file_name may start with "/".
+ if (!name.empty() && name[0] == '/') {
+ name = name.substr(1);
+ }
+ known_file_sizes[name] = bmd.blob_file_size;
+ }
+ }
+
+ std::vector<std::string> paths;
+ paths.emplace_back(impl->immutable_db_options_.db_paths[0].path);
+ for (auto& cf : column_families) {
+ if (!cf.options.cf_paths.empty()) {
+ paths.emplace_back(cf.options.cf_paths[0].path);
+ }
+ }
+ // Remove duplicate paths.
+ std::sort(paths.begin(), paths.end());
+ paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+ IOOptions io_opts;
+ io_opts.do_not_recurse = true;
+ for (auto& path : paths) {
+ std::vector<std::string> existing_files;
+ impl->immutable_db_options_.fs
+ ->GetChildren(path, io_opts, &existing_files,
+ /*IODebugContext*=*/nullptr)
+ .PermitUncheckedError(); //**TODO: What do to on error?
+ for (auto& file_name : existing_files) {
+ uint64_t file_number;
+ FileType file_type;
+ std::string file_path = path + "/" + file_name;
+ if (ParseFileName(file_name, &file_number, &file_type) &&
+ (file_type == kTableFile || file_type == kBlobFile)) {
+ // TODO: Check for errors from OnAddFile?
+ if (known_file_sizes.count(file_name)) {
+ // We're assuming that each sst file name exists in at most one of
+ // the paths.
+ sfm->OnAddFile(file_path, known_file_sizes.at(file_name))
+ .PermitUncheckedError();
+ } else {
+ sfm->OnAddFile(file_path).PermitUncheckedError();
+ }
+ }
+ }
+ }
+
+ // Reserve some disk buffer space. This is a heuristic - when we run out
+ // of disk space, this ensures that there is atleast write_buffer_size
+ // amount of free space before we resume DB writes. In low disk space
+ // conditions, we want to avoid a lot of small L0 files due to frequent
+ // WAL write failures and resultant forced flushes
+ sfm->ReserveDiskBuffer(max_write_buffer_size,
+ impl->immutable_db_options_.db_paths[0].path);
+ }
+
+#endif // !ROCKSDB_LITE
+
+ if (s.ok()) {
+ ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p",
+ impl);
+ LogFlush(impl->immutable_db_options_.info_log);
+ if (!impl->WALBufferIsEmpty()) {
+ s = impl->FlushWAL(false);
+ if (s.ok()) {
+ // Sync is needed otherwise WAL buffered data might get lost after a
+ // power reset.
+ log::Writer* log_writer = impl->logs_.back().writer;
+ s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
+ }
+ }
+ if (s.ok() && !persist_options_status.ok()) {
+ s = Status::IOError(
+ "DB::Open() failed --- Unable to persist Options file",
+ persist_options_status.ToString());
+ }
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(impl->immutable_db_options_.info_log,
+ "DB::Open() failed: %s", s.ToString().c_str());
+ }
+ if (s.ok()) {
+ s = impl->StartPeriodicTaskScheduler();
+ }
+
+ if (s.ok()) {
+ s = impl->RegisterRecordSeqnoTimeWorker();
+ }
+ if (!s.ok()) {
+ for (auto* h : *handles) {
+ delete h;
+ }
+ handles->clear();
+ delete impl;
+ *dbptr = nullptr;
+ }
+ return s;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.cc b/src/rocksdb/db/db_impl/db_impl_readonly.cc
new file mode 100644
index 000000000..0f10baf24
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.cc
@@ -0,0 +1,341 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_readonly.h"
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_impl/compacted_db_impl.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/merge_context.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
+ const std::string& dbname)
+ : DBImpl(db_options, dbname, /*seq_per_batch*/ false,
+ /*batch_per_txn*/ true, /*read_only*/ true) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Opening the db in read only mode");
+ LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplReadOnly::~DBImplReadOnly() {}
+
+// Implementations of the DB interface
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* pinnable_val) {
+ return Get(read_options, column_family, key, pinnable_val,
+ /*timestamp*/ nullptr);
+}
+
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* pinnable_val,
+ std::string* timestamp) {
+ assert(pinnable_val != nullptr);
+ // TODO: stopwatch DB_GET needed?, perf timer needed?
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ assert(column_family);
+ if (read_options.timestamp) {
+ const Status s = FailIfTsMismatchCf(
+ column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ const Status s = FailIfCfHasTs(column_family);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Clear the timestamps for returning results so that we can distinguish
+ // between tombstone or key that has never been written
+ if (timestamp) {
+ timestamp->clear();
+ }
+
+ const Comparator* ucmp = column_family->GetComparator();
+ assert(ucmp);
+ std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr;
+
+ Status s;
+ SequenceNumber snapshot = versions_->LastSequence();
+ GetWithTimestampReadCallback read_cb(snapshot);
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ auto cfd = cfh->cfd();
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ tracer_->Get(column_family, key);
+ }
+ }
+ SuperVersion* super_version = cfd->GetSuperVersion();
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+ LookupKey lkey(key, snapshot, read_options.timestamp);
+ PERF_TIMER_STOP(get_snapshot_time);
+ if (super_version->mem->Get(lkey, pinnable_val->GetSelf(),
+ /*columns=*/nullptr, ts, &s, &merge_context,
+ &max_covering_tombstone_seq, read_options,
+ false /* immutable_memtable */, &read_cb)) {
+ pinnable_val->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ PinnedIteratorsManager pinned_iters_mgr;
+ super_version->current->Get(
+ read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s,
+ &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
+ /*value_found*/ nullptr,
+ /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb,
+ /*is_blob*/ nullptr,
+ /*do_merge*/ true);
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+ RecordTick(stats_, NUMBER_KEYS_READ);
+ size_t size = pinnable_val->size();
+ RecordTick(stats_, BYTES_READ, size);
+ RecordInHistogram(stats_, BYTES_PER_READ, size);
+ PERF_COUNTER_ADD(get_read_bytes, size);
+ return s;
+}
+
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) {
+ assert(column_family);
+ if (read_options.timestamp) {
+ const Status s = FailIfTsMismatchCf(
+ column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return NewErrorIterator(s);
+ }
+ } else {
+ const Status s = FailIfCfHasTs(column_family);
+ if (!s.ok()) {
+ return NewErrorIterator(s);
+ }
+ }
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ auto cfd = cfh->cfd();
+ SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+ SequenceNumber latest_snapshot = versions_->LastSequence();
+ SequenceNumber read_seq =
+ read_options.snapshot != nullptr
+ ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+ ->number_
+ : latest_snapshot;
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ auto db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+ super_version->current, read_seq,
+ super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+ super_version->version_number, read_callback);
+ auto internal_iter = NewInternalIterator(
+ db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(),
+ read_seq, /* allow_unprepared_value */ true, db_iter);
+ db_iter->SetIterUnderDBIter(internal_iter);
+ return db_iter;
+}
+
+Status DBImplReadOnly::NewIterators(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) {
+ if (read_options.timestamp) {
+ for (auto* cf : column_families) {
+ assert(cf);
+ const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp),
+ /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ } else {
+ for (auto* cf : column_families) {
+ assert(cf);
+ const Status s = FailIfCfHasTs(cf);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (iterators == nullptr) {
+ return Status::InvalidArgument("iterators not allowed to be nullptr");
+ }
+ iterators->clear();
+ iterators->reserve(column_families.size());
+ SequenceNumber latest_snapshot = versions_->LastSequence();
+ SequenceNumber read_seq =
+ read_options.snapshot != nullptr
+ ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+ ->number_
+ : latest_snapshot;
+
+ for (auto cfh : column_families) {
+ auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+ auto* sv = cfd->GetSuperVersion()->Ref();
+ auto* db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+ sv->current, read_seq,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations,
+ sv->version_number, read_callback);
+ auto* internal_iter = NewInternalIterator(
+ db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), read_seq,
+ /* allow_unprepared_value */ true, db_iter);
+ db_iter->SetIterUnderDBIter(internal_iter);
+ iterators->push_back(db_iter);
+ }
+
+ return Status::OK();
+}
+
+namespace {
+// Return OK if dbname exists in the file system or create it if
+// create_if_missing
+Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
+ const std::string& dbname) {
+ Status s;
+ if (!db_options.create_if_missing) {
+ // Attempt to read "CURRENT" file
+ const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
+ std::string manifest_path;
+ uint64_t manifest_file_number;
+ s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path,
+ &manifest_file_number);
+ } else {
+ // Historic behavior that doesn't necessarily make sense
+ s = db_options.env->CreateDirIfMissing(dbname);
+ }
+ return s;
+}
+} // namespace
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+ DB** dbptr, bool /*error_if_wal_file_exists*/) {
+ Status s = OpenForReadOnlyCheckExistence(options, dbname);
+ if (!s.ok()) {
+ return s;
+ }
+
+ *dbptr = nullptr;
+
+ // Try to first open DB as fully compacted DB
+ s = CompactedDBImpl::Open(options, dbname, dbptr);
+ if (s.ok()) {
+ return s;
+ }
+
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ std::vector<ColumnFamilyHandle*> handles;
+
+ s = DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+ db_options, dbname, column_families, &handles, dbptr);
+ if (s.ok()) {
+ assert(handles.size() == 1);
+ // i can delete the handle since DBImpl is always holding a
+ // reference to default column family
+ delete handles[0];
+ }
+ return s;
+}
+
+Status DB::OpenForReadOnly(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ bool error_if_wal_file_exists) {
+ // If dbname does not exist in the file system, should not do anything
+ Status s = OpenForReadOnlyCheckExistence(db_options, dbname);
+ if (!s.ok()) {
+ return s;
+ }
+
+ return DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+ db_options, dbname, column_families, handles, dbptr,
+ error_if_wal_file_exists);
+}
+
+Status DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ bool error_if_wal_file_exists) {
+ *dbptr = nullptr;
+ handles->clear();
+
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
+ impl->mutex_.Lock();
+ Status s = impl->Recover(column_families, true /* read only */,
+ error_if_wal_file_exists);
+ if (s.ok()) {
+ // set column family handles
+ for (auto cf : column_families) {
+ auto cfd =
+ impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+ if (cfd == nullptr) {
+ s = Status::InvalidArgument("Column family not found", cf.name);
+ break;
+ }
+ handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+ }
+ }
+ if (s.ok()) {
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ sv_context.NewSuperVersion();
+ cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+ }
+ }
+ impl->mutex_.Unlock();
+ sv_context.Clean();
+ if (s.ok()) {
+ *dbptr = impl;
+ for (auto* h : *handles) {
+ impl->NewThreadStatusCfInfo(
+ static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
+ }
+ } else {
+ for (auto h : *handles) {
+ delete h;
+ }
+ handles->clear();
+ delete impl;
+ }
+ return s;
+}
+
+#else // !ROCKSDB_LITE
+
+Status DB::OpenForReadOnly(const Options& /*options*/,
+ const std::string& /*dbname*/, DB** /*dbptr*/,
+ bool /*error_if_wal_file_exists*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenForReadOnly(
+ const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+ const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/,
+ bool /*error_if_wal_file_exists*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.h b/src/rocksdb/db/db_impl/db_impl_readonly.h
new file mode 100644
index 000000000..b876a0fda
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.h
@@ -0,0 +1,170 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: Share common structure with CompactedDBImpl and DBImplSecondary
+class DBImplReadOnly : public DBImpl {
+ public:
+ DBImplReadOnly(const DBOptions& options, const std::string& dbname);
+ // No copying allowed
+ DBImplReadOnly(const DBImplReadOnly&) = delete;
+ void operator=(const DBImplReadOnly&) = delete;
+
+ virtual ~DBImplReadOnly();
+
+ // Implementations of the DB interface
+ using DB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+ Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value,
+ std::string* timestamp) override;
+
+ // TODO: Implement ReadOnly MultiGet?
+
+ using DBImpl::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions&,
+ ColumnFamilyHandle* column_family) override;
+
+ virtual Status NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override;
+
+ using DBImpl::Put;
+ virtual Status Put(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DBImpl::PutEntity;
+ Status PutEntity(const WriteOptions& /* options */,
+ ColumnFamilyHandle* /* column_family */,
+ const Slice& /* key */,
+ const WideColumns& /* columns */) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DBImpl::Merge;
+ virtual Status Merge(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ using DBImpl::Delete;
+ virtual Status Delete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ using DBImpl::SingleDelete;
+ virtual Status SingleDelete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status Write(const WriteOptions& /*options*/,
+ WriteBatch* /*updates*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ using DBImpl::CompactRange;
+ virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/,
+ const Slice* /*end*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DBImpl::CompactFiles;
+ virtual Status CompactFiles(
+ const CompactionOptions& /*compact_options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*input_file_names*/,
+ const int /*output_level*/, const int /*output_path_id*/ = -1,
+ std::vector<std::string>* const /*output_file_names*/ = nullptr,
+ CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ virtual Status DisableFileDeletions() override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ virtual Status EnableFileDeletions(bool /*force*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status GetLiveFiles(std::vector<std::string>& ret,
+ uint64_t* manifest_file_size,
+ bool /*flush_memtable*/) override {
+ return DBImpl::GetLiveFiles(ret, manifest_file_size,
+ false /* flush_memtable */);
+ }
+
+ using DBImpl::Flush;
+ virtual Status Flush(const FlushOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DBImpl::SyncWAL;
+ virtual Status SyncWAL() override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DB::IngestExternalFile;
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*external_files*/,
+ const IngestExternalFileOptions& /*ingestion_options*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& /*options*/,
+ const std::string& /*column_family_name*/,
+ const ImportColumnFamilyOptions& /*import_options*/,
+ const ExportImportFilesMetaData& /*metadata*/,
+ ColumnFamilyHandle** /*handle*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ // FIXME: some missing overrides for more "write" functions
+
+ protected:
+#ifndef ROCKSDB_LITE
+ Status FlushForGetLiveFiles() override {
+ // No-op for read-only DB
+ return Status::OK();
+ }
+#endif // !ROCKSDB_LITE
+
+ private:
+ // A "helper" function for DB::OpenForReadOnly without column families
+ // to reduce unnecessary I/O
+ // It has the same functionality as DB::OpenForReadOnly with column families
+ // but does not check the existence of dbname in the file system
+ static Status OpenForReadOnlyWithoutCheck(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ bool error_if_wal_file_exists = false);
+ friend class DB;
+};
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.cc b/src/rocksdb/db/db_impl/db_impl_secondary.cc
new file mode 100644
index 000000000..5189d17d9
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.cc
@@ -0,0 +1,967 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_secondary.h"
+
+#include <cinttypes>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/merge_context.h"
+#include "logging/auto_roll_logger.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/configurable.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
+ const std::string& dbname,
+ std::string secondary_path)
+ : DBImpl(db_options, dbname, false, true, true),
+ secondary_path_(std::move(secondary_path)) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Opening the db in secondary mode");
+ LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplSecondary::~DBImplSecondary() {}
+
+Status DBImplSecondary::Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool /*readonly*/, bool /*error_if_wal_file_exists*/,
+ bool /*error_if_data_exists_in_wals*/, uint64_t*,
+ RecoveryContext* /*recovery_ctx*/) {
+ mutex_.AssertHeld();
+
+ JobContext job_context(0);
+ Status s;
+ s = static_cast<ReactiveVersionSet*>(versions_.get())
+ ->Recover(column_families, &manifest_reader_, &manifest_reporter_,
+ &manifest_reader_status_);
+ if (!s.ok()) {
+ if (manifest_reader_status_) {
+ manifest_reader_status_->PermitUncheckedError();
+ }
+ return s;
+ }
+ if (immutable_db_options_.paranoid_checks && s.ok()) {
+ s = CheckConsistency();
+ }
+ // Initial max_total_in_memory_state_ before recovery logs.
+ max_total_in_memory_state_ = 0;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+ mutable_cf_options->max_write_buffer_number;
+ }
+ if (s.ok()) {
+ default_cf_handle_ = new ColumnFamilyHandleImpl(
+ versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+ default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+ }
+
+ if (s.IsPathNotFound()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Secondary tries to read WAL, but WAL file(s) have already "
+ "been purged by primary.");
+ s = Status::OK();
+ }
+ // TODO: update options_file_number_ needed?
+
+ job_context.Clean();
+ return s;
+}
+
+// find new WAL and apply them in order to the secondary instance
+Status DBImplSecondary::FindAndRecoverLogFiles(
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context) {
+ assert(nullptr != cfds_changed);
+ assert(nullptr != job_context);
+ Status s;
+ std::vector<uint64_t> logs;
+ s = FindNewLogNumbers(&logs);
+ if (s.ok() && !logs.empty()) {
+ SequenceNumber next_sequence(kMaxSequenceNumber);
+ s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context);
+ }
+ return s;
+}
+
+// List wal_dir and find all new WALs, return these log numbers
+Status DBImplSecondary::FindNewLogNumbers(std::vector<uint64_t>* logs) {
+ assert(logs != nullptr);
+ std::vector<std::string> filenames;
+ Status s;
+ IOOptions io_opts;
+ io_opts.do_not_recurse = true;
+ s = immutable_db_options_.fs->GetChildren(immutable_db_options_.GetWalDir(),
+ io_opts, &filenames,
+ /*IODebugContext*=*/nullptr);
+ if (s.IsNotFound()) {
+ return Status::InvalidArgument("Failed to open wal_dir",
+ immutable_db_options_.GetWalDir());
+ } else if (!s.ok()) {
+ return s;
+ }
+
+ // if log_readers_ is non-empty, it means we have applied all logs with log
+ // numbers smaller than the smallest log in log_readers_, so there is no
+ // need to pass these logs to RecoverLogFiles
+ uint64_t log_number_min = 0;
+ if (!log_readers_.empty()) {
+ log_number_min = log_readers_.begin()->first;
+ }
+ for (size_t i = 0; i < filenames.size(); i++) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(filenames[i], &number, &type) && type == kWalFile &&
+ number >= log_number_min) {
+ logs->push_back(number);
+ }
+ }
+ // Recover logs in the order that they were generated
+ if (!logs->empty()) {
+ std::sort(logs->begin(), logs->end());
+ }
+ return s;
+}
+
+Status DBImplSecondary::MaybeInitLogReader(
+ uint64_t log_number, log::FragmentBufferedReader** log_reader) {
+ auto iter = log_readers_.find(log_number);
+ // make sure the log file is still present
+ if (iter == log_readers_.end() ||
+ iter->second->reader_->GetLogNumber() != log_number) {
+ // delete the obsolete log reader if log number mismatch
+ if (iter != log_readers_.end()) {
+ log_readers_.erase(iter);
+ }
+ // initialize log reader from log_number
+ // TODO: min_log_number_to_keep_2pc check needed?
+ // Open the log file
+ std::string fname =
+ LogFileName(immutable_db_options_.GetWalDir(), log_number);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Recovering log #%" PRIu64 " mode %d", log_number,
+ static_cast<int>(immutable_db_options_.wal_recovery_mode));
+
+ std::unique_ptr<SequentialFileReader> file_reader;
+ {
+ std::unique_ptr<FSSequentialFile> file;
+ Status status = fs_->NewSequentialFile(
+ fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
+ if (!status.ok()) {
+ *log_reader = nullptr;
+ return status;
+ }
+ file_reader.reset(new SequentialFileReader(
+ std::move(file), fname, immutable_db_options_.log_readahead_size,
+ io_tracer_));
+ }
+
+ // Create the log reader.
+ LogReaderContainer* log_reader_container = new LogReaderContainer(
+ env_, immutable_db_options_.info_log, std::move(fname),
+ std::move(file_reader), log_number);
+ log_readers_.insert(std::make_pair(
+ log_number, std::unique_ptr<LogReaderContainer>(log_reader_container)));
+ }
+ iter = log_readers_.find(log_number);
+ assert(iter != log_readers_.end());
+ *log_reader = iter->second->reader_;
+ return Status::OK();
+}
+
+// After manifest recovery, replay WALs and refresh log_readers_ if necessary
+// REQUIRES: log_numbers are sorted in ascending order
+Status DBImplSecondary::RecoverLogFiles(
+ const std::vector<uint64_t>& log_numbers, SequenceNumber* next_sequence,
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context) {
+ assert(nullptr != cfds_changed);
+ assert(nullptr != job_context);
+ mutex_.AssertHeld();
+ Status status;
+ for (auto log_number : log_numbers) {
+ log::FragmentBufferedReader* reader = nullptr;
+ status = MaybeInitLogReader(log_number, &reader);
+ if (!status.ok()) {
+ return status;
+ }
+ assert(reader != nullptr);
+ }
+ for (auto log_number : log_numbers) {
+ auto it = log_readers_.find(log_number);
+ assert(it != log_readers_.end());
+ log::FragmentBufferedReader* reader = it->second->reader_;
+ Status* wal_read_status = it->second->status_;
+ assert(wal_read_status);
+ // Manually update the file number allocation counter in VersionSet.
+ versions_->MarkFileNumberUsed(log_number);
+
+ // Determine if we should tolerate incomplete records at the tail end of the
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+
+ while (reader->ReadRecord(&record, &scratch,
+ immutable_db_options_.wal_recovery_mode) &&
+ wal_read_status->ok() && status.ok()) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reader->GetReporter()->Corruption(
+ record.size(), Status::Corruption("log record too small"));
+ continue;
+ }
+ status = WriteBatchInternal::SetContents(&batch, record);
+ if (!status.ok()) {
+ break;
+ }
+ SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch);
+ std::vector<uint32_t> column_family_ids;
+ status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
+ if (status.ok()) {
+ for (const auto id : column_family_ids) {
+ ColumnFamilyData* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(id);
+ if (cfd == nullptr) {
+ continue;
+ }
+ if (cfds_changed->count(cfd) == 0) {
+ cfds_changed->insert(cfd);
+ }
+ const std::vector<FileMetaData*>& l0_files =
+ cfd->current()->storage_info()->LevelFiles(0);
+ SequenceNumber seq =
+ l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno;
+ // If the write batch's sequence number is smaller than the last
+ // sequence number of the largest sequence persisted for this column
+ // family, then its data must reside in an SST that has already been
+ // added in the prior MANIFEST replay.
+ if (seq_of_batch <= seq) {
+ continue;
+ }
+ auto curr_log_num = std::numeric_limits<uint64_t>::max();
+ if (cfd_to_current_log_.count(cfd) > 0) {
+ curr_log_num = cfd_to_current_log_[cfd];
+ }
+ // If the active memtable contains records added by replaying an
+ // earlier WAL, then we need to seal the memtable, add it to the
+ // immutable memtable list and create a new active memtable.
+ if (!cfd->mem()->IsEmpty() &&
+ (curr_log_num == std::numeric_limits<uint64_t>::max() ||
+ curr_log_num != log_number)) {
+ const MutableCFOptions mutable_cf_options =
+ *cfd->GetLatestMutableCFOptions();
+ MemTable* new_mem =
+ cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch);
+ cfd->mem()->SetNextLogNumber(log_number);
+ cfd->mem()->ConstructFragmentedRangeTombstones();
+ cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free);
+ new_mem->Ref();
+ cfd->SetMemtable(new_mem);
+ }
+ }
+ bool has_valid_writes = false;
+ status = WriteBatchInternal::InsertInto(
+ &batch, column_family_memtables_.get(),
+ nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/,
+ true, log_number, this, false /* concurrent_memtable_writes */,
+ next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
+ }
+ // If column family was not found, it might mean that the WAL write
+ // batch references to the column family that was dropped after the
+ // insert. We don't want to fail the whole write batch in that case --
+ // we just ignore the update.
+ // That's why we set ignore missing column families to true
+ // passing null flush_scheduler will disable memtable flushing which is
+ // needed for secondary instances
+ if (status.ok()) {
+ for (const auto id : column_family_ids) {
+ ColumnFamilyData* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(id);
+ if (cfd == nullptr) {
+ continue;
+ }
+ std::unordered_map<ColumnFamilyData*, uint64_t>::iterator iter =
+ cfd_to_current_log_.find(cfd);
+ if (iter == cfd_to_current_log_.end()) {
+ cfd_to_current_log_.insert({cfd, log_number});
+ } else if (log_number > iter->second) {
+ iter->second = log_number;
+ }
+ }
+ auto last_sequence = *next_sequence - 1;
+ if ((*next_sequence != kMaxSequenceNumber) &&
+ (versions_->LastSequence() <= last_sequence)) {
+ versions_->SetLastAllocatedSequence(last_sequence);
+ versions_->SetLastPublishedSequence(last_sequence);
+ versions_->SetLastSequence(last_sequence);
+ }
+ } else {
+ // We are treating this as a failure while reading since we read valid
+ // blocks that do not form coherent data
+ reader->GetReporter()->Corruption(record.size(), status);
+ }
+ }
+ if (status.ok() && !wal_read_status->ok()) {
+ status = *wal_read_status;
+ }
+ if (!status.ok()) {
+ return status;
+ }
+ }
+ // remove logreaders from map after successfully recovering the WAL
+ if (log_readers_.size() > 1) {
+ auto erase_iter = log_readers_.begin();
+ std::advance(erase_iter, log_readers_.size() - 1);
+ log_readers_.erase(log_readers_.begin(), erase_iter);
+ }
+ return status;
+}
+
+// Implementation of the DB interface
+Status DBImplSecondary::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) {
+ return GetImpl(read_options, column_family, key, value,
+ /*timestamp*/ nullptr);
+}
+
+Status DBImplSecondary::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value, std::string* timestamp) {
+ return GetImpl(read_options, column_family, key, value, timestamp);
+}
+
+Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* pinnable_val,
+ std::string* timestamp) {
+ assert(pinnable_val != nullptr);
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+ StopWatch sw(immutable_db_options_.clock, stats_, DB_GET);
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ assert(column_family);
+ if (read_options.timestamp) {
+ const Status s = FailIfTsMismatchCf(
+ column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ const Status s = FailIfCfHasTs(column_family);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Clear the timestamp for returning results so that we can distinguish
+ // between tombstone or key that has never been written later.
+ if (timestamp) {
+ timestamp->clear();
+ }
+
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ tracer_->Get(column_family, key);
+ }
+ }
+ // Acquire SuperVersion
+ SuperVersion* super_version = GetAndRefSuperVersion(cfd);
+ SequenceNumber snapshot = versions_->LastSequence();
+ GetWithTimestampReadCallback read_cb(snapshot);
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+ Status s;
+ LookupKey lkey(key, snapshot, read_options.timestamp);
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ bool done = false;
+ const Comparator* ucmp = column_family->GetComparator();
+ assert(ucmp);
+ std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr;
+ if (super_version->mem->Get(lkey, pinnable_val->GetSelf(),
+ /*columns=*/nullptr, ts, &s, &merge_context,
+ &max_covering_tombstone_seq, read_options,
+ false /* immutable_memtable */, &read_cb)) {
+ done = true;
+ pinnable_val->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if ((s.ok() || s.IsMergeInProgress()) &&
+ super_version->imm->Get(
+ lkey, pinnable_val->GetSelf(), /*columns=*/nullptr, ts, &s,
+ &merge_context, &max_covering_tombstone_seq, read_options,
+ &read_cb)) {
+ done = true;
+ pinnable_val->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ if (!done && !s.ok() && !s.IsMergeInProgress()) {
+ ReturnAndCleanupSuperVersion(cfd, super_version);
+ return s;
+ }
+ if (!done) {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ PinnedIteratorsManager pinned_iters_mgr;
+ super_version->current->Get(
+ read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s,
+ &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
+ /*value_found*/ nullptr,
+ /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb, /*is_blob*/ nullptr,
+ /*do_merge*/ true);
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+ {
+ PERF_TIMER_GUARD(get_post_process_time);
+ ReturnAndCleanupSuperVersion(cfd, super_version);
+ RecordTick(stats_, NUMBER_KEYS_READ);
+ size_t size = pinnable_val->size();
+ RecordTick(stats_, BYTES_READ, size);
+ RecordTimeToHistogram(stats_, BYTES_PER_READ, size);
+ PERF_COUNTER_ADD(get_read_bytes, size);
+ }
+ return s;
+}
+
+Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) {
+ if (read_options.managed) {
+ return NewErrorIterator(
+ Status::NotSupported("Managed iterator is not supported anymore."));
+ }
+ if (read_options.read_tier == kPersistedTier) {
+ return NewErrorIterator(Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators."));
+ }
+
+ assert(column_family);
+ if (read_options.timestamp) {
+ const Status s = FailIfTsMismatchCf(
+ column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return NewErrorIterator(s);
+ }
+ } else {
+ const Status s = FailIfCfHasTs(column_family);
+ if (!s.ok()) {
+ return NewErrorIterator(s);
+ }
+ }
+
+ Iterator* result = nullptr;
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ auto cfd = cfh->cfd();
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (read_options.tailing) {
+ return NewErrorIterator(Status::NotSupported(
+ "tailing iterator not supported in secondary mode"));
+ } else if (read_options.snapshot != nullptr) {
+ // TODO (yanqin) support snapshot.
+ return NewErrorIterator(
+ Status::NotSupported("snapshot not supported in secondary mode"));
+ } else {
+ SequenceNumber snapshot(kMaxSequenceNumber);
+ result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
+ }
+ return result;
+}
+
+ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl(
+ const ReadOptions& read_options, ColumnFamilyData* cfd,
+ SequenceNumber snapshot, ReadCallback* read_callback,
+ bool expose_blob_index, bool allow_refresh) {
+ assert(nullptr != cfd);
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ assert(snapshot == kMaxSequenceNumber);
+ snapshot = versions_->LastSequence();
+ assert(snapshot != kMaxSequenceNumber);
+ auto db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+ super_version->current, snapshot,
+ super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+ super_version->version_number, read_callback, this, cfd,
+ expose_blob_index, read_options.snapshot ? false : allow_refresh);
+ auto internal_iter = NewInternalIterator(
+ db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(),
+ snapshot, /* allow_unprepared_value */ true, db_iter);
+ db_iter->SetIterUnderDBIter(internal_iter);
+ return db_iter;
+}
+
+Status DBImplSecondary::NewIterators(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) {
+ if (read_options.managed) {
+ return Status::NotSupported("Managed iterator is not supported anymore.");
+ }
+ if (read_options.read_tier == kPersistedTier) {
+ return Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators.");
+ }
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (iterators == nullptr) {
+ return Status::InvalidArgument("iterators not allowed to be nullptr");
+ }
+
+ if (read_options.timestamp) {
+ for (auto* cf : column_families) {
+ assert(cf);
+ const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp),
+ /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ } else {
+ for (auto* cf : column_families) {
+ assert(cf);
+ const Status s = FailIfCfHasTs(cf);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+ iterators->clear();
+ iterators->reserve(column_families.size());
+ if (read_options.tailing) {
+ return Status::NotSupported(
+ "tailing iterator not supported in secondary mode");
+ } else if (read_options.snapshot != nullptr) {
+ // TODO (yanqin) support snapshot.
+ return Status::NotSupported("snapshot not supported in secondary mode");
+ } else {
+ SequenceNumber read_seq(kMaxSequenceNumber);
+ for (auto cfh : column_families) {
+ ColumnFamilyData* cfd = static_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+ iterators->push_back(
+ NewIteratorImpl(read_options, cfd, read_seq, read_callback));
+ }
+ }
+ return Status::OK();
+}
+
+Status DBImplSecondary::CheckConsistency() {
+ mutex_.AssertHeld();
+ Status s = DBImpl::CheckConsistency();
+ // If DBImpl::CheckConsistency() which is stricter returns success, then we
+ // do not need to give a second chance.
+ if (s.ok()) {
+ return s;
+ }
+ // It's possible that DBImpl::CheckConssitency() can fail because the primary
+ // may have removed certain files, causing the GetFileSize(name) call to
+ // fail and returning a PathNotFound. In this case, we take a best-effort
+ // approach and just proceed.
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
+
+ if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+ return Status::OK();
+ }
+
+ std::vector<LiveFileMetaData> metadata;
+ versions_->GetLiveFilesMetaData(&metadata);
+
+ std::string corruption_messages;
+ for (const auto& md : metadata) {
+ // md.name has a leading "/".
+ std::string file_path = md.db_path + md.name;
+
+ uint64_t fsize = 0;
+ s = env_->GetFileSize(file_path, &fsize);
+ if (!s.ok() &&
+ (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() ||
+ s.IsPathNotFound())) {
+ s = Status::OK();
+ }
+ if (!s.ok()) {
+ corruption_messages +=
+ "Can't access " + md.name + ": " + s.ToString() + "\n";
+ }
+ }
+ return corruption_messages.empty() ? Status::OK()
+ : Status::Corruption(corruption_messages);
+}
+
+Status DBImplSecondary::TryCatchUpWithPrimary() {
+ assert(versions_.get() != nullptr);
+ assert(manifest_reader_.get() != nullptr);
+ Status s;
+ // read the manifest and apply new changes to the secondary instance
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ JobContext job_context(0, true /*create_superversion*/);
+ {
+ InstrumentedMutexLock lock_guard(&mutex_);
+ s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
+ ->ReadAndApply(&mutex_, &manifest_reader_,
+ manifest_reader_status_.get(), &cfds_changed);
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
+ static_cast<uint64_t>(versions_->LastSequence()));
+ for (ColumnFamilyData* cfd : cfds_changed) {
+ if (cfd->IsDropped()) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
+ cfd->GetName().c_str());
+ continue;
+ }
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] Level summary: %s\n", cfd->GetName().c_str(),
+ cfd->current()->storage_info()->LevelSummary(&tmp));
+ }
+
+ // list wal_dir to discover new WALs and apply new changes to the secondary
+ // instance
+ if (s.ok()) {
+ s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+ }
+ if (s.IsPathNotFound()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Secondary tries to read WAL, but WAL file(s) have already "
+ "been purged by primary.");
+ s = Status::OK();
+ }
+ if (s.ok()) {
+ for (auto cfd : cfds_changed) {
+ cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
+ &job_context.memtables_to_free);
+ auto& sv_context = job_context.superversion_contexts.back();
+ cfd->InstallSuperVersion(&sv_context, &mutex_);
+ sv_context.NewSuperVersion();
+ }
+ }
+ }
+ job_context.Clean();
+
+ // Cleanup unused, obsolete files.
+ JobContext purge_files_job_context(0);
+ {
+ InstrumentedMutexLock lock_guard(&mutex_);
+ // Currently, secondary instance does not own the database files, thus it
+ // is unnecessary for the secondary to force full scan.
+ FindObsoleteFiles(&purge_files_job_context, /*force=*/false);
+ }
+ if (purge_files_job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(purge_files_job_context);
+ }
+ purge_files_job_context.Clean();
+ return s;
+}
+
+Status DB::OpenAsSecondary(const Options& options, const std::string& dbname,
+ const std::string& secondary_path, DB** dbptr) {
+ *dbptr = nullptr;
+
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
+ std::vector<ColumnFamilyHandle*> handles;
+
+ Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path,
+ column_families, &handles, dbptr);
+ if (s.ok()) {
+ assert(handles.size() == 1);
+ delete handles[0];
+ }
+ return s;
+}
+
+Status DB::OpenAsSecondary(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::string& secondary_path,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+ *dbptr = nullptr;
+
+ DBOptions tmp_opts(db_options);
+ Status s;
+ if (nullptr == tmp_opts.info_log) {
+ s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log);
+ if (!s.ok()) {
+ tmp_opts.info_log = nullptr;
+ return s;
+ }
+ }
+
+ assert(tmp_opts.info_log != nullptr);
+ if (db_options.max_open_files != -1) {
+ std::ostringstream oss;
+ oss << "The primary instance may delete all types of files after they "
+ "become obsolete. The application can coordinate the primary and "
+ "secondary so that primary does not delete/rename files that are "
+ "currently being used by the secondary. Alternatively, a custom "
+ "Env/FS can be provided such that files become inaccessible only "
+ "after all primary and secondaries indicate that they are obsolete "
+ "and deleted. If the above two are not possible, you can open the "
+ "secondary instance with `max_open_files==-1` so that secondary "
+ "will eagerly keep all table files open. Even if a file is deleted, "
+ "its content can still be accessed via a prior open file "
+ "descriptor. This is a hacky workaround for only table files. If "
+ "none of the above is done, then point lookup or "
+ "range scan via the secondary instance can result in IOError: file "
+ "not found. This can be resolved by retrying "
+ "TryCatchUpWithPrimary().";
+ ROCKS_LOG_WARN(tmp_opts.info_log, "%s", oss.str().c_str());
+ }
+
+ handles->clear();
+ DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path);
+ impl->versions_.reset(new ReactiveVersionSet(
+ dbname, &impl->immutable_db_options_, impl->file_options_,
+ impl->table_cache_.get(), impl->write_buffer_manager_,
+ &impl->write_controller_, impl->io_tracer_));
+ impl->column_family_memtables_.reset(
+ new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
+ impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
+
+ impl->mutex_.Lock();
+ s = impl->Recover(column_families, true, false, false);
+ if (s.ok()) {
+ for (auto cf : column_families) {
+ auto cfd =
+ impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+ if (nullptr == cfd) {
+ s = Status::InvalidArgument("Column family not found", cf.name);
+ break;
+ }
+ handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+ }
+ }
+ SuperVersionContext sv_context(true /* create_superversion */);
+ if (s.ok()) {
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ sv_context.NewSuperVersion();
+ cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+ }
+ }
+ impl->mutex_.Unlock();
+ sv_context.Clean();
+ if (s.ok()) {
+ *dbptr = impl;
+ for (auto h : *handles) {
+ impl->NewThreadStatusCfInfo(
+ static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
+ }
+ } else {
+ for (auto h : *handles) {
+ delete h;
+ }
+ handles->clear();
+ delete impl;
+ }
+ return s;
+}
+
+Status DBImplSecondary::CompactWithoutInstallation(
+ const OpenAndCompactOptions& options, ColumnFamilyHandle* cfh,
+ const CompactionServiceInput& input, CompactionServiceResult* result) {
+ if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+ return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+ InstrumentedMutexLock l(&mutex_);
+ auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+ if (!cfd) {
+ return Status::InvalidArgument("Cannot find column family" +
+ cfh->GetName());
+ }
+
+ std::unordered_set<uint64_t> input_set;
+ for (const auto& file_name : input.input_files) {
+ input_set.insert(TableFileNameToNumber(file_name));
+ }
+
+ auto* version = cfd->current();
+
+ ColumnFamilyMetaData cf_meta;
+ version->GetColumnFamilyMetaData(&cf_meta);
+
+ const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ ColumnFamilyOptions cf_options = cfd->GetLatestCFOptions();
+ VersionStorageInfo* vstorage = version->storage_info();
+
+ // Use comp_options to reuse some CompactFiles functions
+ CompactionOptions comp_options;
+ comp_options.compression = kDisableCompressionOption;
+ comp_options.output_file_size_limit = MaxFileSizeForLevel(
+ *mutable_cf_options, input.output_level, cf_options.compaction_style,
+ vstorage->base_level(), cf_options.level_compaction_dynamic_level_bytes);
+
+ std::vector<CompactionInputFiles> input_files;
+ Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage, comp_options);
+ if (!s.ok()) {
+ return s;
+ }
+
+ std::unique_ptr<Compaction> c;
+ assert(cfd->compaction_picker());
+ c.reset(cfd->compaction_picker()->CompactFiles(
+ comp_options, input_files, input.output_level, vstorage,
+ *mutable_cf_options, mutable_db_options_, 0));
+ assert(c != nullptr);
+
+ c->SetInputVersion(version);
+
+ // Create output directory if it's not existed yet
+ std::unique_ptr<FSDirectory> output_dir;
+ s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir);
+ if (!s.ok()) {
+ return s;
+ }
+
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+
+ const int job_id = next_job_id_.fetch_add(1);
+
+ // use primary host's db_id for running the compaction, but db_session_id is
+ // using the local one, which is to make sure the unique id is unique from
+ // the remote compactors. Because the id is generated from db_id,
+ // db_session_id and orig_file_number, unlike the local compaction, remote
+ // compaction cannot guarantee the uniqueness of orig_file_number, the file
+ // number is only assigned when compaction is done.
+ CompactionServiceCompactionJob compaction_job(
+ job_id, c.get(), immutable_db_options_, mutable_db_options_,
+ file_options_for_compaction_, versions_.get(), &shutting_down_,
+ &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_,
+ input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_,
+ options.canceled ? *options.canceled : kManualCompactionCanceledFalse_,
+ input.db_id, db_session_id_, secondary_path_, input, result);
+
+ mutex_.Unlock();
+ s = compaction_job.Run();
+ mutex_.Lock();
+
+ // clean up
+ compaction_job.io_status().PermitUncheckedError();
+ compaction_job.CleanupCompaction();
+ c->ReleaseCompactionFiles(s);
+ c.reset();
+
+ TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End",
+ &s);
+ result->status = s;
+ return s;
+}
+
+Status DB::OpenAndCompact(
+ const OpenAndCompactOptions& options, const std::string& name,
+ const std::string& output_directory, const std::string& input,
+ std::string* output,
+ const CompactionServiceOptionsOverride& override_options) {
+ if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+ return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+ CompactionServiceInput compaction_input;
+ Status s = CompactionServiceInput::Read(input, &compaction_input);
+ if (!s.ok()) {
+ return s;
+ }
+
+ compaction_input.db_options.max_open_files = -1;
+ compaction_input.db_options.compaction_service = nullptr;
+ if (compaction_input.db_options.statistics) {
+ compaction_input.db_options.statistics.reset();
+ }
+ compaction_input.db_options.env = override_options.env;
+ compaction_input.db_options.file_checksum_gen_factory =
+ override_options.file_checksum_gen_factory;
+ compaction_input.db_options.statistics = override_options.statistics;
+ compaction_input.column_family.options.comparator =
+ override_options.comparator;
+ compaction_input.column_family.options.merge_operator =
+ override_options.merge_operator;
+ compaction_input.column_family.options.compaction_filter =
+ override_options.compaction_filter;
+ compaction_input.column_family.options.compaction_filter_factory =
+ override_options.compaction_filter_factory;
+ compaction_input.column_family.options.prefix_extractor =
+ override_options.prefix_extractor;
+ compaction_input.column_family.options.table_factory =
+ override_options.table_factory;
+ compaction_input.column_family.options.sst_partitioner_factory =
+ override_options.sst_partitioner_factory;
+ compaction_input.column_family.options.table_properties_collector_factories =
+ override_options.table_properties_collector_factories;
+ compaction_input.db_options.listeners = override_options.listeners;
+
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(compaction_input.column_family);
+ // TODO: we have to open default CF, because of an implementation limitation,
+ // currently we just use the same CF option from input, which is not collect
+ // and open may fail.
+ if (compaction_input.column_family.name != kDefaultColumnFamilyName) {
+ column_families.emplace_back(kDefaultColumnFamilyName,
+ compaction_input.column_family.options);
+ }
+
+ DB* db;
+ std::vector<ColumnFamilyHandle*> handles;
+
+ s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory,
+ column_families, &handles, &db);
+ if (!s.ok()) {
+ return s;
+ }
+
+ CompactionServiceResult compaction_result;
+ DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db);
+ assert(handles.size() > 0);
+ s = db_secondary->CompactWithoutInstallation(
+ options, handles[0], compaction_input, &compaction_result);
+
+ Status serialization_status = compaction_result.Write(output);
+
+ for (auto& handle : handles) {
+ delete handle;
+ }
+ delete db;
+ if (s.ok()) {
+ return serialization_status;
+ }
+ return s;
+}
+
+Status DB::OpenAndCompact(
+ const std::string& name, const std::string& output_directory,
+ const std::string& input, std::string* output,
+ const CompactionServiceOptionsOverride& override_options) {
+ return OpenAndCompact(OpenAndCompactOptions(), name, output_directory, input,
+ output, override_options);
+}
+
+#else // !ROCKSDB_LITE
+
+Status DB::OpenAsSecondary(const Options& /*options*/,
+ const std::string& /*name*/,
+ const std::string& /*secondary_path*/,
+ DB** /*dbptr*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenAsSecondary(
+ const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+ const std::string& /*secondary_path*/,
+ const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.h b/src/rocksdb/db/db_impl/db_impl_secondary.h
new file mode 100644
index 000000000..eb9361875
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.h
@@ -0,0 +1,410 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A wrapper class to hold log reader, log reporter, log status.
+class LogReaderContainer {
+ public:
+ LogReaderContainer()
+ : reader_(nullptr), reporter_(nullptr), status_(nullptr) {}
+ LogReaderContainer(Env* env, std::shared_ptr<Logger> info_log,
+ std::string fname,
+ std::unique_ptr<SequentialFileReader>&& file_reader,
+ uint64_t log_number) {
+ LogReporter* reporter = new LogReporter();
+ status_ = new Status();
+ reporter->env = env;
+ reporter->info_log = info_log.get();
+ reporter->fname = std::move(fname);
+ reporter->status = status_;
+ reporter_ = reporter;
+ // We intentially make log::Reader do checksumming even if
+ // paranoid_checks==false so that corruptions cause entire commits
+ // to be skipped instead of propagating bad information (like overly
+ // large sequence numbers).
+ reader_ = new log::FragmentBufferedReader(info_log, std::move(file_reader),
+ reporter, true /*checksum*/,
+ log_number);
+ }
+ log::FragmentBufferedReader* reader_;
+ log::Reader::Reporter* reporter_;
+ Status* status_;
+ ~LogReaderContainer() {
+ delete reader_;
+ delete reporter_;
+ delete status_;
+ }
+
+ private:
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ std::string fname;
+ Status* status; // nullptr if immutable_db_options_.paranoid_checks==false
+ void Corruption(size_t bytes, const Status& s) override {
+ ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+ (this->status == nullptr ? "(ignoring error) " : ""),
+ fname.c_str(), static_cast<int>(bytes),
+ s.ToString().c_str());
+ if (this->status != nullptr && this->status->ok()) {
+ *this->status = s;
+ }
+ }
+ };
+};
+
+// The secondary instance shares access to the storage as the primary.
+// The secondary is able to read and replay changes described in both the
+// MANIFEST and the WAL files without coordination with the primary.
+// The secondary instance can be opened using `DB::OpenAsSecondary`. After
+// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best
+// effort attempts to catch up with the primary.
+// TODO: Share common structure with CompactedDBImpl and DBImplReadOnly
+class DBImplSecondary : public DBImpl {
+ public:
+ DBImplSecondary(const DBOptions& options, const std::string& dbname,
+ std::string secondary_path);
+ ~DBImplSecondary() override;
+
+ // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_
+ // and log_readers_ to facilitate future operations.
+ Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool read_only, bool error_if_wal_file_exists,
+ bool error_if_data_exists_in_wals, uint64_t* = nullptr,
+ RecoveryContext* recovery_ctx = nullptr) override;
+
+ // Implementations of the DB interface.
+ using DB::Get;
+ // Can return IOError due to files being deleted by the primary. To avoid
+ // IOError in this case, application can coordinate between primary and
+ // secondaries so that primary will not delete files that are currently being
+ // used by the secondaries. The application can also provide a custom FS/Env
+ // implementation so that files will remain present until all primary and
+ // secondaries indicate that they can be deleted. As a partial hacky
+ // workaround, the secondaries can be opened with `max_open_files=-1` so that
+ // it eagerly keeps all talbe files open and is able to access the contents of
+ // deleted files via prior open fd.
+ Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value) override;
+
+ Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value,
+ std::string* timestamp) override;
+
+ Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value,
+ std::string* timestamp);
+
+ using DBImpl::NewIterator;
+ // Operations on the created iterators can return IOError due to files being
+ // deleted by the primary. To avoid IOError in this case, application can
+ // coordinate between primary and secondaries so that primary will not delete
+ // files that are currently being used by the secondaries. The application can
+ // also provide a custom FS/Env implementation so that files will remain
+ // present until all primary and secondaries indicate that they can be
+ // deleted. As a partial hacky workaround, the secondaries can be opened with
+ // `max_open_files=-1` so that it eagerly keeps all talbe files open and is
+ // able to access the contents of deleted files via prior open fd.
+ Iterator* NewIterator(const ReadOptions&,
+ ColumnFamilyHandle* column_family) override;
+
+ ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options,
+ ColumnFamilyData* cfd,
+ SequenceNumber snapshot,
+ ReadCallback* read_callback,
+ bool expose_blob_index = false,
+ bool allow_refresh = true);
+
+ Status NewIterators(const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override;
+
+ using DBImpl::Put;
+ Status Put(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+ const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::PutEntity;
+ Status PutEntity(const WriteOptions& /* options */,
+ ColumnFamilyHandle* /* column_family */,
+ const Slice& /* key */,
+ const WideColumns& /* columns */) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::Merge;
+ Status Merge(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+ const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::Delete;
+ Status Delete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SingleDelete;
+ Status SingleDelete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status Write(const WriteOptions& /*options*/,
+ WriteBatch* /*updates*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::CompactRange;
+ Status CompactRange(const CompactRangeOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/, const Slice* /*end*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::CompactFiles;
+ Status CompactFiles(
+ const CompactionOptions& /*compact_options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*input_file_names*/,
+ const int /*output_level*/, const int /*output_path_id*/ = -1,
+ std::vector<std::string>* const /*output_file_names*/ = nullptr,
+ CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status DisableFileDeletions() override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status EnableFileDeletions(bool /*force*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status GetLiveFiles(std::vector<std::string>&,
+ uint64_t* /*manifest_file_size*/,
+ bool /*flush_memtable*/ = true) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::Flush;
+ Status Flush(const FlushOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SetDBOptions;
+ Status SetDBOptions(const std::unordered_map<std::string, std::string>&
+ /*options_map*/) override {
+ // Currently not supported because changing certain options may cause
+ // flush/compaction.
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SetOptions;
+ Status SetOptions(
+ ColumnFamilyHandle* /*cfd*/,
+ const std::unordered_map<std::string, std::string>& /*options_map*/)
+ override {
+ // Currently not supported because changing certain options may cause
+ // flush/compaction and/or write to MANIFEST.
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SyncWAL;
+ Status SyncWAL() override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DB::IngestExternalFile;
+ Status IngestExternalFile(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*external_files*/,
+ const IngestExternalFileOptions& /*ingestion_options*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ // Try to catch up with the primary by reading as much as possible from the
+ // log files until there is nothing more to read or encounters an error. If
+ // the amount of information in the log files to process is huge, this
+ // method can take long time due to all the I/O and CPU costs.
+ Status TryCatchUpWithPrimary() override;
+
+ // Try to find log reader using log_number from log_readers_ map, initialize
+ // if it doesn't exist
+ Status MaybeInitLogReader(uint64_t log_number,
+ log::FragmentBufferedReader** log_reader);
+
+ // Check if all live files exist on file system and that their file sizes
+ // matche to the in-memory records. It is possible that some live files may
+ // have been deleted by the primary. In this case, CheckConsistency() does
+ // not flag the missing file as inconsistency.
+ Status CheckConsistency() override;
+
+#ifndef NDEBUG
+ Status TEST_CompactWithoutInstallation(const OpenAndCompactOptions& options,
+ ColumnFamilyHandle* cfh,
+ const CompactionServiceInput& input,
+ CompactionServiceResult* result) {
+ return CompactWithoutInstallation(options, cfh, input, result);
+ }
+#endif // NDEBUG
+
+ protected:
+#ifndef ROCKSDB_LITE
+ Status FlushForGetLiveFiles() override {
+ // No-op for read-only DB
+ return Status::OK();
+ }
+#endif // !ROCKSDB_LITE
+
+ // ColumnFamilyCollector is a write batch handler which does nothing
+ // except recording unique column family IDs
+ class ColumnFamilyCollector : public WriteBatch::Handler {
+ std::unordered_set<uint32_t> column_family_ids_;
+
+ Status AddColumnFamilyId(uint32_t column_family_id) {
+ if (column_family_ids_.find(column_family_id) ==
+ column_family_ids_.end()) {
+ column_family_ids_.insert(column_family_id);
+ }
+ return Status::OK();
+ }
+
+ public:
+ explicit ColumnFamilyCollector() {}
+
+ ~ColumnFamilyCollector() override {}
+
+ Status PutCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status DeleteCF(uint32_t column_family_id, const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status DeleteRangeCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status MergeCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status PutBlobIndexCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status MarkBeginPrepare(bool) override { return Status::OK(); }
+
+ Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+
+ Status MarkRollback(const Slice&) override { return Status::OK(); }
+
+ Status MarkCommit(const Slice&) override { return Status::OK(); }
+
+ Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+ return Status::OK();
+ }
+
+ Status MarkNoop(bool) override { return Status::OK(); }
+
+ const std::unordered_set<uint32_t>& column_families() const {
+ return column_family_ids_;
+ }
+ };
+
+ Status CollectColumnFamilyIdsFromWriteBatch(
+ const WriteBatch& batch, std::vector<uint32_t>* column_family_ids) {
+ assert(column_family_ids != nullptr);
+ column_family_ids->clear();
+ ColumnFamilyCollector handler;
+ Status s = batch.Iterate(&handler);
+ if (s.ok()) {
+ for (const auto& cf : handler.column_families()) {
+ column_family_ids->push_back(cf);
+ }
+ }
+ return s;
+ }
+
+ bool OwnTablesAndLogs() const override {
+ // Currently, the secondary instance does not own the database files. It
+ // simply opens the files of the primary instance and tracks their file
+ // descriptors until they become obsolete. In the future, the secondary may
+ // create links to database files. OwnTablesAndLogs will return true then.
+ return false;
+ }
+
+ private:
+ friend class DB;
+
+ // No copying allowed
+ DBImplSecondary(const DBImplSecondary&);
+ void operator=(const DBImplSecondary&);
+
+ using DBImpl::Recover;
+
+ Status FindAndRecoverLogFiles(
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context);
+ Status FindNewLogNumbers(std::vector<uint64_t>* logs);
+ // After manifest recovery, replay WALs and refresh log_readers_ if necessary
+ // REQUIRES: log_numbers are sorted in ascending order
+ Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+ SequenceNumber* next_sequence,
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context);
+
+ // Run compaction without installation, the output files will be placed in the
+ // secondary DB path. The LSM tree won't be changed, the secondary DB is still
+ // in read-only mode.
+ Status CompactWithoutInstallation(const OpenAndCompactOptions& options,
+ ColumnFamilyHandle* cfh,
+ const CompactionServiceInput& input,
+ CompactionServiceResult* result);
+
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
+ std::unique_ptr<Status> manifest_reader_status_;
+
+ // Cache log readers for each log number, used for continue WAL replay
+ // after recovery
+ std::map<uint64_t, std::unique_ptr<LogReaderContainer>> log_readers_;
+
+ // Current WAL number replayed for each column family.
+ std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
+
+ const std::string secondary_path_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_write.cc b/src/rocksdb/db/db_impl/db_impl_write.cc
new file mode 100644
index 000000000..a597c168d
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_write.cc
@@ -0,0 +1,2435 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "options/options_helper.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& val) {
+ const Status s = FailIfCfHasTs(column_family);
+ if (!s.ok()) {
+ return s;
+ }
+ return DB::Put(o, column_family, key, val);
+}
+
+Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts, const Slice& val) {
+ const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+ if (!s.ok()) {
+ return s;
+ }
+ return DB::Put(o, column_family, key, ts, val);
+}
+
+Status DBImpl::PutEntity(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const WideColumns& columns) {
+ const Status s = FailIfCfHasTs(column_family);
+ if (!s.ok()) {
+ return s;
+ }
+
+ return DB::PutEntity(options, column_family, key, columns);
+}
+
+Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& val) {
+ const Status s = FailIfCfHasTs(column_family);
+ if (!s.ok()) {
+ return s;
+ }
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ if (!cfh->cfd()->ioptions()->merge_operator) {
+ return Status::NotSupported("Provide a merge_operator when opening DB");
+ } else {
+ return DB::Merge(o, column_family, key, val);
+ }
+}
+
+Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts, const Slice& val) {
+ const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+ if (!s.ok()) {
+ return s;
+ }
+ return DB::Merge(o, column_family, key, ts, val);
+}
+
+Status DBImpl::Delete(const WriteOptions& write_options,
+ ColumnFamilyHandle* column_family, const Slice& key) {
+ const Status s = FailIfCfHasTs(column_family);
+ if (!s.ok()) {
+ return s;
+ }
+ return DB::Delete(write_options, column_family, key);
+}
+
+Status DBImpl::Delete(const WriteOptions& write_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) {
+ const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+ if (!s.ok()) {
+ return s;
+ }
+ return DB::Delete(write_options, column_family, key, ts);
+}
+
+Status DBImpl::SingleDelete(const WriteOptions& write_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ const Status s = FailIfCfHasTs(column_family);
+ if (!s.ok()) {
+ return s;
+ }
+ return DB::SingleDelete(write_options, column_family, key);
+}
+
+Status DBImpl::SingleDelete(const WriteOptions& write_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) {
+ const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+ if (!s.ok()) {
+ return s;
+ }
+ return DB::SingleDelete(write_options, column_family, key, ts);
+}
+
+Status DBImpl::DeleteRange(const WriteOptions& write_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key) {
+ const Status s = FailIfCfHasTs(column_family);
+ if (!s.ok()) {
+ return s;
+ }
+ return DB::DeleteRange(write_options, column_family, begin_key, end_key);
+}
+
+Status DBImpl::DeleteRange(const WriteOptions& write_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key,
+ const Slice& ts) {
+ const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+ if (!s.ok()) {
+ return s;
+ }
+ return DB::DeleteRange(write_options, column_family, begin_key, end_key, ts);
+}
+
+void DBImpl::SetRecoverableStatePreReleaseCallback(
+ PreReleaseCallback* callback) {
+ recoverable_state_pre_release_callback_.reset(callback);
+}
+
+Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
+ Status s;
+ if (write_options.protection_bytes_per_key > 0) {
+ s = WriteBatchInternal::UpdateProtectionInfo(
+ my_batch, write_options.protection_bytes_per_key);
+ }
+ if (s.ok()) {
+ s = WriteImpl(write_options, my_batch, /*callback=*/nullptr,
+ /*log_used=*/nullptr);
+ }
+ return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
+ WriteBatch* my_batch,
+ WriteCallback* callback) {
+ Status s;
+ if (write_options.protection_bytes_per_key > 0) {
+ s = WriteBatchInternal::UpdateProtectionInfo(
+ my_batch, write_options.protection_bytes_per_key);
+ }
+ if (s.ok()) {
+ s = WriteImpl(write_options, my_batch, callback, nullptr);
+ }
+ return s;
+}
+#endif // ROCKSDB_LITE
+
+// The main write queue. This is the only write queue that updates LastSequence.
+// When using one write queue, the same sequence also indicates the last
+// published sequence.
+Status DBImpl::WriteImpl(const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback,
+ uint64_t* log_used, uint64_t log_ref,
+ bool disable_memtable, uint64_t* seq_used,
+ size_t batch_cnt,
+ PreReleaseCallback* pre_release_callback,
+ PostMemTableCallback* post_memtable_callback) {
+ assert(!seq_per_batch_ || batch_cnt != 0);
+ assert(my_batch == nullptr || my_batch->Count() == 0 ||
+ write_options.protection_bytes_per_key == 0 ||
+ write_options.protection_bytes_per_key ==
+ my_batch->GetProtectionBytesPerKey());
+ if (my_batch == nullptr) {
+ return Status::InvalidArgument("Batch is nullptr!");
+ } else if (!disable_memtable &&
+ WriteBatchInternal::TimestampsUpdateNeeded(*my_batch)) {
+ // If writing to memtable, then we require the caller to set/update the
+ // timestamps for the keys in the write batch.
+ // Otherwise, it means we are just writing to the WAL, and we allow
+ // timestamps unset for the keys in the write batch. This can happen if we
+ // use TransactionDB with write-committed policy, and we currently do not
+ // support user-defined timestamp with other policies.
+ // In the prepare phase, a transaction can write the batch to the WAL
+ // without inserting to memtable. The keys in the batch do not have to be
+ // assigned timestamps because they will be used only during recovery if
+ // there is a commit marker which includes their commit timestamp.
+ return Status::InvalidArgument("write batch must have timestamp(s) set");
+ } else if (write_options.rate_limiter_priority != Env::IO_TOTAL &&
+ write_options.rate_limiter_priority != Env::IO_USER) {
+ return Status::InvalidArgument(
+ "WriteOptions::rate_limiter_priority only allows "
+ "Env::IO_TOTAL and Env::IO_USER due to implementation constraints");
+ } else if (write_options.rate_limiter_priority != Env::IO_TOTAL &&
+ (write_options.disableWAL || manual_wal_flush_)) {
+ return Status::InvalidArgument(
+ "WriteOptions::rate_limiter_priority currently only supports "
+ "rate-limiting automatic WAL flush, which requires "
+ "`WriteOptions::disableWAL` and "
+ "`DBOptions::manual_wal_flush` both set to false");
+ } else if (write_options.protection_bytes_per_key != 0 &&
+ write_options.protection_bytes_per_key != 8) {
+ return Status::InvalidArgument(
+ "`WriteOptions::protection_bytes_per_key` must be zero or eight");
+ }
+ // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+ // grabs but does not seem thread-safe.
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_ && !tracer_->IsWriteOrderPreserved()) {
+ // We don't have to preserve write order so can trace anywhere. It's more
+ // efficient to trace here than to add latency to a phase of the log/apply
+ // pipeline.
+ // TODO: maybe handle the tracing status?
+ tracer_->Write(my_batch).PermitUncheckedError();
+ }
+ }
+ if (write_options.sync && write_options.disableWAL) {
+ return Status::InvalidArgument("Sync writes has to enable WAL.");
+ }
+ if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) {
+ return Status::NotSupported(
+ "pipelined_writes is not compatible with concurrent prepares");
+ }
+ if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
+ // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt
+ return Status::NotSupported(
+ "pipelined_writes is not compatible with seq_per_batch");
+ }
+ if (immutable_db_options_.unordered_write &&
+ immutable_db_options_.enable_pipelined_write) {
+ return Status::NotSupported(
+ "pipelined_writes is not compatible with unordered_write");
+ }
+ if (immutable_db_options_.enable_pipelined_write &&
+ post_memtable_callback != nullptr) {
+ return Status::NotSupported(
+ "pipelined write currently does not honor post_memtable_callback");
+ }
+ if (seq_per_batch_ && post_memtable_callback != nullptr) {
+ return Status::NotSupported(
+ "seq_per_batch currently does not honor post_memtable_callback");
+ }
+ // Otherwise IsLatestPersistentState optimization does not make sense
+ assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
+ disable_memtable);
+
+ if (write_options.low_pri) {
+ Status s = ThrottleLowPriWritesIfNeeded(write_options, my_batch);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (two_write_queues_ && disable_memtable) {
+ AssignOrder assign_order =
+ seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder;
+ // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and
+ // they don't consume sequence.
+ return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch,
+ callback, log_used, log_ref, seq_used, batch_cnt,
+ pre_release_callback, assign_order,
+ kDontPublishLastSeq, disable_memtable);
+ }
+
+ if (immutable_db_options_.unordered_write) {
+ const size_t sub_batch_cnt = batch_cnt != 0
+ ? batch_cnt
+ // every key is a sub-batch consuming a seq
+ : WriteBatchInternal::Count(my_batch);
+ uint64_t seq = 0;
+ // Use a write thread to i) optimize for WAL write, ii) publish last
+ // sequence in in increasing order, iii) call pre_release_callback serially
+ Status status = WriteImplWALOnly(
+ &write_thread_, write_options, my_batch, callback, log_used, log_ref,
+ &seq, sub_batch_cnt, pre_release_callback, kDoAssignOrder,
+ kDoPublishLastSeq, disable_memtable);
+ TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
+ if (!status.ok()) {
+ return status;
+ }
+ if (seq_used) {
+ *seq_used = seq;
+ }
+ if (!disable_memtable) {
+ TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable");
+ status = UnorderedWriteMemtable(write_options, my_batch, callback,
+ log_ref, seq, sub_batch_cnt);
+ }
+ return status;
+ }
+
+ if (immutable_db_options_.enable_pipelined_write) {
+ return PipelinedWriteImpl(write_options, my_batch, callback, log_used,
+ log_ref, disable_memtable, seq_used);
+ }
+
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ disable_memtable, batch_cnt, pre_release_callback,
+ post_memtable_callback);
+ StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+ write_thread_.JoinBatchGroup(&w);
+ if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+ // we are a non-leader in a parallel group
+
+ if (w.ShouldWriteToMemtable()) {
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+ PERF_TIMER_GUARD(write_memtable_time);
+
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_,
+ write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+ true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt,
+ batch_per_txn_, write_options.memtable_insert_hint_per_batch);
+
+ PERF_TIMER_START(write_pre_and_post_process_time);
+ }
+
+ if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+ // we're responsible for exit batch group
+ // TODO(myabandeh): propagate status to write_group
+ auto last_sequence = w.write_group->last_sequence;
+ for (auto* tmp_w : *(w.write_group)) {
+ assert(tmp_w);
+ if (tmp_w->post_memtable_callback) {
+ Status tmp_s =
+ (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable);
+ // TODO: propagate the execution status of post_memtable_callback to
+ // caller.
+ assert(tmp_s.ok());
+ }
+ }
+ versions_->SetLastSequence(last_sequence);
+ MemTableInsertStatusCheck(w.status);
+ write_thread_.ExitAsBatchGroupFollower(&w);
+ }
+ assert(w.state == WriteThread::STATE_COMPLETED);
+ // STATE_COMPLETED conditional below handles exit
+ }
+ if (w.state == WriteThread::STATE_COMPLETED) {
+ if (log_used != nullptr) {
+ *log_used = w.log_used;
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ // write is complete and leader has updated sequence
+ return w.FinalStatus();
+ }
+ // else we are the leader of the write batch group
+ assert(w.state == WriteThread::STATE_GROUP_LEADER);
+ Status status;
+ // Once reaches this point, the current writer "w" will try to do its write
+ // job. It may also pick up some of the remaining writers in the "writers_"
+ // when it finds suitable, and finish them in the same write batch.
+ // This is how a write job could be done by the other writer.
+ WriteContext write_context;
+ LogContext log_context(write_options.sync);
+ WriteThread::WriteGroup write_group;
+ bool in_parallel_group = false;
+ uint64_t last_sequence = kMaxSequenceNumber;
+
+ assert(!two_write_queues_ || !disable_memtable);
+ {
+ // With concurrent writes we do preprocess only in the write thread that
+ // also does write to memtable to avoid sync issue on shared data structure
+ // with the other thread
+
+ // PreprocessWrite does its own perf timing.
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ status = PreprocessWrite(write_options, &log_context, &write_context);
+ if (!two_write_queues_) {
+ // Assign it after ::PreprocessWrite since the sequence might advance
+ // inside it by WriteRecoverableState
+ last_sequence = versions_->LastSequence();
+ }
+
+ PERF_TIMER_START(write_pre_and_post_process_time);
+ }
+
+ // Add to log and apply to memtable. We can release the lock
+ // during this phase since &w is currently responsible for logging
+ // and protects against concurrent loggers and concurrent writes
+ // into memtables
+
+ TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters");
+ last_batch_group_size_ =
+ write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
+
+ IOStatus io_s;
+ Status pre_release_cb_status;
+ if (status.ok()) {
+ // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+ // grabs but does not seem thread-safe.
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_ && tracer_->IsWriteOrderPreserved()) {
+ for (auto* writer : write_group) {
+ // TODO: maybe handle the tracing status?
+ tracer_->Write(writer->batch).PermitUncheckedError();
+ }
+ }
+ }
+ // Rules for when we can update the memtable concurrently
+ // 1. supported by memtable
+ // 2. Puts are not okay if inplace_update_support
+ // 3. Merges are not okay
+ //
+ // Rules 1..2 are enforced by checking the options
+ // during startup (CheckConcurrentWritesSupported), so if
+ // options.allow_concurrent_memtable_write is true then they can be
+ // assumed to be true. Rule 3 is checked for each batch. We could
+ // relax rules 2 if we could prevent write batches from referring
+ // more than once to a particular key.
+ bool parallel = immutable_db_options_.allow_concurrent_memtable_write &&
+ write_group.size > 1;
+ size_t total_count = 0;
+ size_t valid_batches = 0;
+ size_t total_byte_size = 0;
+ size_t pre_release_callback_cnt = 0;
+ for (auto* writer : write_group) {
+ assert(writer);
+ if (writer->CheckCallback(this)) {
+ valid_batches += writer->batch_cnt;
+ if (writer->ShouldWriteToMemtable()) {
+ total_count += WriteBatchInternal::Count(writer->batch);
+ parallel = parallel && !writer->batch->HasMerge();
+ }
+ total_byte_size = WriteBatchInternal::AppendedByteSize(
+ total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+ if (writer->pre_release_callback) {
+ pre_release_callback_cnt++;
+ }
+ }
+ }
+ // Note about seq_per_batch_: either disableWAL is set for the entire write
+ // group or not. In either case we inc seq for each write batch with no
+ // failed callback. This means that there could be a batch with
+ // disalbe_memtable in between; although we do not write this batch to
+ // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
+ // the seq per valid written key to mem.
+ size_t seq_inc = seq_per_batch_ ? valid_batches : total_count;
+
+ const bool concurrent_update = two_write_queues_;
+ // Update stats while we are an exclusive group leader, so we know
+ // that nobody else can be writing to these particular stats.
+ // We're optimistic, updating the stats before we successfully
+ // commit. That lets us release our leader status early.
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count,
+ concurrent_update);
+ RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+ stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+ concurrent_update);
+ RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+ concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_SELF);
+ auto write_done_by_other = write_group.size - 1;
+ if (write_done_by_other > 0) {
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+ write_done_by_other, concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+ }
+ RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+ if (write_options.disableWAL) {
+ has_unpersisted_data_.store(true, std::memory_order_relaxed);
+ }
+
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ if (!two_write_queues_) {
+ if (status.ok() && !write_options.disableWAL) {
+ assert(log_context.log_file_number_size);
+ LogFileNumberSize& log_file_number_size =
+ *(log_context.log_file_number_size);
+ PERF_TIMER_GUARD(write_wal_time);
+ io_s =
+ WriteToWAL(write_group, log_context.writer, log_used,
+ log_context.need_log_sync, log_context.need_log_dir_sync,
+ last_sequence + 1, log_file_number_size);
+ }
+ } else {
+ if (status.ok() && !write_options.disableWAL) {
+ PERF_TIMER_GUARD(write_wal_time);
+ // LastAllocatedSequence is increased inside WriteToWAL under
+ // wal_write_mutex_ to ensure ordered events in WAL
+ io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
+ seq_inc);
+ } else {
+ // Otherwise we inc seq number for memtable writes
+ last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+ }
+ }
+ status = io_s;
+ assert(last_sequence != kMaxSequenceNumber);
+ const SequenceNumber current_sequence = last_sequence + 1;
+ last_sequence += seq_inc;
+
+ // PreReleaseCallback is called after WAL write and before memtable write
+ if (status.ok()) {
+ SequenceNumber next_sequence = current_sequence;
+ size_t index = 0;
+ // Note: the logic for advancing seq here must be consistent with the
+ // logic in WriteBatchInternal::InsertInto(write_group...) as well as
+ // with WriteBatchInternal::InsertInto(write_batch...) that is called on
+ // the merged batch during recovery from the WAL.
+ for (auto* writer : write_group) {
+ if (writer->CallbackFailed()) {
+ continue;
+ }
+ writer->sequence = next_sequence;
+ if (writer->pre_release_callback) {
+ Status ws = writer->pre_release_callback->Callback(
+ writer->sequence, disable_memtable, writer->log_used, index++,
+ pre_release_callback_cnt);
+ if (!ws.ok()) {
+ status = pre_release_cb_status = ws;
+ break;
+ }
+ }
+ if (seq_per_batch_) {
+ assert(writer->batch_cnt);
+ next_sequence += writer->batch_cnt;
+ } else if (writer->ShouldWriteToMemtable()) {
+ next_sequence += WriteBatchInternal::Count(writer->batch);
+ }
+ }
+ }
+
+ if (status.ok()) {
+ PERF_TIMER_GUARD(write_memtable_time);
+
+ if (!parallel) {
+ // w.sequence will be set inside InsertInto
+ w.status = WriteBatchInternal::InsertInto(
+ write_group, current_sequence, column_family_memtables_.get(),
+ &flush_scheduler_, &trim_history_scheduler_,
+ write_options.ignore_missing_column_families,
+ 0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
+ batch_per_txn_);
+ } else {
+ write_group.last_sequence = last_sequence;
+ write_thread_.LaunchParallelMemTableWriters(&write_group);
+ in_parallel_group = true;
+
+ // Each parallel follower is doing each own writes. The leader should
+ // also do its own.
+ if (w.ShouldWriteToMemtable()) {
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ assert(w.sequence == current_sequence);
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_,
+ write_options.ignore_missing_column_families, 0 /*log_number*/,
+ this, true /*concurrent_memtable_writes*/, seq_per_batch_,
+ w.batch_cnt, batch_per_txn_,
+ write_options.memtable_insert_hint_per_batch);
+ }
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ }
+ }
+ PERF_TIMER_START(write_pre_and_post_process_time);
+
+ if (!io_s.ok()) {
+ // Check WriteToWAL status
+ IOStatusCheck(io_s);
+ }
+ if (!w.CallbackFailed()) {
+ if (!io_s.ok()) {
+ assert(pre_release_cb_status.ok());
+ } else {
+ WriteStatusCheck(pre_release_cb_status);
+ }
+ } else {
+ assert(pre_release_cb_status.ok());
+ }
+
+ if (log_context.need_log_sync) {
+ VersionEdit synced_wals;
+ log_write_mutex_.Lock();
+ if (status.ok()) {
+ MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
+ &synced_wals);
+ } else {
+ MarkLogsNotSynced(logfile_number_);
+ }
+ log_write_mutex_.Unlock();
+ if (status.ok() && synced_wals.IsWalAddition()) {
+ InstrumentedMutexLock l(&mutex_);
+ status = ApplyWALToManifest(&synced_wals);
+ }
+
+ // Requesting sync with two_write_queues_ is expected to be very rare. We
+ // hence provide a simple implementation that is not necessarily efficient.
+ if (two_write_queues_) {
+ if (manual_wal_flush_) {
+ status = FlushWAL(true);
+ } else {
+ status = SyncWAL();
+ }
+ }
+ }
+
+ bool should_exit_batch_group = true;
+ if (in_parallel_group) {
+ // CompleteParallelWorker returns true if this thread should
+ // handle exit, false means somebody else did
+ should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
+ }
+ if (should_exit_batch_group) {
+ if (status.ok()) {
+ for (auto* tmp_w : write_group) {
+ assert(tmp_w);
+ if (tmp_w->post_memtable_callback) {
+ Status tmp_s =
+ (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable);
+ // TODO: propagate the execution status of post_memtable_callback to
+ // caller.
+ assert(tmp_s.ok());
+ }
+ }
+ // Note: if we are to resume after non-OK statuses we need to revisit how
+ // we reacts to non-OK statuses here.
+ versions_->SetLastSequence(last_sequence);
+ }
+ MemTableInsertStatusCheck(w.status);
+ write_thread_.ExitAsBatchGroupLeader(write_group, status);
+ }
+
+ if (status.ok()) {
+ status = w.FinalStatus();
+ }
+ return status;
+}
+
+Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback,
+ uint64_t* log_used, uint64_t log_ref,
+ bool disable_memtable, uint64_t* seq_used) {
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+ WriteContext write_context;
+
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ disable_memtable, /*_batch_cnt=*/0,
+ /*_pre_release_callback=*/nullptr);
+ write_thread_.JoinBatchGroup(&w);
+ TEST_SYNC_POINT("DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup");
+ if (w.state == WriteThread::STATE_GROUP_LEADER) {
+ WriteThread::WriteGroup wal_write_group;
+ if (w.callback && !w.callback->AllowWriteBatching()) {
+ write_thread_.WaitForMemTableWriters();
+ }
+ LogContext log_context(!write_options.disableWAL && write_options.sync);
+ // PreprocessWrite does its own perf timing.
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+ w.status = PreprocessWrite(write_options, &log_context, &write_context);
+ PERF_TIMER_START(write_pre_and_post_process_time);
+
+ // This can set non-OK status if callback fail.
+ last_batch_group_size_ =
+ write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group);
+ const SequenceNumber current_sequence =
+ write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1;
+ size_t total_count = 0;
+ size_t total_byte_size = 0;
+
+ if (w.status.ok()) {
+ // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+ // grabs but does not seem thread-safe.
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+ for (auto* writer : wal_write_group) {
+ // TODO: maybe handle the tracing status?
+ tracer_->Write(writer->batch).PermitUncheckedError();
+ }
+ }
+ }
+ SequenceNumber next_sequence = current_sequence;
+ for (auto* writer : wal_write_group) {
+ assert(writer);
+ if (writer->CheckCallback(this)) {
+ if (writer->ShouldWriteToMemtable()) {
+ writer->sequence = next_sequence;
+ size_t count = WriteBatchInternal::Count(writer->batch);
+ next_sequence += count;
+ total_count += count;
+ }
+ total_byte_size = WriteBatchInternal::AppendedByteSize(
+ total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+ }
+ }
+ if (w.disable_wal) {
+ has_unpersisted_data_.store(true, std::memory_order_relaxed);
+ }
+ write_thread_.UpdateLastSequence(current_sequence + total_count - 1);
+ }
+
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+ RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+ stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size);
+ RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+ RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ IOStatus io_s;
+ io_s.PermitUncheckedError(); // Allow io_s to be uninitialized
+
+ if (w.status.ok() && !write_options.disableWAL) {
+ PERF_TIMER_GUARD(write_wal_time);
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
+ RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
+ if (wal_write_group.size > 1) {
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+ wal_write_group.size - 1);
+ RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
+ }
+ assert(log_context.log_file_number_size);
+ LogFileNumberSize& log_file_number_size =
+ *(log_context.log_file_number_size);
+ io_s =
+ WriteToWAL(wal_write_group, log_context.writer, log_used,
+ log_context.need_log_sync, log_context.need_log_dir_sync,
+ current_sequence, log_file_number_size);
+ w.status = io_s;
+ }
+
+ if (!io_s.ok()) {
+ // Check WriteToWAL status
+ IOStatusCheck(io_s);
+ } else if (!w.CallbackFailed()) {
+ WriteStatusCheck(w.status);
+ }
+
+ VersionEdit synced_wals;
+ if (log_context.need_log_sync) {
+ InstrumentedMutexLock l(&log_write_mutex_);
+ if (w.status.ok()) {
+ MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
+ &synced_wals);
+ } else {
+ MarkLogsNotSynced(logfile_number_);
+ }
+ }
+ if (w.status.ok() && synced_wals.IsWalAddition()) {
+ InstrumentedMutexLock l(&mutex_);
+ w.status = ApplyWALToManifest(&synced_wals);
+ }
+ write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
+ }
+
+ // NOTE: the memtable_write_group is declared before the following
+ // `if` statement because its lifetime needs to be longer
+ // that the inner context of the `if` as a reference to it
+ // may be used further below within the outer _write_thread
+ WriteThread::WriteGroup memtable_write_group;
+
+ if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) {
+ PERF_TIMER_GUARD(write_memtable_time);
+ assert(w.ShouldWriteToMemtable());
+ write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group);
+ if (memtable_write_group.size > 1 &&
+ immutable_db_options_.allow_concurrent_memtable_write) {
+ write_thread_.LaunchParallelMemTableWriters(&memtable_write_group);
+ } else {
+ memtable_write_group.status = WriteBatchInternal::InsertInto(
+ memtable_write_group, w.sequence, column_family_memtables_.get(),
+ &flush_scheduler_, &trim_history_scheduler_,
+ write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+ false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
+ versions_->SetLastSequence(memtable_write_group.last_sequence);
+ write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
+ }
+ } else {
+ // NOTE: the memtable_write_group is never really used,
+ // so we need to set its status to pass ASSERT_STATUS_CHECKED
+ memtable_write_group.status.PermitUncheckedError();
+ }
+
+ if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+ assert(w.ShouldWriteToMemtable());
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_, write_options.ignore_missing_column_families,
+ 0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+ false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/,
+ write_options.memtable_insert_hint_per_batch);
+ if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+ MemTableInsertStatusCheck(w.status);
+ versions_->SetLastSequence(w.write_group->last_sequence);
+ write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
+ }
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+
+ assert(w.state == WriteThread::STATE_COMPLETED);
+ return w.FinalStatus();
+}
+
+Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
+ WriteBatch* my_batch,
+ WriteCallback* callback, uint64_t log_ref,
+ SequenceNumber seq,
+ const size_t sub_batch_cnt) {
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ false /*disable_memtable*/);
+
+ if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) {
+ w.sequence = seq;
+ size_t total_count = WriteBatchInternal::Count(my_batch);
+ InternalStats* stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+ RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_, write_options.ignore_missing_column_families,
+ 0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+ seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/,
+ write_options.memtable_insert_hint_per_batch);
+ if (write_options.disableWAL) {
+ has_unpersisted_data_.store(true, std::memory_order_relaxed);
+ }
+ }
+
+ size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1;
+ if (pending_cnt == 0) {
+ // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex
+ // before notify ensures that cv is in waiting state when it is notified
+ // thus not missing the update to pending_memtable_writes_ even though it is
+ // not modified under the mutex.
+ std::lock_guard<std::mutex> lck(switch_mutex_);
+ switch_cv_.notify_all();
+ }
+ WriteStatusCheck(w.status);
+
+ if (!w.FinalStatus().ok()) {
+ return w.FinalStatus();
+ }
+ return Status::OK();
+}
+
+// The 2nd write queue. If enabled it will be used only for WAL-only writes.
+// This is the only queue that updates LastPublishedSequence which is only
+// applicable in a two-queue setting.
+Status DBImpl::WriteImplWALOnly(
+ WriteThread* write_thread, const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used,
+ const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+ PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+ const PublishLastSeq publish_last_seq, const bool disable_memtable) {
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ disable_memtable, sub_batch_cnt, pre_release_callback);
+ StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+ write_thread->JoinBatchGroup(&w);
+ assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
+ if (w.state == WriteThread::STATE_COMPLETED) {
+ if (log_used != nullptr) {
+ *log_used = w.log_used;
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ return w.FinalStatus();
+ }
+ // else we are the leader of the write batch group
+ assert(w.state == WriteThread::STATE_GROUP_LEADER);
+
+ if (publish_last_seq == kDoPublishLastSeq) {
+ Status status;
+
+ // Currently we only use kDoPublishLastSeq in unordered_write
+ assert(immutable_db_options_.unordered_write);
+ WriteContext write_context;
+ if (error_handler_.IsDBStopped()) {
+ status = error_handler_.GetBGError();
+ }
+ // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
+ // without paying the cost of obtaining the mutex.
+ if (status.ok()) {
+ LogContext log_context;
+ status = PreprocessWrite(write_options, &log_context, &write_context);
+ WriteStatusCheckOnLocked(status);
+ }
+ if (!status.ok()) {
+ WriteThread::WriteGroup write_group;
+ write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+ write_thread->ExitAsBatchGroupLeader(write_group, status);
+ return status;
+ }
+ }
+
+ WriteThread::WriteGroup write_group;
+ uint64_t last_sequence;
+ write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+ // Note: no need to update last_batch_group_size_ here since the batch writes
+ // to WAL only
+ // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+ // grabs but does not seem thread-safe.
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+ for (auto* writer : write_group) {
+ // TODO: maybe handle the tracing status?
+ tracer_->Write(writer->batch).PermitUncheckedError();
+ }
+ }
+ }
+
+ size_t pre_release_callback_cnt = 0;
+ size_t total_byte_size = 0;
+ for (auto* writer : write_group) {
+ assert(writer);
+ if (writer->CheckCallback(this)) {
+ total_byte_size = WriteBatchInternal::AppendedByteSize(
+ total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+ if (writer->pre_release_callback) {
+ pre_release_callback_cnt++;
+ }
+ }
+ }
+
+ const bool concurrent_update = true;
+ // Update stats while we are an exclusive group leader, so we know
+ // that nobody else can be writing to these particular stats.
+ // We're optimistic, updating the stats before we successfully
+ // commit. That lets us release our leader status early.
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+ concurrent_update);
+ RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+ concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_SELF);
+ auto write_done_by_other = write_group.size - 1;
+ if (write_done_by_other > 0) {
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+ write_done_by_other, concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+ }
+ RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ PERF_TIMER_GUARD(write_wal_time);
+ // LastAllocatedSequence is increased inside WriteToWAL under
+ // wal_write_mutex_ to ensure ordered events in WAL
+ size_t seq_inc = 0 /* total_count */;
+ if (assign_order == kDoAssignOrder) {
+ size_t total_batch_cnt = 0;
+ for (auto* writer : write_group) {
+ assert(writer->batch_cnt || !seq_per_batch_);
+ if (!writer->CallbackFailed()) {
+ total_batch_cnt += writer->batch_cnt;
+ }
+ }
+ seq_inc = total_batch_cnt;
+ }
+ Status status;
+ if (!write_options.disableWAL) {
+ IOStatus io_s =
+ ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
+ status = io_s;
+ // last_sequence may not be set if there is an error
+ // This error checking and return is moved up to avoid using uninitialized
+ // last_sequence.
+ if (!io_s.ok()) {
+ IOStatusCheck(io_s);
+ write_thread->ExitAsBatchGroupLeader(write_group, status);
+ return status;
+ }
+ } else {
+ // Otherwise we inc seq number to do solely the seq allocation
+ last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+ }
+
+ size_t memtable_write_cnt = 0;
+ auto curr_seq = last_sequence + 1;
+ for (auto* writer : write_group) {
+ if (writer->CallbackFailed()) {
+ continue;
+ }
+ writer->sequence = curr_seq;
+ if (assign_order == kDoAssignOrder) {
+ assert(writer->batch_cnt || !seq_per_batch_);
+ curr_seq += writer->batch_cnt;
+ }
+ if (!writer->disable_memtable) {
+ memtable_write_cnt++;
+ }
+ // else seq advances only by memtable writes
+ }
+ if (status.ok() && write_options.sync) {
+ assert(!write_options.disableWAL);
+ // Requesting sync with two_write_queues_ is expected to be very rare. We
+ // hance provide a simple implementation that is not necessarily efficient.
+ if (manual_wal_flush_) {
+ status = FlushWAL(true);
+ } else {
+ status = SyncWAL();
+ }
+ }
+ PERF_TIMER_START(write_pre_and_post_process_time);
+
+ if (!w.CallbackFailed()) {
+ WriteStatusCheck(status);
+ }
+ if (status.ok()) {
+ size_t index = 0;
+ for (auto* writer : write_group) {
+ if (!writer->CallbackFailed() && writer->pre_release_callback) {
+ assert(writer->sequence != kMaxSequenceNumber);
+ Status ws = writer->pre_release_callback->Callback(
+ writer->sequence, disable_memtable, writer->log_used, index++,
+ pre_release_callback_cnt);
+ if (!ws.ok()) {
+ status = ws;
+ break;
+ }
+ }
+ }
+ }
+ if (publish_last_seq == kDoPublishLastSeq) {
+ versions_->SetLastSequence(last_sequence + seq_inc);
+ // Currently we only use kDoPublishLastSeq in unordered_write
+ assert(immutable_db_options_.unordered_write);
+ }
+ if (immutable_db_options_.unordered_write && status.ok()) {
+ pending_memtable_writes_ += memtable_write_cnt;
+ }
+ write_thread->ExitAsBatchGroupLeader(write_group, status);
+ if (status.ok()) {
+ status = w.FinalStatus();
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ return status;
+}
+
+void DBImpl::WriteStatusCheckOnLocked(const Status& status) {
+ // Is setting bg_error_ enough here? This will at least stop
+ // compaction and fail any further writes.
+ InstrumentedMutexLock l(&mutex_);
+ assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
+ if (immutable_db_options_.paranoid_checks && !status.ok() &&
+ !status.IsBusy() && !status.IsIncomplete()) {
+ // Maybe change the return status to void?
+ error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
+ }
+}
+
+void DBImpl::WriteStatusCheck(const Status& status) {
+ // Is setting bg_error_ enough here? This will at least stop
+ // compaction and fail any further writes.
+ assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
+ if (immutable_db_options_.paranoid_checks && !status.ok() &&
+ !status.IsBusy() && !status.IsIncomplete()) {
+ mutex_.Lock();
+ // Maybe change the return status to void?
+ error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
+ mutex_.Unlock();
+ }
+}
+
+void DBImpl::IOStatusCheck(const IOStatus& io_status) {
+ // Is setting bg_error_ enough here? This will at least stop
+ // compaction and fail any further writes.
+ if ((immutable_db_options_.paranoid_checks && !io_status.ok() &&
+ !io_status.IsBusy() && !io_status.IsIncomplete()) ||
+ io_status.IsIOFenced()) {
+ mutex_.Lock();
+ // Maybe change the return status to void?
+ error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback);
+ mutex_.Unlock();
+ } else {
+ // Force writable file to be continue writable.
+ logs_.back().writer->file()->reset_seen_error();
+ }
+}
+
+void DBImpl::MemTableInsertStatusCheck(const Status& status) {
+ // A non-OK status here indicates that the state implied by the
+ // WAL has diverged from the in-memory state. This could be
+ // because of a corrupt write_batch (very bad), or because the
+ // client specified an invalid column family and didn't specify
+ // ignore_missing_column_families.
+ if (!status.ok()) {
+ mutex_.Lock();
+ assert(!error_handler_.IsBGWorkStopped());
+ // Maybe change the return status to void?
+ error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable)
+ .PermitUncheckedError();
+ mutex_.Unlock();
+ }
+}
+
+Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
+ LogContext* log_context,
+ WriteContext* write_context) {
+ assert(write_context != nullptr && log_context != nullptr);
+ Status status;
+
+ if (error_handler_.IsDBStopped()) {
+ InstrumentedMutexLock l(&mutex_);
+ status = error_handler_.GetBGError();
+ }
+
+ PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
+
+ if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) {
+ assert(versions_);
+ InstrumentedMutexLock l(&mutex_);
+ const ColumnFamilySet* const column_families =
+ versions_->GetColumnFamilySet();
+ assert(column_families);
+ size_t num_cfs = column_families->NumberOfColumnFamilies();
+ assert(num_cfs >= 1);
+ if (num_cfs > 1) {
+ WaitForPendingWrites();
+ status = SwitchWAL(write_context);
+ }
+ }
+
+ if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) {
+ // Before a new memtable is added in SwitchMemtable(),
+ // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+ // thread is writing to another DB with the same write buffer, they may also
+ // be flushed. We may end up with flushing much more DBs than needed. It's
+ // suboptimal but still correct.
+ InstrumentedMutexLock l(&mutex_);
+ WaitForPendingWrites();
+ status = HandleWriteBufferManagerFlush(write_context);
+ }
+
+ if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
+ InstrumentedMutexLock l(&mutex_);
+ status = TrimMemtableHistory(write_context);
+ }
+
+ if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
+ InstrumentedMutexLock l(&mutex_);
+ WaitForPendingWrites();
+ status = ScheduleFlushes(write_context);
+ }
+
+ PERF_TIMER_STOP(write_scheduling_flushes_compactions_time);
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+
+ if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
+ write_controller_.NeedsDelay()))) {
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+ PERF_TIMER_GUARD(write_delay_time);
+ // We don't know size of curent batch so that we always use the size
+ // for previous one. It might create a fairness issue that expiration
+ // might happen for smaller writes but larger writes can go through.
+ // Can optimize it if it is an issue.
+ InstrumentedMutexLock l(&mutex_);
+ status = DelayWrite(last_batch_group_size_, write_options);
+ PERF_TIMER_START(write_pre_and_post_process_time);
+ }
+
+ // If memory usage exceeded beyond a certain threshold,
+ // write_buffer_manager_->ShouldStall() returns true to all threads writing to
+ // all DBs and writers will be stalled.
+ // It does soft checking because WriteBufferManager::buffer_limit_ has already
+ // exceeded at this point so no new write (including current one) will go
+ // through until memory usage is decreased.
+ if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) {
+ if (write_options.no_slowdown) {
+ status = Status::Incomplete("Write stall");
+ } else {
+ InstrumentedMutexLock l(&mutex_);
+ WriteBufferManagerStallWrites();
+ }
+ }
+ InstrumentedMutexLock l(&log_write_mutex_);
+ if (status.ok() && log_context->need_log_sync) {
+ // Wait until the parallel syncs are finished. Any sync process has to sync
+ // the front log too so it is enough to check the status of front()
+ // We do a while loop since log_sync_cv_ is signalled when any sync is
+ // finished
+ // Note: there does not seem to be a reason to wait for parallel sync at
+ // this early step but it is not important since parallel sync (SyncWAL) and
+ // need_log_sync are usually not used together.
+ while (logs_.front().IsSyncing()) {
+ log_sync_cv_.Wait();
+ }
+ for (auto& log : logs_) {
+ // This is just to prevent the logs to be synced by a parallel SyncWAL
+ // call. We will do the actual syncing later after we will write to the
+ // WAL.
+ // Note: there does not seem to be a reason to set this early before we
+ // actually write to the WAL
+ log.PrepareForSync();
+ }
+ } else {
+ log_context->need_log_sync = false;
+ }
+ log_context->writer = logs_.back().writer;
+ log_context->need_log_dir_sync =
+ log_context->need_log_dir_sync && !log_dir_synced_;
+ log_context->log_file_number_size = std::addressof(alive_log_files_.back());
+
+ return status;
+}
+
+Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
+ WriteBatch* tmp_batch, WriteBatch** merged_batch,
+ size_t* write_with_wal,
+ WriteBatch** to_be_cached_state) {
+ assert(write_with_wal != nullptr);
+ assert(tmp_batch != nullptr);
+ assert(*to_be_cached_state == nullptr);
+ *write_with_wal = 0;
+ auto* leader = write_group.leader;
+ assert(!leader->disable_wal); // Same holds for all in the batch group
+ if (write_group.size == 1 && !leader->CallbackFailed() &&
+ leader->batch->GetWalTerminationPoint().is_cleared()) {
+ // we simply write the first WriteBatch to WAL if the group only
+ // contains one batch, that batch should be written to the WAL,
+ // and the batch is not wanting to be truncated
+ *merged_batch = leader->batch;
+ if (WriteBatchInternal::IsLatestPersistentState(*merged_batch)) {
+ *to_be_cached_state = *merged_batch;
+ }
+ *write_with_wal = 1;
+ } else {
+ // WAL needs all of the batches flattened into a single batch.
+ // We could avoid copying here with an iov-like AddRecord
+ // interface
+ *merged_batch = tmp_batch;
+ for (auto writer : write_group) {
+ if (!writer->CallbackFailed()) {
+ Status s = WriteBatchInternal::Append(*merged_batch, writer->batch,
+ /*WAL_only*/ true);
+ if (!s.ok()) {
+ tmp_batch->Clear();
+ return s;
+ }
+ if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) {
+ // We only need to cache the last of such write batch
+ *to_be_cached_state = writer->batch;
+ }
+ (*write_with_wal)++;
+ }
+ }
+ }
+ // return merged_batch;
+ return Status::OK();
+}
+
+// When two_write_queues_ is disabled, this function is called from the only
+// write thread. Otherwise this must be called holding log_write_mutex_.
+IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
+ log::Writer* log_writer, uint64_t* log_used,
+ uint64_t* log_size,
+ Env::IOPriority rate_limiter_priority,
+ LogFileNumberSize& log_file_number_size) {
+ assert(log_size != nullptr);
+
+ Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
+ TEST_SYNC_POINT_CALLBACK("DBImpl::WriteToWAL:log_entry", &log_entry);
+ auto s = merged_batch.VerifyChecksum();
+ if (!s.ok()) {
+ return status_to_io_status(std::move(s));
+ }
+ *log_size = log_entry.size();
+ // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
+ // from the two queues anyway and log_write_mutex_ is already held. Otherwise
+ // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
+ // from possible concurrent calls via the FlushWAL by the application.
+ const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+ // Due to performance cocerns of missed branch prediction penalize the new
+ // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
+ // when we do not need any locking.
+ if (UNLIKELY(needs_locking)) {
+ log_write_mutex_.Lock();
+ }
+ IOStatus io_s = log_writer->AddRecord(log_entry, rate_limiter_priority);
+
+ if (UNLIKELY(needs_locking)) {
+ log_write_mutex_.Unlock();
+ }
+ if (log_used != nullptr) {
+ *log_used = logfile_number_;
+ }
+ total_log_size_ += log_entry.size();
+ log_file_number_size.AddSize(*log_size);
+ log_empty_ = false;
+ return io_s;
+}
+
+IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
+ log::Writer* log_writer, uint64_t* log_used,
+ bool need_log_sync, bool need_log_dir_sync,
+ SequenceNumber sequence,
+ LogFileNumberSize& log_file_number_size) {
+ IOStatus io_s;
+ assert(!two_write_queues_);
+ assert(!write_group.leader->disable_wal);
+ // Same holds for all in the batch group
+ size_t write_with_wal = 0;
+ WriteBatch* to_be_cached_state = nullptr;
+ WriteBatch* merged_batch;
+ io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch_, &merged_batch,
+ &write_with_wal, &to_be_cached_state));
+ if (UNLIKELY(!io_s.ok())) {
+ return io_s;
+ }
+
+ if (merged_batch == write_group.leader->batch) {
+ write_group.leader->log_used = logfile_number_;
+ } else if (write_with_wal > 1) {
+ for (auto writer : write_group) {
+ writer->log_used = logfile_number_;
+ }
+ }
+
+ WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+ uint64_t log_size;
+ io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
+ write_group.leader->rate_limiter_priority,
+ log_file_number_size);
+ if (to_be_cached_state) {
+ cached_recoverable_state_ = *to_be_cached_state;
+ cached_recoverable_state_empty_ = false;
+ }
+
+ if (io_s.ok() && need_log_sync) {
+ StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS);
+ // It's safe to access logs_ with unlocked mutex_ here because:
+ // - we've set getting_synced=true for all logs,
+ // so other threads won't pop from logs_ while we're here,
+ // - only writer thread can push to logs_, and we're in
+ // writer thread, so no one will push to logs_,
+ // - as long as other threads don't modify it, it's safe to read
+ // from std::deque from multiple threads concurrently.
+ //
+ // Sync operation should work with locked log_write_mutex_, because:
+ // when DBOptions.manual_wal_flush_ is set,
+ // FlushWAL function will be invoked by another thread.
+ // if without locked log_write_mutex_, the log file may get data
+ // corruption
+
+ const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+ if (UNLIKELY(needs_locking)) {
+ log_write_mutex_.Lock();
+ }
+
+ for (auto& log : logs_) {
+ io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync);
+ if (!io_s.ok()) {
+ break;
+ }
+ }
+
+ if (UNLIKELY(needs_locking)) {
+ log_write_mutex_.Unlock();
+ }
+
+ if (io_s.ok() && need_log_dir_sync) {
+ // We only sync WAL directory the first time WAL syncing is
+ // requested, so that in case users never turn on WAL sync,
+ // we can avoid the disk I/O in the write code path.
+ io_s = directories_.GetWalDir()->FsyncWithDirOptions(
+ IOOptions(), nullptr,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+ }
+ }
+
+ if (merged_batch == &tmp_batch_) {
+ tmp_batch_.Clear();
+ }
+ if (io_s.ok()) {
+ auto stats = default_cf_internal_stats_;
+ if (need_log_sync) {
+ stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
+ RecordTick(stats_, WAL_FILE_SYNCED);
+ }
+ stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size);
+ RecordTick(stats_, WAL_FILE_BYTES, log_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal);
+ RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+ }
+ return io_s;
+}
+
+IOStatus DBImpl::ConcurrentWriteToWAL(
+ const WriteThread::WriteGroup& write_group, uint64_t* log_used,
+ SequenceNumber* last_sequence, size_t seq_inc) {
+ IOStatus io_s;
+
+ assert(two_write_queues_ || immutable_db_options_.unordered_write);
+ assert(!write_group.leader->disable_wal);
+ // Same holds for all in the batch group
+ WriteBatch tmp_batch;
+ size_t write_with_wal = 0;
+ WriteBatch* to_be_cached_state = nullptr;
+ WriteBatch* merged_batch;
+ io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch, &merged_batch,
+ &write_with_wal, &to_be_cached_state));
+ if (UNLIKELY(!io_s.ok())) {
+ return io_s;
+ }
+
+ // We need to lock log_write_mutex_ since logs_ and alive_log_files might be
+ // pushed back concurrently
+ log_write_mutex_.Lock();
+ if (merged_batch == write_group.leader->batch) {
+ write_group.leader->log_used = logfile_number_;
+ } else if (write_with_wal > 1) {
+ for (auto writer : write_group) {
+ writer->log_used = logfile_number_;
+ }
+ }
+ *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+ auto sequence = *last_sequence + 1;
+ WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+ log::Writer* log_writer = logs_.back().writer;
+ LogFileNumberSize& log_file_number_size = alive_log_files_.back();
+
+ assert(log_writer->get_log_number() == log_file_number_size.number);
+
+ uint64_t log_size;
+ io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
+ write_group.leader->rate_limiter_priority,
+ log_file_number_size);
+ if (to_be_cached_state) {
+ cached_recoverable_state_ = *to_be_cached_state;
+ cached_recoverable_state_empty_ = false;
+ }
+ log_write_mutex_.Unlock();
+
+ if (io_s.ok()) {
+ const bool concurrent = true;
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size,
+ concurrent);
+ RecordTick(stats_, WAL_FILE_BYTES, log_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal,
+ concurrent);
+ RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+ }
+ return io_s;
+}
+
+Status DBImpl::WriteRecoverableState() {
+ mutex_.AssertHeld();
+ if (!cached_recoverable_state_empty_) {
+ bool dont_care_bool;
+ SequenceNumber next_seq;
+ if (two_write_queues_) {
+ log_write_mutex_.Lock();
+ }
+ SequenceNumber seq;
+ if (two_write_queues_) {
+ seq = versions_->FetchAddLastAllocatedSequence(0);
+ } else {
+ seq = versions_->LastSequence();
+ }
+ WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1);
+ auto status = WriteBatchInternal::InsertInto(
+ &cached_recoverable_state_, column_family_memtables_.get(),
+ &flush_scheduler_, &trim_history_scheduler_, true,
+ 0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
+ &next_seq, &dont_care_bool, seq_per_batch_);
+ auto last_seq = next_seq - 1;
+ if (two_write_queues_) {
+ versions_->FetchAddLastAllocatedSequence(last_seq - seq);
+ versions_->SetLastPublishedSequence(last_seq);
+ }
+ versions_->SetLastSequence(last_seq);
+ if (two_write_queues_) {
+ log_write_mutex_.Unlock();
+ }
+ if (status.ok() && recoverable_state_pre_release_callback_) {
+ const bool DISABLE_MEMTABLE = true;
+ for (uint64_t sub_batch_seq = seq + 1;
+ sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) {
+ uint64_t const no_log_num = 0;
+ // Unlock it since the callback might end up locking mutex. e.g.,
+ // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB
+ mutex_.Unlock();
+ status = recoverable_state_pre_release_callback_->Callback(
+ sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1);
+ mutex_.Lock();
+ }
+ }
+ if (status.ok()) {
+ cached_recoverable_state_.Clear();
+ cached_recoverable_state_empty_ = true;
+ }
+ return status;
+ }
+ return Status::OK();
+}
+
+void DBImpl::SelectColumnFamiliesForAtomicFlush(
+ autovector<ColumnFamilyData*>* cfds) {
+ for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+ !cached_recoverable_state_empty_.load()) {
+ cfds->push_back(cfd);
+ }
+ }
+}
+
+// Assign sequence number for atomic flush.
+void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
+ assert(immutable_db_options_.atomic_flush);
+ auto seq = versions_->LastSequence();
+ for (auto cfd : cfds) {
+ cfd->imm()->AssignAtomicFlushSeq(seq);
+ }
+}
+
+Status DBImpl::SwitchWAL(WriteContext* write_context) {
+ mutex_.AssertHeld();
+ assert(write_context != nullptr);
+ Status status;
+
+ if (alive_log_files_.begin()->getting_flushed) {
+ return status;
+ }
+
+ auto oldest_alive_log = alive_log_files_.begin()->number;
+ bool flush_wont_release_oldest_log = false;
+ if (allow_2pc()) {
+ auto oldest_log_with_uncommitted_prep =
+ logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+
+ assert(oldest_log_with_uncommitted_prep == 0 ||
+ oldest_log_with_uncommitted_prep >= oldest_alive_log);
+ if (oldest_log_with_uncommitted_prep > 0 &&
+ oldest_log_with_uncommitted_prep == oldest_alive_log) {
+ if (unable_to_release_oldest_log_) {
+ // we already attempted to flush all column families dependent on
+ // the oldest alive log but the log still contained uncommitted
+ // transactions so there is still nothing that we can do.
+ return status;
+ } else {
+ ROCKS_LOG_WARN(
+ immutable_db_options_.info_log,
+ "Unable to release oldest log due to uncommitted transaction");
+ unable_to_release_oldest_log_ = true;
+ flush_wont_release_oldest_log = true;
+ }
+ }
+ }
+ if (!flush_wont_release_oldest_log) {
+ // we only mark this log as getting flushed if we have successfully
+ // flushed all data in this log. If this log contains outstanding prepared
+ // transactions then we cannot flush this log until those transactions are
+ // commited.
+ unable_to_release_oldest_log_ = false;
+ alive_log_files_.begin()->getting_flushed = true;
+ }
+
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Flushing all column families with data in WAL number %" PRIu64
+ ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
+ oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
+ // no need to refcount because drop is happening in write thread, so can't
+ // happen while we're in the write thread
+ autovector<ColumnFamilyData*> cfds;
+ if (immutable_db_options_.atomic_flush) {
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ } else {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (cfd->OldestLogToKeep() <= oldest_alive_log) {
+ cfds.push_back(cfd);
+ }
+ }
+ MaybeFlushStatsCF(&cfds);
+ }
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ for (const auto cfd : cfds) {
+ cfd->Ref();
+ status = SwitchMemtable(cfd, write_context);
+ cfd->UnrefAndTryDelete();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+
+ if (status.ok()) {
+ if (immutable_db_options_.atomic_flush) {
+ AssignAtomicFlushSeq(cfds);
+ }
+ for (auto cfd : cfds) {
+ cfd->imm()->FlushRequested();
+ if (!immutable_db_options_.atomic_flush) {
+ FlushRequest flush_req;
+ GenerateFlushRequest({cfd}, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWalFull);
+ }
+ }
+ if (immutable_db_options_.atomic_flush) {
+ FlushRequest flush_req;
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWalFull);
+ }
+ MaybeScheduleFlushOrCompaction();
+ }
+ return status;
+}
+
+Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
+ mutex_.AssertHeld();
+ assert(write_context != nullptr);
+ Status status;
+
+ // Before a new memtable is added in SwitchMemtable(),
+ // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+ // thread is writing to another DB with the same write buffer, they may also
+ // be flushed. We may end up with flushing much more DBs than needed. It's
+ // suboptimal but still correct.
+ // no need to refcount because drop is happening in write thread, so can't
+ // happen while we're in the write thread
+ autovector<ColumnFamilyData*> cfds;
+ if (immutable_db_options_.atomic_flush) {
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ } else {
+ ColumnFamilyData* cfd_picked = nullptr;
+ SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
+
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (!cfd->mem()->IsEmpty() && !cfd->imm()->IsFlushPendingOrRunning()) {
+ // We only consider flush on CFs with bytes in the mutable memtable,
+ // and no immutable memtables for which flush has yet to finish. If
+ // we triggered flush on CFs already trying to flush, we would risk
+ // creating too many immutable memtables leading to write stalls.
+ uint64_t seq = cfd->mem()->GetCreationSeq();
+ if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
+ cfd_picked = cfd;
+ seq_num_for_cf_picked = seq;
+ }
+ }
+ }
+ if (cfd_picked != nullptr) {
+ cfds.push_back(cfd_picked);
+ }
+ MaybeFlushStatsCF(&cfds);
+ }
+ if (!cfds.empty()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Flushing triggered to alleviate write buffer memory usage. Write "
+ "buffer is using %" ROCKSDB_PRIszt
+ " bytes out of a total of %" ROCKSDB_PRIszt ".",
+ write_buffer_manager_->memory_usage(),
+ write_buffer_manager_->buffer_size());
+ }
+
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+ for (const auto cfd : cfds) {
+ if (cfd->mem()->IsEmpty()) {
+ continue;
+ }
+ cfd->Ref();
+ status = SwitchMemtable(cfd, write_context);
+ cfd->UnrefAndTryDelete();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+
+ if (status.ok()) {
+ if (immutable_db_options_.atomic_flush) {
+ AssignAtomicFlushSeq(cfds);
+ }
+ for (const auto cfd : cfds) {
+ cfd->imm()->FlushRequested();
+ if (!immutable_db_options_.atomic_flush) {
+ FlushRequest flush_req;
+ GenerateFlushRequest({cfd}, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+ }
+ }
+ if (immutable_db_options_.atomic_flush) {
+ FlushRequest flush_req;
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+ }
+ MaybeScheduleFlushOrCompaction();
+ }
+ return status;
+}
+
+uint64_t DBImpl::GetMaxTotalWalSize() const {
+ uint64_t max_total_wal_size =
+ max_total_wal_size_.load(std::memory_order_acquire);
+ if (max_total_wal_size > 0) {
+ return max_total_wal_size;
+ }
+ return 4 * max_total_in_memory_state_.load(std::memory_order_acquire);
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::DelayWrite(uint64_t num_bytes,
+ const WriteOptions& write_options) {
+ uint64_t time_delayed = 0;
+ bool delayed = false;
+ {
+ StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL,
+ &time_delayed);
+ uint64_t delay =
+ write_controller_.GetDelay(immutable_db_options_.clock, num_bytes);
+ TEST_SYNC_POINT("DBImpl::DelayWrite:Start");
+ if (delay > 0) {
+ if (write_options.no_slowdown) {
+ return Status::Incomplete("Write stall");
+ }
+ TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
+
+ // Notify write_thread_ about the stall so it can setup a barrier and
+ // fail any pending writers with no_slowdown
+ write_thread_.BeginWriteStall();
+ mutex_.Unlock();
+ TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone");
+ // We will delay the write until we have slept for `delay` microseconds
+ // or we don't need a delay anymore. We check for cancellation every 1ms
+ // (slightly longer because WriteController minimum delay is 1ms, in
+ // case of sleep imprecision, rounding, etc.)
+ const uint64_t kDelayInterval = 1001;
+ uint64_t stall_end = sw.start_time() + delay;
+ while (write_controller_.NeedsDelay()) {
+ if (immutable_db_options_.clock->NowMicros() >= stall_end) {
+ // We already delayed this write `delay` microseconds
+ break;
+ }
+
+ delayed = true;
+ // Sleep for 0.001 seconds
+ immutable_db_options_.clock->SleepForMicroseconds(kDelayInterval);
+ }
+ mutex_.Lock();
+ write_thread_.EndWriteStall();
+ }
+
+ // Don't wait if there's a background error, even if its a soft error. We
+ // might wait here indefinitely as the background compaction may never
+ // finish successfully, resulting in the stall condition lasting
+ // indefinitely
+ while (error_handler_.GetBGError().ok() && write_controller_.IsStopped() &&
+ !shutting_down_.load(std::memory_order_relaxed)) {
+ if (write_options.no_slowdown) {
+ return Status::Incomplete("Write stall");
+ }
+ delayed = true;
+
+ // Notify write_thread_ about the stall so it can setup a barrier and
+ // fail any pending writers with no_slowdown
+ write_thread_.BeginWriteStall();
+ TEST_SYNC_POINT("DBImpl::DelayWrite:Wait");
+ bg_cv_.Wait();
+ write_thread_.EndWriteStall();
+ }
+ }
+ assert(!delayed || !write_options.no_slowdown);
+ if (delayed) {
+ default_cf_internal_stats_->AddDBStats(
+ InternalStats::kIntStatsWriteStallMicros, time_delayed);
+ RecordTick(stats_, STALL_MICROS, time_delayed);
+ }
+
+ // If DB is not in read-only mode and write_controller is not stopping
+ // writes, we can ignore any background errors and allow the write to
+ // proceed
+ Status s;
+ if (write_controller_.IsStopped()) {
+ if (!shutting_down_.load(std::memory_order_relaxed)) {
+ // If writes are still stopped and db not shutdown, it means we bailed
+ // due to a background error
+ s = Status::Incomplete(error_handler_.GetBGError().ToString());
+ } else {
+ s = Status::ShutdownInProgress("stalled writes");
+ }
+ }
+ if (error_handler_.IsDBStopped()) {
+ s = error_handler_.GetBGError();
+ }
+ return s;
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+void DBImpl::WriteBufferManagerStallWrites() {
+ mutex_.AssertHeld();
+ // First block future writer threads who want to add themselves to the queue
+ // of WriteThread.
+ write_thread_.BeginWriteStall();
+ mutex_.Unlock();
+
+ // Change the state to State::Blocked.
+ static_cast<WBMStallInterface*>(wbm_stall_.get())
+ ->SetState(WBMStallInterface::State::BLOCKED);
+ // Then WriteBufferManager will add DB instance to its queue
+ // and block this thread by calling WBMStallInterface::Block().
+ write_buffer_manager_->BeginWriteStall(wbm_stall_.get());
+ wbm_stall_->Block();
+
+ mutex_.Lock();
+ // Stall has ended. Signal writer threads so that they can add
+ // themselves to the WriteThread queue for writes.
+ write_thread_.EndWriteStall();
+}
+
+Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+ WriteBatch* my_batch) {
+ assert(write_options.low_pri);
+ // This is called outside the DB mutex. Although it is safe to make the call,
+ // the consistency condition is not guaranteed to hold. It's OK to live with
+ // it in this case.
+ // If we need to speed compaction, it means the compaction is left behind
+ // and we start to limit low pri writes to a limit.
+ if (write_controller_.NeedSpeedupCompaction()) {
+ if (allow_2pc() && (my_batch->HasCommit() || my_batch->HasRollback())) {
+ // For 2PC, we only rate limit prepare, not commit.
+ return Status::OK();
+ }
+ if (write_options.no_slowdown) {
+ return Status::Incomplete("Low priority write stall");
+ } else {
+ assert(my_batch != nullptr);
+ // Rate limit those writes. The reason that we don't completely wait
+ // is that in case the write is heavy, low pri writes may never have
+ // a chance to run. Now we guarantee we are still slowly making
+ // progress.
+ PERF_TIMER_GUARD(write_delay_time);
+ write_controller_.low_pri_rate_limiter()->Request(
+ my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */,
+ RateLimiter::OpType::kWrite);
+ }
+ }
+ return Status::OK();
+}
+
+void DBImpl::MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds) {
+ assert(cfds != nullptr);
+ if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) {
+ ColumnFamilyData* cfd_stats =
+ versions_->GetColumnFamilySet()->GetColumnFamily(
+ kPersistentStatsColumnFamilyName);
+ if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) {
+ for (ColumnFamilyData* cfd : *cfds) {
+ if (cfd == cfd_stats) {
+ // stats CF already included in cfds
+ return;
+ }
+ }
+ // force flush stats CF when its log number is less than all other CF's
+ // log numbers
+ bool force_flush_stats_cf = true;
+ for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+ if (loop_cfd == cfd_stats) {
+ continue;
+ }
+ if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+ force_flush_stats_cf = false;
+ }
+ }
+ if (force_flush_stats_cf) {
+ cfds->push_back(cfd_stats);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Force flushing stats CF with automated flush "
+ "to avoid holding old logs");
+ }
+ }
+ }
+}
+
+Status DBImpl::TrimMemtableHistory(WriteContext* context) {
+ autovector<ColumnFamilyData*> cfds;
+ ColumnFamilyData* tmp_cfd;
+ while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) !=
+ nullptr) {
+ cfds.push_back(tmp_cfd);
+ }
+ for (auto& cfd : cfds) {
+ autovector<MemTable*> to_delete;
+ bool trimmed = cfd->imm()->TrimHistory(&context->memtables_to_free_,
+ cfd->mem()->MemoryAllocatedBytes());
+ if (trimmed) {
+ context->superversion_context.NewSuperVersion();
+ assert(context->superversion_context.new_superversion.get() != nullptr);
+ cfd->InstallSuperVersion(&context->superversion_context, &mutex_);
+ }
+
+ if (cfd->UnrefAndTryDelete()) {
+ cfd = nullptr;
+ }
+ }
+ return Status::OK();
+}
+
+Status DBImpl::ScheduleFlushes(WriteContext* context) {
+ autovector<ColumnFamilyData*> cfds;
+ if (immutable_db_options_.atomic_flush) {
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ for (auto cfd : cfds) {
+ cfd->Ref();
+ }
+ flush_scheduler_.Clear();
+ } else {
+ ColumnFamilyData* tmp_cfd;
+ while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+ cfds.push_back(tmp_cfd);
+ }
+ MaybeFlushStatsCF(&cfds);
+ }
+ Status status;
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ for (auto& cfd : cfds) {
+ if (!cfd->mem()->IsEmpty()) {
+ status = SwitchMemtable(cfd, context);
+ }
+ if (cfd->UnrefAndTryDelete()) {
+ cfd = nullptr;
+ }
+ if (!status.ok()) {
+ break;
+ }
+ }
+
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+
+ if (status.ok()) {
+ if (immutable_db_options_.atomic_flush) {
+ AssignAtomicFlushSeq(cfds);
+ FlushRequest flush_req;
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+ } else {
+ for (auto* cfd : cfds) {
+ FlushRequest flush_req;
+ GenerateFlushRequest({cfd}, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+ }
+ }
+ MaybeScheduleFlushOrCompaction();
+ }
+ return status;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
+ const MemTableInfo& mem_table_info) {
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+
+ mutex_.Unlock();
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnMemTableSealed(mem_table_info);
+ }
+ mutex_.Lock();
+}
+#endif // ROCKSDB_LITE
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+// REQUIRES: this thread is currently at the front of the 2nd writer queue if
+// two_write_queues_ is true (This is to simplify the reasoning.)
+Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
+ mutex_.AssertHeld();
+ log::Writer* new_log = nullptr;
+ MemTable* new_mem = nullptr;
+ IOStatus io_s;
+
+ // Recoverable state is persisted in WAL. After memtable switch, WAL might
+ // be deleted, so we write the state to memtable to be persisted as well.
+ Status s = WriteRecoverableState();
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Attempt to switch to a new memtable and trigger flush of old.
+ // Do this without holding the dbmutex lock.
+ assert(versions_->prev_log_number() == 0);
+ if (two_write_queues_) {
+ log_write_mutex_.Lock();
+ }
+ bool creating_new_log = !log_empty_;
+ if (two_write_queues_) {
+ log_write_mutex_.Unlock();
+ }
+ uint64_t recycle_log_number = 0;
+ if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
+ !log_recycle_files_.empty()) {
+ recycle_log_number = log_recycle_files_.front();
+ }
+ uint64_t new_log_number =
+ creating_new_log ? versions_->NewFileNumber() : logfile_number_;
+ const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+
+ // Set memtable_info for memtable sealed callback
+#ifndef ROCKSDB_LITE
+ MemTableInfo memtable_info;
+ memtable_info.cf_name = cfd->GetName();
+ memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
+ memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
+ memtable_info.num_entries = cfd->mem()->num_entries();
+ memtable_info.num_deletes = cfd->mem()->num_deletes();
+#endif // ROCKSDB_LITE
+ // Log this later after lock release. It may be outdated, e.g., if background
+ // flush happens before logging, but that should be ok.
+ int num_imm_unflushed = cfd->imm()->NumNotFlushed();
+ const auto preallocate_block_size =
+ GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
+ mutex_.Unlock();
+ if (creating_new_log) {
+ // TODO: Write buffer size passed in should be max of all CF's instead
+ // of mutable_cf_options.write_buffer_size.
+ io_s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size,
+ &new_log);
+ if (s.ok()) {
+ s = io_s;
+ }
+ }
+ if (s.ok()) {
+ SequenceNumber seq = versions_->LastSequence();
+ new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
+ context->superversion_context.NewSuperVersion();
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] New memtable created with log file: #%" PRIu64
+ ". Immutable memtables: %d.\n",
+ cfd->GetName().c_str(), new_log_number, num_imm_unflushed);
+ // There should be no concurrent write as the thread is at the front of
+ // writer queue
+ cfd->mem()->ConstructFragmentedRangeTombstones();
+
+ mutex_.Lock();
+ if (recycle_log_number != 0) {
+ // Since renaming the file is done outside DB mutex, we need to ensure
+ // concurrent full purges don't delete the file while we're recycling it.
+ // To achieve that we hold the old log number in the recyclable list until
+ // after it has been renamed.
+ assert(log_recycle_files_.front() == recycle_log_number);
+ log_recycle_files_.pop_front();
+ }
+ if (s.ok() && creating_new_log) {
+ InstrumentedMutexLock l(&log_write_mutex_);
+ assert(new_log != nullptr);
+ if (!logs_.empty()) {
+ // Alway flush the buffer of the last log before switching to a new one
+ log::Writer* cur_log_writer = logs_.back().writer;
+ if (error_handler_.IsRecoveryInProgress()) {
+ // In recovery path, we force another try of writing WAL buffer.
+ cur_log_writer->file()->reset_seen_error();
+ }
+ io_s = cur_log_writer->WriteBuffer();
+ if (s.ok()) {
+ s = io_s;
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64
+ " WAL file\n",
+ cfd->GetName().c_str(), cur_log_writer->get_log_number(),
+ new_log_number);
+ }
+ }
+ if (s.ok()) {
+ logfile_number_ = new_log_number;
+ log_empty_ = true;
+ log_dir_synced_ = false;
+ logs_.emplace_back(logfile_number_, new_log);
+ alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
+ }
+ }
+
+ if (!s.ok()) {
+ // how do we fail if we're not creating new log?
+ assert(creating_new_log);
+ delete new_mem;
+ delete new_log;
+ context->superversion_context.new_superversion.reset();
+ // We may have lost data from the WritableFileBuffer in-memory buffer for
+ // the current log, so treat it as a fatal error and set bg_error
+ if (!io_s.ok()) {
+ error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable);
+ } else {
+ error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
+ }
+ // Read back bg_error in order to get the right severity
+ s = error_handler_.GetBGError();
+ return s;
+ }
+
+ bool empty_cf_updated = false;
+ if (immutable_db_options_.track_and_verify_wals_in_manifest &&
+ !immutable_db_options_.allow_2pc && creating_new_log) {
+ // In non-2pc mode, WALs become obsolete if they do not contain unflushed
+ // data. Updating the empty CF's log number might cause some WALs to become
+ // obsolete. So we should track the WAL obsoletion event before actually
+ // updating the empty CF's log number.
+ uint64_t min_wal_number_to_keep =
+ versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_);
+ if (min_wal_number_to_keep >
+ versions_->GetWalSet().GetMinWalNumberToKeep()) {
+ // Get a snapshot of the empty column families.
+ // LogAndApply may release and reacquire db
+ // mutex, during that period, column family may become empty (e.g. its
+ // flush succeeds), then it affects the computed min_log_number_to_keep,
+ // so we take a snapshot for consistency of column family data
+ // status. If a column family becomes non-empty afterwards, its active log
+ // should still be the created new log, so the min_log_number_to_keep is
+ // not affected.
+ autovector<ColumnFamilyData*> empty_cfs;
+ for (auto cf : *versions_->GetColumnFamilySet()) {
+ if (cf->IsEmpty()) {
+ empty_cfs.push_back(cf);
+ }
+ }
+
+ VersionEdit wal_deletion;
+ wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
+ s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_,
+ directories_.GetDbDir());
+ if (!s.ok() && versions_->io_status().IsIOError()) {
+ s = error_handler_.SetBGError(versions_->io_status(),
+ BackgroundErrorReason::kManifestWrite);
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ for (auto cf : empty_cfs) {
+ if (cf->IsEmpty()) {
+ cf->SetLogNumber(logfile_number_);
+ // MEMPURGE: No need to change this, because new adds
+ // should still receive new sequence numbers.
+ cf->mem()->SetCreationSeq(versions_->LastSequence());
+ } // cf may become non-empty.
+ }
+ empty_cf_updated = true;
+ }
+ }
+ if (!empty_cf_updated) {
+ for (auto cf : *versions_->GetColumnFamilySet()) {
+ // all this is just optimization to delete logs that
+ // are no longer needed -- if CF is empty, that means it
+ // doesn't need that particular log to stay alive, so we just
+ // advance the log number. no need to persist this in the manifest
+ if (cf->IsEmpty()) {
+ if (creating_new_log) {
+ cf->SetLogNumber(logfile_number_);
+ }
+ cf->mem()->SetCreationSeq(versions_->LastSequence());
+ }
+ }
+ }
+
+ cfd->mem()->SetNextLogNumber(logfile_number_);
+ assert(new_mem != nullptr);
+ cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
+ new_mem->Ref();
+ cfd->SetMemtable(new_mem);
+ InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
+ mutable_cf_options);
+
+#ifndef ROCKSDB_LITE
+ // Notify client that memtable is sealed, now that we have successfully
+ // installed a new memtable
+ NotifyOnMemTableSealed(cfd, memtable_info);
+#endif // ROCKSDB_LITE
+ // It is possible that we got here without checking the value of i_os, but
+ // that is okay. If we did, it most likely means that s was already an error.
+ // In any case, ignore any unchecked error for i_os here.
+ io_s.PermitUncheckedError();
+ return s;
+}
+
+size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
+ mutex_.AssertHeld();
+ size_t bsize =
+ static_cast<size_t>(write_buffer_size / 10 + write_buffer_size);
+ // Some users might set very high write_buffer_size and rely on
+ // max_total_wal_size or other parameters to control the WAL size.
+ if (mutable_db_options_.max_total_wal_size > 0) {
+ bsize = std::min<size_t>(
+ bsize, static_cast<size_t>(mutable_db_options_.max_total_wal_size));
+ }
+ if (immutable_db_options_.db_write_buffer_size > 0) {
+ bsize = std::min<size_t>(bsize, immutable_db_options_.db_write_buffer_size);
+ }
+ if (immutable_db_options_.write_buffer_manager &&
+ immutable_db_options_.write_buffer_manager->enabled()) {
+ bsize = std::min<size_t>(
+ bsize, immutable_db_options_.write_buffer_manager->buffer_size());
+ }
+
+ return bsize;
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) {
+ // Pre-allocate size of write batch conservatively.
+ // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+ // and we allocate 11 extra bytes for key length, as well as value length.
+ WriteBatch batch(key.size() + value.size() + 24, 0 /* max_bytes */,
+ opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+ Status s = batch.Put(column_family, key, value);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts, const Slice& value) {
+ ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+ assert(default_cf);
+ const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+ assert(default_cf_ucmp);
+ WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ opt.protection_bytes_per_key,
+ default_cf_ucmp->timestamp_size());
+ Status s = batch.Put(column_family, key, ts, value);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+Status DB::PutEntity(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const WideColumns& columns) {
+ const ColumnFamilyHandle* const default_cf = DefaultColumnFamily();
+ assert(default_cf);
+
+ const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+ assert(default_cf_ucmp);
+
+ WriteBatch batch(/* reserved_bytes */ 0, /* max_bytes */ 0,
+ options.protection_bytes_per_key,
+ default_cf_ucmp->timestamp_size());
+
+ const Status s = batch.PutEntity(column_family, key, columns);
+ if (!s.ok()) {
+ return s;
+ }
+
+ return Write(options, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+ Status s = batch.Delete(column_family, key);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts) {
+ ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+ assert(default_cf);
+ const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+ assert(default_cf_ucmp);
+ WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ opt.protection_bytes_per_key,
+ default_cf_ucmp->timestamp_size());
+ Status s = batch.Delete(column_family, key, ts);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+Status DB::SingleDelete(const WriteOptions& opt,
+ ColumnFamilyHandle* column_family, const Slice& key) {
+ WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+ Status s = batch.SingleDelete(column_family, key);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+Status DB::SingleDelete(const WriteOptions& opt,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) {
+ ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+ assert(default_cf);
+ const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+ assert(default_cf_ucmp);
+ WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ opt.protection_bytes_per_key,
+ default_cf_ucmp->timestamp_size());
+ Status s = batch.SingleDelete(column_family, key, ts);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+Status DB::DeleteRange(const WriteOptions& opt,
+ ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key) {
+ WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+ Status s = batch.DeleteRange(column_family, begin_key, end_key);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+Status DB::DeleteRange(const WriteOptions& opt,
+ ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key,
+ const Slice& ts) {
+ ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+ assert(default_cf);
+ const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+ assert(default_cf_ucmp);
+ WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ opt.protection_bytes_per_key,
+ default_cf_ucmp->timestamp_size());
+ Status s = batch.DeleteRange(column_family, begin_key, end_key, ts);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) {
+ WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+ Status s = batch.Merge(column_family, key, value);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts, const Slice& value) {
+ ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+ assert(default_cf);
+ const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+ assert(default_cf_ucmp);
+ WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ opt.protection_bytes_per_key,
+ default_cf_ucmp->timestamp_size());
+ Status s = batch.Merge(column_family, key, ts, value);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_info_dumper.cc b/src/rocksdb/db/db_info_dumper.cc
new file mode 100644
index 000000000..be8d5bee1
--- /dev/null
+++ b/src/rocksdb/db/db_info_dumper.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_info_dumper.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "file/filename.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DumpDBFileSummary(const ImmutableDBOptions& options,
+ const std::string& dbname,
+ const std::string& session_id) {
+ if (options.info_log == nullptr) {
+ return;
+ }
+
+ auto* env = options.env;
+ uint64_t number = 0;
+ FileType type = kInfoLogFile;
+
+ std::vector<std::string> files;
+ uint64_t file_num = 0;
+ uint64_t file_size;
+ std::string file_info, wal_info;
+
+ Header(options.info_log, "DB SUMMARY\n");
+ Header(options.info_log, "DB Session ID: %s\n", session_id.c_str());
+
+ Status s;
+ // Get files in dbname dir
+ s = env->GetChildren(dbname, &files);
+ if (!s.ok()) {
+ Error(options.info_log, "Error when reading %s dir %s\n", dbname.c_str(),
+ s.ToString().c_str());
+ }
+ std::sort(files.begin(), files.end());
+ for (const std::string& file : files) {
+ if (!ParseFileName(file, &number, &type)) {
+ continue;
+ }
+ switch (type) {
+ case kCurrentFile:
+ Header(options.info_log, "CURRENT file: %s\n", file.c_str());
+ break;
+ case kIdentityFile:
+ Header(options.info_log, "IDENTITY file: %s\n", file.c_str());
+ break;
+ case kDescriptorFile:
+ s = env->GetFileSize(dbname + "/" + file, &file_size);
+ if (s.ok()) {
+ Header(options.info_log,
+ "MANIFEST file: %s size: %" PRIu64 " Bytes\n", file.c_str(),
+ file_size);
+ } else {
+ Error(options.info_log,
+ "Error when reading MANIFEST file: %s/%s %s\n", dbname.c_str(),
+ file.c_str(), s.ToString().c_str());
+ }
+ break;
+ case kWalFile:
+ s = env->GetFileSize(dbname + "/" + file, &file_size);
+ if (s.ok()) {
+ wal_info.append(file)
+ .append(" size: ")
+ .append(std::to_string(file_size))
+ .append(" ; ");
+ } else {
+ Error(options.info_log, "Error when reading LOG file: %s/%s %s\n",
+ dbname.c_str(), file.c_str(), s.ToString().c_str());
+ }
+ break;
+ case kTableFile:
+ if (++file_num < 10) {
+ file_info.append(file).append(" ");
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Get sst files in db_path dir
+ for (auto& db_path : options.db_paths) {
+ if (dbname.compare(db_path.path) != 0) {
+ s = env->GetChildren(db_path.path, &files);
+ if (!s.ok()) {
+ Error(options.info_log, "Error when reading %s dir %s\n",
+ db_path.path.c_str(), s.ToString().c_str());
+ continue;
+ }
+ std::sort(files.begin(), files.end());
+ for (const std::string& file : files) {
+ if (ParseFileName(file, &number, &type)) {
+ if (type == kTableFile && ++file_num < 10) {
+ file_info.append(file).append(" ");
+ }
+ }
+ }
+ }
+ Header(options.info_log,
+ "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n",
+ db_path.path.c_str(), file_num, file_info.c_str());
+ file_num = 0;
+ file_info.clear();
+ }
+
+ // Get wal file in wal_dir
+ const auto& wal_dir = options.GetWalDir(dbname);
+ if (!options.IsWalDirSameAsDBPath(dbname)) {
+ s = env->GetChildren(wal_dir, &files);
+ if (!s.ok()) {
+ Error(options.info_log, "Error when reading %s dir %s\n", wal_dir.c_str(),
+ s.ToString().c_str());
+ return;
+ }
+ wal_info.clear();
+ for (const std::string& file : files) {
+ if (ParseFileName(file, &number, &type)) {
+ if (type == kWalFile) {
+ s = env->GetFileSize(wal_dir + "/" + file, &file_size);
+ if (s.ok()) {
+ wal_info.append(file)
+ .append(" size: ")
+ .append(std::to_string(file_size))
+ .append(" ; ");
+ } else {
+ Error(options.info_log, "Error when reading LOG file %s/%s %s\n",
+ wal_dir.c_str(), file.c_str(), s.ToString().c_str());
+ }
+ }
+ }
+ }
+ }
+ Header(options.info_log, "Write Ahead Log file in %s: %s\n", wal_dir.c_str(),
+ wal_info.c_str());
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_info_dumper.h b/src/rocksdb/db/db_info_dumper.h
new file mode 100644
index 000000000..f518e840f
--- /dev/null
+++ b/src/rocksdb/db/db_info_dumper.h
@@ -0,0 +1,15 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+
+#include "options/db_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+void DumpDBFileSummary(const ImmutableDBOptions& options,
+ const std::string& dbname,
+ const std::string& session_id = "");
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_inplace_update_test.cc b/src/rocksdb/db/db_inplace_update_test.cc
new file mode 100644
index 000000000..3921a3b00
--- /dev/null
+++ b/src/rocksdb/db/db_inplace_update_test.cc
@@ -0,0 +1,262 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestInPlaceUpdate : public DBTestBase {
+ public:
+ DBTestInPlaceUpdate()
+ : DBTestBase("db_inplace_update_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of smaller size
+ int numValues = 10;
+ for (int i = numValues; i > 0; i--) {
+ std::string value = DummyString(i, 'a');
+ ASSERT_OK(Put(1, "key", value));
+ ASSERT_EQ(value, Get(1, "key"));
+ }
+
+ // Only 1 instance for that key.
+ validateNumberOfEntries(1, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateLargeNewValue) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of larger size
+ int numValues = 10;
+ for (int i = 0; i < numValues; i++) {
+ std::string value = DummyString(i, 'a');
+ ASSERT_OK(Put(1, "key", value));
+ ASSERT_EQ(value, Get(1, "key"));
+ }
+
+ // All 10 updates exist in the internal iterator
+ validateNumberOfEntries(numValues, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateEntitySmallerNewValue) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+ options.env = env_;
+ options.allow_concurrent_memtable_write = false;
+
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of smaller size
+ constexpr int num_values = 10;
+ for (int i = num_values; i > 0; --i) {
+ constexpr char key[] = "key";
+ const std::string value = DummyString(i, 'a');
+ WideColumns wide_columns{{"attr", value}};
+
+ ASSERT_OK(db_->PutEntity(WriteOptions(), handles_[1], key, wide_columns));
+ // TODO: use Get to check entity once it's supported
+ }
+
+ // Only 1 instance for that key.
+ validateNumberOfEntries(1, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateEntityLargerNewValue) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+ options.env = env_;
+ options.allow_concurrent_memtable_write = false;
+
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of larger size
+ constexpr int num_values = 10;
+ for (int i = 0; i < num_values; ++i) {
+ constexpr char key[] = "key";
+ const std::string value = DummyString(i, 'a');
+ WideColumns wide_columns{{"attr", value}};
+
+ ASSERT_OK(db_->PutEntity(WriteOptions(), handles_[1], key, wide_columns));
+ // TODO: use Get to check entity once it's supported
+ }
+
+ // All 10 updates exist in the internal iterator
+ validateNumberOfEntries(num_values, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerSize) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.inplace_callback =
+ ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerSize;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of smaller size
+ int numValues = 10;
+ ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+ ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+ for (int i = numValues; i > 0; i--) {
+ ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+ ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key"));
+ }
+
+ // Only 1 instance for that key.
+ validateNumberOfEntries(1, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerVarintSize) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.inplace_callback =
+ ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerVarintSize;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of smaller varint size
+ int numValues = 265;
+ ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+ ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+ for (int i = numValues; i > 0; i--) {
+ ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+ ASSERT_EQ(DummyString(1, 'b'), Get(1, "key"));
+ }
+
+ // Only 1 instance for that key.
+ validateNumberOfEntries(1, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackLargeNewValue) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.inplace_callback =
+ ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceLargerSize;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of larger size
+ int numValues = 10;
+ for (int i = 0; i < numValues; i++) {
+ ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+ ASSERT_EQ(DummyString(i, 'c'), Get(1, "key"));
+ }
+
+ // No inplace updates. All updates are puts with new seq number
+ // All 10 updates exist in the internal iterator
+ validateNumberOfEntries(numValues, 1);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.inplace_callback =
+ ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceNoAction;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Callback function requests no actions from db
+ ASSERT_OK(Put(1, "key", DummyString(1, 'a')));
+ ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateAndSnapshot) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = true;
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Update key with values of smaller size, and
+ // run GetSnapshot and ReleaseSnapshot
+ int numValues = 2;
+ for (int i = numValues; i > 0; i--) {
+ const Snapshot* s = db_->GetSnapshot();
+ ASSERT_EQ(nullptr, s);
+ std::string value = DummyString(i, 'a');
+ ASSERT_OK(Put(1, "key", value));
+ ASSERT_EQ(value, Get(1, "key"));
+ // release s (nullptr)
+ db_->ReleaseSnapshot(s);
+ }
+
+ // Only 1 instance for that key.
+ validateNumberOfEntries(1, 1);
+ } while (ChangeCompactOptions());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_io_failure_test.cc b/src/rocksdb/db/db_io_failure_test.cc
new file mode 100644
index 000000000..2a405fd38
--- /dev/null
+++ b/src/rocksdb/db/db_io_failure_test.cc
@@ -0,0 +1,593 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBIOFailureTest : public DBTestBase {
+ public:
+ DBIOFailureTest() : DBTestBase("db_io_failure_test", /*env_do_fsync=*/true) {}
+};
+
+#ifndef ROCKSDB_LITE
+// Check that number of files does not grow when writes are dropped
+TEST_F(DBIOFailureTest, DropWrites) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.paranoid_checks = false;
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ Compact("a", "z");
+ const size_t num_files = CountFiles();
+ // Force out-of-space errors
+ env_->drop_writes_.store(true, std::memory_order_release);
+ env_->sleep_counter_.Reset();
+ env_->SetMockSleep();
+ for (int i = 0; i < 5; i++) {
+ if (option_config_ != kUniversalCompactionMultiLevel &&
+ option_config_ != kUniversalSubcompactions) {
+ for (int level = 0; level < dbfull()->NumberLevels(); level++) {
+ if (level > 0 && level == dbfull()->NumberLevels() - 1) {
+ break;
+ }
+ Status s =
+ dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr,
+ true /* disallow trivial move */);
+ ASSERT_TRUE(s.ok() || s.IsCorruption());
+ }
+ } else {
+ Status s =
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_TRUE(s.ok() || s.IsCorruption());
+ }
+ }
+
+ std::string property_value;
+ ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+ ASSERT_EQ("5", property_value);
+
+ env_->drop_writes_.store(false, std::memory_order_release);
+ const size_t count = CountFiles();
+ ASSERT_LT(count, num_files + 3);
+
+ // Check that compaction attempts slept after errors
+ // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler
+ // versions
+ ASSERT_GE(env_->sleep_counter_.Read(), 4);
+ } while (ChangeCompactOptions());
+}
+
+// Check background error counter bumped on flush failures.
+TEST_F(DBIOFailureTest, DropWritesFlush) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.max_background_flushes = 1;
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ // Force out-of-space errors
+ env_->drop_writes_.store(true, std::memory_order_release);
+
+ std::string property_value;
+ // Background error count is 0 now.
+ ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+ ASSERT_EQ("0", property_value);
+
+ // ASSERT file is too short
+ ASSERT_TRUE(dbfull()->TEST_FlushMemTable(true).IsCorruption());
+
+ ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+ ASSERT_EQ("1", property_value);
+
+ env_->drop_writes_.store(false, std::memory_order_release);
+ } while (ChangeCompactOptions());
+}
+
+// Check that CompactRange() returns failure if there is not enough space left
+// on device
+TEST_F(DBIOFailureTest, NoSpaceCompactRange) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ // generate 5 tables
+ for (int i = 0; i < 5; ++i) {
+ ASSERT_OK(Put(Key(i), Key(i) + "v"));
+ ASSERT_OK(Flush());
+ }
+
+ // Force out-of-space errors
+ env_->no_space_.store(true, std::memory_order_release);
+
+ Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow trivial move */);
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_TRUE(s.IsNoSpace());
+
+ env_->no_space_.store(false, std::memory_order_release);
+ } while (ChangeCompactOptions());
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBIOFailureTest, NonWritableFileSystem) {
+ do {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 4096;
+ options.arena_block_size = 4096;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_OK(Put("foo", "v1"));
+ env_->non_writeable_rate_.store(100);
+ std::string big(100000, 'x');
+ int errors = 0;
+ for (int i = 0; i < 20; i++) {
+ if (!Put("foo", big).ok()) {
+ errors++;
+ env_->SleepForMicroseconds(100000);
+ }
+ }
+ ASSERT_GT(errors, 0);
+ env_->non_writeable_rate_.store(0);
+ } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBIOFailureTest, ManifestWriteError) {
+ // Test for the following problem:
+ // (a) Compaction produces file F
+ // (b) Log record containing F is written to MANIFEST file, but Sync() fails
+ // (c) GC deletes F
+ // (d) After reopening DB, reads fail since deleted F is named in log record
+
+ // We iterate twice. In the second iteration, everything is the
+ // same except the log record never makes it to the MANIFEST file.
+ for (int iter = 0; iter < 2; iter++) {
+ std::atomic<bool>* error_type = (iter == 0) ? &env_->manifest_sync_error_
+ : &env_->manifest_write_error_;
+
+ // Insert foo=>bar mapping
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Memtable compaction (will succeed)
+ ASSERT_OK(Flush());
+ ASSERT_EQ("bar", Get("foo"));
+ const int last = 2;
+ MoveFilesToLevel(2);
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level
+
+ // Merging compaction (will fail)
+ error_type->store(true, std::memory_order_release);
+ ASSERT_NOK(
+ dbfull()->TEST_CompactRange(last, nullptr, nullptr)); // Should fail
+ ASSERT_EQ("bar", Get("foo"));
+
+ error_type->store(false, std::memory_order_release);
+
+ // Since paranoid_checks=true, writes should fail
+ ASSERT_NOK(Put("foo2", "bar2"));
+
+ // Recovery: should not lose data
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Try again with paranoid_checks=false
+ Close();
+ options.paranoid_checks = false;
+ Reopen(options);
+
+ // Merging compaction (will fail)
+ error_type->store(true, std::memory_order_release);
+ Status s =
+ dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail
+ if (iter == 0) {
+ ASSERT_OK(s);
+ } else {
+ ASSERT_TRUE(s.IsIOError());
+ }
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Recovery: should not lose data
+ error_type->store(false, std::memory_order_release);
+ Reopen(options);
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Since paranoid_checks=false, writes should succeed
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_EQ("bar", Get("foo"));
+ ASSERT_EQ("bar2", Get("foo2"));
+ }
+}
+
+TEST_F(DBIOFailureTest, PutFailsParanoid) {
+ // Test the following:
+ // (a) A random put fails in paranoid mode (simulate by sync fail)
+ // (b) All other puts have to fail, even if writes would succeed
+ // (c) All of that should happen ONLY if paranoid_checks = true
+
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo1", "bar1"));
+ // simulate error
+ env_->log_write_error_.store(true, std::memory_order_release);
+ ASSERT_NOK(Put(1, "foo2", "bar2"));
+ env_->log_write_error_.store(false, std::memory_order_release);
+ // the next put should fail, too
+ ASSERT_NOK(Put(1, "foo3", "bar3"));
+ // but we're still able to read
+ ASSERT_EQ("bar", Get(1, "foo"));
+
+ // do the same thing with paranoid checks off
+ options.paranoid_checks = false;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo1", "bar1"));
+ // simulate error
+ env_->log_write_error_.store(true, std::memory_order_release);
+ ASSERT_NOK(Put(1, "foo2", "bar2"));
+ env_->log_write_error_.store(false, std::memory_order_release);
+ // the next put should NOT fail
+ ASSERT_OK(Put(1, "foo3", "bar3"));
+}
+#if !(defined NDEBUG) || !defined(OS_WIN)
+TEST_F(DBIOFailureTest, FlushSstRangeSyncError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.write_buffer_size = 256 * 1024 * 1024;
+ options.writable_file_max_buffer_size = 128 * 1024;
+ options.bytes_per_sync = 128 * 1024;
+ options.level0_file_num_compaction_trigger = 4;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(10));
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const char* io_error_msg = "range sync dummy error";
+ std::atomic<int> range_sync_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
+ if (range_sync_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError(io_error_msg);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ std::string rnd_str =
+ rnd.RandomString(static_cast<int>(options.bytes_per_sync / 2));
+ std::string rnd_str_512kb = rnd.RandomString(512 * 1024);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ // First 1MB doesn't get range synced
+ ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb));
+ ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb));
+ ASSERT_OK(Put(1, "foo1_1", rnd_str));
+ ASSERT_OK(Put(1, "foo1_2", rnd_str));
+ ASSERT_OK(Put(1, "foo1_3", rnd_str));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ ASSERT_OK(Put(1, "foo3_1", rnd_str));
+ ASSERT_OK(Put(1, "foo3_2", rnd_str));
+ ASSERT_OK(Put(1, "foo3_3", rnd_str));
+ ASSERT_OK(Put(1, "foo4", "bar"));
+ Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_STREQ(s.getState(), io_error_msg);
+
+ // Following writes should fail as flush failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar", Get(1, "foo"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_GE(1, range_sync_called.load());
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, CompactSstRangeSyncError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.write_buffer_size = 256 * 1024 * 1024;
+ options.writable_file_max_buffer_size = 128 * 1024;
+ options.bytes_per_sync = 128 * 1024;
+ options.level0_file_num_compaction_trigger = 2;
+ options.target_file_size_base = 256 * 1024 * 1024;
+ options.disable_auto_compactions = true;
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ std::string rnd_str =
+ rnd.RandomString(static_cast<int>(options.bytes_per_sync / 2));
+ std::string rnd_str_512kb = rnd.RandomString(512 * 1024);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ // First 1MB doesn't get range synced
+ ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb));
+ ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb));
+ ASSERT_OK(Put(1, "foo1_1", rnd_str));
+ ASSERT_OK(Put(1, "foo1_2", rnd_str));
+ ASSERT_OK(Put(1, "foo1_3", rnd_str));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo3_1", rnd_str));
+ ASSERT_OK(Put(1, "foo3_2", rnd_str));
+ ASSERT_OK(Put(1, "foo3_3", rnd_str));
+ ASSERT_OK(Put(1, "foo4", "bar"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+ const char* io_error_msg = "range sync dummy error";
+ std::atomic<int> range_sync_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
+ if (range_sync_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError(io_error_msg);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(dbfull()->SetOptions(handles_[1],
+ {
+ {"disable_auto_compactions", "false"},
+ }));
+ Status s = dbfull()->TEST_WaitForCompact();
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_STREQ(s.getState(), io_error_msg);
+
+ // Following writes should fail as flush failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar", Get(1, "foo"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_GE(1, range_sync_called.load());
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, FlushSstCloseError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.level0_file_num_compaction_trigger = 4;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const char* io_error_msg = "close dummy error";
+ std::atomic<int> close_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::Close", [&](void* arg) {
+ if (close_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError(io_error_msg);
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo1", "bar1"));
+ ASSERT_OK(Put(1, "foo", "bar2"));
+ Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_STREQ(s.getState(), io_error_msg);
+
+ // Following writes should fail as flush failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar2", Get(1, "foo"));
+ ASSERT_EQ("bar1", Get(1, "foo1"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar2", Get(1, "foo"));
+ ASSERT_EQ("bar1", Get(1, "foo1"));
+}
+
+TEST_F(DBIOFailureTest, CompactionSstCloseError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.disable_auto_compactions = true;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "foo", "bar2"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "foo", "bar3"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ const char* io_error_msg = "close dummy error";
+ std::atomic<int> close_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::Close", [&](void* arg) {
+ if (close_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError(io_error_msg);
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(dbfull()->SetOptions(handles_[1],
+ {
+ {"disable_auto_compactions", "false"},
+ }));
+ Status s = dbfull()->TEST_WaitForCompact();
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_STREQ(s.getState(), io_error_msg);
+
+ // Following writes should fail as compaction failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar3", Get(1, "foo"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar3", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, FlushSstSyncError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.use_fsync = false;
+ options.level0_file_num_compaction_trigger = 4;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const char* io_error_msg = "sync dummy error";
+ std::atomic<int> sync_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::Sync", [&](void* arg) {
+ if (sync_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError(io_error_msg);
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo1", "bar1"));
+ ASSERT_OK(Put(1, "foo", "bar2"));
+ Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_STREQ(s.getState(), io_error_msg);
+
+ // Following writes should fail as flush failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar2", Get(1, "foo"));
+ ASSERT_EQ("bar1", Get(1, "foo1"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar2", Get(1, "foo"));
+ ASSERT_EQ("bar1", Get(1, "foo1"));
+}
+
+TEST_F(DBIOFailureTest, CompactionSstSyncError) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ options.paranoid_checks = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.disable_auto_compactions = true;
+ options.use_fsync = false;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "foo", "bar2"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "foo", "bar3"));
+ ASSERT_OK(Put(1, "foo2", "bar"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ const char* io_error_msg = "sync dummy error";
+ std::atomic<int> sync_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SpecialEnv::SStableFile::Sync", [&](void* arg) {
+ if (sync_called.fetch_add(1) == 0) {
+ Status* st = static_cast<Status*>(arg);
+ *st = Status::IOError(io_error_msg);
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(dbfull()->SetOptions(handles_[1],
+ {
+ {"disable_auto_compactions", "false"},
+ }));
+ Status s = dbfull()->TEST_WaitForCompact();
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_STREQ(s.getState(), io_error_msg);
+
+ // Following writes should fail as compaction failed.
+ ASSERT_NOK(Put(1, "foo2", "bar3"));
+ ASSERT_EQ("bar3", Get(1, "foo"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("bar3", Get(1, "foo"));
+}
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iter.cc b/src/rocksdb/db/db_iter.cc
new file mode 100644
index 000000000..e1375deb7
--- /dev/null
+++ b/src/rocksdb/db/db_iter.cc
@@ -0,0 +1,1708 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/wide/wide_column_serialization.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "memory/arena.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "trace_replay/trace_replay.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+DBIter::DBIter(Env* _env, const ReadOptions& read_options,
+ const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ const Comparator* cmp, InternalIterator* iter,
+ const Version* version, SequenceNumber s, bool arena_mode,
+ uint64_t max_sequential_skip_in_iterations,
+ ReadCallback* read_callback, DBImpl* db_impl,
+ ColumnFamilyData* cfd, bool expose_blob_index)
+ : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+ env_(_env),
+ clock_(ioptions.clock),
+ logger_(ioptions.logger),
+ user_comparator_(cmp),
+ merge_operator_(ioptions.merge_operator.get()),
+ iter_(iter),
+ version_(version),
+ read_callback_(read_callback),
+ sequence_(s),
+ statistics_(ioptions.stats),
+ max_skip_(max_sequential_skip_in_iterations),
+ max_skippable_internal_keys_(read_options.max_skippable_internal_keys),
+ num_internal_keys_skipped_(0),
+ iterate_lower_bound_(read_options.iterate_lower_bound),
+ iterate_upper_bound_(read_options.iterate_upper_bound),
+ direction_(kForward),
+ valid_(false),
+ current_entry_is_merged_(false),
+ is_key_seqnum_zero_(false),
+ prefix_same_as_start_(mutable_cf_options.prefix_extractor
+ ? read_options.prefix_same_as_start
+ : false),
+ pin_thru_lifetime_(read_options.pin_data),
+ expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
+ read_options.total_order_seek ||
+ read_options.auto_prefix_mode),
+ read_tier_(read_options.read_tier),
+ fill_cache_(read_options.fill_cache),
+ verify_checksums_(read_options.verify_checksums),
+ expose_blob_index_(expose_blob_index),
+ is_blob_(false),
+ arena_mode_(arena_mode),
+ db_impl_(db_impl),
+ cfd_(cfd),
+ timestamp_ub_(read_options.timestamp),
+ timestamp_lb_(read_options.iter_start_ts),
+ timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) {
+ RecordTick(statistics_, NO_ITERATOR_CREATED);
+ if (pin_thru_lifetime_) {
+ pinned_iters_mgr_.StartPinning();
+ }
+ if (iter_.iter()) {
+ iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+ }
+ status_.PermitUncheckedError();
+ assert(timestamp_size_ ==
+ user_comparator_.user_comparator()->timestamp_size());
+}
+
+Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
+ if (prop == nullptr) {
+ return Status::InvalidArgument("prop is nullptr");
+ }
+ if (prop_name == "rocksdb.iterator.super-version-number") {
+ // First try to pass the value returned from inner iterator.
+ return iter_.iter()->GetProperty(prop_name, prop);
+ } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
+ if (valid_) {
+ *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
+ } else {
+ *prop = "Iterator is not valid.";
+ }
+ return Status::OK();
+ } else if (prop_name == "rocksdb.iterator.internal-key") {
+ *prop = saved_key_.GetUserKey().ToString();
+ return Status::OK();
+ }
+ return Status::InvalidArgument("Unidentified property.");
+}
+
+bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+ Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */);
+ if (!s.ok()) {
+ status_ = Status::Corruption("In DBIter: ", s.getState());
+ valid_ = false;
+ ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState());
+ return false;
+ } else {
+ return true;
+ }
+}
+
+void DBIter::Next() {
+ assert(valid_);
+ assert(status_.ok());
+
+ PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_);
+ // Release temporarily pinned blocks from last operation
+ ReleaseTempPinnedData();
+ ResetBlobValue();
+ ResetValueAndColumns();
+ local_stats_.skip_count_ += num_internal_keys_skipped_;
+ local_stats_.skip_count_--;
+ num_internal_keys_skipped_ = 0;
+ bool ok = true;
+ if (direction_ == kReverse) {
+ is_key_seqnum_zero_ = false;
+ if (!ReverseToForward()) {
+ ok = false;
+ }
+ } else if (!current_entry_is_merged_) {
+ // If the current value is not a merge, the iter position is the
+ // current key, which is already returned. We can safely issue a
+ // Next() without checking the current key.
+ // If the current key is a merge, very likely iter already points
+ // to the next internal position.
+ assert(iter_.Valid());
+ iter_.Next();
+ PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+ }
+
+ local_stats_.next_count_++;
+ if (ok && iter_.Valid()) {
+ ClearSavedValue();
+
+ if (prefix_same_as_start_) {
+ assert(prefix_extractor_ != nullptr);
+ const Slice prefix = prefix_.GetUserKey();
+ FindNextUserEntry(true /* skipping the current user key */, &prefix);
+ } else {
+ FindNextUserEntry(true /* skipping the current user key */, nullptr);
+ }
+ } else {
+ is_key_seqnum_zero_ = false;
+ valid_ = false;
+ }
+ if (statistics_ != nullptr && valid_) {
+ local_stats_.next_found_count_++;
+ local_stats_.bytes_read_ += (key().size() + value().size());
+ }
+}
+
+bool DBIter::SetBlobValueIfNeeded(const Slice& user_key,
+ const Slice& blob_index) {
+ assert(!is_blob_);
+ assert(blob_value_.empty());
+
+ if (expose_blob_index_) { // Stacked BlobDB implementation
+ is_blob_ = true;
+ return true;
+ }
+
+ if (!version_) {
+ status_ = Status::Corruption("Encountered unexpected blob index.");
+ valid_ = false;
+ return false;
+ }
+
+ // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to
+ // avoid having to copy options back and forth.
+ ReadOptions read_options;
+ read_options.read_tier = read_tier_;
+ read_options.fill_cache = fill_cache_;
+ read_options.verify_checksums = verify_checksums_;
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr uint64_t* bytes_read = nullptr;
+
+ const Status s = version_->GetBlob(read_options, user_key, blob_index,
+ prefetch_buffer, &blob_value_, bytes_read);
+
+ if (!s.ok()) {
+ status_ = s;
+ valid_ = false;
+ return false;
+ }
+
+ is_blob_ = true;
+ return true;
+}
+
+bool DBIter::SetValueAndColumnsFromEntity(Slice slice) {
+ assert(value_.empty());
+ assert(wide_columns_.empty());
+
+ const Status s = WideColumnSerialization::Deserialize(slice, wide_columns_);
+
+ if (!s.ok()) {
+ status_ = s;
+ valid_ = false;
+ return false;
+ }
+
+ if (!wide_columns_.empty() &&
+ wide_columns_[0].name() == kDefaultWideColumnName) {
+ value_ = wide_columns_[0].value();
+ }
+
+ return true;
+}
+
+// PRE: saved_key_ has the current user key if skipping_saved_key
+// POST: saved_key_ should have the next user key if valid_,
+// if the current entry is a result of merge
+// current_entry_is_merged_ => true
+// saved_value_ => the merged value
+//
+// NOTE: In between, saved_key_ can point to a user key that has
+// a delete marker or a sequence number higher than sequence_
+// saved_key_ MUST have a proper user_key before calling this function
+//
+// The prefix parameter, if not null, indicates that we need to iterate
+// within the prefix, and the iterator needs to be made invalid, if no
+// more entry for the prefix can be found.
+bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) {
+ PERF_TIMER_GUARD(find_next_user_entry_time);
+ return FindNextUserEntryInternal(skipping_saved_key, prefix);
+}
+
+// Actual implementation of DBIter::FindNextUserEntry()
+bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
+ const Slice* prefix) {
+ // Loop until we hit an acceptable entry to yield
+ assert(iter_.Valid());
+ assert(status_.ok());
+ assert(direction_ == kForward);
+ current_entry_is_merged_ = false;
+
+ // How many times in a row we have skipped an entry with user key less than
+ // or equal to saved_key_. We could skip these entries either because
+ // sequence numbers were too high or because skipping_saved_key = true.
+ // What saved_key_ contains throughout this method:
+ // - if skipping_saved_key : saved_key_ contains the key that we need
+ // to skip, and we haven't seen any keys greater
+ // than that,
+ // - if num_skipped > 0 : saved_key_ contains the key that we have skipped
+ // num_skipped times, and we haven't seen any keys
+ // greater than that,
+ // - none of the above : saved_key_ can contain anything, it doesn't
+ // matter.
+ uint64_t num_skipped = 0;
+ // For write unprepared, the target sequence number in reseek could be larger
+ // than the snapshot, and thus needs to be skipped again. This could result in
+ // an infinite loop of reseeks. To avoid that, we limit the number of reseeks
+ // to one.
+ bool reseek_done = false;
+
+ do {
+ // Will update is_key_seqnum_zero_ as soon as we parsed the current key
+ // but we need to save the previous value to be used in the loop.
+ bool is_prev_key_seqnum_zero = is_key_seqnum_zero_;
+ if (!ParseKey(&ikey_)) {
+ is_key_seqnum_zero_ = false;
+ return false;
+ }
+ Slice user_key_without_ts =
+ StripTimestampFromUserKey(ikey_.user_key, timestamp_size_);
+
+ is_key_seqnum_zero_ = (ikey_.sequence == 0);
+
+ assert(iterate_upper_bound_ == nullptr ||
+ iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound ||
+ user_comparator_.CompareWithoutTimestamp(
+ user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
+ /*b_has_ts=*/false) < 0);
+ if (iterate_upper_bound_ != nullptr &&
+ iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound &&
+ user_comparator_.CompareWithoutTimestamp(
+ user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
+ /*b_has_ts=*/false) >= 0) {
+ break;
+ }
+
+ assert(prefix == nullptr || prefix_extractor_ != nullptr);
+ if (prefix != nullptr &&
+ prefix_extractor_->Transform(user_key_without_ts).compare(*prefix) !=
+ 0) {
+ assert(prefix_same_as_start_);
+ break;
+ }
+
+ if (TooManyInternalKeysSkipped()) {
+ return false;
+ }
+
+ assert(ikey_.user_key.size() >= timestamp_size_);
+ Slice ts = timestamp_size_ > 0 ? ExtractTimestampFromUserKey(
+ ikey_.user_key, timestamp_size_)
+ : Slice();
+ bool more_recent = false;
+ if (IsVisible(ikey_.sequence, ts, &more_recent)) {
+ // If the previous entry is of seqnum 0, the current entry will not
+ // possibly be skipped. This condition can potentially be relaxed to
+ // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
+ // prone to bugs causing the same user key with the same sequence number.
+ // Note that with current timestamp implementation, the same user key can
+ // have different timestamps and zero sequence number on the bottommost
+ // level. This may change in the future.
+ if ((!is_prev_key_seqnum_zero || timestamp_size_ > 0) &&
+ skipping_saved_key &&
+ CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
+ num_skipped++; // skip this entry
+ PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+ } else {
+ assert(!skipping_saved_key ||
+ CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0);
+ if (!iter_.PrepareValue()) {
+ assert(!iter_.status().ok());
+ valid_ = false;
+ return false;
+ }
+ num_skipped = 0;
+ reseek_done = false;
+ switch (ikey_.type) {
+ case kTypeDeletion:
+ case kTypeDeletionWithTimestamp:
+ case kTypeSingleDeletion:
+ // Arrange to skip all upcoming entries for this key since
+ // they are hidden by this deletion.
+ if (timestamp_lb_) {
+ saved_key_.SetInternalKey(ikey_);
+ valid_ = true;
+ return true;
+ } else {
+ saved_key_.SetUserKey(
+ ikey_.user_key, !pin_thru_lifetime_ ||
+ !iter_.iter()->IsKeyPinned() /* copy */);
+ skipping_saved_key = true;
+ PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+ }
+ break;
+ case kTypeValue:
+ case kTypeBlobIndex:
+ case kTypeWideColumnEntity:
+ if (timestamp_lb_) {
+ saved_key_.SetInternalKey(ikey_);
+ } else {
+ saved_key_.SetUserKey(
+ ikey_.user_key, !pin_thru_lifetime_ ||
+ !iter_.iter()->IsKeyPinned() /* copy */);
+ }
+
+ if (ikey_.type == kTypeBlobIndex) {
+ if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) {
+ return false;
+ }
+
+ SetValueAndColumnsFromPlain(expose_blob_index_ ? iter_.value()
+ : blob_value_);
+ } else if (ikey_.type == kTypeWideColumnEntity) {
+ if (!SetValueAndColumnsFromEntity(iter_.value())) {
+ return false;
+ }
+ } else {
+ assert(ikey_.type == kTypeValue);
+ SetValueAndColumnsFromPlain(iter_.value());
+ }
+
+ valid_ = true;
+ return true;
+ break;
+ case kTypeMerge:
+ saved_key_.SetUserKey(
+ ikey_.user_key,
+ !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
+ // By now, we are sure the current ikey is going to yield a value
+ current_entry_is_merged_ = true;
+ valid_ = true;
+ return MergeValuesNewToOld(); // Go to a different state machine
+ break;
+ default:
+ valid_ = false;
+ status_ = Status::Corruption(
+ "Unknown value type: " +
+ std::to_string(static_cast<unsigned int>(ikey_.type)));
+ return false;
+ }
+ }
+ } else {
+ if (more_recent) {
+ PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+ }
+
+ // This key was inserted after our snapshot was taken or skipped by
+ // timestamp range. If this happens too many times in a row for the same
+ // user key, we want to seek to the target sequence number.
+ int cmp = user_comparator_.CompareWithoutTimestamp(
+ ikey_.user_key, saved_key_.GetUserKey());
+ if (cmp == 0 || (skipping_saved_key && cmp < 0)) {
+ num_skipped++;
+ } else {
+ saved_key_.SetUserKey(
+ ikey_.user_key,
+ !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+ skipping_saved_key = false;
+ num_skipped = 0;
+ reseek_done = false;
+ }
+ }
+
+ // If we have sequentially iterated via numerous equal keys, then it's
+ // better to seek so that we can avoid too many key comparisons.
+ //
+ // To avoid infinite loops, do not reseek if we have already attempted to
+ // reseek previously.
+ //
+ // TODO(lth): If we reseek to sequence number greater than ikey_.sequence,
+ // then it does not make sense to reseek as we would actually land further
+ // away from the desired key. There is opportunity for optimization here.
+ if (num_skipped > max_skip_ && !reseek_done) {
+ is_key_seqnum_zero_ = false;
+ num_skipped = 0;
+ reseek_done = true;
+ std::string last_key;
+ if (skipping_saved_key) {
+ // We're looking for the next user-key but all we see are the same
+ // user-key with decreasing sequence numbers. Fast forward to
+ // sequence number 0 and type deletion (the smallest type).
+ if (timestamp_size_ == 0) {
+ AppendInternalKey(
+ &last_key,
+ ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion));
+ } else {
+ const std::string kTsMin(timestamp_size_, '\0');
+ AppendInternalKeyWithDifferentTimestamp(
+ &last_key,
+ ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion),
+ kTsMin);
+ }
+ // Don't set skipping_saved_key = false because we may still see more
+ // user-keys equal to saved_key_.
+ } else {
+ // We saw multiple entries with this user key and sequence numbers
+ // higher than sequence_. Fast forward to sequence_.
+ // Note that this only covers a case when a higher key was overwritten
+ // many times since our snapshot was taken, not the case when a lot of
+ // different keys were inserted after our snapshot was taken.
+ if (timestamp_size_ == 0) {
+ AppendInternalKey(
+ &last_key, ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+ kValueTypeForSeek));
+ } else {
+ AppendInternalKeyWithDifferentTimestamp(
+ &last_key,
+ ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+ kValueTypeForSeek),
+ *timestamp_ub_);
+ }
+ }
+ iter_.Seek(last_key);
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+ } else {
+ iter_.Next();
+ }
+ } while (iter_.Valid());
+
+ valid_ = false;
+ return iter_.status().ok();
+}
+
+// Merge values of the same user key starting from the current iter_ position
+// Scan from the newer entries to older entries.
+// PRE: iter_.key() points to the first merge type entry
+// saved_key_ stores the user key
+// iter_.PrepareValue() has been called
+// POST: saved_value_ has the merged value for the user key
+// iter_ points to the next entry (or invalid)
+bool DBIter::MergeValuesNewToOld() {
+ if (!merge_operator_) {
+ ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null.");
+ status_ = Status::InvalidArgument("merge_operator_ must be set.");
+ valid_ = false;
+ return false;
+ }
+
+ // Temporarily pin the blocks that hold merge operands
+ TempPinData();
+ merge_context_.Clear();
+ // Start the merge process by pushing the first operand
+ merge_context_.PushOperand(
+ iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+ TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
+
+ ParsedInternalKey ikey;
+ for (iter_.Next(); iter_.Valid(); iter_.Next()) {
+ TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+
+ if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+ saved_key_.GetUserKey())) {
+ // hit the next user key, stop right here
+ break;
+ }
+ if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
+ kTypeDeletionWithTimestamp == ikey.type) {
+ // hit a delete with the same user key, stop right here
+ // iter_ is positioned after delete
+ iter_.Next();
+ break;
+ }
+ if (!iter_.PrepareValue()) {
+ valid_ = false;
+ return false;
+ }
+
+ if (kTypeValue == ikey.type) {
+ // hit a put, merge the put value with operands and store the
+ // final result in saved_value_. We are done!
+ const Slice val = iter_.value();
+ if (!Merge(&val, ikey.user_key)) {
+ return false;
+ }
+ // iter_ is positioned after put
+ iter_.Next();
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+ return true;
+ } else if (kTypeMerge == ikey.type) {
+ // hit a merge, add the value as an operand and run associative merge.
+ // when complete, add result to operands and continue.
+ merge_context_.PushOperand(
+ iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+ PERF_COUNTER_ADD(internal_merge_count, 1);
+ } else if (kTypeBlobIndex == ikey.type) {
+ if (expose_blob_index_) {
+ status_ =
+ Status::NotSupported("BlobDB does not support merge operator.");
+ valid_ = false;
+ return false;
+ }
+ // hit a put, merge the put value with operands and store the
+ // final result in saved_value_. We are done!
+ if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
+ return false;
+ }
+ valid_ = true;
+ if (!Merge(&blob_value_, ikey.user_key)) {
+ return false;
+ }
+
+ ResetBlobValue();
+
+ // iter_ is positioned after put
+ iter_.Next();
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+ return true;
+ } else if (kTypeWideColumnEntity == ikey.type) {
+ if (!MergeEntity(iter_.value(), ikey.user_key)) {
+ return false;
+ }
+
+ // iter_ is positioned after put
+ iter_.Next();
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+
+ return true;
+ } else {
+ valid_ = false;
+ status_ = Status::Corruption(
+ "Unrecognized value type: " +
+ std::to_string(static_cast<unsigned int>(ikey.type)));
+ return false;
+ }
+ }
+
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+
+ // we either exhausted all internal keys under this user key, or hit
+ // a deletion marker.
+ // feed null as the existing value to the merge operator, such that
+ // client can differentiate this scenario and do things accordingly.
+ if (!Merge(nullptr, saved_key_.GetUserKey())) {
+ return false;
+ }
+ assert(status_.ok());
+ return true;
+}
+
+void DBIter::Prev() {
+ assert(valid_);
+ assert(status_.ok());
+
+ PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_);
+ ReleaseTempPinnedData();
+ ResetBlobValue();
+ ResetValueAndColumns();
+ ResetInternalKeysSkippedCounter();
+ bool ok = true;
+ if (direction_ == kForward) {
+ if (!ReverseToBackward()) {
+ ok = false;
+ }
+ }
+ if (ok) {
+ ClearSavedValue();
+
+ Slice prefix;
+ if (prefix_same_as_start_) {
+ assert(prefix_extractor_ != nullptr);
+ prefix = prefix_.GetUserKey();
+ }
+ PrevInternal(prefix_same_as_start_ ? &prefix : nullptr);
+ }
+
+ if (statistics_ != nullptr) {
+ local_stats_.prev_count_++;
+ if (valid_) {
+ local_stats_.prev_found_count_++;
+ local_stats_.bytes_read_ += (key().size() + value().size());
+ }
+ }
+}
+
+bool DBIter::ReverseToForward() {
+ assert(iter_.status().ok());
+
+ // When moving backwards, iter_ is positioned on _previous_ key, which may
+ // not exist or may have different prefix than the current key().
+ // If that's the case, seek iter_ to current key.
+ if (!expect_total_order_inner_iter() || !iter_.Valid()) {
+ IterKey last_key;
+ ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
+ kValueTypeForSeek);
+ if (timestamp_size_ > 0) {
+ // TODO: pre-create kTsMax.
+ const std::string kTsMax(timestamp_size_, '\xff');
+ pikey.SetTimestamp(kTsMax);
+ }
+ last_key.SetInternalKey(pikey);
+ iter_.Seek(last_key.GetInternalKey());
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+ }
+
+ direction_ = kForward;
+ // Skip keys less than the current key() (a.k.a. saved_key_).
+ while (iter_.Valid()) {
+ ParsedInternalKey ikey;
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+ if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) {
+ return true;
+ }
+ iter_.Next();
+ }
+
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+
+ return true;
+}
+
+// Move iter_ to the key before saved_key_.
+bool DBIter::ReverseToBackward() {
+ assert(iter_.status().ok());
+
+ // When current_entry_is_merged_ is true, iter_ may be positioned on the next
+ // key, which may not exist or may have prefix different from current.
+ // If that's the case, seek to saved_key_.
+ if (current_entry_is_merged_ &&
+ (!expect_total_order_inner_iter() || !iter_.Valid())) {
+ IterKey last_key;
+ // Using kMaxSequenceNumber and kValueTypeForSeek
+ // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller
+ // than saved_key_.
+ last_key.SetInternalKey(ParsedInternalKey(
+ saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+ if (!expect_total_order_inner_iter()) {
+ iter_.SeekForPrev(last_key.GetInternalKey());
+ } else {
+ // Some iterators may not support SeekForPrev(), so we avoid using it
+ // when prefix seek mode is disabled. This is somewhat expensive
+ // (an extra Prev(), as well as an extra change of direction of iter_),
+ // so we may need to reconsider it later.
+ iter_.Seek(last_key.GetInternalKey());
+ if (!iter_.Valid() && iter_.status().ok()) {
+ iter_.SeekToLast();
+ }
+ }
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+ }
+
+ direction_ = kReverse;
+ return FindUserKeyBeforeSavedKey();
+}
+
+void DBIter::PrevInternal(const Slice* prefix) {
+ while (iter_.Valid()) {
+ saved_key_.SetUserKey(
+ ExtractUserKey(iter_.key()),
+ !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+
+ assert(prefix == nullptr || prefix_extractor_ != nullptr);
+ if (prefix != nullptr &&
+ prefix_extractor_
+ ->Transform(StripTimestampFromUserKey(saved_key_.GetUserKey(),
+ timestamp_size_))
+ .compare(*prefix) != 0) {
+ assert(prefix_same_as_start_);
+ // Current key does not have the same prefix as start
+ valid_ = false;
+ return;
+ }
+
+ assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
+ user_comparator_.CompareWithoutTimestamp(
+ saved_key_.GetUserKey(), /*a_has_ts=*/true,
+ *iterate_lower_bound_, /*b_has_ts=*/false) >= 0);
+ if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
+ user_comparator_.CompareWithoutTimestamp(
+ saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
+ /*b_has_ts=*/false) < 0) {
+ // We've iterated earlier than the user-specified lower bound.
+ valid_ = false;
+ return;
+ }
+
+ if (!FindValueForCurrentKey()) { // assigns valid_
+ return;
+ }
+
+ // Whether or not we found a value for current key, we need iter_ to end up
+ // on a smaller key.
+ if (!FindUserKeyBeforeSavedKey()) {
+ return;
+ }
+
+ if (valid_) {
+ // Found the value.
+ return;
+ }
+
+ if (TooManyInternalKeysSkipped(false)) {
+ return;
+ }
+ }
+
+ // We haven't found any key - iterator is not valid
+ valid_ = false;
+}
+
+// Used for backwards iteration.
+// Looks at the entries with user key saved_key_ and finds the most up-to-date
+// value for it, or executes a merge, or determines that the value was deleted.
+// Sets valid_ to true if the value is found and is ready to be presented to
+// the user through value().
+// Sets valid_ to false if the value was deleted, and we should try another key.
+// Returns false if an error occurred, and !status().ok() and !valid_.
+//
+// PRE: iter_ is positioned on the last entry with user key equal to saved_key_.
+// POST: iter_ is positioned on one of the entries equal to saved_key_, or on
+// the entry just before them, or on the entry just after them.
+bool DBIter::FindValueForCurrentKey() {
+ assert(iter_.Valid());
+ merge_context_.Clear();
+ current_entry_is_merged_ = false;
+ // last entry before merge (could be kTypeDeletion,
+ // kTypeDeletionWithTimestamp, kTypeSingleDeletion, kTypeValue,
+ // kTypeBlobIndex, or kTypeWideColumnEntity)
+ ValueType last_not_merge_type = kTypeDeletion;
+ ValueType last_key_entry_type = kTypeDeletion;
+
+ // If false, it indicates that we have not seen any valid entry, even though
+ // last_key_entry_type is initialized to kTypeDeletion.
+ bool valid_entry_seen = false;
+
+ // Temporarily pin blocks that hold (merge operands / the value)
+ ReleaseTempPinnedData();
+ TempPinData();
+ size_t num_skipped = 0;
+ while (iter_.Valid()) {
+ ParsedInternalKey ikey;
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+
+ if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+ saved_key_.GetUserKey())) {
+ // Found a smaller user key, thus we are done with current user key.
+ break;
+ }
+
+ assert(ikey.user_key.size() >= timestamp_size_);
+ Slice ts;
+ if (timestamp_size_ > 0) {
+ ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+ timestamp_size_);
+ }
+
+ bool visible = IsVisible(ikey.sequence, ts);
+ if (!visible &&
+ (timestamp_lb_ == nullptr ||
+ user_comparator_.CompareTimestamp(ts, *timestamp_ub_) > 0)) {
+ // Found an invisible version of the current user key, and it must have
+ // a higher sequence number or timestamp. Therefore, we are done with the
+ // current user key.
+ break;
+ }
+
+ if (!ts.empty()) {
+ saved_timestamp_.assign(ts.data(), ts.size());
+ }
+
+ if (TooManyInternalKeysSkipped()) {
+ return false;
+ }
+
+ // This user key has lots of entries.
+ // We're going from old to new, and it's taking too long. Let's do a Seek()
+ // and go from new to old. This helps when a key was overwritten many times.
+ if (num_skipped >= max_skip_) {
+ return FindValueForCurrentKeyUsingSeek();
+ }
+
+ if (!iter_.PrepareValue()) {
+ valid_ = false;
+ return false;
+ }
+
+ if (timestamp_lb_ != nullptr) {
+ // Only needed when timestamp_lb_ is not null
+ [[maybe_unused]] const bool ret = ParseKey(&ikey_);
+ saved_ikey_.assign(iter_.key().data(), iter_.key().size());
+ // Since the preceding ParseKey(&ikey) succeeds, so must this.
+ assert(ret);
+ }
+
+ valid_entry_seen = true;
+ last_key_entry_type = ikey.type;
+ switch (last_key_entry_type) {
+ case kTypeValue:
+ case kTypeBlobIndex:
+ case kTypeWideColumnEntity:
+ if (iter_.iter()->IsValuePinned()) {
+ pinned_value_ = iter_.value();
+ } else {
+ valid_ = false;
+ status_ = Status::NotSupported(
+ "Backward iteration not supported if underlying iterator's value "
+ "cannot be pinned.");
+ }
+ merge_context_.Clear();
+ last_not_merge_type = last_key_entry_type;
+ if (!status_.ok()) {
+ return false;
+ }
+ break;
+ case kTypeDeletion:
+ case kTypeDeletionWithTimestamp:
+ case kTypeSingleDeletion:
+ merge_context_.Clear();
+ last_not_merge_type = last_key_entry_type;
+ PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+ break;
+ case kTypeMerge: {
+ assert(merge_operator_ != nullptr);
+ merge_context_.PushOperandBack(
+ iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+ PERF_COUNTER_ADD(internal_merge_count, 1);
+ } break;
+ default:
+ valid_ = false;
+ status_ = Status::Corruption(
+ "Unknown value type: " +
+ std::to_string(static_cast<unsigned int>(last_key_entry_type)));
+ return false;
+ }
+
+ PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+ iter_.Prev();
+ ++num_skipped;
+
+ if (visible && timestamp_lb_ != nullptr) {
+ // If timestamp_lb_ is not nullptr, we do not have to look further for
+ // another internal key. We can return this current internal key. Yet we
+ // still keep the invariant that iter_ is positioned before the returned
+ // key.
+ break;
+ }
+ }
+
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+
+ if (!valid_entry_seen) {
+ // Since we haven't seen any valid entry, last_key_entry_type remains
+ // unchanged and the same as its initial value.
+ assert(last_key_entry_type == kTypeDeletion);
+ assert(last_not_merge_type == kTypeDeletion);
+ valid_ = false;
+ return true;
+ }
+
+ if (timestamp_lb_ != nullptr) {
+ assert(last_key_entry_type == ikey_.type);
+ }
+
+ Status s;
+ s.PermitUncheckedError();
+
+ switch (last_key_entry_type) {
+ case kTypeDeletion:
+ case kTypeDeletionWithTimestamp:
+ case kTypeSingleDeletion:
+ if (timestamp_lb_ == nullptr) {
+ valid_ = false;
+ } else {
+ saved_key_.SetInternalKey(saved_ikey_);
+ valid_ = true;
+ }
+ return true;
+ case kTypeMerge:
+ current_entry_is_merged_ = true;
+ if (last_not_merge_type == kTypeDeletion ||
+ last_not_merge_type == kTypeSingleDeletion ||
+ last_not_merge_type == kTypeDeletionWithTimestamp) {
+ if (!Merge(nullptr, saved_key_.GetUserKey())) {
+ return false;
+ }
+ return true;
+ } else if (last_not_merge_type == kTypeBlobIndex) {
+ if (expose_blob_index_) {
+ status_ =
+ Status::NotSupported("BlobDB does not support merge operator.");
+ valid_ = false;
+ return false;
+ }
+ if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
+ return false;
+ }
+ valid_ = true;
+ if (!Merge(&blob_value_, saved_key_.GetUserKey())) {
+ return false;
+ }
+
+ ResetBlobValue();
+
+ return true;
+ } else if (last_not_merge_type == kTypeWideColumnEntity) {
+ if (!MergeEntity(pinned_value_, saved_key_.GetUserKey())) {
+ return false;
+ }
+
+ return true;
+ } else {
+ assert(last_not_merge_type == kTypeValue);
+ if (!Merge(&pinned_value_, saved_key_.GetUserKey())) {
+ return false;
+ }
+ return true;
+ }
+ break;
+ case kTypeValue:
+ if (timestamp_lb_ != nullptr) {
+ saved_key_.SetInternalKey(saved_ikey_);
+ }
+
+ SetValueAndColumnsFromPlain(pinned_value_);
+
+ break;
+ case kTypeBlobIndex:
+ if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
+ return false;
+ }
+
+ SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
+ : blob_value_);
+
+ break;
+ case kTypeWideColumnEntity:
+ if (!SetValueAndColumnsFromEntity(pinned_value_)) {
+ return false;
+ }
+ break;
+ default:
+ valid_ = false;
+ status_ = Status::Corruption(
+ "Unknown value type: " +
+ std::to_string(static_cast<unsigned int>(last_key_entry_type)));
+ return false;
+ }
+ if (!s.ok()) {
+ valid_ = false;
+ status_ = s;
+ return false;
+ }
+ valid_ = true;
+ return true;
+}
+
+// This function is used in FindValueForCurrentKey.
+// We use Seek() function instead of Prev() to find necessary value
+// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld().
+// Would be nice to reuse some code.
+bool DBIter::FindValueForCurrentKeyUsingSeek() {
+ // FindValueForCurrentKey will enable pinning before calling
+ // FindValueForCurrentKeyUsingSeek()
+ assert(pinned_iters_mgr_.PinningEnabled());
+ std::string last_key;
+ if (0 == timestamp_size_) {
+ AppendInternalKey(&last_key,
+ ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+ kValueTypeForSeek));
+ } else {
+ AppendInternalKeyWithDifferentTimestamp(
+ &last_key,
+ ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+ kValueTypeForSeek),
+ timestamp_lb_ == nullptr ? *timestamp_ub_ : *timestamp_lb_);
+ }
+ iter_.Seek(last_key);
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+
+ // In case read_callback presents, the value we seek to may not be visible.
+ // Find the next value that's visible.
+ ParsedInternalKey ikey;
+
+ while (true) {
+ if (!iter_.Valid()) {
+ valid_ = false;
+ return iter_.status().ok();
+ }
+
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+ assert(ikey.user_key.size() >= timestamp_size_);
+ Slice ts;
+ if (timestamp_size_ > 0) {
+ ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+ timestamp_size_);
+ }
+
+ if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+ saved_key_.GetUserKey())) {
+ // No visible values for this key, even though FindValueForCurrentKey()
+ // has seen some. This is possible if we're using a tailing iterator, and
+ // the entries were discarded in a compaction.
+ valid_ = false;
+ return true;
+ }
+
+ if (IsVisible(ikey.sequence, ts)) {
+ break;
+ }
+
+ iter_.Next();
+ }
+
+ if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
+ kTypeDeletionWithTimestamp == ikey.type) {
+ if (timestamp_lb_ == nullptr) {
+ valid_ = false;
+ } else {
+ valid_ = true;
+ saved_key_.SetInternalKey(ikey);
+ }
+ return true;
+ }
+ if (!iter_.PrepareValue()) {
+ valid_ = false;
+ return false;
+ }
+ if (timestamp_size_ > 0) {
+ Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_);
+ saved_timestamp_.assign(ts.data(), ts.size());
+ }
+ if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex ||
+ ikey.type == kTypeWideColumnEntity) {
+ assert(iter_.iter()->IsValuePinned());
+ pinned_value_ = iter_.value();
+ if (ikey.type == kTypeBlobIndex) {
+ if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) {
+ return false;
+ }
+
+ SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
+ : blob_value_);
+ } else if (ikey.type == kTypeWideColumnEntity) {
+ if (!SetValueAndColumnsFromEntity(pinned_value_)) {
+ return false;
+ }
+ } else {
+ assert(ikey.type == kTypeValue);
+ SetValueAndColumnsFromPlain(pinned_value_);
+ }
+
+ if (timestamp_lb_ != nullptr) {
+ saved_key_.SetInternalKey(ikey);
+ }
+
+ valid_ = true;
+ return true;
+ }
+
+ // kTypeMerge. We need to collect all kTypeMerge values and save them
+ // in operands
+ assert(ikey.type == kTypeMerge);
+ current_entry_is_merged_ = true;
+ merge_context_.Clear();
+ merge_context_.PushOperand(
+ iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+ while (true) {
+ iter_.Next();
+
+ if (!iter_.Valid()) {
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+ break;
+ }
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+ if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+ saved_key_.GetUserKey())) {
+ break;
+ }
+ if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
+ ikey.type == kTypeDeletionWithTimestamp) {
+ break;
+ }
+ if (!iter_.PrepareValue()) {
+ valid_ = false;
+ return false;
+ }
+
+ if (ikey.type == kTypeValue) {
+ const Slice val = iter_.value();
+ if (!Merge(&val, saved_key_.GetUserKey())) {
+ return false;
+ }
+ return true;
+ } else if (ikey.type == kTypeMerge) {
+ merge_context_.PushOperand(
+ iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+ PERF_COUNTER_ADD(internal_merge_count, 1);
+ } else if (ikey.type == kTypeBlobIndex) {
+ if (expose_blob_index_) {
+ status_ =
+ Status::NotSupported("BlobDB does not support merge operator.");
+ valid_ = false;
+ return false;
+ }
+ if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
+ return false;
+ }
+ valid_ = true;
+ if (!Merge(&blob_value_, saved_key_.GetUserKey())) {
+ return false;
+ }
+
+ ResetBlobValue();
+
+ return true;
+ } else if (ikey.type == kTypeWideColumnEntity) {
+ if (!MergeEntity(iter_.value(), saved_key_.GetUserKey())) {
+ return false;
+ }
+
+ return true;
+ } else {
+ valid_ = false;
+ status_ = Status::Corruption(
+ "Unknown value type: " +
+ std::to_string(static_cast<unsigned int>(ikey.type)));
+ return false;
+ }
+ }
+
+ if (!Merge(nullptr, saved_key_.GetUserKey())) {
+ return false;
+ }
+
+ // Make sure we leave iter_ in a good state. If it's valid and we don't care
+ // about prefixes, that's already good enough. Otherwise it needs to be
+ // seeked to the current key.
+ if (!expect_total_order_inner_iter() || !iter_.Valid()) {
+ if (!expect_total_order_inner_iter()) {
+ iter_.SeekForPrev(last_key);
+ } else {
+ iter_.Seek(last_key);
+ if (!iter_.Valid() && iter_.status().ok()) {
+ iter_.SeekToLast();
+ }
+ }
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+ }
+
+ valid_ = true;
+ return true;
+}
+
+bool DBIter::Merge(const Slice* val, const Slice& user_key) {
+ Status s = MergeHelper::TimedFullMerge(
+ merge_operator_, user_key, val, merge_context_.GetOperands(),
+ &saved_value_, logger_, statistics_, clock_, &pinned_value_,
+ /* update_num_ops_stats */ true);
+ if (!s.ok()) {
+ valid_ = false;
+ status_ = s;
+ return false;
+ }
+
+ SetValueAndColumnsFromPlain(pinned_value_.data() ? pinned_value_
+ : saved_value_);
+
+ valid_ = true;
+ return true;
+}
+
+bool DBIter::MergeEntity(const Slice& entity, const Slice& user_key) {
+ Status s = MergeHelper::TimedFullMergeWithEntity(
+ merge_operator_, user_key, entity, merge_context_.GetOperands(),
+ &saved_value_, logger_, statistics_, clock_,
+ /* update_num_ops_stats */ true);
+ if (!s.ok()) {
+ valid_ = false;
+ status_ = s;
+ return false;
+ }
+
+ if (!SetValueAndColumnsFromEntity(saved_value_)) {
+ return false;
+ }
+
+ valid_ = true;
+ return true;
+}
+
+// Move backwards until the key smaller than saved_key_.
+// Changes valid_ only if return value is false.
+bool DBIter::FindUserKeyBeforeSavedKey() {
+ assert(status_.ok());
+ size_t num_skipped = 0;
+ while (iter_.Valid()) {
+ ParsedInternalKey ikey;
+ if (!ParseKey(&ikey)) {
+ return false;
+ }
+
+ if (CompareKeyForSkip(ikey.user_key, saved_key_.GetUserKey()) < 0) {
+ return true;
+ }
+
+ if (TooManyInternalKeysSkipped()) {
+ return false;
+ }
+
+ assert(ikey.sequence != kMaxSequenceNumber);
+ assert(ikey.user_key.size() >= timestamp_size_);
+ Slice ts;
+ if (timestamp_size_ > 0) {
+ ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+ timestamp_size_);
+ }
+ if (!IsVisible(ikey.sequence, ts)) {
+ PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+ } else {
+ PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+ }
+
+ if (num_skipped >= max_skip_) {
+ num_skipped = 0;
+ IterKey last_key;
+ ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
+ kValueTypeForSeek);
+ if (timestamp_size_ > 0) {
+ // TODO: pre-create kTsMax.
+ const std::string kTsMax(timestamp_size_, '\xff');
+ pikey.SetTimestamp(kTsMax);
+ }
+ last_key.SetInternalKey(pikey);
+ // It would be more efficient to use SeekForPrev() here, but some
+ // iterators may not support it.
+ iter_.Seek(last_key.GetInternalKey());
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+ if (!iter_.Valid()) {
+ break;
+ }
+ } else {
+ ++num_skipped;
+ }
+
+ iter_.Prev();
+ }
+
+ if (!iter_.status().ok()) {
+ valid_ = false;
+ return false;
+ }
+
+ return true;
+}
+
+bool DBIter::TooManyInternalKeysSkipped(bool increment) {
+ if ((max_skippable_internal_keys_ > 0) &&
+ (num_internal_keys_skipped_ > max_skippable_internal_keys_)) {
+ valid_ = false;
+ status_ = Status::Incomplete("Too many internal keys skipped.");
+ return true;
+ } else if (increment) {
+ num_internal_keys_skipped_++;
+ }
+ return false;
+}
+
+bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts,
+ bool* more_recent) {
+ // Remember that comparator orders preceding timestamp as larger.
+ // TODO(yanqin): support timestamp in read_callback_.
+ bool visible_by_seq = (read_callback_ == nullptr)
+ ? sequence <= sequence_
+ : read_callback_->IsVisible(sequence);
+
+ bool visible_by_ts =
+ (timestamp_ub_ == nullptr ||
+ user_comparator_.CompareTimestamp(ts, *timestamp_ub_) <= 0) &&
+ (timestamp_lb_ == nullptr ||
+ user_comparator_.CompareTimestamp(ts, *timestamp_lb_) >= 0);
+
+ if (more_recent) {
+ *more_recent = !visible_by_seq;
+ }
+ return visible_by_seq && visible_by_ts;
+}
+
+void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
+ is_key_seqnum_zero_ = false;
+ SequenceNumber seq = sequence_;
+ saved_key_.Clear();
+ saved_key_.SetInternalKey(target, seq, kValueTypeForSeek, timestamp_ub_);
+
+ if (iterate_lower_bound_ != nullptr &&
+ user_comparator_.CompareWithoutTimestamp(
+ saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
+ /*b_has_ts=*/false) < 0) {
+ // Seek key is smaller than the lower bound.
+ saved_key_.Clear();
+ saved_key_.SetInternalKey(*iterate_lower_bound_, seq, kValueTypeForSeek,
+ timestamp_ub_);
+ }
+}
+
+void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
+ is_key_seqnum_zero_ = false;
+ saved_key_.Clear();
+ // now saved_key is used to store internal key.
+ saved_key_.SetInternalKey(target, 0 /* sequence_number */,
+ kValueTypeForSeekForPrev, timestamp_ub_);
+
+ if (timestamp_size_ > 0) {
+ const std::string kTsMin(timestamp_size_, '\0');
+ Slice ts = kTsMin;
+ saved_key_.UpdateInternalKey(
+ /*seq=*/0, kValueTypeForSeekForPrev,
+ timestamp_lb_ == nullptr ? &ts : timestamp_lb_);
+ }
+
+ if (iterate_upper_bound_ != nullptr &&
+ user_comparator_.CompareWithoutTimestamp(
+ saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_,
+ /*b_has_ts=*/false) >= 0) {
+ saved_key_.Clear();
+ saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber,
+ kValueTypeForSeekForPrev, timestamp_ub_);
+ if (timestamp_size_ > 0) {
+ const std::string kTsMax(timestamp_size_, '\xff');
+ Slice ts = kTsMax;
+ saved_key_.UpdateInternalKey(
+ kMaxSequenceNumber, kValueTypeForSeekForPrev,
+ timestamp_lb_ != nullptr ? timestamp_lb_ : &ts);
+ }
+ }
+}
+
+void DBIter::Seek(const Slice& target) {
+ PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+ StopWatch sw(clock_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+ if (db_impl_ != nullptr && cfd_ != nullptr) {
+ // TODO: What do we do if this returns an error?
+ Slice lower_bound, upper_bound;
+ if (iterate_lower_bound_ != nullptr) {
+ lower_bound = *iterate_lower_bound_;
+ } else {
+ lower_bound = Slice("");
+ }
+ if (iterate_upper_bound_ != nullptr) {
+ upper_bound = *iterate_upper_bound_;
+ } else {
+ upper_bound = Slice("");
+ }
+ db_impl_->TraceIteratorSeek(cfd_->GetID(), target, lower_bound, upper_bound)
+ .PermitUncheckedError();
+ }
+#endif // ROCKSDB_LITE
+
+ status_ = Status::OK();
+ ReleaseTempPinnedData();
+ ResetBlobValue();
+ ResetValueAndColumns();
+ ResetInternalKeysSkippedCounter();
+
+ // Seek the inner iterator based on the target key.
+ {
+ PERF_TIMER_GUARD(seek_internal_seek_time);
+
+ SetSavedKeyToSeekTarget(target);
+ iter_.Seek(saved_key_.GetInternalKey());
+
+ RecordTick(statistics_, NUMBER_DB_SEEK);
+ }
+ if (!iter_.Valid()) {
+ valid_ = false;
+ return;
+ }
+ direction_ = kForward;
+
+ // Now the inner iterator is placed to the target position. From there,
+ // we need to find out the next key that is visible to the user.
+ ClearSavedValue();
+ if (prefix_same_as_start_) {
+ // The case where the iterator needs to be invalidated if it has exhausted
+ // keys within the same prefix of the seek key.
+ assert(prefix_extractor_ != nullptr);
+ Slice target_prefix = prefix_extractor_->Transform(target);
+ FindNextUserEntry(false /* not skipping saved_key */,
+ &target_prefix /* prefix */);
+ if (valid_) {
+ // Remember the prefix of the seek key for the future Next() call to
+ // check.
+ prefix_.SetUserKey(target_prefix);
+ }
+ } else {
+ FindNextUserEntry(false /* not skipping saved_key */, nullptr);
+ }
+ if (!valid_) {
+ return;
+ }
+
+ // Updating stats and perf context counters.
+ if (statistics_ != nullptr) {
+ // Decrement since we don't want to count this key as skipped
+ RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+ RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+ }
+ PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+}
+
+void DBIter::SeekForPrev(const Slice& target) {
+ PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+ StopWatch sw(clock_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+ if (db_impl_ != nullptr && cfd_ != nullptr) {
+ // TODO: What do we do if this returns an error?
+ Slice lower_bound, upper_bound;
+ if (iterate_lower_bound_ != nullptr) {
+ lower_bound = *iterate_lower_bound_;
+ } else {
+ lower_bound = Slice("");
+ }
+ if (iterate_upper_bound_ != nullptr) {
+ upper_bound = *iterate_upper_bound_;
+ } else {
+ upper_bound = Slice("");
+ }
+ db_impl_
+ ->TraceIteratorSeekForPrev(cfd_->GetID(), target, lower_bound,
+ upper_bound)
+ .PermitUncheckedError();
+ }
+#endif // ROCKSDB_LITE
+
+ status_ = Status::OK();
+ ReleaseTempPinnedData();
+ ResetBlobValue();
+ ResetValueAndColumns();
+ ResetInternalKeysSkippedCounter();
+
+ // Seek the inner iterator based on the target key.
+ {
+ PERF_TIMER_GUARD(seek_internal_seek_time);
+ SetSavedKeyToSeekForPrevTarget(target);
+ iter_.SeekForPrev(saved_key_.GetInternalKey());
+ RecordTick(statistics_, NUMBER_DB_SEEK);
+ }
+ if (!iter_.Valid()) {
+ valid_ = false;
+ return;
+ }
+ direction_ = kReverse;
+
+ // Now the inner iterator is placed to the target position. From there,
+ // we need to find out the first key that is visible to the user in the
+ // backward direction.
+ ClearSavedValue();
+ if (prefix_same_as_start_) {
+ // The case where the iterator needs to be invalidated if it has exhausted
+ // keys within the same prefix of the seek key.
+ assert(prefix_extractor_ != nullptr);
+ Slice target_prefix = prefix_extractor_->Transform(target);
+ PrevInternal(&target_prefix);
+ if (valid_) {
+ // Remember the prefix of the seek key for the future Prev() call to
+ // check.
+ prefix_.SetUserKey(target_prefix);
+ }
+ } else {
+ PrevInternal(nullptr);
+ }
+
+ // Report stats and perf context.
+ if (statistics_ != nullptr && valid_) {
+ RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+ RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+ PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+ }
+}
+
+void DBIter::SeekToFirst() {
+ if (iterate_lower_bound_ != nullptr) {
+ Seek(*iterate_lower_bound_);
+ return;
+ }
+ PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+ // Don't use iter_::Seek() if we set a prefix extractor
+ // because prefix seek will be used.
+ if (!expect_total_order_inner_iter()) {
+ max_skip_ = std::numeric_limits<uint64_t>::max();
+ }
+ status_ = Status::OK();
+ // if iterator is empty, this status_ could be unchecked.
+ status_.PermitUncheckedError();
+ direction_ = kForward;
+ ReleaseTempPinnedData();
+ ResetBlobValue();
+ ResetValueAndColumns();
+ ResetInternalKeysSkippedCounter();
+ ClearSavedValue();
+ is_key_seqnum_zero_ = false;
+
+ {
+ PERF_TIMER_GUARD(seek_internal_seek_time);
+ iter_.SeekToFirst();
+ }
+
+ RecordTick(statistics_, NUMBER_DB_SEEK);
+ if (iter_.Valid()) {
+ saved_key_.SetUserKey(
+ ExtractUserKey(iter_.key()),
+ !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+ FindNextUserEntry(false /* not skipping saved_key */,
+ nullptr /* no prefix check */);
+ if (statistics_ != nullptr) {
+ if (valid_) {
+ RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+ RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+ PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+ }
+ }
+ } else {
+ valid_ = false;
+ }
+ if (valid_ && prefix_same_as_start_) {
+ assert(prefix_extractor_ != nullptr);
+ prefix_.SetUserKey(prefix_extractor_->Transform(
+ StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
+ }
+}
+
+void DBIter::SeekToLast() {
+ if (iterate_upper_bound_ != nullptr) {
+ // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
+ SeekForPrev(*iterate_upper_bound_);
+ const bool is_ikey = (timestamp_size_ > 0 && timestamp_lb_ != nullptr);
+ Slice k = Valid() ? key() : Slice();
+ if (is_ikey && Valid()) {
+ k.remove_suffix(kNumInternalBytes + timestamp_size_);
+ }
+ while (Valid() && 0 == user_comparator_.CompareWithoutTimestamp(
+ *iterate_upper_bound_, /*a_has_ts=*/false, k,
+ /*b_has_ts=*/false)) {
+ ReleaseTempPinnedData();
+ ResetBlobValue();
+ ResetValueAndColumns();
+ PrevInternal(nullptr);
+
+ k = key();
+ if (is_ikey) {
+ k.remove_suffix(kNumInternalBytes + timestamp_size_);
+ }
+ }
+ return;
+ }
+
+ PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+ // Don't use iter_::Seek() if we set a prefix extractor
+ // because prefix seek will be used.
+ if (!expect_total_order_inner_iter()) {
+ max_skip_ = std::numeric_limits<uint64_t>::max();
+ }
+ status_ = Status::OK();
+ // if iterator is empty, this status_ could be unchecked.
+ status_.PermitUncheckedError();
+ direction_ = kReverse;
+ ReleaseTempPinnedData();
+ ResetBlobValue();
+ ResetValueAndColumns();
+ ResetInternalKeysSkippedCounter();
+ ClearSavedValue();
+ is_key_seqnum_zero_ = false;
+
+ {
+ PERF_TIMER_GUARD(seek_internal_seek_time);
+ iter_.SeekToLast();
+ }
+ PrevInternal(nullptr);
+ if (statistics_ != nullptr) {
+ RecordTick(statistics_, NUMBER_DB_SEEK);
+ if (valid_) {
+ RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+ RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+ PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+ }
+ }
+ if (valid_ && prefix_same_as_start_) {
+ assert(prefix_extractor_ != nullptr);
+ prefix_.SetUserKey(prefix_extractor_->Transform(
+ StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
+ }
+}
+
+Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
+ const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ const Comparator* user_key_comparator,
+ InternalIterator* internal_iter, const Version* version,
+ const SequenceNumber& sequence,
+ uint64_t max_sequential_skip_in_iterations,
+ ReadCallback* read_callback, DBImpl* db_impl,
+ ColumnFamilyData* cfd, bool expose_blob_index) {
+ DBIter* db_iter =
+ new DBIter(env, read_options, ioptions, mutable_cf_options,
+ user_key_comparator, internal_iter, version, sequence, false,
+ max_sequential_skip_in_iterations, read_callback, db_impl, cfd,
+ expose_blob_index);
+ return db_iter;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_iter.h b/src/rocksdb/db/db_iter.h
new file mode 100644
index 000000000..e87c2b4c9
--- /dev/null
+++ b/src/rocksdb/db/db_iter.h
@@ -0,0 +1,420 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cstdint>
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/wide_columns.h"
+#include "table/iterator_wrapper.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Version;
+
+// This file declares the factory functions of DBIter, in its original form
+// or a wrapped form with class ArenaWrappedDBIter, which is defined here.
+// Class DBIter, which is declared and implemented inside db_iter.cc, is
+// an iterator that converts internal keys (yielded by an InternalIterator)
+// that were live at the specified sequence number into appropriate user
+// keys.
+// Each internal key consists of a user key, a sequence number, and a value
+// type. DBIter deals with multiple key versions, tombstones, merge operands,
+// etc, and exposes an Iterator.
+// For example, DBIter may wrap following InternalIterator:
+// user key: AAA value: v3 seqno: 100 type: Put
+// user key: AAA value: v2 seqno: 97 type: Put
+// user key: AAA value: v1 seqno: 95 type: Put
+// user key: BBB value: v1 seqno: 90 type: Put
+// user key: BBC value: N/A seqno: 98 type: Delete
+// user key: BBC value: v1 seqno: 95 type: Put
+// If the snapshot passed in is 102, then the DBIter is expected to
+// expose the following iterator:
+// key: AAA value: v3
+// key: BBB value: v1
+// If the snapshot passed in is 96, then it should expose:
+// key: AAA value: v1
+// key: BBB value: v1
+// key: BBC value: v1
+//
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries. DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter final : public Iterator {
+ public:
+ // The following is grossly complicated. TODO: clean it up
+ // Which direction is the iterator currently moving?
+ // (1) When moving forward:
+ // (1a) if current_entry_is_merged_ = false, the internal iterator is
+ // positioned at the exact entry that yields this->key(), this->value()
+ // (1b) if current_entry_is_merged_ = true, the internal iterator is
+ // positioned immediately after the last entry that contributed to the
+ // current this->value(). That entry may or may not have key equal to
+ // this->key().
+ // (2) When moving backwards, the internal iterator is positioned
+ // just before all entries whose user key == this->key().
+ enum Direction : uint8_t { kForward, kReverse };
+
+ // LocalStatistics contain Statistics counters that will be aggregated per
+ // each iterator instance and then will be sent to the global statistics when
+ // the iterator is destroyed.
+ //
+ // The purpose of this approach is to avoid perf regression happening
+ // when multiple threads bump the atomic counters from a DBIter::Next().
+ struct LocalStatistics {
+ explicit LocalStatistics() { ResetCounters(); }
+
+ void ResetCounters() {
+ next_count_ = 0;
+ next_found_count_ = 0;
+ prev_count_ = 0;
+ prev_found_count_ = 0;
+ bytes_read_ = 0;
+ skip_count_ = 0;
+ }
+
+ void BumpGlobalStatistics(Statistics* global_statistics) {
+ RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
+ RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
+ RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
+ RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
+ RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
+ RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
+ PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
+ ResetCounters();
+ }
+
+ // Map to Tickers::NUMBER_DB_NEXT
+ uint64_t next_count_;
+ // Map to Tickers::NUMBER_DB_NEXT_FOUND
+ uint64_t next_found_count_;
+ // Map to Tickers::NUMBER_DB_PREV
+ uint64_t prev_count_;
+ // Map to Tickers::NUMBER_DB_PREV_FOUND
+ uint64_t prev_found_count_;
+ // Map to Tickers::ITER_BYTES_READ
+ uint64_t bytes_read_;
+ // Map to Tickers::NUMBER_ITER_SKIP
+ uint64_t skip_count_;
+ };
+
+ DBIter(Env* _env, const ReadOptions& read_options,
+ const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
+ InternalIterator* iter, const Version* version, SequenceNumber s,
+ bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+ ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+ bool expose_blob_index);
+
+ // No copying allowed
+ DBIter(const DBIter&) = delete;
+ void operator=(const DBIter&) = delete;
+
+ ~DBIter() override {
+ // Release pinned data if any
+ if (pinned_iters_mgr_.PinningEnabled()) {
+ pinned_iters_mgr_.ReleasePinnedData();
+ }
+ RecordTick(statistics_, NO_ITERATOR_DELETED);
+ ResetInternalKeysSkippedCounter();
+ local_stats_.BumpGlobalStatistics(statistics_);
+ iter_.DeleteIter(arena_mode_);
+ }
+ void SetIter(InternalIterator* iter) {
+ assert(iter_.iter() == nullptr);
+ iter_.Set(iter);
+ iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+ }
+
+ bool Valid() const override {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ if (valid_) {
+ status_.PermitUncheckedError();
+ }
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+ return valid_;
+ }
+ Slice key() const override {
+ assert(valid_);
+ if (timestamp_lb_) {
+ return saved_key_.GetInternalKey();
+ } else {
+ const Slice ukey_and_ts = saved_key_.GetUserKey();
+ return Slice(ukey_and_ts.data(), ukey_and_ts.size() - timestamp_size_);
+ }
+ }
+ Slice value() const override {
+ assert(valid_);
+
+ return value_;
+ }
+
+ const WideColumns& columns() const override {
+ assert(valid_);
+
+ return wide_columns_;
+ }
+
+ Status status() const override {
+ if (status_.ok()) {
+ return iter_.status();
+ } else {
+ assert(!valid_);
+ return status_;
+ }
+ }
+ Slice timestamp() const override {
+ assert(valid_);
+ assert(timestamp_size_ > 0);
+ if (direction_ == kReverse) {
+ return saved_timestamp_;
+ }
+ const Slice ukey_and_ts = saved_key_.GetUserKey();
+ assert(timestamp_size_ < ukey_and_ts.size());
+ return ExtractTimestampFromUserKey(ukey_and_ts, timestamp_size_);
+ }
+ bool IsBlob() const {
+ assert(valid_);
+ return is_blob_;
+ }
+
+ Status GetProperty(std::string prop_name, std::string* prop) override;
+
+ void Next() final override;
+ void Prev() final override;
+ // 'target' does not contain timestamp, even if user timestamp feature is
+ // enabled.
+ void Seek(const Slice& target) final override;
+ void SeekForPrev(const Slice& target) final override;
+ void SeekToFirst() final override;
+ void SeekToLast() final override;
+ Env* env() const { return env_; }
+ void set_sequence(uint64_t s) {
+ sequence_ = s;
+ if (read_callback_) {
+ read_callback_->Refresh(s);
+ }
+ }
+ void set_valid(bool v) { valid_ = v; }
+
+ private:
+ // For all methods in this block:
+ // PRE: iter_->Valid() && status_.ok()
+ // Return false if there was an error, and status() is non-ok, valid_ = false;
+ // in this case callers would usually stop what they were doing and return.
+ bool ReverseToForward();
+ bool ReverseToBackward();
+ // Set saved_key_ to the seek key to target, with proper sequence number set.
+ // It might get adjusted if the seek key is smaller than iterator lower bound.
+ // target does not have timestamp.
+ void SetSavedKeyToSeekTarget(const Slice& target);
+ // Set saved_key_ to the seek key to target, with proper sequence number set.
+ // It might get adjusted if the seek key is larger than iterator upper bound.
+ // target does not have timestamp.
+ void SetSavedKeyToSeekForPrevTarget(const Slice& target);
+ bool FindValueForCurrentKey();
+ bool FindValueForCurrentKeyUsingSeek();
+ bool FindUserKeyBeforeSavedKey();
+ // If `skipping_saved_key` is true, the function will keep iterating until it
+ // finds a user key that is larger than `saved_key_`.
+ // If `prefix` is not null, the iterator needs to stop when all keys for the
+ // prefix are exhausted and the iterator is set to invalid.
+ bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
+ // Internal implementation of FindNextUserEntry().
+ bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
+ bool ParseKey(ParsedInternalKey* key);
+ bool MergeValuesNewToOld();
+
+ // If prefix is not null, we need to set the iterator to invalid if no more
+ // entry can be found within the prefix.
+ void PrevInternal(const Slice* prefix);
+ bool TooManyInternalKeysSkipped(bool increment = true);
+ bool IsVisible(SequenceNumber sequence, const Slice& ts,
+ bool* more_recent = nullptr);
+
+ // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
+ // is called
+ void TempPinData() {
+ if (!pin_thru_lifetime_) {
+ pinned_iters_mgr_.StartPinning();
+ }
+ }
+
+ // Release blocks pinned by TempPinData()
+ void ReleaseTempPinnedData() {
+ if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
+ pinned_iters_mgr_.ReleasePinnedData();
+ }
+ }
+
+ inline void ClearSavedValue() {
+ if (saved_value_.capacity() > 1048576) {
+ std::string empty;
+ swap(empty, saved_value_);
+ } else {
+ saved_value_.clear();
+ }
+ }
+
+ inline void ResetInternalKeysSkippedCounter() {
+ local_stats_.skip_count_ += num_internal_keys_skipped_;
+ if (valid_) {
+ local_stats_.skip_count_--;
+ }
+ num_internal_keys_skipped_ = 0;
+ }
+
+ bool expect_total_order_inner_iter() {
+ assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr);
+ return expect_total_order_inner_iter_;
+ }
+
+ // If lower bound of timestamp is given by ReadOptions.iter_start_ts, we need
+ // to return versions of the same key. We cannot just skip if the key value
+ // is the same but timestamps are different but fall in timestamp range.
+ inline int CompareKeyForSkip(const Slice& a, const Slice& b) {
+ return timestamp_lb_ != nullptr
+ ? user_comparator_.Compare(a, b)
+ : user_comparator_.CompareWithoutTimestamp(a, b);
+ }
+
+ // Retrieves the blob value for the specified user key using the given blob
+ // index when using the integrated BlobDB implementation.
+ bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index);
+
+ void ResetBlobValue() {
+ is_blob_ = false;
+ blob_value_.Reset();
+ }
+
+ void SetValueAndColumnsFromPlain(const Slice& slice) {
+ assert(value_.empty());
+ assert(wide_columns_.empty());
+
+ value_ = slice;
+ wide_columns_.emplace_back(kDefaultWideColumnName, slice);
+ }
+
+ bool SetValueAndColumnsFromEntity(Slice slice);
+
+ void ResetValueAndColumns() {
+ value_.clear();
+ wide_columns_.clear();
+ }
+
+ // If user-defined timestamp is enabled, `user_key` includes timestamp.
+ bool Merge(const Slice* val, const Slice& user_key);
+ bool MergeEntity(const Slice& entity, const Slice& user_key);
+
+ const SliceTransform* prefix_extractor_;
+ Env* const env_;
+ SystemClock* clock_;
+ Logger* logger_;
+ UserComparatorWrapper user_comparator_;
+ const MergeOperator* const merge_operator_;
+ IteratorWrapper iter_;
+ const Version* version_;
+ ReadCallback* read_callback_;
+ // Max visible sequence number. It is normally the snapshot seq unless we have
+ // uncommitted data in db as in WriteUnCommitted.
+ SequenceNumber sequence_;
+
+ IterKey saved_key_;
+ // Reusable internal key data structure. This is only used inside one function
+ // and should not be used across functions. Reusing this object can reduce
+ // overhead of calling construction of the function if creating it each time.
+ ParsedInternalKey ikey_;
+ std::string saved_value_;
+ Slice pinned_value_;
+ // for prefix seek mode to support prev()
+ PinnableSlice blob_value_;
+ // Value of the default column
+ Slice value_;
+ // All columns (i.e. name-value pairs)
+ WideColumns wide_columns_;
+ Statistics* statistics_;
+ uint64_t max_skip_;
+ uint64_t max_skippable_internal_keys_;
+ uint64_t num_internal_keys_skipped_;
+ const Slice* iterate_lower_bound_;
+ const Slice* iterate_upper_bound_;
+
+ // The prefix of the seek key. It is only used when prefix_same_as_start_
+ // is true and prefix extractor is not null. In Next() or Prev(), current keys
+ // will be checked against this prefix, so that the iterator can be
+ // invalidated if the keys in this prefix has been exhausted. Set it using
+ // SetUserKey() and use it using GetUserKey().
+ IterKey prefix_;
+
+ Status status_;
+ Direction direction_;
+ bool valid_;
+ bool current_entry_is_merged_;
+ // True if we know that the current entry's seqnum is 0.
+ // This information is used as that the next entry will be for another
+ // user key.
+ bool is_key_seqnum_zero_;
+ const bool prefix_same_as_start_;
+ // Means that we will pin all data blocks we read as long the Iterator
+ // is not deleted, will be true if ReadOptions::pin_data is true
+ const bool pin_thru_lifetime_;
+ // Expect the inner iterator to maintain a total order.
+ // prefix_extractor_ must be non-NULL if the value is false.
+ const bool expect_total_order_inner_iter_;
+ ReadTier read_tier_;
+ bool fill_cache_;
+ bool verify_checksums_;
+ // Whether the iterator is allowed to expose blob references. Set to true when
+ // the stacked BlobDB implementation is used, false otherwise.
+ bool expose_blob_index_;
+ bool is_blob_;
+ bool arena_mode_;
+ // List of operands for merge operator.
+ MergeContext merge_context_;
+ LocalStatistics local_stats_;
+ PinnedIteratorsManager pinned_iters_mgr_;
+#ifdef ROCKSDB_LITE
+ ROCKSDB_FIELD_UNUSED
+#endif
+ DBImpl* db_impl_;
+#ifdef ROCKSDB_LITE
+ ROCKSDB_FIELD_UNUSED
+#endif
+ ColumnFamilyData* cfd_;
+ const Slice* const timestamp_ub_;
+ const Slice* const timestamp_lb_;
+ const size_t timestamp_size_;
+ std::string saved_timestamp_;
+
+ // Used only if timestamp_lb_ is not nullptr.
+ std::string saved_ikey_;
+};
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified `sequence` number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+ Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ const Comparator* user_key_comparator, InternalIterator* internal_iter,
+ const Version* version, const SequenceNumber& sequence,
+ uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback,
+ DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+ bool expose_blob_index = false);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_iter_stress_test.cc b/src/rocksdb/db/db_iter_stress_test.cc
new file mode 100644
index 000000000..872f7e6bd
--- /dev/null
+++ b/src/rocksdb/db/db_iter_stress_test.cc
@@ -0,0 +1,658 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#ifdef GFLAGS
+
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_bool(verbose, false,
+ "Print huge, detailed trace. Intended for debugging failures.");
+
+#else
+
+void ParseCommandLineFlags(int*, char***, bool) {}
+bool FLAGS_verbose = false;
+
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBIteratorStressTest : public testing::Test {
+ public:
+ Env* env_;
+
+ DBIteratorStressTest() : env_(Env::Default()) {}
+};
+
+namespace {
+
+struct Entry {
+ std::string key;
+ ValueType type; // kTypeValue, kTypeDeletion, kTypeMerge
+ uint64_t sequence;
+ std::string ikey; // internal key, made from `key`, `sequence` and `type`
+ std::string value;
+ // If false, we'll pretend that this entry doesn't exist.
+ bool visible = true;
+
+ bool operator<(const Entry& e) const {
+ if (key != e.key) return key < e.key;
+ return std::tie(sequence, type) > std::tie(e.sequence, e.type);
+ }
+};
+
+struct Data {
+ std::vector<Entry> entries;
+
+ // Indices in `entries` with `visible` = false.
+ std::vector<size_t> hidden;
+ // Keys of entries whose `visible` changed since the last seek of iterators.
+ std::set<std::string> recently_touched_keys;
+};
+
+struct StressTestIterator : public InternalIterator {
+ Data* data;
+ Random64* rnd;
+ InternalKeyComparator cmp;
+
+ // Each operation will return error with this probability...
+ double error_probability = 0;
+ // ... and add/remove entries with this probability.
+ double mutation_probability = 0;
+ // The probability of adding vs removing entries will be chosen so that the
+ // amount of removed entries stays somewhat close to this number.
+ double target_hidden_fraction = 0;
+ // If true, print all mutations to stdout for debugging.
+ bool trace = false;
+
+ int iter = -1;
+ Status status_;
+
+ StressTestIterator(Data* _data, Random64* _rnd, const Comparator* _cmp)
+ : data(_data), rnd(_rnd), cmp(_cmp) {}
+
+ bool Valid() const override {
+ if (iter >= 0 && iter < (int)data->entries.size()) {
+ assert(status_.ok());
+ return true;
+ }
+ return false;
+ }
+
+ Status status() const override { return status_; }
+
+ bool MaybeFail() {
+ if (rnd->Next() >=
+ static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+ error_probability) {
+ return false;
+ }
+ if (rnd->Next() % 2) {
+ status_ = Status::Incomplete("test");
+ } else {
+ status_ = Status::IOError("test");
+ }
+ if (trace) {
+ std::cout << "injecting " << status_.ToString() << std::endl;
+ }
+ iter = -1;
+ return true;
+ }
+
+ void MaybeMutate() {
+ if (rnd->Next() >=
+ static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+ mutation_probability) {
+ return;
+ }
+ do {
+ // If too many entries are hidden, hide less, otherwise hide more.
+ double hide_probability =
+ data->hidden.size() > data->entries.size() * target_hidden_fraction
+ ? 1. / 3
+ : 2. / 3;
+ if (data->hidden.empty()) {
+ hide_probability = 1;
+ }
+ bool do_hide = rnd->Next() <
+ static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+ hide_probability;
+ if (do_hide) {
+ // Hide a random entry.
+ size_t idx = rnd->Next() % data->entries.size();
+ Entry& e = data->entries[idx];
+ if (e.visible) {
+ if (trace) {
+ std::cout << "hiding idx " << idx << std::endl;
+ }
+ e.visible = false;
+ data->hidden.push_back(idx);
+ data->recently_touched_keys.insert(e.key);
+ } else {
+ // Already hidden. Let's go unhide something instead, just because
+ // it's easy and it doesn't really matter what we do.
+ do_hide = false;
+ }
+ }
+ if (!do_hide) {
+ // Unhide a random entry.
+ size_t hi = rnd->Next() % data->hidden.size();
+ size_t idx = data->hidden[hi];
+ if (trace) {
+ std::cout << "unhiding idx " << idx << std::endl;
+ }
+ Entry& e = data->entries[idx];
+ assert(!e.visible);
+ e.visible = true;
+ data->hidden[hi] = data->hidden.back();
+ data->hidden.pop_back();
+ data->recently_touched_keys.insert(e.key);
+ }
+ } while (rnd->Next() % 3 != 0); // do 3 mutations on average
+ }
+
+ void SkipForward() {
+ while (iter < (int)data->entries.size() && !data->entries[iter].visible) {
+ ++iter;
+ }
+ }
+ void SkipBackward() {
+ while (iter >= 0 && !data->entries[iter].visible) {
+ --iter;
+ }
+ }
+
+ void SeekToFirst() override {
+ if (MaybeFail()) return;
+ MaybeMutate();
+
+ status_ = Status::OK();
+ iter = 0;
+ SkipForward();
+ }
+ void SeekToLast() override {
+ if (MaybeFail()) return;
+ MaybeMutate();
+
+ status_ = Status::OK();
+ iter = (int)data->entries.size() - 1;
+ SkipBackward();
+ }
+
+ void Seek(const Slice& target) override {
+ if (MaybeFail()) return;
+ MaybeMutate();
+
+ status_ = Status::OK();
+ // Binary search.
+ auto it = std::partition_point(
+ data->entries.begin(), data->entries.end(),
+ [&](const Entry& e) { return cmp.Compare(e.ikey, target) < 0; });
+ iter = (int)(it - data->entries.begin());
+ SkipForward();
+ }
+ void SeekForPrev(const Slice& target) override {
+ if (MaybeFail()) return;
+ MaybeMutate();
+
+ status_ = Status::OK();
+ // Binary search.
+ auto it = std::partition_point(
+ data->entries.begin(), data->entries.end(),
+ [&](const Entry& e) { return cmp.Compare(e.ikey, target) <= 0; });
+ iter = (int)(it - data->entries.begin());
+ --iter;
+ SkipBackward();
+ }
+
+ void Next() override {
+ assert(Valid());
+ if (MaybeFail()) return;
+ MaybeMutate();
+ ++iter;
+ SkipForward();
+ }
+ void Prev() override {
+ assert(Valid());
+ if (MaybeFail()) return;
+ MaybeMutate();
+ --iter;
+ SkipBackward();
+ }
+
+ Slice key() const override {
+ assert(Valid());
+ return data->entries[iter].ikey;
+ }
+ Slice value() const override {
+ assert(Valid());
+ return data->entries[iter].value;
+ }
+
+ bool IsKeyPinned() const override { return true; }
+ bool IsValuePinned() const override { return true; }
+};
+
+// A small reimplementation of DBIter, supporting only some of the features,
+// and doing everything in O(log n).
+// Skips all keys that are in recently_touched_keys.
+struct ReferenceIterator {
+ Data* data;
+ uint64_t sequence; // ignore entries with sequence number below this
+
+ bool valid = false;
+ std::string key;
+ std::string value;
+
+ ReferenceIterator(Data* _data, uint64_t _sequence)
+ : data(_data), sequence(_sequence) {}
+
+ bool Valid() const { return valid; }
+
+ // Finds the first entry with key
+ // greater/less/greater-or-equal/less-or-equal than `key`, depending on
+ // arguments: if `skip`, inequality is strict; if `forward`, it's
+ // greater/greater-or-equal, otherwise less/less-or-equal.
+ // Sets `key` to the result.
+ // If no such key exists, returns false. Doesn't check `visible`.
+ bool FindNextKey(bool skip, bool forward) {
+ valid = false;
+ auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+ [&](const Entry& e) {
+ if (forward != skip) {
+ return e.key < key;
+ } else {
+ return e.key <= key;
+ }
+ });
+ if (forward) {
+ if (it != data->entries.end()) {
+ key = it->key;
+ return true;
+ }
+ } else {
+ if (it != data->entries.begin()) {
+ --it;
+ key = it->key;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool FindValueForCurrentKey() {
+ if (data->recently_touched_keys.count(key)) {
+ return false;
+ }
+
+ // Find the first entry for the key. The caller promises that it exists.
+ auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+ [&](const Entry& e) {
+ if (e.key != key) {
+ return e.key < key;
+ }
+ return e.sequence > sequence;
+ });
+
+ // Find the first visible entry.
+ for (;; ++it) {
+ if (it == data->entries.end()) {
+ return false;
+ }
+ Entry& e = *it;
+ if (e.key != key) {
+ return false;
+ }
+ assert(e.sequence <= sequence);
+ if (!e.visible) continue;
+ if (e.type == kTypeDeletion) {
+ return false;
+ }
+ if (e.type == kTypeValue) {
+ value = e.value;
+ valid = true;
+ return true;
+ }
+ assert(e.type == kTypeMerge);
+ break;
+ }
+
+ // Collect merge operands.
+ std::vector<Slice> operands;
+ for (; it != data->entries.end(); ++it) {
+ Entry& e = *it;
+ if (e.key != key) {
+ break;
+ }
+ assert(e.sequence <= sequence);
+ if (!e.visible) continue;
+ if (e.type == kTypeDeletion) {
+ break;
+ }
+ operands.push_back(e.value);
+ if (e.type == kTypeValue) {
+ break;
+ }
+ }
+
+ // Do a merge.
+ value = operands.back().ToString();
+ for (int i = (int)operands.size() - 2; i >= 0; --i) {
+ value.append(",");
+ value.append(operands[i].data(), operands[i].size());
+ }
+
+ valid = true;
+ return true;
+ }
+
+ // Start at `key` and move until we encounter a valid value.
+ // `forward` defines the direction of movement.
+ // If `skip` is true, we're looking for key not equal to `key`.
+ void DoTheThing(bool skip, bool forward) {
+ while (FindNextKey(skip, forward) && !FindValueForCurrentKey()) {
+ skip = true;
+ }
+ }
+
+ void Seek(const Slice& target) {
+ key = target.ToString();
+ DoTheThing(false, true);
+ }
+ void SeekForPrev(const Slice& target) {
+ key = target.ToString();
+ DoTheThing(false, false);
+ }
+ void SeekToFirst() { Seek(""); }
+ void SeekToLast() {
+ key = data->entries.back().key;
+ DoTheThing(false, false);
+ }
+ void Next() {
+ assert(Valid());
+ DoTheThing(true, true);
+ }
+ void Prev() {
+ assert(Valid());
+ DoTheThing(true, false);
+ }
+};
+
+} // anonymous namespace
+
+// Use an internal iterator that sometimes returns errors and sometimes
+// adds/removes entries on the fly. Do random operations on a DBIter and
+// check results.
+// TODO: can be improved for more coverage:
+// * Override IsKeyPinned() and IsValuePinned() to actually use
+// PinnedIteratorManager and check that there's no use-after free.
+// * Try different combinations of prefix_extractor, total_order_seek,
+// prefix_same_as_start, iterate_lower_bound, iterate_upper_bound.
+TEST_F(DBIteratorStressTest, StressTest) {
+ // We use a deterministic RNG, and everything happens in a single thread.
+ Random64 rnd(826909345792864532ll);
+
+ auto gen_key = [&](int max_key) {
+ assert(max_key > 0);
+ int len = 0;
+ int a = max_key;
+ while (a) {
+ a /= 10;
+ ++len;
+ }
+ std::string s = std::to_string(rnd.Next() % static_cast<uint64_t>(max_key));
+ s.insert(0, len - (int)s.size(), '0');
+ return s;
+ };
+
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ ReadOptions ropt;
+
+ size_t num_matching = 0;
+ size_t num_at_end = 0;
+ size_t num_not_ok = 0;
+ size_t num_recently_removed = 0;
+
+ // Number of iterations for each combination of parameters
+ // (there are ~250 of those).
+ // Tweak this to change the test run time.
+ // As of the time of writing, the test takes ~4 seconds for value of 5000.
+ const int num_iterations = 5000;
+ // Enable this to print all the operations for debugging.
+ bool trace = FLAGS_verbose;
+
+ for (int num_entries : {5, 10, 100}) {
+ for (double key_space : {0.1, 1.0, 3.0}) {
+ for (ValueType prevalent_entry_type :
+ {kTypeValue, kTypeDeletion, kTypeMerge}) {
+ for (double error_probability : {0.01, 0.1}) {
+ for (double mutation_probability : {0.01, 0.5}) {
+ for (double target_hidden_fraction : {0.1, 0.5}) {
+ std::string trace_str =
+ "entries: " + std::to_string(num_entries) +
+ ", key_space: " + std::to_string(key_space) +
+ ", error_probability: " + std::to_string(error_probability) +
+ ", mutation_probability: " +
+ std::to_string(mutation_probability) +
+ ", target_hidden_fraction: " +
+ std::to_string(target_hidden_fraction);
+ SCOPED_TRACE(trace_str);
+ if (trace) {
+ std::cout << trace_str << std::endl;
+ }
+
+ // Generate data.
+ Data data;
+ int max_key = (int)(num_entries * key_space) + 1;
+ for (int i = 0; i < num_entries; ++i) {
+ Entry e;
+ e.key = gen_key(max_key);
+ if (rnd.Next() % 10 != 0) {
+ e.type = prevalent_entry_type;
+ } else {
+ const ValueType types[] = {kTypeValue, kTypeDeletion,
+ kTypeMerge};
+ e.type =
+ types[rnd.Next() % (sizeof(types) / sizeof(types[0]))];
+ }
+ e.sequence = i;
+ e.value = "v" + std::to_string(i);
+ ParsedInternalKey internal_key(e.key, e.sequence, e.type);
+ AppendInternalKey(&e.ikey, internal_key);
+
+ data.entries.push_back(e);
+ }
+ std::sort(data.entries.begin(), data.entries.end());
+ if (trace) {
+ std::cout << "entries:";
+ for (size_t i = 0; i < data.entries.size(); ++i) {
+ Entry& e = data.entries[i];
+ std::cout << "\n idx " << i << ": \"" << e.key << "\": \""
+ << e.value << "\" seq: " << e.sequence << " type: "
+ << (e.type == kTypeValue ? "val"
+ : e.type == kTypeDeletion ? "del"
+ : "merge");
+ }
+ std::cout << std::endl;
+ }
+
+ std::unique_ptr<Iterator> db_iter;
+ std::unique_ptr<ReferenceIterator> ref_iter;
+ for (int iteration = 0; iteration < num_iterations; ++iteration) {
+ SCOPED_TRACE(iteration);
+ // Create a new iterator every ~30 operations.
+ if (db_iter == nullptr || rnd.Next() % 30 == 0) {
+ uint64_t sequence = rnd.Next() % (data.entries.size() + 2);
+ ref_iter.reset(new ReferenceIterator(&data, sequence));
+ if (trace) {
+ std::cout << "new iterator, seq: " << sequence << std::endl;
+ }
+
+ auto internal_iter =
+ new StressTestIterator(&data, &rnd, BytewiseComparator());
+ internal_iter->error_probability = error_probability;
+ internal_iter->mutation_probability = mutation_probability;
+ internal_iter->target_hidden_fraction =
+ target_hidden_fraction;
+ internal_iter->trace = trace;
+ db_iter.reset(NewDBIterator(
+ env_, ropt, ImmutableOptions(options),
+ MutableCFOptions(options), BytewiseComparator(),
+ internal_iter, nullptr /* version */, sequence,
+ options.max_sequential_skip_in_iterations,
+ nullptr /*read_callback*/));
+ }
+
+ // Do a random operation. It's important to do it on ref_it
+ // later than on db_iter to make sure ref_it sees the correct
+ // recently_touched_keys.
+ std::string old_key;
+ bool forward = rnd.Next() % 2 > 0;
+ // Do Next()/Prev() ~90% of the time.
+ bool seek = !ref_iter->Valid() || rnd.Next() % 10 == 0;
+ if (trace) {
+ std::cout << iteration << ": ";
+ }
+
+ if (!seek) {
+ assert(db_iter->Valid());
+ old_key = ref_iter->key;
+ if (trace) {
+ std::cout << (forward ? "Next" : "Prev") << std::endl;
+ }
+
+ if (forward) {
+ db_iter->Next();
+ ref_iter->Next();
+ } else {
+ db_iter->Prev();
+ ref_iter->Prev();
+ }
+ } else {
+ data.recently_touched_keys.clear();
+ // Do SeekToFirst less often than Seek.
+ if (rnd.Next() % 4 == 0) {
+ if (trace) {
+ std::cout << (forward ? "SeekToFirst" : "SeekToLast")
+ << std::endl;
+ }
+
+ if (forward) {
+ old_key = "";
+ db_iter->SeekToFirst();
+ ref_iter->SeekToFirst();
+ } else {
+ old_key = data.entries.back().key;
+ db_iter->SeekToLast();
+ ref_iter->SeekToLast();
+ }
+ } else {
+ old_key = gen_key(max_key);
+ if (trace) {
+ std::cout << (forward ? "Seek" : "SeekForPrev") << " \""
+ << old_key << '"' << std::endl;
+ }
+ if (forward) {
+ db_iter->Seek(old_key);
+ ref_iter->Seek(old_key);
+ } else {
+ db_iter->SeekForPrev(old_key);
+ ref_iter->SeekForPrev(old_key);
+ }
+ }
+ }
+
+ // Check the result.
+ if (db_iter->Valid()) {
+ ASSERT_TRUE(db_iter->status().ok());
+ if (data.recently_touched_keys.count(
+ db_iter->key().ToString())) {
+ // Ended on a key that may have been mutated during the
+ // operation. Reference iterator skips such keys, so we
+ // can't check the exact result.
+
+ // Check that the key moved in the right direction.
+ if (forward) {
+ if (seek)
+ ASSERT_GE(db_iter->key().ToString(), old_key);
+ else
+ ASSERT_GT(db_iter->key().ToString(), old_key);
+ } else {
+ if (seek)
+ ASSERT_LE(db_iter->key().ToString(), old_key);
+ else
+ ASSERT_LT(db_iter->key().ToString(), old_key);
+ }
+
+ if (ref_iter->Valid()) {
+ // Check that DBIter didn't miss any non-mutated key.
+ if (forward) {
+ ASSERT_LT(db_iter->key().ToString(), ref_iter->key);
+ } else {
+ ASSERT_GT(db_iter->key().ToString(), ref_iter->key);
+ }
+ }
+ // Tell the next iteration of the loop to reseek the
+ // iterators.
+ ref_iter->valid = false;
+
+ ++num_recently_removed;
+ } else {
+ ASSERT_TRUE(ref_iter->Valid());
+ ASSERT_EQ(ref_iter->key, db_iter->key().ToString());
+ ASSERT_EQ(ref_iter->value, db_iter->value());
+ ++num_matching;
+ }
+ } else if (db_iter->status().ok()) {
+ ASSERT_FALSE(ref_iter->Valid());
+ ++num_at_end;
+ } else {
+ // Non-ok status. Nothing to check here.
+ // Tell the next iteration of the loop to reseek the
+ // iterators.
+ ref_iter->valid = false;
+ ++num_not_ok;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Check that all cases were hit many times.
+ EXPECT_GT(num_matching, 10000);
+ EXPECT_GT(num_at_end, 10000);
+ EXPECT_GT(num_not_ok, 10000);
+ EXPECT_GT(num_recently_removed, 10000);
+
+ std::cout << "stats:\n exact matches: " << num_matching
+ << "\n end reached: " << num_at_end
+ << "\n non-ok status: " << num_not_ok
+ << "\n mutated on the fly: " << num_recently_removed << std::endl;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ ParseCommandLineFlags(&argc, &argv, true);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iter_test.cc b/src/rocksdb/db/db_iter_test.cc
new file mode 100644
index 000000000..65290bfad
--- /dev/null
+++ b/src/rocksdb/db/db_iter_test.cc
@@ -0,0 +1,3195 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_iter.h"
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "table/iterator_wrapper.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static uint64_t TestGetTickerCount(const Options& options,
+ Tickers ticker_type) {
+ return options.statistics->getTickerCount(ticker_type);
+}
+
+class TestIterator : public InternalIterator {
+ public:
+ explicit TestIterator(const Comparator* comparator)
+ : initialized_(false),
+ valid_(false),
+ sequence_number_(0),
+ iter_(0),
+ cmp(comparator) {
+ data_.reserve(16);
+ }
+
+ void AddPut(std::string argkey, std::string argvalue) {
+ Add(argkey, kTypeValue, argvalue);
+ }
+
+ void AddDeletion(std::string argkey) {
+ Add(argkey, kTypeDeletion, std::string());
+ }
+
+ void AddSingleDeletion(std::string argkey) {
+ Add(argkey, kTypeSingleDeletion, std::string());
+ }
+
+ void AddMerge(std::string argkey, std::string argvalue) {
+ Add(argkey, kTypeMerge, argvalue);
+ }
+
+ void Add(std::string argkey, ValueType type, std::string argvalue) {
+ Add(argkey, type, argvalue, sequence_number_++);
+ }
+
+ void Add(std::string argkey, ValueType type, std::string argvalue,
+ size_t seq_num, bool update_iter = false) {
+ valid_ = true;
+ ParsedInternalKey internal_key(argkey, seq_num, type);
+ data_.push_back(
+ std::pair<std::string, std::string>(std::string(), argvalue));
+ AppendInternalKey(&data_.back().first, internal_key);
+ if (update_iter && valid_ && cmp.Compare(data_.back().first, key()) < 0) {
+ // insert a key smaller than current key
+ Finish();
+ // data_[iter_] is not anymore the current element of the iterator.
+ // Increment it to reposition it to the right position.
+ iter_++;
+ }
+ }
+
+ // should be called before operations with iterator
+ void Finish() {
+ initialized_ = true;
+ std::sort(data_.begin(), data_.end(),
+ [this](std::pair<std::string, std::string> a,
+ std::pair<std::string, std::string> b) {
+ return (cmp.Compare(a.first, b.first) < 0);
+ });
+ }
+
+ // Removes the key from the set of keys over which this iterator iterates.
+ // Not to be confused with AddDeletion().
+ // If the iterator is currently positioned on this key, the deletion will
+ // apply next time the iterator moves.
+ // Used for simulating ForwardIterator updating to a new version that doesn't
+ // have some of the keys (e.g. after compaction with a filter).
+ void Vanish(std::string _key) {
+ if (valid_ && data_[iter_].first == _key) {
+ delete_current_ = true;
+ return;
+ }
+ for (auto it = data_.begin(); it != data_.end(); ++it) {
+ ParsedInternalKey ikey;
+ Status pik_status =
+ ParseInternalKey(it->first, &ikey, true /* log_err_key */);
+ pik_status.PermitUncheckedError();
+ assert(pik_status.ok());
+ if (!pik_status.ok() || ikey.user_key != _key) {
+ continue;
+ }
+ if (valid_ && data_.begin() + iter_ > it) {
+ --iter_;
+ }
+ data_.erase(it);
+ return;
+ }
+ assert(false);
+ }
+
+ // Number of operations done on this iterator since construction.
+ size_t steps() const { return steps_; }
+
+ bool Valid() const override {
+ assert(initialized_);
+ return valid_;
+ }
+
+ void SeekToFirst() override {
+ assert(initialized_);
+ ++steps_;
+ DeleteCurrentIfNeeded();
+ valid_ = (data_.size() > 0);
+ iter_ = 0;
+ }
+
+ void SeekToLast() override {
+ assert(initialized_);
+ ++steps_;
+ DeleteCurrentIfNeeded();
+ valid_ = (data_.size() > 0);
+ iter_ = data_.size() - 1;
+ }
+
+ void Seek(const Slice& target) override {
+ assert(initialized_);
+ SeekToFirst();
+ ++steps_;
+ if (!valid_) {
+ return;
+ }
+ while (iter_ < data_.size() &&
+ (cmp.Compare(data_[iter_].first, target) < 0)) {
+ ++iter_;
+ }
+
+ if (iter_ == data_.size()) {
+ valid_ = false;
+ }
+ }
+
+ void SeekForPrev(const Slice& target) override {
+ assert(initialized_);
+ DeleteCurrentIfNeeded();
+ SeekForPrevImpl(target, &cmp);
+ }
+
+ void Next() override {
+ assert(initialized_);
+ assert(valid_);
+ assert(iter_ < data_.size());
+
+ ++steps_;
+ if (delete_current_) {
+ DeleteCurrentIfNeeded();
+ } else {
+ ++iter_;
+ }
+ valid_ = iter_ < data_.size();
+ }
+
+ void Prev() override {
+ assert(initialized_);
+ assert(valid_);
+ assert(iter_ < data_.size());
+
+ ++steps_;
+ DeleteCurrentIfNeeded();
+ if (iter_ == 0) {
+ valid_ = false;
+ } else {
+ --iter_;
+ }
+ }
+
+ Slice key() const override {
+ assert(initialized_);
+ return data_[iter_].first;
+ }
+
+ Slice value() const override {
+ assert(initialized_);
+ return data_[iter_].second;
+ }
+
+ Status status() const override {
+ assert(initialized_);
+ return Status::OK();
+ }
+
+ bool IsKeyPinned() const override { return true; }
+ bool IsValuePinned() const override { return true; }
+
+ private:
+ bool initialized_;
+ bool valid_;
+ size_t sequence_number_;
+ size_t iter_;
+ size_t steps_ = 0;
+
+ InternalKeyComparator cmp;
+ std::vector<std::pair<std::string, std::string>> data_;
+ bool delete_current_ = false;
+
+ void DeleteCurrentIfNeeded() {
+ if (!delete_current_) {
+ return;
+ }
+ data_.erase(data_.begin() + iter_);
+ delete_current_ = false;
+ }
+};
+
+class DBIteratorTest : public testing::Test {
+ public:
+ Env* env_;
+
+ DBIteratorTest() : env_(Env::Default()) {}
+};
+
+TEST_F(DBIteratorTest, DBIteratorPrevNext) {
+ Options options;
+ ImmutableOptions ioptions = ImmutableOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddPut("a", "val_a");
+
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+ // Test to check the SeekToLast() with iterate_upper_bound not set
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ }
+
+ // Test to check the SeekToLast() with iterate_upper_bound set
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("d", "val_d");
+ internal_iter->AddPut("e", "val_e");
+ internal_iter->AddPut("f", "val_f");
+ internal_iter->Finish();
+
+ Slice prefix("d");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ }
+ // Test to check the SeekToLast() iterate_upper_bound set to a key that
+ // is not Put yet
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("d", "val_d");
+ internal_iter->Finish();
+
+ Slice prefix("z");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ }
+ // Test to check the SeekToLast() with iterate_upper_bound set to the
+ // first key
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->Finish();
+
+ Slice prefix("a");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+ // Test case to check SeekToLast with iterate_upper_bound set
+ // (same key put may times - SeekToLast should start with the
+ // maximum sequence id of the upper bound)
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ Slice prefix("c");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 7 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ SetPerfLevel(kEnableCount);
+ ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+ get_perf_context()->Reset();
+ db_iter->SeekToLast();
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(static_cast<int>(get_perf_context()->internal_key_skipped_count),
+ 1);
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+
+ SetPerfLevel(kDisable);
+ }
+ // Test to check the SeekToLast() with the iterate_upper_bound set
+ // (Checking the value of the key which has sequence ids greater than
+ // and less that the iterator's sequence id)
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+ internal_iter->AddPut("a", "val_a1");
+ internal_iter->AddPut("a", "val_a2");
+ internal_iter->AddPut("b", "val_b1");
+ internal_iter->AddPut("c", "val_c1");
+ internal_iter->AddPut("c", "val_c2");
+ internal_iter->AddPut("c", "val_c3");
+ internal_iter->AddPut("b", "val_b2");
+ internal_iter->AddPut("d", "val_d1");
+ internal_iter->Finish();
+
+ Slice prefix("c");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 4 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b1");
+ }
+
+ // Test to check the SeekToLast() with the iterate_upper_bound set to the
+ // key that is deleted
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ Slice prefix("a");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+ // Test to check the SeekToLast() with the iterate_upper_bound set
+ // (Deletion cases)
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ Slice prefix("c");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ }
+ // Test to check the SeekToLast() with iterate_upper_bound set
+ // (Deletion cases - Lot of internal keys after the upper_bound
+ // is deleted)
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddDeletion("c");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddDeletion("e");
+ internal_iter->AddDeletion("f");
+ internal_iter->AddDeletion("g");
+ internal_iter->AddDeletion("h");
+ internal_iter->Finish();
+
+ Slice prefix("c");
+
+ ReadOptions ro;
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 7 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ SetPerfLevel(kEnableCount);
+ ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+ get_perf_context()->Reset();
+ db_iter->SeekToLast();
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(
+ static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+
+ SetPerfLevel(kDisable);
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddPut("a", "val_a");
+
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 2 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("a", "val_a");
+
+ internal_iter->AddPut("b", "val_b");
+
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+ }
+}
+
+TEST_F(DBIteratorTest, DBIteratorEmpty) {
+ Options options;
+ ImmutableOptions ioptions = ImmutableOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+ ReadOptions ro;
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 0 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 0 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
+ ReadOptions ro;
+ Options options;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (size_t i = 0; i < 200; ++i) {
+ internal_iter->AddPut("a", "a");
+ internal_iter->AddPut("b", "b");
+ internal_iter->AddPut("c", "c");
+ }
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 2 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "c");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1u);
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "b");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2u);
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "a");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkip) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ ImmutableOptions ioptions = ImmutableOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+ {
+ for (size_t i = 0; i < 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("b", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddPut("c", std::to_string(k));
+ }
+ internal_iter->Finish();
+
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, i + 2 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), std::to_string(i));
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+ db_iter->Prev();
+
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+ }
+
+ {
+ for (size_t i = 0; i < 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("b", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddDeletion("c");
+ }
+ internal_iter->AddPut("c", "200");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, i + 2 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+ db_iter->Prev();
+
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("b", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ for (size_t i = 0; i < 200; ++i) {
+ internal_iter->AddDeletion("c");
+ }
+ internal_iter->AddPut("c", "200");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 202 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "200");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+ db_iter->Prev();
+
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+ }
+
+ {
+ for (size_t i = 0; i < 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddDeletion("c");
+ }
+ internal_iter->AddPut("c", "200");
+ internal_iter->Finish();
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, i /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (size_t i = 0; i < 200; ++i) {
+ internal_iter->AddDeletion("c");
+ }
+ internal_iter->AddPut("c", "200");
+ internal_iter->Finish();
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 200 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "200");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "200");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+
+ {
+ for (size_t i = 0; i < 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("b", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddPut("d", std::to_string(k));
+ }
+
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddPut("c", std::to_string(k));
+ }
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, i + 2 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), std::to_string(i));
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+ db_iter->Prev();
+
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+ }
+
+ {
+ for (size_t i = 0; i < 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("b", "b");
+ internal_iter->AddMerge("a", "a");
+ for (size_t k = 0; k < 200; ++k) {
+ internal_iter->AddMerge("c", std::to_string(k));
+ }
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, i + 2 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ std::string merge_result = "0";
+ for (size_t j = 1; j <= i; ++j) {
+ merge_result += "," + std::to_string(j);
+ }
+ ASSERT_EQ(db_iter->value().ToString(), merge_result);
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "b");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "a");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+ }
+}
+
+TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
+ Options options;
+ ImmutableOptions ioptions = ImmutableOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+ ReadOptions ro;
+
+ // Basic test case ... Make sure explicityly passing the default value works.
+ // Skipping internal keys is disabled by default, when the value is 0.
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddDeletion("c");
+ internal_iter->AddPut("d", "val_d");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 0;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "val_d");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().ok());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "val_d");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ }
+
+ // Test to make sure that the request will *not* fail as incomplete if
+ // num_internal_keys_skipped is *equal* to max_skippable_internal_keys
+ // threshold. (It will fail as incomplete only when the threshold is
+ // exceeded.)
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().ok());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev();
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().ok());
+ }
+
+ // Fail the request as incomplete when num_internal_keys_skipped >
+ // max_skippable_internal_keys
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+
+ // Test that the num_internal_keys_skipped counter resets after a successful
+ // read.
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddPut("e", "val_e");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Next(); // num_internal_keys_skipped counter resets here.
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+
+ // Test that the num_internal_keys_skipped counter resets after a successful
+ // read.
+ // Reverse direction
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddPut("e", "val_e");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "e");
+ ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev(); // num_internal_keys_skipped counter resets here.
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+
+ // Test that skipping separate keys is handled
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("c");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddPut("e", "val_e");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "e");
+ ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+
+ // Test if alternating puts and deletes of the same key are handled correctly.
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->AddDeletion("c");
+ internal_iter->AddPut("d", "val_d");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddPut("e", "val_e");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = 2;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "e");
+ ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+
+ // Test for large number of skippable internal keys with *default*
+ // max_sequential_skip_in_iterations.
+ {
+ for (size_t i = 1; i <= 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ for (size_t j = 1; j <= i; ++j) {
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddDeletion("b");
+ }
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ ro.max_skippable_internal_keys = i;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ if ((options.max_sequential_skip_in_iterations + 1) >=
+ ro.max_skippable_internal_keys) {
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ } else {
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+ }
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev();
+ if ((options.max_sequential_skip_in_iterations + 1) >=
+ ro.max_skippable_internal_keys) {
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ } else {
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+ }
+ }
+ }
+
+ // Test for large number of skippable internal keys with a *non-default*
+ // max_sequential_skip_in_iterations.
+ {
+ for (size_t i = 1; i <= 200; ++i) {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ for (size_t j = 1; j <= i; ++j) {
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->AddDeletion("b");
+ }
+ internal_iter->AddPut("c", "val_c");
+ internal_iter->Finish();
+
+ options.max_sequential_skip_in_iterations = 1000;
+ ro.max_skippable_internal_keys = i;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ ASSERT_TRUE(db_iter->status().IsIncomplete());
+ }
+ }
+}
+
+TEST_F(DBIteratorTest, DBIterator1) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("a", "1");
+ internal_iter->AddMerge("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 1 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ db_iter->Next();
+ ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator2) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("a", "1");
+ internal_iter->AddMerge("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 0 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator3) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("a", "1");
+ internal_iter->AddMerge("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 2 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator4) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("a", "1");
+ internal_iter->AddMerge("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 4 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0,1");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "2");
+ db_iter->Next();
+ ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator5) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ ImmutableOptions ioptions = ImmutableOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 0 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 1 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 2 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 3 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "put_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 4 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 5 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddPut("a", "put_1");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 6 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ // put, singledelete, merge
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "val_a");
+ internal_iter->AddSingleDeletion("a");
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddPut("b", "val_b");
+ internal_iter->Finish();
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 10 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->Seek("b");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ }
+}
+
+TEST_F(DBIteratorTest, DBIterator6) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ ImmutableOptions ioptions = ImmutableOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 0 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 1 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 2 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 3 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 4 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 5 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("a", "merge_3");
+ internal_iter->AddDeletion("a");
+ internal_iter->AddMerge("a", "merge_4");
+ internal_iter->AddMerge("a", "merge_5");
+ internal_iter->AddMerge("a", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 6 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+}
+
+TEST_F(DBIteratorTest, DBIterator7) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ ImmutableOptions ioptions = ImmutableOptions(options);
+ MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 0 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 2 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "val,merge_2");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 4 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 5 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+ db_iter->Prev();
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 6 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 7 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 9 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 13 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(),
+ "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddPut("b", "val");
+ internal_iter->AddMerge("b", "merge_2");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_3");
+
+ internal_iter->AddMerge("c", "merge_4");
+ internal_iter->AddMerge("c", "merge_5");
+
+ internal_iter->AddDeletion("b");
+ internal_iter->AddMerge("b", "merge_6");
+ internal_iter->AddMerge("b", "merge_7");
+ internal_iter->AddMerge("b", "merge_8");
+ internal_iter->AddMerge("b", "merge_9");
+ internal_iter->AddMerge("b", "merge_10");
+ internal_iter->AddMerge("b", "merge_11");
+
+ internal_iter->AddDeletion("c");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+ internal_iter, nullptr /* version */, 14 /* sequence */,
+ options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(),
+ "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+ db_iter->Prev();
+ ASSERT_TRUE(!db_iter->Valid());
+ }
+}
+
+TEST_F(DBIteratorTest, DBIterator8) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddDeletion("a");
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 10 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+}
+
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+// return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator9) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ {
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddMerge("a", "merge_1");
+ internal_iter->AddMerge("a", "merge_2");
+ internal_iter->AddMerge("b", "merge_3");
+ internal_iter->AddMerge("b", "merge_4");
+ internal_iter->AddMerge("d", "merge_5");
+ internal_iter->AddMerge("d", "merge_6");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 10 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+
+ db_iter->Seek("b");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+
+ db_iter->SeekForPrev("b");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+
+ db_iter->Seek("c");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+
+ db_iter->SeekForPrev("c");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+ }
+}
+
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+// return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator10) {
+ ReadOptions ro;
+ Options options;
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "1");
+ internal_iter->AddPut("b", "2");
+ internal_iter->AddPut("c", "3");
+ internal_iter->AddPut("d", "4");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 10 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->Seek("c");
+ ASSERT_TRUE(db_iter->Valid());
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "2");
+
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "3");
+
+ db_iter->SeekForPrev("c");
+ ASSERT_TRUE(db_iter->Valid());
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "d");
+ ASSERT_EQ(db_iter->value().ToString(), "4");
+
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "3");
+}
+
+TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = nullptr;
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "1");
+ internal_iter->AddPut("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "1");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "2");
+ db_iter->Next();
+ ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator11) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "0");
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddSingleDeletion("b");
+ internal_iter->AddMerge("a", "1");
+ internal_iter->AddMerge("b", "2");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 1 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+ db_iter->SeekToFirst();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "0");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ db_iter->Next();
+ ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator12) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = nullptr;
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "1");
+ internal_iter->AddPut("b", "2");
+ internal_iter->AddPut("c", "3");
+ internal_iter->AddSingleDeletion("b");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
+ db_iter->SeekToLast();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "c");
+ ASSERT_EQ(db_iter->value().ToString(), "3");
+ db_iter->Prev();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "1");
+ db_iter->Prev();
+ ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator13) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = nullptr;
+
+ std::string key;
+ key.resize(9);
+ key.assign(9, static_cast<char>(0));
+ key[0] = 'b';
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut(key, "0");
+ internal_iter->AddPut(key, "1");
+ internal_iter->AddPut(key, "2");
+ internal_iter->AddPut(key, "3");
+ internal_iter->AddPut(key, "4");
+ internal_iter->AddPut(key, "5");
+ internal_iter->AddPut(key, "6");
+ internal_iter->AddPut(key, "7");
+ internal_iter->AddPut(key, "8");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 2 /* sequence */, 3 /* max_sequential_skip_in_iterations */,
+ nullptr /* read_callback */));
+ db_iter->Seek("b");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), key);
+ ASSERT_EQ(db_iter->value().ToString(), "2");
+}
+
+TEST_F(DBIteratorTest, DBIterator14) {
+ ReadOptions ro;
+ Options options;
+ options.merge_operator = nullptr;
+
+ std::string key("b");
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("b", "0");
+ internal_iter->AddPut("b", "1");
+ internal_iter->AddPut("b", "2");
+ internal_iter->AddPut("b", "3");
+ internal_iter->AddPut("a", "4");
+ internal_iter->AddPut("a", "5");
+ internal_iter->AddPut("a", "6");
+ internal_iter->AddPut("c", "7");
+ internal_iter->AddPut("c", "8");
+ internal_iter->AddPut("c", "9");
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 4 /* sequence */, 1 /* max_sequential_skip_in_iterations */,
+ nullptr /* read_callback */));
+ db_iter->Seek("b");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(db_iter->key().ToString(), "b");
+ ASSERT_EQ(db_iter->value().ToString(), "3");
+ db_iter->SeekToFirst();
+ ASSERT_EQ(db_iter->key().ToString(), "a");
+ ASSERT_EQ(db_iter->value().ToString(), "4");
+}
+
+class DBIterWithMergeIterTest : public testing::Test {
+ public:
+ DBIterWithMergeIterTest()
+ : env_(Env::Default()), icomp_(BytewiseComparator()) {
+ options_.merge_operator = nullptr;
+
+ internal_iter1_ = new TestIterator(BytewiseComparator());
+ internal_iter1_->Add("a", kTypeValue, "1", 3u);
+ internal_iter1_->Add("f", kTypeValue, "2", 5u);
+ internal_iter1_->Add("g", kTypeValue, "3", 7u);
+ internal_iter1_->Finish();
+
+ internal_iter2_ = new TestIterator(BytewiseComparator());
+ internal_iter2_->Add("a", kTypeValue, "4", 6u);
+ internal_iter2_->Add("b", kTypeValue, "5", 1u);
+ internal_iter2_->Add("c", kTypeValue, "6", 2u);
+ internal_iter2_->Add("d", kTypeValue, "7", 3u);
+ internal_iter2_->Finish();
+
+ std::vector<InternalIterator*> child_iters;
+ child_iters.push_back(internal_iter1_);
+ child_iters.push_back(internal_iter2_);
+ InternalKeyComparator icomp(BytewiseComparator());
+ InternalIterator* merge_iter =
+ NewMergingIterator(&icomp_, &child_iters[0], 2u);
+
+ db_iter_.reset(NewDBIterator(
+ env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_),
+ BytewiseComparator(), merge_iter, nullptr /* version */,
+ 8 /* read data earlier than seqId 8 */,
+ 3 /* max iterators before reseek */, nullptr /* read_callback */));
+ }
+
+ Env* env_;
+ ReadOptions ro_;
+ Options options_;
+ TestIterator* internal_iter1_;
+ TestIterator* internal_iter2_;
+ InternalKeyComparator icomp_;
+ Iterator* merge_iter_;
+ std::unique_ptr<Iterator> db_iter_;
+};
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator1) {
+ db_iter_->SeekToFirst();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+ db_iter_->Next();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Next();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Next();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Next();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Next();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+ db_iter_->Next();
+ ASSERT_FALSE(db_iter_->Valid());
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) {
+ // Test Prev() when one child iterator is at its end.
+ db_iter_->SeekForPrev("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) {
+ // Test Prev() when one child iterator is at its end but more rows
+ // are added.
+ db_iter_->Seek("f");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+ // Test call back inserts a key in the end of the mem table after
+ // MergeIterator::Prev() realized the mem table iterator is at its end
+ // and before an SeekToLast() is called.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev",
+ [&](void* /*arg*/) { internal_iter2_->Add("z", kTypeValue, "7", 12u); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace2) {
+ // Test Prev() when one child iterator is at its end but more rows
+ // are added.
+ db_iter_->Seek("f");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+ // Test call back inserts entries for update a key in the end of the
+ // mem table after MergeIterator::Prev() realized the mem tableiterator is at
+ // its end and before an SeekToLast() is called.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
+ internal_iter2_->Add("z", kTypeValue, "7", 12u);
+ internal_iter2_->Add("z", kTypeValue, "7", 11u);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace3) {
+ // Test Prev() when one child iterator is at its end but more rows
+ // are added and max_skipped is triggered.
+ db_iter_->Seek("f");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+ // Test call back inserts entries for update a key in the end of the
+ // mem table after MergeIterator::Prev() realized the mem table iterator is at
+ // its end and before an SeekToLast() is called.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
+ internal_iter2_->Add("z", kTypeValue, "7", 16u, true);
+ internal_iter2_->Add("z", kTypeValue, "7", 15u, true);
+ internal_iter2_->Add("z", kTypeValue, "7", 14u, true);
+ internal_iter2_->Add("z", kTypeValue, "7", 13u, true);
+ internal_iter2_->Add("z", kTypeValue, "7", 12u, true);
+ internal_iter2_->Add("z", kTypeValue, "7", 11u, true);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace4) {
+ // Test Prev() when one child iterator has more rows inserted
+ // between Seek() and Prev() when changing directions.
+ internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+ db_iter_->Seek("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+ // Test call back inserts entries for update a key before "z" in
+ // mem table after MergeIterator::Prev() calls mem table iterator's
+ // Seek() and before calling Prev()
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+ IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+ if (it->key().starts_with("z")) {
+ internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace5) {
+ internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+ // Test Prev() when one child iterator has more rows inserted
+ // between Seek() and Prev() when changing directions.
+ db_iter_->Seek("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+ // Test call back inserts entries for update a key before "z" in
+ // mem table after MergeIterator::Prev() calls mem table iterator's
+ // Seek() and before calling Prev()
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+ IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+ if (it->key().starts_with("z")) {
+ internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace6) {
+ internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+ // Test Prev() when one child iterator has more rows inserted
+ // between Seek() and Prev() when changing directions.
+ db_iter_->Seek("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+ // Test call back inserts an entry for update a key before "z" in
+ // mem table after MergeIterator::Prev() calls mem table iterator's
+ // Seek() and before calling Prev()
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+ IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+ if (it->key().starts_with("z")) {
+ internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace7) {
+ internal_iter1_->Add("u", kTypeValue, "10", 4u);
+ internal_iter1_->Add("v", kTypeValue, "11", 4u);
+ internal_iter1_->Add("w", kTypeValue, "12", 4u);
+ internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+ // Test Prev() when one child iterator has more rows inserted
+ // between Seek() and Prev() when changing directions.
+ db_iter_->Seek("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+ // Test call back inserts entries for update a key before "z" in
+ // mem table after MergeIterator::Prev() calls mem table iterator's
+ // Seek() and before calling Prev()
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+ IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+ if (it->key().starts_with("z")) {
+ internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+ internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "c");
+ ASSERT_EQ(db_iter_->value().ToString(), "6");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "b");
+ ASSERT_EQ(db_iter_->value().ToString(), "5");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "a");
+ ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) {
+ // internal_iter1_: a, f, g
+ // internal_iter2_: a, b, c, d, adding (z)
+ internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+ // Test Prev() when one child iterator has more rows inserted
+ // between Seek() and Prev() when changing directions.
+ db_iter_->Seek("g");
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "g");
+ ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+ // Test call back inserts two keys before "z" in mem table after
+ // MergeIterator::Prev() calls mem table iterator's Seek() and
+ // before calling Prev()
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+ IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+ if (it->key().starts_with("z")) {
+ internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+ internal_iter2_->Add("y", kTypeValue, "7", 17u, true);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "f");
+ ASSERT_EQ(db_iter_->value().ToString(), "2");
+ db_iter_->Prev();
+ ASSERT_TRUE(db_iter_->Valid());
+ ASSERT_EQ(db_iter_->key().ToString(), "d");
+ ASSERT_EQ(db_iter_->value().ToString(), "7");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIteratorTest, SeekPrefixTombstones) {
+ ReadOptions ro;
+ Options options;
+ options.prefix_extractor.reset(NewNoopTransform());
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddDeletion("b");
+ internal_iter->AddDeletion("c");
+ internal_iter->AddDeletion("d");
+ internal_iter->AddDeletion("e");
+ internal_iter->AddDeletion("f");
+ internal_iter->AddDeletion("g");
+ internal_iter->Finish();
+
+ ro.prefix_same_as_start = true;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 10 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ int skipped_keys = 0;
+
+ get_perf_context()->Reset();
+ db_iter->SeekForPrev("z");
+ skipped_keys =
+ static_cast<int>(get_perf_context()->internal_key_skipped_count);
+ ASSERT_EQ(skipped_keys, 0);
+
+ get_perf_context()->Reset();
+ db_iter->Seek("a");
+ skipped_keys =
+ static_cast<int>(get_perf_context()->internal_key_skipped_count);
+ ASSERT_EQ(skipped_keys, 0);
+}
+
+TEST_F(DBIteratorTest, SeekToFirstLowerBound) {
+ const int kNumKeys = 3;
+ for (int i = 0; i < kNumKeys + 2; ++i) {
+ // + 2 for two special cases: lower bound before and lower bound after the
+ // internal iterator's keys
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (int j = 1; j <= kNumKeys; ++j) {
+ internal_iter->AddPut(std::to_string(j), "val");
+ }
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ auto lower_bound_str = std::to_string(i);
+ Slice lower_bound(lower_bound_str);
+ ro.iterate_lower_bound = &lower_bound;
+ Options options;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 10 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToFirst();
+ if (i == kNumKeys + 1) {
+ // lower bound was beyond the last key
+ ASSERT_FALSE(db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ } else {
+ ASSERT_TRUE(db_iter->Valid());
+ int expected;
+ if (i == 0) {
+ // lower bound was before the first key
+ expected = 1;
+ } else {
+ // lower bound was at the ith key
+ expected = i;
+ }
+ ASSERT_EQ(std::to_string(expected), db_iter->key().ToString());
+ }
+ }
+}
+
+TEST_F(DBIteratorTest, PrevLowerBound) {
+ const int kNumKeys = 3;
+ const int kLowerBound = 2;
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (int j = 1; j <= kNumKeys; ++j) {
+ internal_iter->AddPut(std::to_string(j), "val");
+ }
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ auto lower_bound_str = std::to_string(kLowerBound);
+ Slice lower_bound(lower_bound_str);
+ ro.iterate_lower_bound = &lower_bound;
+ Options options;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 10 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekToLast();
+ for (int i = kNumKeys; i >= kLowerBound; --i) {
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(std::to_string(i), db_iter->key().ToString());
+ db_iter->Prev();
+ }
+ ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, SeekLessLowerBound) {
+ const int kNumKeys = 3;
+ const int kLowerBound = 2;
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ for (int j = 1; j <= kNumKeys; ++j) {
+ internal_iter->AddPut(std::to_string(j), "val");
+ }
+ internal_iter->Finish();
+
+ ReadOptions ro;
+ auto lower_bound_str = std::to_string(kLowerBound);
+ Slice lower_bound(lower_bound_str);
+ ro.iterate_lower_bound = &lower_bound;
+ Options options;
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 10 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ auto before_lower_bound_str = std::to_string(kLowerBound - 1);
+ Slice before_lower_bound(lower_bound_str);
+
+ db_iter->Seek(before_lower_bound);
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_EQ(lower_bound_str, db_iter->key().ToString());
+}
+
+TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) {
+ Options options;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(0));
+
+ TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+ internal_iter->AddPut("a", "A");
+ internal_iter->AddPut("b", "B");
+ for (int i = 0; i < 100; ++i) {
+ internal_iter->AddPut("c" + std::to_string(i), "");
+ }
+ internal_iter->Finish();
+
+ std::unique_ptr<Iterator> db_iter(NewDBIterator(
+ env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options),
+ BytewiseComparator(), internal_iter, nullptr /* version */,
+ 10 /* sequence */, options.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */));
+
+ db_iter->SeekForPrev("a");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ ASSERT_EQ("a", db_iter->key().ToString());
+
+ internal_iter->Vanish("a");
+ db_iter->Next();
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ ASSERT_EQ("b", db_iter->key().ToString());
+
+ // A (sort of) bug used to cause DBIter to pointlessly drag the internal
+ // iterator all the way to the end. But this doesn't really matter at the time
+ // of writing because the only iterator that can see disappearing keys is
+ // ForwardIterator, which doesn't support SeekForPrev().
+ EXPECT_LT(internal_iter->steps(), 20);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iterator_test.cc b/src/rocksdb/db/db_iterator_test.cc
new file mode 100644
index 000000000..aaf1408b4
--- /dev/null
+++ b/src/rocksdb/db/db_iterator_test.cc
@@ -0,0 +1,3265 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/perf_context.h"
+#include "table/block_based/flush_block_policy.h"
+#include "util/random.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A dumb ReadCallback which saying every key is committed.
+class DummyReadCallback : public ReadCallback {
+ public:
+ DummyReadCallback() : ReadCallback(kMaxSequenceNumber) {}
+ bool IsVisibleFullCheck(SequenceNumber /*seq*/) override { return true; }
+ void SetSnapshot(SequenceNumber seq) { max_visible_seq_ = seq; }
+};
+
+// Test param:
+// bool: whether to pass read_callback to NewIterator().
+class DBIteratorTest : public DBTestBase,
+ public testing::WithParamInterface<bool> {
+ public:
+ DBIteratorTest() : DBTestBase("db_iterator_test", /*env_do_fsync=*/true) {}
+
+ Iterator* NewIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family = nullptr) {
+ if (column_family == nullptr) {
+ column_family = db_->DefaultColumnFamily();
+ }
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+ SequenceNumber seq = read_options.snapshot != nullptr
+ ? read_options.snapshot->GetSequenceNumber()
+ : db_->GetLatestSequenceNumber();
+ bool use_read_callback = GetParam();
+ DummyReadCallback* read_callback = nullptr;
+ if (use_read_callback) {
+ read_callback = new DummyReadCallback();
+ read_callback->SetSnapshot(seq);
+ InstrumentedMutexLock lock(&mutex_);
+ read_callbacks_.push_back(
+ std::unique_ptr<DummyReadCallback>(read_callback));
+ }
+ return dbfull()->NewIteratorImpl(read_options, cfd, seq, read_callback);
+ }
+
+ private:
+ InstrumentedMutex mutex_;
+ std::vector<std::unique_ptr<DummyReadCallback>> read_callbacks_;
+};
+
+TEST_P(DBIteratorTest, IteratorProperty) {
+ // The test needs to be changed if kPersistedTier is supported in iterator.
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put(1, "1", "2"));
+ ASSERT_OK(Delete(1, "2"));
+ ReadOptions ropt;
+ ropt.pin_data = false;
+ {
+ std::unique_ptr<Iterator> iter(NewIterator(ropt, handles_[1]));
+ iter->SeekToFirst();
+ std::string prop_value;
+ ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value));
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("0", prop_value);
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ iter->Next();
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("Iterator is not valid.", prop_value);
+
+ // Get internal key at which the iteration stopped (tombstone in this case).
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+ ASSERT_EQ("2", prop_value);
+ }
+ Close();
+}
+
+TEST_P(DBIteratorTest, PersistedTierOnIterator) {
+ // The test needs to be changed if kPersistedTier is supported in iterator.
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ReadOptions ropt;
+ ropt.read_tier = kPersistedTier;
+
+ auto* iter = db_->NewIterator(ropt, handles_[1]);
+ ASSERT_TRUE(iter->status().IsNotSupported());
+ delete iter;
+
+ std::vector<Iterator*> iters;
+ ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported());
+ Close();
+}
+
+TEST_P(DBIteratorTest, NonBlockingIteration) {
+ do {
+ ReadOptions non_blocking_opts, regular_opts;
+ anon::OptionsOverride options_override;
+ options_override.full_block_cache = true;
+ Options options = CurrentOptions(options_override);
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ non_blocking_opts.read_tier = kBlockCacheTier;
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // write one kv to the database.
+ ASSERT_OK(Put(1, "a", "b"));
+
+ // scan using non-blocking iterator. We should find it because
+ // it is in memtable.
+ Iterator* iter = NewIterator(non_blocking_opts, handles_[1]);
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 1);
+ delete iter;
+
+ // flush memtable to storage. Now, the key should not be in the
+ // memtable neither in the block cache.
+ ASSERT_OK(Flush(1));
+
+ // verify that a non-blocking iterator does not find any
+ // kvs. Neither does it do any IOs to storage.
+ uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ iter = NewIterator(non_blocking_opts, handles_[1]);
+ count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ count++;
+ }
+ ASSERT_EQ(count, 0);
+ ASSERT_TRUE(iter->status().IsIncomplete());
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ delete iter;
+
+ // read in the specified block via a regular get
+ ASSERT_EQ(Get(1, "a"), "b");
+
+ // verify that we can find it via a non-blocking scan
+ numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+ cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ iter = NewIterator(non_blocking_opts, handles_[1]);
+ count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 1);
+ ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ delete iter;
+
+ // This test verifies block cache behaviors, which is not used by plain
+ // table format.
+ } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipMmapReads));
+}
+
+TEST_P(DBIteratorTest, IterSeekBeforePrev) {
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("0", "f"));
+ ASSERT_OK(Put("1", "h"));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("2", "j"));
+ auto iter = NewIterator(ReadOptions());
+ iter->Seek(Slice("c"));
+ iter->Prev();
+ iter->Seek(Slice("a"));
+ iter->Prev();
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterReseekNewUpperBound) {
+ Random rnd(301);
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1024;
+ table_options.block_size_deviation = 50;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.compression = kNoCompression;
+ Reopen(options);
+
+ ASSERT_OK(Put("a", rnd.RandomString(400)));
+ ASSERT_OK(Put("aabb", rnd.RandomString(400)));
+ ASSERT_OK(Put("aaef", rnd.RandomString(400)));
+ ASSERT_OK(Put("b", rnd.RandomString(400)));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+ ReadOptions opts;
+ Slice ub = Slice("aa");
+ opts.iterate_upper_bound = &ub;
+ auto iter = NewIterator(opts);
+ iter->Seek(Slice("a"));
+ ub = Slice("b");
+ iter->Seek(Slice("aabc"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "aaef");
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) {
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("0", "f"));
+ ASSERT_OK(Put("1", "h"));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("2", "j"));
+ auto iter = NewIterator(ReadOptions());
+ iter->SeekForPrev(Slice("0"));
+ iter->Next();
+ iter->SeekForPrev(Slice("1"));
+ iter->Next();
+ delete iter;
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+ return std::string(length, c);
+}
+} // anonymous namespace
+
+TEST_P(DBIteratorTest, IterLongKeys) {
+ ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
+ ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
+ ASSERT_OK(Put("a", "b"));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
+ ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
+ ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
+ auto iter = NewIterator(ReadOptions());
+
+ // Create a key that needs to be skipped for Seq too new
+ iter->Seek(MakeLongKey(20, 0));
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4");
+
+ iter->SeekForPrev(MakeLongKey(127, 3));
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+ delete iter;
+
+ iter = NewIterator(ReadOptions());
+ iter->Seek(MakeLongKey(50, 1));
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterNextWithNewerSeq) {
+ ASSERT_OK(Put("0", "0"));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ ASSERT_OK(Put("d", "e"));
+ auto iter = NewIterator(ReadOptions());
+
+ // Create a key that needs to be skipped for Seq too new
+ for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+ i++) {
+ ASSERT_OK(Put("b", "f"));
+ }
+
+ iter->Seek(Slice("a"));
+ ASSERT_EQ(IterStatus(iter), "a->b");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->d");
+ iter->SeekForPrev(Slice("b"));
+ ASSERT_EQ(IterStatus(iter), "a->b");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->d");
+
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq) {
+ ASSERT_OK(Put("0", "0"));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ ASSERT_OK(Put("d", "e"));
+ auto iter = NewIterator(ReadOptions());
+
+ // Create a key that needs to be skipped for Seq too new
+ for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+ i++) {
+ ASSERT_OK(Put("b", "f"));
+ }
+
+ iter->Seek(Slice("d"));
+ ASSERT_EQ(IterStatus(iter), "d->e");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "c->d");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->b");
+ iter->Prev();
+ iter->SeekForPrev(Slice("d"));
+ ASSERT_EQ(IterStatus(iter), "d->e");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "c->d");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->b");
+ iter->Prev();
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) {
+ ASSERT_OK(Put("0", "0"));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ ASSERT_OK(Put("e", "f"));
+ auto iter = NewIterator(ReadOptions());
+ auto iter2 = NewIterator(ReadOptions());
+ iter->Seek(Slice("c"));
+ iter2->SeekForPrev(Slice("d"));
+ ASSERT_EQ(IterStatus(iter), "c->d");
+ ASSERT_EQ(IterStatus(iter2), "c->d");
+
+ // Create a key that needs to be skipped for Seq too new
+ for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+ i++) {
+ ASSERT_OK(Put("b", "f"));
+ }
+
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->b");
+ iter->Prev();
+ iter2->Prev();
+ ASSERT_EQ(IterStatus(iter2), "a->b");
+ iter2->Prev();
+ delete iter;
+ delete iter2;
+}
+
+TEST_P(DBIteratorTest, IterEmpty) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("foo");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekForPrev("foo");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ ASSERT_OK(iter->status());
+
+ delete iter;
+ } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterSingle) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "a", "va"));
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekForPrev("");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("a");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekForPrev("a");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("b");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekForPrev("b");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+ } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterMulti) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "a", "va"));
+ ASSERT_OK(Put(1, "b", "vb"));
+ ASSERT_OK(Put(1, "c", "vc"));
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Seek("a");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Seek("ax");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->SeekForPrev("d");
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->SeekForPrev("c");
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->SeekForPrev("bx");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+
+ iter->Seek("b");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Seek("z");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekForPrev("b");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->SeekForPrev("");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ // Switch from reverse to forward
+ iter->SeekToLast();
+ iter->Prev();
+ iter->Prev();
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+
+ // Switch from forward to reverse
+ iter->SeekToFirst();
+ iter->Next();
+ iter->Next();
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+
+ // Make sure iter stays at snapshot
+ ASSERT_OK(Put(1, "a", "va2"));
+ ASSERT_OK(Put(1, "a2", "va3"));
+ ASSERT_OK(Put(1, "b", "vb2"));
+ ASSERT_OK(Put(1, "c", "vc2"));
+ ASSERT_OK(Delete(1, "b"));
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+ } while (ChangeCompactOptions());
+}
+
+// Check that we can skip over a run of user keys
+// by using reseek rather than sequential scan
+TEST_P(DBIteratorTest, IterReseek) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ Options options = CurrentOptions(options_override);
+ options.max_sequential_skip_in_iterations = 3;
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // insert three keys with same userkey and verify that
+ // reseek is not invoked. For each of these test cases,
+ // verify that we can find the next key "b".
+ ASSERT_OK(Put(1, "a", "zero"));
+ ASSERT_OK(Put(1, "a", "one"));
+ ASSERT_OK(Put(1, "a", "two"));
+ ASSERT_OK(Put(1, "b", "bone"));
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->SeekToFirst();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ ASSERT_EQ(IterStatus(iter), "a->two");
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ ASSERT_EQ(IterStatus(iter), "b->bone");
+ delete iter;
+
+ // insert a total of three keys with same userkey and verify
+ // that reseek is still not invoked.
+ ASSERT_OK(Put(1, "a", "three"));
+ iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->three");
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ ASSERT_EQ(IterStatus(iter), "b->bone");
+ delete iter;
+
+ // insert a total of four keys with same userkey and verify
+ // that reseek is invoked.
+ ASSERT_OK(Put(1, "a", "four"));
+ iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->four");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+ ASSERT_EQ(IterStatus(iter), "b->bone");
+ delete iter;
+
+ // Testing reverse iterator
+ // At this point, we have three versions of "a" and one version of "b".
+ // The reseek statistics is already at 1.
+ int num_reseeks = static_cast<int>(
+ TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+
+ // Insert another version of b and assert that reseek is not invoked
+ ASSERT_OK(Put(1, "b", "btwo"));
+ iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "b->btwo");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+ num_reseeks);
+ iter->Prev();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+ num_reseeks + 1);
+ ASSERT_EQ(IterStatus(iter), "a->four");
+ delete iter;
+
+ // insert two more versions of b. This makes a total of 4 versions
+ // of b and 4 versions of a.
+ ASSERT_OK(Put(1, "b", "bthree"));
+ ASSERT_OK(Put(1, "b", "bfour"));
+ iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "b->bfour");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+ num_reseeks + 2);
+ iter->Prev();
+
+ // the previous Prev call should have invoked reseek
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+ num_reseeks + 3);
+ ASSERT_EQ(IterStatus(iter), "a->four");
+ delete iter;
+}
+
+TEST_F(DBIteratorTest, ReseekUponDirectionChange) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.merge_operator.reset(
+ new StringAppendTESTOperator(/*delim_char=*/' '));
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_OK(Put("bar", "value"));
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->SeekToLast();
+ it->Prev();
+ it->Next();
+ }
+ ASSERT_EQ(1,
+ options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+ const std::string merge_key("good");
+ ASSERT_OK(Put(merge_key, "orig"));
+ ASSERT_OK(Merge(merge_key, "suffix"));
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->Seek(merge_key);
+ ASSERT_TRUE(it->Valid());
+ const uint64_t prev_reseek_count =
+ options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+ it->Prev();
+ ASSERT_EQ(prev_reseek_count + 1, options.statistics->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION));
+ }
+}
+
+TEST_P(DBIteratorTest, IterSmallAndLargeMix) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "a", "va"));
+ ASSERT_OK(Put(1, "b", std::string(100000, 'b')));
+ ASSERT_OK(Put(1, "c", "vc"));
+ ASSERT_OK(Put(1, "d", std::string(100000, 'd')));
+ ASSERT_OK(Put(1, "e", std::string(100000, 'e')));
+
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+ } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterMultiWithDelete) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "ka", "va"));
+ ASSERT_OK(Put(1, "kb", "vb"));
+ ASSERT_OK(Put(1, "kc", "vc"));
+ ASSERT_OK(Delete(1, "kb"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "kb"));
+
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+ iter->Seek("kc");
+ ASSERT_EQ(IterStatus(iter), "kc->vc");
+ if (!CurrentOptions().merge_operator) {
+ // TODO: merge operator does not support backward iteration yet
+ if (kPlainTableAllBytesPrefix != option_config_ &&
+ kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+ kHashLinkList != option_config_ &&
+ kHashSkipList != option_config_) { // doesn't support SeekToLast
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "ka->va");
+ }
+ }
+ delete iter;
+ } while (ChangeOptions());
+}
+
+TEST_P(DBIteratorTest, IterPrevMaxSkip) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ for (int i = 0; i < 2; i++) {
+ ASSERT_OK(Put(1, "key1", "v1"));
+ ASSERT_OK(Put(1, "key2", "v2"));
+ ASSERT_OK(Put(1, "key3", "v3"));
+ ASSERT_OK(Put(1, "key4", "v4"));
+ ASSERT_OK(Put(1, "key5", "v5"));
+ }
+
+ VerifyIterLast("key5->v5", 1);
+
+ ASSERT_OK(Delete(1, "key5"));
+ VerifyIterLast("key4->v4", 1);
+
+ ASSERT_OK(Delete(1, "key4"));
+ VerifyIterLast("key3->v3", 1);
+
+ ASSERT_OK(Delete(1, "key3"));
+ VerifyIterLast("key2->v2", 1);
+
+ ASSERT_OK(Delete(1, "key2"));
+ VerifyIterLast("key1->v1", 1);
+
+ ASSERT_OK(Delete(1, "key1"));
+ VerifyIterLast("(invalid)", 1);
+ } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast));
+}
+
+TEST_P(DBIteratorTest, IterWithSnapshot) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+ ASSERT_OK(Put(1, "key1", "val1"));
+ ASSERT_OK(Put(1, "key2", "val2"));
+ ASSERT_OK(Put(1, "key3", "val3"));
+ ASSERT_OK(Put(1, "key4", "val4"));
+ ASSERT_OK(Put(1, "key5", "val5"));
+
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ReadOptions options;
+ options.snapshot = snapshot;
+ Iterator* iter = NewIterator(options, handles_[1]);
+
+ ASSERT_OK(Put(1, "key0", "val0"));
+ // Put more values after the snapshot
+ ASSERT_OK(Put(1, "key100", "val100"));
+ ASSERT_OK(Put(1, "key101", "val101"));
+
+ iter->Seek("key5");
+ ASSERT_EQ(IterStatus(iter), "key5->val5");
+ if (!CurrentOptions().merge_operator) {
+ // TODO: merge operator does not support backward iteration yet
+ if (kPlainTableAllBytesPrefix != option_config_ &&
+ kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+ kHashLinkList != option_config_ && kHashSkipList != option_config_) {
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "key4->val4");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "key3->val3");
+
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "key4->val4");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "key5->val5");
+ }
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ if (!CurrentOptions().merge_operator) {
+ // TODO(gzh): merge operator does not support backward iteration yet
+ if (kPlainTableAllBytesPrefix != option_config_ &&
+ kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+ kHashLinkList != option_config_ && kHashSkipList != option_config_) {
+ iter->SeekForPrev("key1");
+ ASSERT_EQ(IterStatus(iter), "key1->val1");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "key2->val2");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "key3->val3");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "key2->val2");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "key1->val1");
+ iter->Prev();
+ ASSERT_TRUE(!iter->Valid());
+ }
+ }
+ db_->ReleaseSnapshot(snapshot);
+ delete iter;
+ } while (ChangeOptions());
+}
+
+TEST_P(DBIteratorTest, IteratorPinsRef) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "hello"));
+
+ // Get iterator that will yield the current contents of the DB.
+ Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+ // Write to force compactions
+ ASSERT_OK(Put(1, "foo", "newvalue1"));
+ for (int i = 0; i < 100; i++) {
+ // 100K values
+ ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v')));
+ }
+ ASSERT_OK(Put(1, "foo", "newvalue2"));
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("hello", iter->value().ToString());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ delete iter;
+ } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+ ASSERT_OK(Put(1, "foo", "delete-cf-then-delete-iter"));
+ ASSERT_OK(Put(1, "hello", "value2"));
+
+ ColumnFamilyHandle* cf = handles_[1];
+ ReadOptions ro;
+
+ auto* iter = db_->NewIterator(ro, cf);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter");
+
+ // delete CF handle
+ EXPECT_OK(db_->DestroyColumnFamilyHandle(cf));
+ handles_.erase(std::begin(handles_) + 1);
+
+ // delete Iterator after CF handle is deleted
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "hello->value2");
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+ ASSERT_OK(Put(1, "foo", "drop-cf-then-delete-iter"));
+
+ ReadOptions ro;
+ ColumnFamilyHandle* cf = handles_[1];
+
+ auto* iter = db_->NewIterator(ro, cf);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter");
+
+ // drop and delete CF
+ EXPECT_OK(db_->DropColumnFamily(cf));
+ EXPECT_OK(db_->DestroyColumnFamilyHandle(cf));
+ handles_.erase(std::begin(handles_) + 1);
+
+ // delete Iterator after CF handle is dropped
+ delete iter;
+}
+
+// SetOptions not defined in ROCKSDB LITE
+#ifndef ROCKSDB_LITE
+TEST_P(DBIteratorTest, DBIteratorBoundTest) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+
+ options.prefix_extractor = nullptr;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("a", "0"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("g1", "0"));
+
+ // testing basic case with no iterate_upper_bound and no prefix_extractor
+ {
+ ReadOptions ro;
+ ro.iterate_upper_bound = nullptr;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("foo");
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+
+ iter->SeekForPrev("g1");
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+ }
+
+ // testing iterate_upper_bound and forward iterator
+ // to make sure it stops at bound
+ {
+ ReadOptions ro;
+ // iterate_upper_bound points beyond the last expected entry
+ Slice prefix("foo2");
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("foo");
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(("foo1")), 0);
+
+ iter->Next();
+ // should stop here...
+ ASSERT_TRUE(!iter->Valid());
+ }
+ // Testing SeekToLast with iterate_upper_bound set
+ {
+ ReadOptions ro;
+
+ Slice prefix("foo");
+ ro.iterate_upper_bound = &prefix;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("a")), 0);
+ }
+
+ // prefix is the first letter of the key
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}}));
+ ASSERT_OK(Put("a", "0"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("g1", "0"));
+
+ // testing with iterate_upper_bound and prefix_extractor
+ // Seek target and iterate_upper_bound are not is same prefix
+ // This should be an error
+ {
+ ReadOptions ro;
+ Slice upper_bound("g");
+ ro.iterate_upper_bound = &upper_bound;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("foo");
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo1", iter->key().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ // testing that iterate_upper_bound prevents iterating over deleted items
+ // if the bound has already reached
+ {
+ options.prefix_extractor = nullptr;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("a", "0"));
+ ASSERT_OK(Put("b", "0"));
+ ASSERT_OK(Put("b1", "0"));
+ ASSERT_OK(Put("c", "0"));
+ ASSERT_OK(Put("d", "0"));
+ ASSERT_OK(Put("e", "0"));
+ ASSERT_OK(Delete("c"));
+ ASSERT_OK(Delete("d"));
+
+ // base case with no bound
+ ReadOptions ro;
+ ro.iterate_upper_bound = nullptr;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("b");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+ get_perf_context()->Reset();
+ iter->Next();
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(
+ static_cast<int>(get_perf_context()->internal_delete_skipped_count), 2);
+
+ // now testing with iterate_bound
+ Slice prefix("c");
+ ro.iterate_upper_bound = &prefix;
+
+ iter.reset(NewIterator(ro));
+
+ get_perf_context()->Reset();
+
+ iter->Seek("b");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+ iter->Next();
+ // the iteration should stop as soon as the bound key is reached
+ // even though the key is deleted
+ // hence internal_delete_skipped_count should be 0
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_EQ(
+ static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
+ }
+}
+
+TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.prefix_extractor = nullptr;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("a", "0"));
+ ASSERT_OK(Put("z", "0"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Put("foo3", "bar3"));
+ ASSERT_OK(Put("foo4", "bar4"));
+
+ {
+ std::string up_str = "foo5";
+ Slice up(up_str);
+ ReadOptions ro;
+ ro.iterate_upper_bound = &up;
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("foo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+ uint64_t prev_block_cache_hit =
+ TestGetTickerCount(options, BLOCK_CACHE_HIT);
+ uint64_t prev_block_cache_miss =
+ TestGetTickerCount(options, BLOCK_CACHE_MISS);
+
+ ASSERT_GT(prev_block_cache_hit + prev_block_cache_miss, 0);
+
+ iter->Seek("foo4");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo4")), 0);
+ ASSERT_EQ(prev_block_cache_hit,
+ TestGetTickerCount(options, BLOCK_CACHE_HIT));
+ ASSERT_EQ(prev_block_cache_miss,
+ TestGetTickerCount(options, BLOCK_CACHE_MISS));
+
+ iter->Seek("foo2");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo3")), 0);
+ ASSERT_EQ(prev_block_cache_hit,
+ TestGetTickerCount(options, BLOCK_CACHE_HIT));
+ ASSERT_EQ(prev_block_cache_miss,
+ TestGetTickerCount(options, BLOCK_CACHE_MISS));
+ }
+}
+#endif
+
+TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) {
+ for (auto format_version : {2, 3, 4}) {
+ int upper_bound_hits = 0;
+ Options options = CurrentOptions();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableIterator:out_of_bound",
+ [&upper_bound_hits](void*) { upper_bound_hits++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.prefix_extractor = nullptr;
+ BlockBasedTableOptions table_options;
+ table_options.format_version = format_version;
+ table_options.flush_block_policy_factory =
+ std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Put("foo4", "bar4"));
+ ASSERT_OK(Flush());
+
+ Slice ub("foo3");
+ ReadOptions ro;
+ ro.iterate_upper_bound = &ub;
+
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+ ASSERT_EQ(upper_bound_hits, 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
+ ASSERT_EQ(upper_bound_hits, 0);
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(upper_bound_hits, 1);
+ }
+}
+
+// Enable kBinarySearchWithFirstKey, do some iterator operations and check that
+// they don't do unnecessary block reads.
+TEST_P(DBIteratorTest, IndexWithFirstKey) {
+ for (int tailing = 0; tailing < 2; ++tailing) {
+ SCOPED_TRACE("tailing = " + std::to_string(tailing));
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.prefix_extractor = nullptr;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ Statistics* stats = options.statistics.get();
+ BlockBasedTableOptions table_options;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+ table_options.index_shortening =
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+ table_options.flush_block_policy_factory =
+ std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+ table_options.block_cache =
+ NewLRUCache(8000); // fits all blocks and their cache metadata overhead
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Merge("a1", "x1"));
+ ASSERT_OK(Merge("b1", "y1"));
+ ASSERT_OK(Merge("c0", "z1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("a2", "x2"));
+ ASSERT_OK(Merge("b2", "y2"));
+ ASSERT_OK(Merge("c0", "z2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("a3", "x3"));
+ ASSERT_OK(Merge("b3", "y3"));
+ ASSERT_OK(Merge("c3", "z3"));
+ ASSERT_OK(Flush());
+
+ // Block cache is not important for this test.
+ // We use BLOCK_CACHE_DATA_* counters just because they're the most readily
+ // available way of counting block accesses.
+
+ ReadOptions ropt;
+ ropt.tailing = tailing;
+ std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+ ropt.read_tier = ReadTier::kBlockCacheTier;
+ std::unique_ptr<Iterator> nonblocking_iter(NewIterator(ropt));
+
+ iter->Seek("b10");
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("b2", iter->key().ToString());
+ EXPECT_EQ("y2", iter->value().ToString());
+ EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ // The cache-only iterator should succeed too, using the blocks pulled into
+ // the cache by the previous iterator.
+ nonblocking_iter->Seek("b10");
+ ASSERT_TRUE(nonblocking_iter->Valid());
+ EXPECT_EQ("b2", nonblocking_iter->key().ToString());
+ EXPECT_EQ("y2", nonblocking_iter->value().ToString());
+ EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // ... but it shouldn't be able to step forward since the next block is
+ // not in cache yet.
+ nonblocking_iter->Next();
+ ASSERT_FALSE(nonblocking_iter->Valid());
+ ASSERT_TRUE(nonblocking_iter->status().IsIncomplete());
+
+ // ... nor should a seek to the next key succeed.
+ nonblocking_iter->Seek("b20");
+ ASSERT_FALSE(nonblocking_iter->Valid());
+ ASSERT_TRUE(nonblocking_iter->status().IsIncomplete());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("b3", iter->key().ToString());
+ EXPECT_EQ("y3", iter->value().ToString());
+ EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // After the blocking iterator loaded the next block, the nonblocking
+ // iterator's seek should succeed.
+ nonblocking_iter->Seek("b20");
+ ASSERT_TRUE(nonblocking_iter->Valid());
+ EXPECT_EQ("b3", nonblocking_iter->key().ToString());
+ EXPECT_EQ("y3", nonblocking_iter->value().ToString());
+ EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ iter->Seek("c0");
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("c0", iter->key().ToString());
+ EXPECT_EQ("z1,z2", iter->value().ToString());
+ EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ(6, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("c3", iter->key().ToString());
+ EXPECT_EQ("z3", iter->value().ToString());
+ EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ iter.reset();
+
+ // Enable iterate_upper_bound and check that iterator is not trying to read
+ // blocks that are fully above upper bound.
+ std::string ub = "b3";
+ Slice ub_slice(ub);
+ ropt.iterate_upper_bound = &ub_slice;
+ iter.reset(NewIterator(ropt));
+
+ iter->Seek("b2");
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("b2", iter->key().ToString());
+ EXPECT_EQ("y2", iter->value().ToString());
+ EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ }
+}
+
+TEST_P(DBIteratorTest, IndexWithFirstKeyGet) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.prefix_extractor = nullptr;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ Statistics* stats = options.statistics.get();
+ BlockBasedTableOptions table_options;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+ table_options.index_shortening =
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+ table_options.flush_block_policy_factory =
+ std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+ table_options.block_cache = NewLRUCache(1000); // fits all blocks
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Merge("a", "x1"));
+ ASSERT_OK(Merge("c", "y1"));
+ ASSERT_OK(Merge("e", "z1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("c", "y2"));
+ ASSERT_OK(Merge("e", "z2"));
+ ASSERT_OK(Flush());
+
+ // Get() between blocks shouldn't read any blocks.
+ ASSERT_EQ("NOT_FOUND", Get("b"));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // Get() of an existing key shouldn't read any unnecessary blocks when there's
+ // only one key per block.
+
+ ASSERT_EQ("y1,y2", Get("c"));
+ EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ ASSERT_EQ("x1", Get("a"));
+ EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ EXPECT_EQ(std::vector<std::string>({"NOT_FOUND", "z1,z2"}),
+ MultiGet({"b", "e"}));
+}
+
+// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary
+// return the biggest key which is smaller than the seek key.
+TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) {
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ options.env = env_;
+ DestroyAndReopen(options);
+
+ // write three entries with different keys using Merge()
+ WriteOptions wopts;
+ ASSERT_OK(db_->Merge(wopts, "1", "data1"));
+ ASSERT_OK(db_->Merge(wopts, "2", "data2"));
+ ASSERT_OK(db_->Merge(wopts, "3", "data3"));
+
+ std::unique_ptr<Iterator> it(NewIterator(ReadOptions()));
+
+ it->Seek("2");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("2", it->key().ToString());
+
+ it->Prev();
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("1", it->key().ToString());
+
+ it->SeekForPrev("1");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("1", it->key().ToString());
+
+ it->Next();
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("2", it->key().ToString());
+}
+
+class DBIteratorTestForPinnedData : public DBIteratorTest {
+ public:
+ enum TestConfig {
+ NORMAL,
+ CLOSE_AND_OPEN,
+ COMPACT_BEFORE_READ,
+ FLUSH_EVERY_1000,
+ MAX
+ };
+ DBIteratorTestForPinnedData() : DBIteratorTest() {}
+ void PinnedDataIteratorRandomized(TestConfig run_config) {
+ // Generate Random data
+ Random rnd(301);
+
+ int puts = 100000;
+ int key_pool = static_cast<int>(puts * 0.7);
+ int key_size = 100;
+ int val_size = 1000;
+ int seeks_percentage = 20; // 20% of keys will be used to test seek()
+ int delete_percentage = 20; // 20% of keys will be deleted
+ int merge_percentage = 20; // 20% of keys will be added using Merge()
+
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.use_delta_encoding = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ DestroyAndReopen(options);
+
+ std::vector<std::string> generated_keys(key_pool);
+ for (int i = 0; i < key_pool; i++) {
+ generated_keys[i] = rnd.RandomString(key_size);
+ }
+
+ std::map<std::string, std::string> true_data;
+ std::vector<std::string> random_keys;
+ std::vector<std::string> deleted_keys;
+ for (int i = 0; i < puts; i++) {
+ auto& k = generated_keys[rnd.Next() % key_pool];
+ auto v = rnd.RandomString(val_size);
+
+ // Insert data to true_data map and to DB
+ true_data[k] = v;
+ if (rnd.PercentTrue(merge_percentage)) {
+ ASSERT_OK(db_->Merge(WriteOptions(), k, v));
+ } else {
+ ASSERT_OK(Put(k, v));
+ }
+
+ // Pick random keys to be used to test Seek()
+ if (rnd.PercentTrue(seeks_percentage)) {
+ random_keys.push_back(k);
+ }
+
+ // Delete some random keys
+ if (rnd.PercentTrue(delete_percentage)) {
+ deleted_keys.push_back(k);
+ true_data.erase(k);
+ ASSERT_OK(Delete(k));
+ }
+
+ if (run_config == TestConfig::FLUSH_EVERY_1000) {
+ if (i && i % 1000 == 0) {
+ ASSERT_OK(Flush());
+ }
+ }
+ }
+
+ if (run_config == TestConfig::CLOSE_AND_OPEN) {
+ Close();
+ Reopen(options);
+ } else if (run_config == TestConfig::COMPACT_BEFORE_READ) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+
+ ReadOptions ro;
+ ro.pin_data = true;
+ auto iter = NewIterator(ro);
+
+ {
+ // Test Seek to random keys
+ std::vector<Slice> keys_slices;
+ std::vector<std::string> true_keys;
+ for (auto& k : random_keys) {
+ iter->Seek(k);
+ if (!iter->Valid()) {
+ ASSERT_EQ(true_data.lower_bound(k), true_data.end());
+ continue;
+ }
+ std::string prop_value;
+ ASSERT_OK(
+ iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ keys_slices.push_back(iter->key());
+ true_keys.push_back(true_data.lower_bound(k)->first);
+ }
+
+ for (size_t i = 0; i < keys_slices.size(); i++) {
+ ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
+ }
+ }
+
+ {
+ // Test SeekForPrev to random keys
+ std::vector<Slice> keys_slices;
+ std::vector<std::string> true_keys;
+ for (auto& k : random_keys) {
+ iter->SeekForPrev(k);
+ if (!iter->Valid()) {
+ ASSERT_EQ(true_data.upper_bound(k), true_data.begin());
+ continue;
+ }
+ std::string prop_value;
+ ASSERT_OK(
+ iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ keys_slices.push_back(iter->key());
+ true_keys.push_back((--true_data.upper_bound(k))->first);
+ }
+
+ for (size_t i = 0; i < keys_slices.size(); i++) {
+ ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
+ }
+ }
+
+ {
+ // Test iterating all data forward
+ std::vector<Slice> all_keys;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string prop_value;
+ ASSERT_OK(
+ iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ all_keys.push_back(iter->key());
+ }
+ ASSERT_EQ(all_keys.size(), true_data.size());
+
+ // Verify that all keys slices are valid
+ auto data_iter = true_data.begin();
+ for (size_t i = 0; i < all_keys.size(); i++) {
+ ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
+ data_iter++;
+ }
+ }
+
+ {
+ // Test iterating all data backward
+ std::vector<Slice> all_keys;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ std::string prop_value;
+ ASSERT_OK(
+ iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ all_keys.push_back(iter->key());
+ }
+ ASSERT_EQ(all_keys.size(), true_data.size());
+
+ // Verify that all keys slices are valid (backward)
+ auto data_iter = true_data.rbegin();
+ for (size_t i = 0; i < all_keys.size(); i++) {
+ ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
+ data_iter++;
+ }
+ }
+
+ delete iter;
+ }
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedNormal) {
+ PinnedDataIteratorRandomized(TestConfig::NORMAL);
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedCLoseAndOpen) {
+ PinnedDataIteratorRandomized(TestConfig::CLOSE_AND_OPEN);
+}
+
+TEST_P(DBIteratorTestForPinnedData,
+ PinnedDataIteratorRandomizedCompactBeforeRead) {
+ PinnedDataIteratorRandomized(TestConfig::COMPACT_BEFORE_READ);
+}
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedFlush) {
+ PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000);
+}
+
+INSTANTIATE_TEST_CASE_P(DBIteratorTestForPinnedDataInstance,
+ DBIteratorTestForPinnedData,
+ testing::Values(true, false));
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.use_delta_encoding = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 1024 * 1024 * 10; // 10 Mb
+ DestroyAndReopen(options);
+
+ std::map<std::string, std::string> true_data;
+
+ // Generate 4 sst files in L2
+ Random rnd(301);
+ for (int i = 1; i <= 1000; i++) {
+ std::string k = Key(i * 3);
+ std::string v = rnd.RandomString(100);
+ ASSERT_OK(Put(k, v));
+ true_data[k] = v;
+ if (i % 250 == 0) {
+ ASSERT_OK(Flush());
+ }
+ }
+ ASSERT_EQ(FilesPerLevel(0), "4");
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(FilesPerLevel(0), "0,4");
+
+ // Generate 4 sst files in L0
+ for (int i = 1; i <= 1000; i++) {
+ std::string k = Key(i * 2);
+ std::string v = rnd.RandomString(100);
+ ASSERT_OK(Put(k, v));
+ true_data[k] = v;
+ if (i % 250 == 0) {
+ ASSERT_OK(Flush());
+ }
+ }
+ ASSERT_EQ(FilesPerLevel(0), "4,4");
+
+ // Add some keys/values in memtables
+ for (int i = 1; i <= 1000; i++) {
+ std::string k = Key(i);
+ std::string v = rnd.RandomString(100);
+ ASSERT_OK(Put(k, v));
+ true_data[k] = v;
+ }
+ ASSERT_EQ(FilesPerLevel(0), "4,4");
+
+ ReadOptions ro;
+ ro.pin_data = true;
+ auto iter = NewIterator(ro);
+
+ std::vector<std::pair<Slice, std::string>> results;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string prop_value;
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ results.emplace_back(iter->key(), iter->value().ToString());
+ }
+
+ ASSERT_EQ(results.size(), true_data.size());
+ auto data_iter = true_data.begin();
+ for (size_t i = 0; i < results.size(); i++, data_iter++) {
+ auto& kv = results[i];
+ ASSERT_EQ(kv.first, data_iter->first);
+ ASSERT_EQ(kv.second, data_iter->second);
+ }
+
+ delete iter;
+}
+#endif
+
+TEST_P(DBIteratorTest, PinnedDataIteratorMergeOperator) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.use_delta_encoding = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ DestroyAndReopen(options);
+
+ std::string numbers[7];
+ for (int val = 0; val <= 6; val++) {
+ PutFixed64(numbers + val, val);
+ }
+
+ // +1 all keys in range [ 0 => 999]
+ for (int i = 0; i < 1000; i++) {
+ WriteOptions wo;
+ ASSERT_OK(db_->Merge(wo, Key(i), numbers[1]));
+ }
+
+ // +2 all keys divisible by 2 in range [ 0 => 999]
+ for (int i = 0; i < 1000; i += 2) {
+ WriteOptions wo;
+ ASSERT_OK(db_->Merge(wo, Key(i), numbers[2]));
+ }
+
+ // +3 all keys divisible by 5 in range [ 0 => 999]
+ for (int i = 0; i < 1000; i += 5) {
+ WriteOptions wo;
+ ASSERT_OK(db_->Merge(wo, Key(i), numbers[3]));
+ }
+
+ ReadOptions ro;
+ ro.pin_data = true;
+ auto iter = NewIterator(ro);
+
+ std::vector<std::pair<Slice, std::string>> results;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string prop_value;
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ results.emplace_back(iter->key(), iter->value().ToString());
+ }
+
+ ASSERT_EQ(results.size(), 1000);
+ for (size_t i = 0; i < results.size(); i++) {
+ auto& kv = results[i];
+ ASSERT_EQ(kv.first, Key(static_cast<int>(i)));
+ int expected_val = 1;
+ if (i % 2 == 0) {
+ expected_val += 2;
+ }
+ if (i % 5 == 0) {
+ expected_val += 3;
+ }
+ ASSERT_EQ(kv.second, numbers[expected_val]);
+ }
+
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.use_delta_encoding = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.write_buffer_size = 100000;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ std::map<std::string, std::string> true_data;
+ for (int i = 0; i < 1000; i++) {
+ std::string k = rnd.RandomString(10);
+ std::string v = rnd.RandomString(1000);
+ ASSERT_OK(Put(k, v));
+ true_data[k] = v;
+ }
+
+ ReadOptions ro;
+ ro.pin_data = true;
+ auto iter = NewIterator(ro);
+
+ // Delete 50% of the keys and update the other 50%
+ for (auto& kv : true_data) {
+ if (rnd.OneIn(2)) {
+ ASSERT_OK(Delete(kv.first));
+ } else {
+ std::string new_val = rnd.RandomString(1000);
+ ASSERT_OK(Put(kv.first, new_val));
+ }
+ }
+
+ std::vector<std::pair<Slice, std::string>> results;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string prop_value;
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+ ASSERT_EQ("1", prop_value);
+ results.emplace_back(iter->key(), iter->value().ToString());
+ }
+
+ auto data_iter = true_data.begin();
+ for (size_t i = 0; i < results.size(); i++, data_iter++) {
+ auto& kv = results[i];
+ ASSERT_EQ(kv.first, data_iter->first);
+ ASSERT_EQ(kv.second, data_iter->second);
+ }
+
+ delete iter;
+}
+
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+ const char* Name() const override {
+ return "SliceTransformLimitedDomainGeneric";
+ }
+
+ Slice Transform(const Slice& src) const override {
+ return Slice(src.data(), 1);
+ }
+
+ bool InDomain(const Slice& src) const override {
+ // prefix will be x????
+ return src.size() >= 1;
+ }
+
+ bool InRange(const Slice& dst) const override {
+ // prefix will be x????
+ return dst.size() == 1;
+ }
+};
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) {
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.disable_auto_compactions = true;
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("a1", "va1"));
+ ASSERT_OK(Put("a2", "va2"));
+ ASSERT_OK(Put("a3", "va3"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("b1", "vb1"));
+ ASSERT_OK(Put("b2", "vb2"));
+ ASSERT_OK(Put("b3", "vb3"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("b4", "vb4"));
+ ASSERT_OK(Put("d1", "vd1"));
+ ASSERT_OK(Put("d2", "vd2"));
+ ASSERT_OK(Put("d4", "vd4"));
+ ASSERT_OK(Flush());
+
+ MoveFilesToLevel(1);
+ {
+ ReadOptions ro;
+ Iterator* iter = NewIterator(ro);
+
+ iter->SeekForPrev("a4");
+ ASSERT_EQ(iter->key().ToString(), "a3");
+ ASSERT_EQ(iter->value().ToString(), "va3");
+
+ iter->SeekForPrev("c2");
+ ASSERT_EQ(iter->key().ToString(), "b3");
+ iter->SeekForPrev("d3");
+ ASSERT_EQ(iter->key().ToString(), "d2");
+ iter->SeekForPrev("b5");
+ ASSERT_EQ(iter->key().ToString(), "b4");
+ delete iter;
+ }
+
+ {
+ ReadOptions ro;
+ ro.prefix_same_as_start = true;
+ Iterator* iter = NewIterator(ro);
+ iter->SeekForPrev("c2");
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+ delete iter;
+ }
+}
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFilesCustomPrefixExtractor) {
+ Options options = CurrentOptions();
+ options.prefix_extractor =
+ std::make_shared<SliceTransformLimitedDomainGeneric>();
+ options.disable_auto_compactions = true;
+ // Enable prefix bloom for SST files
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("a1", "va1"));
+ ASSERT_OK(Put("a2", "va2"));
+ ASSERT_OK(Put("a3", "va3"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("b1", "vb1"));
+ ASSERT_OK(Put("b2", "vb2"));
+ ASSERT_OK(Put("b3", "vb3"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("b4", "vb4"));
+ ASSERT_OK(Put("d1", "vd1"));
+ ASSERT_OK(Put("d2", "vd2"));
+ ASSERT_OK(Put("d4", "vd4"));
+ ASSERT_OK(Flush());
+
+ MoveFilesToLevel(1);
+ {
+ ReadOptions ro;
+ Iterator* iter = NewIterator(ro);
+
+ iter->SeekForPrev("a4");
+ ASSERT_EQ(iter->key().ToString(), "a3");
+ ASSERT_EQ(iter->value().ToString(), "va3");
+
+ iter->SeekForPrev("c2");
+ ASSERT_EQ(iter->key().ToString(), "b3");
+ iter->SeekForPrev("d3");
+ ASSERT_EQ(iter->key().ToString(), "d2");
+ iter->SeekForPrev("b5");
+ ASSERT_EQ(iter->key().ToString(), "b4");
+ delete iter;
+ }
+
+ {
+ ReadOptions ro;
+ ro.prefix_same_as_start = true;
+ Iterator* iter = NewIterator(ro);
+ iter->SeekForPrev("c2");
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+ delete iter;
+ }
+}
+
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocks) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1; // every block will contain one entry
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+ options.disable_auto_compactions = true;
+ options.max_sequential_skip_in_iterations = 8;
+
+ DestroyAndReopen(options);
+
+ // Putting such deletes will force DBIter::Prev() to fallback to a Seek
+ for (int file_num = 0; file_num < 10; file_num++) {
+ ASSERT_OK(Delete("key4"));
+ ASSERT_OK(Flush());
+ }
+
+ // First File containing 5 blocks of puts
+ ASSERT_OK(Put("key1", "val1.0"));
+ ASSERT_OK(Put("key2", "val2.0"));
+ ASSERT_OK(Put("key3", "val3.0"));
+ ASSERT_OK(Put("key4", "val4.0"));
+ ASSERT_OK(Put("key5", "val5.0"));
+ ASSERT_OK(Flush());
+
+ // Second file containing 9 blocks of merge operands
+ ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.1"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.2"));
+
+ ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.1"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.2"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.3"));
+
+ ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.1"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.2"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.3"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.4"));
+ ASSERT_OK(Flush());
+
+ {
+ ReadOptions ro;
+ ro.fill_cache = false;
+ Iterator* iter = NewIterator(ro);
+
+ iter->SeekToLast();
+ ASSERT_EQ(iter->key().ToString(), "key5");
+ ASSERT_EQ(iter->value().ToString(), "val5.0");
+
+ iter->Prev();
+ ASSERT_EQ(iter->key().ToString(), "key4");
+ ASSERT_EQ(iter->value().ToString(), "val4.0");
+
+ iter->Prev();
+ ASSERT_EQ(iter->key().ToString(), "key3");
+ ASSERT_EQ(iter->value().ToString(), "val3.0,val3.1,val3.2,val3.3,val3.4");
+
+ iter->Prev();
+ ASSERT_EQ(iter->key().ToString(), "key2");
+ ASSERT_EQ(iter->value().ToString(), "val2.0,val2.1,val2.2,val2.3");
+
+ iter->Prev();
+ ASSERT_EQ(iter->key().ToString(), "key1");
+ ASSERT_EQ(iter->value().ToString(), "val1.0,val1.1,val1.2");
+
+ delete iter;
+ }
+}
+
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+ options.disable_auto_compactions = true;
+ options.level0_slowdown_writes_trigger = (1 << 30);
+ options.level0_stop_writes_trigger = (1 << 30);
+ options.max_sequential_skip_in_iterations = 8;
+ DestroyAndReopen(options);
+
+ const int kNumKeys = 500;
+ // Small number of merge operands to make sure that DBIter::Prev() don't
+ // fall back to Seek()
+ const int kNumMergeOperands = 3;
+ // Use value size that will make sure that every block contain 1 key
+ const int kValSize =
+ static_cast<int>(BlockBasedTableOptions().block_size) * 4;
+ // Percentage of keys that wont get merge operations
+ const int kNoMergeOpPercentage = 20;
+ // Percentage of keys that will be deleted
+ const int kDeletePercentage = 10;
+
+ // For half of the key range we will write multiple deletes first to
+ // force DBIter::Prev() to fall back to Seek()
+ for (int file_num = 0; file_num < 10; file_num++) {
+ for (int i = 0; i < kNumKeys; i += 2) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ Random rnd(301);
+ std::map<std::string, std::string> true_data;
+ std::string gen_key;
+ std::string gen_val;
+
+ for (int i = 0; i < kNumKeys; i++) {
+ gen_key = Key(i);
+ gen_val = rnd.RandomString(kValSize);
+
+ ASSERT_OK(Put(gen_key, gen_val));
+ true_data[gen_key] = gen_val;
+ }
+ ASSERT_OK(Flush());
+
+ // Separate values and merge operands in different file so that we
+ // make sure that we don't merge them while flushing but actually
+ // merge them in the read path
+ for (int i = 0; i < kNumKeys; i++) {
+ if (rnd.PercentTrue(kNoMergeOpPercentage)) {
+ // Dont give merge operations for some keys
+ continue;
+ }
+
+ for (int j = 0; j < kNumMergeOperands; j++) {
+ gen_key = Key(i);
+ gen_val = rnd.RandomString(kValSize);
+
+ ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val));
+ true_data[gen_key] += "," + gen_val;
+ }
+ }
+ ASSERT_OK(Flush());
+
+ for (int i = 0; i < kNumKeys; i++) {
+ if (rnd.PercentTrue(kDeletePercentage)) {
+ gen_key = Key(i);
+
+ ASSERT_OK(Delete(gen_key));
+ true_data.erase(gen_key);
+ }
+ }
+ ASSERT_OK(Flush());
+
+ {
+ ReadOptions ro;
+ ro.fill_cache = false;
+ Iterator* iter = NewIterator(ro);
+ auto data_iter = true_data.rbegin();
+
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ data_iter++;
+ }
+ ASSERT_EQ(data_iter, true_data.rend());
+
+ delete iter;
+ }
+
+ {
+ ReadOptions ro;
+ ro.fill_cache = false;
+ Iterator* iter = NewIterator(ro);
+ auto data_iter = true_data.rbegin();
+
+ int entries_right = 0;
+ std::string seek_key;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ // Verify key/value of current position
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+
+ bool restore_position_with_seek = rnd.Uniform(2);
+ if (restore_position_with_seek) {
+ seek_key = iter->key().ToString();
+ }
+
+ // Do some Next() operations the restore the iterator to orignal position
+ int next_count =
+ entries_right > 0 ? rnd.Uniform(std::min(entries_right, 10)) : 0;
+ for (int i = 0; i < next_count; i++) {
+ iter->Next();
+ data_iter--;
+
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ }
+
+ if (restore_position_with_seek) {
+ // Restore orignal position using Seek()
+ iter->Seek(seek_key);
+ for (int i = 0; i < next_count; i++) {
+ data_iter++;
+ }
+
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ } else {
+ // Restore original position using Prev()
+ for (int i = 0; i < next_count; i++) {
+ iter->Prev();
+ data_iter++;
+
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ }
+ }
+
+ entries_right++;
+ data_iter++;
+ }
+ ASSERT_EQ(data_iter, true_data.rend());
+
+ delete iter;
+ }
+}
+
+TEST_P(DBIteratorTest, IteratorWithLocalStatistics) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 1000; i++) {
+ // Key 10 bytes / Value 10 bytes
+ ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+ }
+
+ std::atomic<uint64_t> total_next(0);
+ std::atomic<uint64_t> total_next_found(0);
+ std::atomic<uint64_t> total_prev(0);
+ std::atomic<uint64_t> total_prev_found(0);
+ std::atomic<uint64_t> total_bytes(0);
+
+ std::vector<port::Thread> threads;
+ std::function<void()> reader_func_next = [&]() {
+ SetPerfLevel(kEnableCount);
+ get_perf_context()->Reset();
+ Iterator* iter = NewIterator(ReadOptions());
+
+ iter->SeekToFirst();
+ // Seek will bump ITER_BYTES_READ
+ uint64_t bytes = 0;
+ bytes += iter->key().size();
+ bytes += iter->value().size();
+ while (true) {
+ iter->Next();
+ total_next++;
+
+ if (!iter->Valid()) {
+ break;
+ }
+ total_next_found++;
+ bytes += iter->key().size();
+ bytes += iter->value().size();
+ }
+
+ delete iter;
+ ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+ SetPerfLevel(kDisable);
+ total_bytes += bytes;
+ };
+
+ std::function<void()> reader_func_prev = [&]() {
+ SetPerfLevel(kEnableCount);
+ Iterator* iter = NewIterator(ReadOptions());
+
+ iter->SeekToLast();
+ // Seek will bump ITER_BYTES_READ
+ uint64_t bytes = 0;
+ bytes += iter->key().size();
+ bytes += iter->value().size();
+ while (true) {
+ iter->Prev();
+ total_prev++;
+
+ if (!iter->Valid()) {
+ break;
+ }
+ total_prev_found++;
+ bytes += iter->key().size();
+ bytes += iter->value().size();
+ }
+
+ delete iter;
+ ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+ SetPerfLevel(kDisable);
+ total_bytes += bytes;
+ };
+
+ for (int i = 0; i < 10; i++) {
+ threads.emplace_back(reader_func_next);
+ }
+ for (int i = 0; i < 15; i++) {
+ threads.emplace_back(reader_func_prev);
+ }
+
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), (uint64_t)total_next);
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND),
+ (uint64_t)total_next_found);
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), (uint64_t)total_prev);
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND),
+ (uint64_t)total_prev_found);
+ ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ),
+ (uint64_t)total_bytes);
+}
+
+TEST_P(DBIteratorTest, ReadAhead) {
+ Options options;
+ env_->count_random_reads_ = true;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 4 << 20;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1024;
+ table_options.no_block_cache = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ std::string value(1024, 'a');
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(i), value));
+ }
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(2);
+
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(i), value));
+ }
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(i), value));
+ }
+ ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("1,1,1", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ env_->random_read_bytes_counter_ = 0;
+ options.statistics->setTickerCount(NO_FILE_OPENS, 0);
+ ReadOptions read_options;
+ auto* iter = NewIterator(read_options);
+ iter->SeekToFirst();
+ int64_t num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS);
+ size_t bytes_read = env_->random_read_bytes_counter_;
+ delete iter;
+
+ int64_t num_file_closes = TestGetTickerCount(options, NO_FILE_CLOSES);
+ env_->random_read_bytes_counter_ = 0;
+ options.statistics->setTickerCount(NO_FILE_OPENS, 0);
+ read_options.readahead_size = 1024 * 10;
+ iter = NewIterator(read_options);
+ iter->SeekToFirst();
+ int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS);
+ size_t bytes_read_readahead = env_->random_read_bytes_counter_;
+ delete iter;
+ int64_t num_file_closes_readahead =
+ TestGetTickerCount(options, NO_FILE_CLOSES);
+ ASSERT_EQ(num_file_opens, num_file_opens_readahead);
+ ASSERT_EQ(num_file_closes, num_file_closes_readahead);
+ ASSERT_GT(bytes_read_readahead, bytes_read);
+ ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3);
+
+ // Verify correctness.
+ iter = NewIterator(read_options);
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(value, iter->value());
+ count++;
+ }
+ ASSERT_EQ(100, count);
+ for (int i = 0; i < 100; i++) {
+ iter->Seek(Key(i));
+ ASSERT_EQ(value, iter->value());
+ }
+ delete iter;
+}
+
+// Insert a key, create a snapshot iterator, overwrite key lots of times,
+// seek to a smaller key. Expect DBIter to fall back to a seek instead of
+// going through all the overwrites linearly.
+TEST_P(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.max_sequential_skip_in_iterations = 3;
+ options.prefix_extractor = nullptr;
+ options.write_buffer_size = 1 << 27; // big enough to avoid flush
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ // Insert.
+ ASSERT_OK(Put("b", "0"));
+
+ // Create iterator.
+ ReadOptions ro;
+ std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+ // Insert a lot.
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(Put("b", std::to_string(i + 1).c_str()));
+ }
+
+#ifndef ROCKSDB_LITE
+ // Check that memtable wasn't flushed.
+ std::string val;
+ ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &val));
+ EXPECT_EQ("0", val);
+#endif
+
+ // Seek iterator to a smaller key.
+ get_perf_context()->Reset();
+ iter->Seek("a");
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ("b", iter->key().ToString());
+ EXPECT_EQ("0", iter->value().ToString());
+
+ // Check that the seek didn't do too much work.
+ // Checks are not tight, just make sure that everything is well below 100.
+ EXPECT_LT(get_perf_context()->internal_key_skipped_count, 4);
+ EXPECT_LT(get_perf_context()->internal_recent_skipped_count, 8);
+ EXPECT_LT(get_perf_context()->seek_on_memtable_count, 10);
+ EXPECT_LT(get_perf_context()->next_on_memtable_count, 10);
+ EXPECT_LT(get_perf_context()->prev_on_memtable_count, 10);
+
+ // Check that iterator did something like what we expect.
+ EXPECT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
+ EXPECT_EQ(get_perf_context()->internal_merge_count, 0);
+ EXPECT_GE(get_perf_context()->internal_recent_skipped_count, 2);
+ EXPECT_GE(get_perf_context()->seek_on_memtable_count, 2);
+ EXPECT_EQ(1,
+ options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+}
+
+TEST_P(DBIteratorTest, Refresh) {
+ ASSERT_OK(Put("x", "y"));
+
+ std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+ ASSERT_OK(iter->status());
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ ASSERT_OK(Put("c", "d"));
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ ASSERT_OK(iter->status());
+ ASSERT_OK(iter->Refresh());
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+
+ ASSERT_OK(Put("m", "n"));
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ ASSERT_OK(iter->status());
+ ASSERT_OK(iter->Refresh());
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("m")), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ iter.reset();
+}
+
+TEST_P(DBIteratorTest, RefreshWithSnapshot) {
+ ASSERT_OK(Put("x", "y"));
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ReadOptions options;
+ options.snapshot = snapshot;
+ Iterator* iter = NewIterator(options);
+ ASSERT_OK(iter->status());
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ ASSERT_OK(Put("c", "d"));
+
+ iter->Seek(Slice("a"));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ ASSERT_OK(iter->status());
+ Status s = iter->Refresh();
+ ASSERT_TRUE(s.IsNotSupported());
+ db_->ReleaseSnapshot(snapshot);
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, CreationFailure) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::NewInternalIterator:StatusCallback", [](void* arg) {
+ *(reinterpret_cast<Status*>(arg)) = Status::Corruption("test status");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Iterator* iter = NewIterator(ReadOptions());
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsCorruption());
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithChangeDirection) {
+ Options options = CurrentOptions();
+ options.max_sequential_skip_in_iterations = 3;
+ DestroyAndReopen(options);
+
+ // write a bunch of kvs to the database.
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Put("y", "1"));
+ ASSERT_OK(Put("y1", "1"));
+ ASSERT_OK(Put("y2", "1"));
+ ASSERT_OK(Put("y3", "1"));
+ ASSERT_OK(Put("z", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Put("z", "1"));
+ ASSERT_OK(Put("bar", "1"));
+ ASSERT_OK(Put("foo", "1"));
+
+ std::string upper_bound = "x";
+ Slice ub_slice(upper_bound);
+ ReadOptions ro;
+ ro.iterate_upper_bound = &ub_slice;
+ ro.max_skippable_internal_keys = 1000;
+
+ Iterator* iter = NewIterator(ro);
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("bar", iter->key().ToString());
+
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, TableFilter) {
+ ASSERT_OK(Put("a", "1"));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("b", "2"));
+ ASSERT_OK(Put("c", "3"));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("d", "4"));
+ ASSERT_OK(Put("e", "5"));
+ ASSERT_OK(Put("f", "6"));
+ EXPECT_OK(dbfull()->Flush(FlushOptions()));
+
+ // Ensure the table_filter callback is called once for each table.
+ {
+ std::set<uint64_t> unseen{1, 2, 3};
+ ReadOptions opts;
+ opts.table_filter = [&](const TableProperties& props) {
+ auto it = unseen.find(props.num_entries);
+ if (it == unseen.end()) {
+ ADD_FAILURE() << "saw table properties with an unexpected "
+ << props.num_entries << " entries";
+ } else {
+ unseen.erase(it);
+ }
+ return true;
+ };
+ auto iter = NewIterator(opts);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->1");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->2");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->3");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "d->4");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "e->5");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "f->6");
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(unseen.empty());
+ delete iter;
+ }
+
+ // Ensure returning false in the table_filter hides the keys from that table
+ // during iteration.
+ {
+ ReadOptions opts;
+ opts.table_filter = [](const TableProperties& props) {
+ return props.num_entries != 2;
+ };
+ auto iter = NewIterator(opts);
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->1");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "d->4");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "e->5");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "f->6");
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ delete iter;
+ }
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithPrevReseek) {
+ Options options = CurrentOptions();
+ options.max_sequential_skip_in_iterations = 3;
+ DestroyAndReopen(options);
+
+ // write a bunch of kvs to the database.
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Put("y", "1"));
+ ASSERT_OK(Put("z", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Put("z", "1"));
+ ASSERT_OK(Put("bar", "1"));
+ ASSERT_OK(Put("foo", "1"));
+ ASSERT_OK(Put("foo", "2"));
+
+ ASSERT_OK(Put("foo", "3"));
+ ASSERT_OK(Put("foo", "4"));
+ ASSERT_OK(Put("foo", "5"));
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(Put("foo", "6"));
+
+ std::string upper_bound = "x";
+ Slice ub_slice(upper_bound);
+ ReadOptions ro;
+ ro.snapshot = snapshot;
+ ro.iterate_upper_bound = &ub_slice;
+
+ Iterator* iter = NewIterator(ro);
+ iter->SeekForPrev("goo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ iter->Prev();
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBIteratorTest, SkipStatistics) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ int skip_count = 0;
+
+ // write a bunch of kvs to the database.
+ ASSERT_OK(Put("a", "1"));
+ ASSERT_OK(Put("b", "1"));
+ ASSERT_OK(Put("c", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("d", "1"));
+ ASSERT_OK(Put("e", "1"));
+ ASSERT_OK(Put("f", "1"));
+ ASSERT_OK(Put("a", "2"));
+ ASSERT_OK(Put("b", "2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Delete("d"));
+ ASSERT_OK(Delete("e"));
+ ASSERT_OK(Delete("f"));
+
+ Iterator* iter = NewIterator(ReadOptions());
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 3);
+ delete iter;
+ skip_count += 8; // 3 deletes + 3 original keys + 2 lower in sequence
+ ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+ iter = NewIterator(ReadOptions());
+ count = 0;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 3);
+ delete iter;
+ skip_count += 8; // Same as above, but in reverse order
+ ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+ ASSERT_OK(Put("aa", "1"));
+ ASSERT_OK(Put("ab", "1"));
+ ASSERT_OK(Put("ac", "1"));
+ ASSERT_OK(Put("ad", "1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Delete("ab"));
+ ASSERT_OK(Delete("ac"));
+ ASSERT_OK(Delete("ad"));
+
+ ReadOptions ro;
+ Slice prefix("b");
+ ro.iterate_upper_bound = &prefix;
+
+ iter = NewIterator(ro);
+ count = 0;
+ for (iter->Seek("aa"); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 1);
+ delete iter;
+ skip_count += 6; // 3 deletes + 3 original keys
+ ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+ iter = NewIterator(ro);
+ count = 0;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ ASSERT_OK(iter->status());
+ count++;
+ }
+ ASSERT_EQ(count, 2);
+ delete iter;
+ // 3 deletes + 3 original keys + lower sequence of "a"
+ skip_count += 7;
+ ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+}
+
+TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ ReadOptions ropts;
+ ropts.max_skippable_internal_keys = 2;
+
+ ASSERT_OK(Put("1", "val_1"));
+ // Add more tombstones than max_skippable_internal_keys so that Next() fails.
+ ASSERT_OK(Delete("2"));
+ ASSERT_OK(Delete("3"));
+ ASSERT_OK(Delete("4"));
+ ASSERT_OK(Delete("5"));
+ ASSERT_OK(Put("6", "val_6"));
+
+ std::unique_ptr<Iterator> iter(NewIterator(ropts));
+ iter->SeekToFirst();
+
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "1");
+ ASSERT_EQ(iter->value().ToString(), "val_1");
+
+ // This should fail as incomplete due to too many non-visible internal keys on
+ // the way to the next valid user key.
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_TRUE(iter->status().IsIncomplete());
+
+ // Get the internal key at which Next() failed.
+ std::string prop_value;
+ ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+ ASSERT_EQ("4", prop_value);
+
+ // Create a new iterator to seek to the internal key.
+ std::unique_ptr<Iterator> iter2(NewIterator(ropts));
+ iter2->Seek(prop_value);
+ ASSERT_TRUE(iter2->Valid());
+ ASSERT_OK(iter2->status());
+
+ ASSERT_EQ(iter2->key().ToString(), "6");
+ ASSERT_EQ(iter2->value().ToString(), "val_6");
+}
+
+// Reproduces a former bug where iterator would skip some records when DBIter
+// re-seeks subiterator with Incomplete status.
+TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ // Make sure the sst file has more than one block.
+ table_options.flush_block_policy_factory =
+ std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ // Two records in sst file, each in its own block.
+ ASSERT_OK(Put("b", ""));
+ ASSERT_OK(Put("d", ""));
+ ASSERT_OK(Flush());
+
+ // Create a nonblocking iterator before writing to memtable.
+ ReadOptions ropt;
+ ropt.read_tier = kBlockCacheTier;
+ std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+ // Overwrite a key in memtable many times to hit
+ // max_sequential_skip_in_iterations (which is 8 by default).
+ for (int i = 0; i < 20; ++i) {
+ ASSERT_OK(Put("c", ""));
+ }
+
+ // Load the second block in sst file into the block cache.
+ {
+ std::unique_ptr<Iterator> iter2(NewIterator(ReadOptions()));
+ iter2->Seek("d");
+ }
+
+ // Finally seek the nonblocking iterator.
+ iter->Seek("a");
+ // With the bug, the status used to be OK, and the iterator used to point to
+ // "d".
+ EXPECT_TRUE(iter->status().IsIncomplete());
+}
+
+TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) {
+ ASSERT_OK(Put("a", ""));
+ ASSERT_OK(Put("b", ""));
+ ASSERT_OK(Flush());
+
+ ReadOptions ropt;
+ Slice ub = "b";
+ ropt.iterate_upper_bound = &ub;
+
+ std::unique_ptr<Iterator> it(dbfull()->NewIterator(ropt));
+ it->SeekForPrev("a");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_OK(it->status());
+ ASSERT_EQ("a", it->key().ToString());
+ it->Next();
+ ASSERT_FALSE(it->Valid());
+ ASSERT_OK(it->status());
+ it->SeekForPrev("a");
+ ASSERT_OK(it->status());
+
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("a", it->key().ToString());
+}
+
+TEST_P(DBIteratorTest, AvoidReseekLevelIterator) {
+ Options options = CurrentOptions();
+ options.compression = CompressionType::kNoCompression;
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 800;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ Random rnd(301);
+ std::string random_str = rnd.RandomString(180);
+
+ ASSERT_OK(Put("1", random_str));
+ ASSERT_OK(Put("2", random_str));
+ ASSERT_OK(Put("3", random_str));
+ ASSERT_OK(Put("4", random_str));
+ // A new block
+ ASSERT_OK(Put("5", random_str));
+ ASSERT_OK(Put("6", random_str));
+ ASSERT_OK(Put("7", random_str));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("8", random_str));
+ ASSERT_OK(Put("9", random_str));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ int num_find_file_in_level = 0;
+ int num_idx_blk_seek = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "LevelIterator::Seek:BeforeFindFile",
+ [&](void* /*arg*/) { num_find_file_in_level++; });
+ SyncPoint::GetInstance()->SetCallBack(
+ "IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ {
+ std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+ iter->Seek("1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(1, num_idx_blk_seek);
+
+ iter->Seek("2");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(1, num_idx_blk_seek);
+
+ iter->Seek("3");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(1, num_idx_blk_seek);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(1, num_idx_blk_seek);
+
+ iter->Seek("5");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(2, num_idx_blk_seek);
+
+ iter->Seek("6");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(2, num_idx_blk_seek);
+
+ iter->Seek("7");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(1, num_find_file_in_level);
+ ASSERT_EQ(3, num_idx_blk_seek);
+
+ iter->Seek("8");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(2, num_find_file_in_level);
+ // Still re-seek because "8" is the boundary key, which has
+ // the same user key as the seek key.
+ ASSERT_EQ(4, num_idx_blk_seek);
+
+ iter->Seek("5");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(3, num_find_file_in_level);
+ ASSERT_EQ(5, num_idx_blk_seek);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(3, num_find_file_in_level);
+ ASSERT_EQ(5, num_idx_blk_seek);
+
+ // Seek backward never triggers the index block seek to be skipped
+ iter->Seek("5");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(3, num_find_file_in_level);
+ ASSERT_EQ(6, num_idx_blk_seek);
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// MyRocks may change iterate bounds before seek. Simply test to make sure such
+// usage doesn't break iterator.
+TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
+ Options options = CurrentOptions();
+ options.compression = CompressionType::kNoCompression;
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 100;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ std::string value(50, 'v');
+ Reopen(options);
+ ASSERT_OK(Put("aaa", value));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("bbb", "v"));
+ ASSERT_OK(Put("ccc", "v"));
+ ASSERT_OK(Put("ddd", "v"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("eee", "v"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ std::string ub1 = "e";
+ std::string ub2 = "c";
+ Slice ub(ub1);
+ ReadOptions read_opts1;
+ read_opts1.iterate_upper_bound = &ub;
+ Iterator* iter = NewIterator(read_opts1);
+ // Seek and iterate accross block boundary.
+ iter->Seek("b");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("bbb", iter->key());
+ ub = Slice(ub2);
+ iter->Seek("b");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("bbb", iter->key());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ delete iter;
+
+ std::string lb1 = "a";
+ std::string lb2 = "c";
+ Slice lb(lb1);
+ ReadOptions read_opts2;
+ read_opts2.iterate_lower_bound = &lb;
+ iter = NewIterator(read_opts2);
+ iter->SeekForPrev("d");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("ccc", iter->key());
+ lb = Slice(lb2);
+ iter->SeekForPrev("d");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("ccc", iter->key());
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ delete iter;
+}
+
+TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) {
+ ASSERT_OK(Put("aaa", "v"));
+ ASSERT_OK(Put("bbb", "v"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("ccc", "v"));
+ ASSERT_OK(Put("ddd", "v"));
+ ASSERT_OK(Flush());
+ // Move both files to bottom level.
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ Slice lower_bound("b");
+ ReadOptions read_opts;
+ read_opts.iterate_lower_bound = &lower_bound;
+ std::unique_ptr<Iterator> iter(NewIterator(read_opts));
+ iter->SeekForPrev("d");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("ccc", iter->key());
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("bbb", iter->key());
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+}
+
+TEST_P(DBIteratorTest, Blob) {
+ Options options = CurrentOptions();
+ options.enable_blob_files = true;
+ options.max_sequential_skip_in_iterations = 2;
+ options.statistics = CreateDBStatistics();
+
+ Reopen(options);
+
+ // Note: we have 4 KVs (3 of which are hidden) for key "b" and
+ // max_sequential_skip_in_iterations is set to 2. Thus, we need to do a reseek
+ // anytime we move from "b" to "c" or vice versa.
+ ASSERT_OK(Put("a", "va"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("b", "vb0"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("b", "vb1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("b", "vb2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("b", "vb3"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("c", "vc"));
+ ASSERT_OK(Flush());
+
+ std::unique_ptr<Iterator> iter_guard(NewIterator(ReadOptions()));
+ Iterator* const iter = iter_guard.get();
+
+ iter->SeekToFirst();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ ASSERT_EQ(IterStatus(iter), "b->vb3");
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToFirst();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+ ASSERT_EQ(IterStatus(iter), "b->vb3");
+ iter->Prev();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Seek("a");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Seek("ax");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+ ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+ iter->SeekForPrev("d");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->SeekForPrev("c");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->SeekForPrev("bx");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+ ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+ iter->Seek("b");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+ ASSERT_EQ(IterStatus(iter), "b->vb3");
+ iter->Seek("z");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekForPrev("b");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+ ASSERT_EQ(IterStatus(iter), "b->vb3");
+ iter->SeekForPrev("");
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ // Switch from reverse to forward
+ iter->SeekToLast();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+ iter->Prev();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5);
+ iter->Prev();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5);
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+ ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+ // Switch from forward to reverse
+ iter->SeekToFirst();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+ iter->Next();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 7);
+ iter->Prev();
+ ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 8);
+ ASSERT_EQ(IterStatus(iter), "b->vb3");
+}
+
+INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
+ testing::Values(true, false));
+
+// Tests how DBIter work with ReadCallback
+class DBIteratorWithReadCallbackTest : public DBIteratorTest {};
+
+TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) {
+ class TestReadCallback : public ReadCallback {
+ public:
+ explicit TestReadCallback(SequenceNumber _max_visible_seq)
+ : ReadCallback(_max_visible_seq) {}
+
+ bool IsVisibleFullCheck(SequenceNumber seq) override {
+ return seq <= max_visible_seq_;
+ }
+ };
+
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("foo", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+ ASSERT_OK(Put("a", "va"));
+ ASSERT_OK(Put("z", "vz"));
+ SequenceNumber seq1 = db_->GetLatestSequenceNumber();
+ TestReadCallback callback1(seq1);
+ ASSERT_OK(Put("foo", "v4"));
+ ASSERT_OK(Put("foo", "v5"));
+ ASSERT_OK(Put("bar", "v7"));
+
+ SequenceNumber seq2 = db_->GetLatestSequenceNumber();
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+ ->cfd();
+ // The iterator are suppose to see data before seq1.
+ Iterator* iter =
+ dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq2, &callback1);
+
+ // Seek
+ // The latest value of "foo" before seq1 is "v3"
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v3", iter->value());
+ // "bar" is not visible to the iterator. It will move on to the next key
+ // "foo".
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v3", iter->value());
+
+ // Next
+ // Seek to "a"
+ iter->Seek("a");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("va", iter->value());
+ // "bar" is not visible to the iterator. It will move on to the next key
+ // "foo".
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v3", iter->value());
+
+ // Prev
+ // Seek to "z"
+ iter->Seek("z");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("vz", iter->value());
+ // The previous key is "foo", which is visible to the iterator.
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v3", iter->value());
+ // "bar" is not visible to the iterator. It will move on to the next key "a".
+ iter->Prev(); // skipping "bar"
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("a", iter->key());
+ ASSERT_EQ("va", iter->value());
+
+ // SeekForPrev
+ // The previous key is "foo", which is visible to the iterator.
+ iter->SeekForPrev("y");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v3", iter->value());
+ // "bar" is not visible to the iterator. It will move on to the next key "a".
+ iter->SeekForPrev("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("a", iter->key());
+ ASSERT_EQ("va", iter->value());
+
+ delete iter;
+
+ // Prev beyond max_sequential_skip_in_iterations
+ uint64_t num_versions =
+ CurrentOptions().max_sequential_skip_in_iterations + 10;
+ for (uint64_t i = 0; i < num_versions; i++) {
+ ASSERT_OK(Put("bar", std::to_string(i)));
+ }
+ SequenceNumber seq3 = db_->GetLatestSequenceNumber();
+ TestReadCallback callback2(seq3);
+ ASSERT_OK(Put("bar", "v8"));
+ SequenceNumber seq4 = db_->GetLatestSequenceNumber();
+
+ // The iterator is suppose to see data before seq3.
+ iter = dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq4, &callback2);
+ // Seek to "z", which is visible.
+ iter->Seek("z");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("vz", iter->value());
+ // Previous key is "foo" and the last value "v5" is visible.
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("foo", iter->key());
+ ASSERT_EQ("v5", iter->value());
+ // Since the number of values of "bar" is more than
+ // max_sequential_skip_in_iterations, Prev() will ultimately fallback to
+ // seek in forward direction. Here we test the fallback seek is correct.
+ // The last visible value should be (num_versions - 1), as "v8" is not
+ // visible.
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("bar", iter->key());
+ ASSERT_EQ(std::to_string(num_versions - 1), iter->value());
+
+ delete iter;
+}
+
+TEST_F(DBIteratorTest, BackwardIterationOnInplaceUpdateMemtable) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.inplace_update_support = false;
+ options.env = env_;
+ DestroyAndReopen(options);
+ constexpr int kNumKeys = 10;
+
+ // Write kNumKeys to WAL.
+ for (int i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ int count = 0;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ ++count;
+ }
+ ASSERT_EQ(kNumKeys, count);
+ }
+
+ // Reopen and rebuild the memtable from WAL.
+ options.create_if_missing = false;
+ options.avoid_flush_during_recovery = true;
+ options.inplace_update_support = true;
+ options.allow_concurrent_memtable_write = false;
+ Reopen(options);
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ iter->SeekToLast();
+ // Backward iteration not supported due to inplace_update_support = true.
+ ASSERT_TRUE(iter->status().IsNotSupported());
+ ASSERT_FALSE(iter->Valid());
+ }
+}
+
+TEST_F(DBIteratorTest, IteratorRefreshReturnSV) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ std::unique_ptr<Iterator> iter{db_->NewIterator(ReadOptions())};
+ SyncPoint::GetInstance()->SetCallBack(
+ "ArenaWrappedDBIter::Refresh:SV", [&](void*) {
+ ASSERT_OK(db_->Put(WriteOptions(), "dummy", "new SV"));
+ // This makes the local SV obselete.
+ ASSERT_OK(Flush());
+ SyncPoint::GetInstance()->DisableProcessing();
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(iter->Refresh());
+ iter.reset();
+ // iter used to not cleanup SV, so the Close() below would hit an assertion
+ // error.
+ Close();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_kv_checksum_test.cc b/src/rocksdb/db/db_kv_checksum_test.cc
new file mode 100644
index 000000000..614399243
--- /dev/null
+++ b/src/rocksdb/db/db_kv_checksum_test.cc
@@ -0,0 +1,885 @@
+// Copyright (c) 2020-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/db_test_util.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class WriteBatchOpType {
+ kPut = 0,
+ kDelete,
+ kSingleDelete,
+ kMerge,
+ kPutEntity,
+ kDeleteRange,
+ kNum,
+};
+
+// Integer addition is needed for `::testing::Range()` to take the enum type.
+WriteBatchOpType operator+(WriteBatchOpType lhs, const int rhs) {
+ using T = std::underlying_type<WriteBatchOpType>::type;
+ return static_cast<WriteBatchOpType>(static_cast<T>(lhs) + rhs);
+}
+
+enum class WriteMode {
+ // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key = 0`
+ // and `WriteOptions::protection_bytes_per_key = 0`
+ kWriteUnprotectedBatch = 0,
+ // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key > 0`.
+ kWriteProtectedBatch,
+ // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key == 0`.
+ // Protection is enabled via `WriteOptions::protection_bytes_per_key > 0`.
+ kWriteOptionProtectedBatch,
+ // TODO(ajkr): add a mode that uses `Write()` wrappers, e.g., `Put()`.
+ kNum,
+};
+
+// Integer addition is needed for `::testing::Range()` to take the enum type.
+WriteMode operator+(WriteMode lhs, const int rhs) {
+ using T = std::underlying_type<WriteMode>::type;
+ return static_cast<WriteMode>(static_cast<T>(lhs) + rhs);
+}
+
+std::pair<WriteBatch, Status> GetWriteBatch(ColumnFamilyHandle* cf_handle,
+ size_t protection_bytes_per_key,
+ WriteBatchOpType op_type) {
+ Status s;
+ WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */,
+ protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+ switch (op_type) {
+ case WriteBatchOpType::kPut:
+ s = wb.Put(cf_handle, "key", "val");
+ break;
+ case WriteBatchOpType::kDelete:
+ s = wb.Delete(cf_handle, "key");
+ break;
+ case WriteBatchOpType::kSingleDelete:
+ s = wb.SingleDelete(cf_handle, "key");
+ break;
+ case WriteBatchOpType::kDeleteRange:
+ s = wb.DeleteRange(cf_handle, "begin", "end");
+ break;
+ case WriteBatchOpType::kMerge:
+ s = wb.Merge(cf_handle, "key", "val");
+ break;
+ case WriteBatchOpType::kPutEntity:
+ s = wb.PutEntity(cf_handle, "key",
+ {{"attr_name1", "foo"}, {"attr_name2", "bar"}});
+ break;
+ case WriteBatchOpType::kNum:
+ assert(false);
+ }
+ return {std::move(wb), std::move(s)};
+}
+
+class DbKvChecksumTestBase : public DBTestBase {
+ public:
+ DbKvChecksumTestBase(const std::string& path, bool env_do_fsync)
+ : DBTestBase(path, env_do_fsync) {}
+
+ ColumnFamilyHandle* GetCFHandleToUse(ColumnFamilyHandle* column_family,
+ WriteBatchOpType op_type) const {
+ // Note: PutEntity cannot be called without column family
+ if (op_type == WriteBatchOpType::kPutEntity && !column_family) {
+ return db_->DefaultColumnFamily();
+ }
+
+ return column_family;
+ }
+};
+
+class DbKvChecksumTest
+ : public DbKvChecksumTestBase,
+ public ::testing::WithParamInterface<
+ std::tuple<WriteBatchOpType, char, WriteMode,
+ uint32_t /* memtable_protection_bytes_per_key */>> {
+ public:
+ DbKvChecksumTest()
+ : DbKvChecksumTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {
+ op_type_ = std::get<0>(GetParam());
+ corrupt_byte_addend_ = std::get<1>(GetParam());
+ write_mode_ = std::get<2>(GetParam());
+ memtable_protection_bytes_per_key_ = std::get<3>(GetParam());
+ }
+
+ Status ExecuteWrite(ColumnFamilyHandle* cf_handle) {
+ switch (write_mode_) {
+ case WriteMode::kWriteUnprotectedBatch: {
+ auto batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
+ 0 /* protection_bytes_per_key */, op_type_);
+ assert(batch_and_status.second.ok());
+ // Default write option has protection_bytes_per_key = 0
+ return db_->Write(WriteOptions(), &batch_and_status.first);
+ }
+ case WriteMode::kWriteProtectedBatch: {
+ auto batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
+ 8 /* protection_bytes_per_key */, op_type_);
+ assert(batch_and_status.second.ok());
+ return db_->Write(WriteOptions(), &batch_and_status.first);
+ }
+ case WriteMode::kWriteOptionProtectedBatch: {
+ auto batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
+ 0 /* protection_bytes_per_key */, op_type_);
+ assert(batch_and_status.second.ok());
+ WriteOptions write_opts;
+ write_opts.protection_bytes_per_key = 8;
+ return db_->Write(write_opts, &batch_and_status.first);
+ }
+ case WriteMode::kNum:
+ assert(false);
+ }
+ return Status::NotSupported("WriteMode " +
+ std::to_string(static_cast<int>(write_mode_)));
+ }
+
+ void CorruptNextByteCallBack(void* arg) {
+ Slice encoded = *static_cast<Slice*>(arg);
+ if (entry_len_ == std::numeric_limits<size_t>::max()) {
+ // We learn the entry size on the first attempt
+ entry_len_ = encoded.size();
+ }
+ char* buf = const_cast<char*>(encoded.data());
+ buf[corrupt_byte_offset_] += corrupt_byte_addend_;
+ ++corrupt_byte_offset_;
+ }
+
+ bool MoreBytesToCorrupt() { return corrupt_byte_offset_ < entry_len_; }
+
+ protected:
+ WriteBatchOpType op_type_;
+ char corrupt_byte_addend_;
+ WriteMode write_mode_;
+ uint32_t memtable_protection_bytes_per_key_;
+ size_t corrupt_byte_offset_ = 0;
+ size_t entry_len_ = std::numeric_limits<size_t>::max();
+};
+
+std::string GetOpTypeString(const WriteBatchOpType& op_type) {
+ switch (op_type) {
+ case WriteBatchOpType::kPut:
+ return "Put";
+ case WriteBatchOpType::kDelete:
+ return "Delete";
+ case WriteBatchOpType::kSingleDelete:
+ return "SingleDelete";
+ case WriteBatchOpType::kDeleteRange:
+ return "DeleteRange";
+ case WriteBatchOpType::kMerge:
+ return "Merge";
+ case WriteBatchOpType::kPutEntity:
+ return "PutEntity";
+ case WriteBatchOpType::kNum:
+ assert(false);
+ }
+ assert(false);
+ return "";
+}
+
+std::string GetWriteModeString(const WriteMode& mode) {
+ switch (mode) {
+ case WriteMode::kWriteUnprotectedBatch:
+ return "WriteUnprotectedBatch";
+ case WriteMode::kWriteProtectedBatch:
+ return "WriteProtectedBatch";
+ case WriteMode::kWriteOptionProtectedBatch:
+ return "kWriteOptionProtectedBatch";
+ case WriteMode::kNum:
+ assert(false);
+ }
+ return "";
+}
+
+INSTANTIATE_TEST_CASE_P(
+ DbKvChecksumTest, DbKvChecksumTest,
+ ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
+ WriteBatchOpType::kNum),
+ ::testing::Values(2, 103, 251),
+ ::testing::Range(WriteMode::kWriteProtectedBatch,
+ WriteMode::kNum),
+ ::testing::Values(0)),
+ [](const testing::TestParamInfo<
+ std::tuple<WriteBatchOpType, char, WriteMode, uint32_t>>& args) {
+ std::ostringstream oss;
+ oss << GetOpTypeString(std::get<0>(args.param)) << "Add"
+ << static_cast<int>(
+ static_cast<unsigned char>(std::get<1>(args.param)))
+ << GetWriteModeString(std::get<2>(args.param))
+ << static_cast<uint32_t>(std::get<3>(args.param));
+ return oss.str();
+ });
+
+// TODO(ajkr): add a test that corrupts the `WriteBatch` contents. Such
+// corruptions should only be detectable in `WriteMode::kWriteProtectedBatch`.
+
+TEST_P(DbKvChecksumTest, MemTableAddCorrupted) {
+ // This test repeatedly attempts to write `WriteBatch`es containing a single
+ // entry of type `op_type_`. Each attempt has one byte corrupted in its
+ // memtable entry by adding `corrupt_byte_addend_` to its original value. The
+ // test repeats until an attempt has been made on each byte in the encoded
+ // memtable entry. All attempts are expected to fail with `Status::Corruption`
+ SyncPoint::GetInstance()->SetCallBack(
+ "MemTable::Add:Encoded",
+ std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+ std::placeholders::_1));
+
+ while (MoreBytesToCorrupt()) {
+ // Failed memtable insert always leads to read-only mode, so we have to
+ // reopen for every attempt.
+ Options options = CurrentOptions();
+ if (op_type_ == WriteBatchOpType::kMerge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ }
+ Reopen(options);
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // In case the above callback is not invoked, this test will run
+ // numeric_limits<size_t>::max() times until it reports an error (or will
+ // exhaust disk space). Added this assert to report error early.
+ ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+ }
+}
+
+TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) {
+ // This test repeatedly attempts to write `WriteBatch`es containing a single
+ // entry of type `op_type_` to a non-default column family. Each attempt has
+ // one byte corrupted in its memtable entry by adding `corrupt_byte_addend_`
+ // to its original value. The test repeats until an attempt has been made on
+ // each byte in the encoded memtable entry. All attempts are expected to fail
+ // with `Status::Corruption`.
+ Options options = CurrentOptions();
+ if (op_type_ == WriteBatchOpType::kMerge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ }
+ CreateAndReopenWithCF({"pikachu"}, options);
+ SyncPoint::GetInstance()->SetCallBack(
+ "MemTable::Add:Encoded",
+ std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+ std::placeholders::_1));
+
+ while (MoreBytesToCorrupt()) {
+ // Failed memtable insert always leads to read-only mode, so we have to
+ // reopen for every attempt.
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_TRUE(ExecuteWrite(handles_[1]).IsCorruption());
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // In case the above callback is not invoked, this test will run
+ // numeric_limits<size_t>::max() times until it reports an error (or will
+ // exhaust disk space). Added this assert to report error early.
+ ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+ }
+}
+
+TEST_P(DbKvChecksumTest, NoCorruptionCase) {
+ // If this test fails, we may have found a piece of malfunctioned hardware
+ auto batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(nullptr, op_type_),
+ 8 /* protection_bytes_per_key */, op_type_);
+ ASSERT_OK(batch_and_status.second);
+ ASSERT_OK(batch_and_status.first.VerifyChecksum());
+}
+
+TEST_P(DbKvChecksumTest, WriteToWALCorrupted) {
+ // This test repeatedly attempts to write `WriteBatch`es containing a single
+ // entry of type `op_type_`. Each attempt has one byte corrupted by adding
+ // `corrupt_byte_addend_` to its original value. The test repeats until an
+ // attempt has been made on each byte in the encoded write batch. All attempts
+ // are expected to fail with `Status::Corruption`
+ Options options = CurrentOptions();
+ if (op_type_ == WriteBatchOpType::kMerge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ }
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::WriteToWAL:log_entry",
+ std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+ std::placeholders::_1));
+ // First 8 bytes are for sequence number which is not protected in write batch
+ corrupt_byte_offset_ = 8;
+
+ while (MoreBytesToCorrupt()) {
+ // Corrupted write batch leads to read-only mode, so we have to
+ // reopen for every attempt.
+ Reopen(options);
+ auto log_size_pre_write = dbfull()->TEST_total_log_size();
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
+ // Confirm that nothing was written to WAL
+ ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+ ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // In case the above callback is not invoked, this test will run
+ // numeric_limits<size_t>::max() times until it reports an error (or will
+ // exhaust disk space). Added this assert to report error early.
+ ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+ }
+}
+
+TEST_P(DbKvChecksumTest, WriteToWALWithColumnFamilyCorrupted) {
+ // This test repeatedly attempts to write `WriteBatch`es containing a single
+ // entry of type `op_type_`. Each attempt has one byte corrupted by adding
+ // `corrupt_byte_addend_` to its original value. The test repeats until an
+ // attempt has been made on each byte in the encoded write batch. All attempts
+ // are expected to fail with `Status::Corruption`
+ Options options = CurrentOptions();
+ if (op_type_ == WriteBatchOpType::kMerge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ }
+ CreateAndReopenWithCF({"pikachu"}, options);
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::WriteToWAL:log_entry",
+ std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+ std::placeholders::_1));
+ // First 8 bytes are for sequence number which is not protected in write batch
+ corrupt_byte_offset_ = 8;
+
+ while (MoreBytesToCorrupt()) {
+ // Corrupted write batch leads to read-only mode, so we have to
+ // reopen for every attempt.
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+ auto log_size_pre_write = dbfull()->TEST_total_log_size();
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
+ // Confirm that nothing was written to WAL
+ ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+ ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // In case the above callback is not invoked, this test will run
+ // numeric_limits<size_t>::max() times until it reports an error (or will
+ // exhaust disk space). Added this assert to report error early.
+ ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+ }
+}
+
+class DbKvChecksumTestMergedBatch
+ : public DbKvChecksumTestBase,
+ public ::testing::WithParamInterface<
+ std::tuple<WriteBatchOpType, WriteBatchOpType, char>> {
+ public:
+ DbKvChecksumTestMergedBatch()
+ : DbKvChecksumTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {
+ op_type1_ = std::get<0>(GetParam());
+ op_type2_ = std::get<1>(GetParam());
+ corrupt_byte_addend_ = std::get<2>(GetParam());
+ }
+
+ protected:
+ WriteBatchOpType op_type1_;
+ WriteBatchOpType op_type2_;
+ char corrupt_byte_addend_;
+};
+
+void CorruptWriteBatch(Slice* content, size_t offset,
+ char corrupt_byte_addend) {
+ ASSERT_TRUE(offset < content->size());
+ char* buf = const_cast<char*>(content->data());
+ buf[offset] += corrupt_byte_addend;
+}
+
+TEST_P(DbKvChecksumTestMergedBatch, NoCorruptionCase) {
+ // Veirfy write batch checksum after write batch append
+ auto batch1 = GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
+ 8 /* protection_bytes_per_key */, op_type1_);
+ ASSERT_OK(batch1.second);
+ auto batch2 = GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
+ 8 /* protection_bytes_per_key */, op_type2_);
+ ASSERT_OK(batch2.second);
+ ASSERT_OK(WriteBatchInternal::Append(&batch1.first, &batch2.first));
+ ASSERT_OK(batch1.first.VerifyChecksum());
+}
+
+TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
+ // This test has two writers repeatedly attempt to write `WriteBatch`es
+ // containing a single entry of type op_type1_ and op_type2_ respectively. The
+ // leader of the write group writes the batch containinng the entry of type
+ // op_type1_. One byte of the pre-merged write batches is corrupted by adding
+ // `corrupt_byte_addend_` to the batch's original value during each attempt.
+ // The test repeats until an attempt has been made on each byte in both
+ // pre-merged write batches. All attempts are expected to fail with
+ // `Status::Corruption`.
+ Options options = CurrentOptions();
+ if (op_type1_ == WriteBatchOpType::kMerge ||
+ op_type2_ == WriteBatchOpType::kMerge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ }
+
+ auto leader_batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
+ 8 /* protection_bytes_per_key */, op_type1_);
+ ASSERT_OK(leader_batch_and_status.second);
+ auto follower_batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
+ 8 /* protection_bytes_per_key */, op_type2_);
+ size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
+ size_t total_bytes =
+ leader_batch_size + follower_batch_and_status.first.GetDataSize();
+ // First 8 bytes are for sequence number which is not protected in write batch
+ size_t corrupt_byte_offset = 8;
+
+ std::atomic<bool> follower_joined{false};
+ std::atomic<int> leader_count{0};
+ port::Thread follower_thread;
+ // This callback should only be called by the leader thread
+ SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) {
+ auto* leader = reinterpret_cast<WriteThread::Writer*>(arg_leader);
+ ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER);
+
+ // This callback should only be called by the follower thread
+ SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) {
+ auto* follower =
+ reinterpret_cast<WriteThread::Writer*>(arg_follower);
+ // The leader thread will wait on this bool and hence wait until
+ // this writer joins the write group
+ ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER);
+ if (corrupt_byte_offset >= leader_batch_size) {
+ Slice batch_content = follower->batch->Data();
+ CorruptWriteBatch(&batch_content,
+ corrupt_byte_offset - leader_batch_size,
+ corrupt_byte_addend_);
+ }
+ // Leader busy waits on this flag
+ follower_joined = true;
+ // So the follower does not enter the outer callback at
+ // WriteThread::JoinBatchGroup:Wait2
+ SyncPoint::GetInstance()->DisableProcessing();
+ });
+
+ // Start the other writer thread which will join the write group as
+ // follower
+ follower_thread = port::Thread([&]() {
+ follower_batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
+ 8 /* protection_bytes_per_key */, op_type2_);
+ ASSERT_OK(follower_batch_and_status.second);
+ ASSERT_TRUE(
+ db_->Write(WriteOptions(), &follower_batch_and_status.first)
+ .IsCorruption());
+ });
+
+ ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size);
+ if (corrupt_byte_offset < leader_batch_size) {
+ Slice batch_content = leader->batch->Data();
+ CorruptWriteBatch(&batch_content, corrupt_byte_offset,
+ corrupt_byte_addend_);
+ }
+ leader_count++;
+ while (!follower_joined) {
+ // busy waiting
+ }
+ });
+ while (corrupt_byte_offset < total_bytes) {
+ // Reopen DB since it failed WAL write which lead to read-only mode
+ Reopen(options);
+ SyncPoint::GetInstance()->EnableProcessing();
+ auto log_size_pre_write = dbfull()->TEST_total_log_size();
+ leader_batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
+ 8 /* protection_bytes_per_key */, op_type1_);
+ ASSERT_OK(leader_batch_and_status.second);
+ ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
+ .IsCorruption());
+ follower_thread.join();
+ // Prevent leader thread from entering this callback
+ SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
+ ASSERT_EQ(1, leader_count);
+ // Nothing should have been written to WAL
+ ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+ ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+
+ corrupt_byte_offset++;
+ if (corrupt_byte_offset == leader_batch_size) {
+ // skip over the sequence number part of follower's write batch
+ corrupt_byte_offset += 8;
+ }
+ follower_joined = false;
+ leader_count = 0;
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
+ // This test has two writers repeatedly attempt to write `WriteBatch`es
+ // containing a single entry of type op_type1_ and op_type2_ respectively. The
+ // leader of the write group writes the batch containinng the entry of type
+ // op_type1_. One byte of the pre-merged write batches is corrupted by adding
+ // `corrupt_byte_addend_` to the batch's original value during each attempt.
+ // The test repeats until an attempt has been made on each byte in both
+ // pre-merged write batches. All attempts are expected to fail with
+ // `Status::Corruption`.
+ Options options = CurrentOptions();
+ if (op_type1_ == WriteBatchOpType::kMerge ||
+ op_type2_ == WriteBatchOpType::kMerge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ }
+ CreateAndReopenWithCF({"ramen"}, options);
+
+ auto leader_batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_),
+ 8 /* protection_bytes_per_key */, op_type1_);
+ ASSERT_OK(leader_batch_and_status.second);
+ auto follower_batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(handles_[1], op_type2_),
+ 8 /* protection_bytes_per_key */, op_type2_);
+ size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
+ size_t total_bytes =
+ leader_batch_size + follower_batch_and_status.first.GetDataSize();
+ // First 8 bytes are for sequence number which is not protected in write batch
+ size_t corrupt_byte_offset = 8;
+
+ std::atomic<bool> follower_joined{false};
+ std::atomic<int> leader_count{0};
+ port::Thread follower_thread;
+ // This callback should only be called by the leader thread
+ SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) {
+ auto* leader = reinterpret_cast<WriteThread::Writer*>(arg_leader);
+ ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER);
+
+ // This callback should only be called by the follower thread
+ SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) {
+ auto* follower =
+ reinterpret_cast<WriteThread::Writer*>(arg_follower);
+ // The leader thread will wait on this bool and hence wait until
+ // this writer joins the write group
+ ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER);
+ if (corrupt_byte_offset >= leader_batch_size) {
+ Slice batch_content =
+ WriteBatchInternal::Contents(follower->batch);
+ CorruptWriteBatch(&batch_content,
+ corrupt_byte_offset - leader_batch_size,
+ corrupt_byte_addend_);
+ }
+ follower_joined = true;
+ // So the follower does not enter the outer callback at
+ // WriteThread::JoinBatchGroup:Wait2
+ SyncPoint::GetInstance()->DisableProcessing();
+ });
+
+ // Start the other writer thread which will join the write group as
+ // follower
+ follower_thread = port::Thread([&]() {
+ follower_batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(handles_[1], op_type2_),
+ 8 /* protection_bytes_per_key */, op_type2_);
+ ASSERT_OK(follower_batch_and_status.second);
+ ASSERT_TRUE(
+ db_->Write(WriteOptions(), &follower_batch_and_status.first)
+ .IsCorruption());
+ });
+
+ ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size);
+ if (corrupt_byte_offset < leader_batch_size) {
+ Slice batch_content = WriteBatchInternal::Contents(leader->batch);
+ CorruptWriteBatch(&batch_content, corrupt_byte_offset,
+ corrupt_byte_addend_);
+ }
+ leader_count++;
+ while (!follower_joined) {
+ // busy waiting
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ while (corrupt_byte_offset < total_bytes) {
+ // Reopen DB since it failed WAL write which lead to read-only mode
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "ramen"}, options);
+ SyncPoint::GetInstance()->EnableProcessing();
+ auto log_size_pre_write = dbfull()->TEST_total_log_size();
+ leader_batch_and_status =
+ GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_),
+ 8 /* protection_bytes_per_key */, op_type1_);
+ ASSERT_OK(leader_batch_and_status.second);
+ ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
+ .IsCorruption());
+ follower_thread.join();
+ // Prevent leader thread from entering this callback
+ SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
+
+ ASSERT_EQ(1, leader_count);
+ // Nothing should have been written to WAL
+ ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+ ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+
+ corrupt_byte_offset++;
+ if (corrupt_byte_offset == leader_batch_size) {
+ // skip over the sequence number part of follower's write batch
+ corrupt_byte_offset += 8;
+ }
+ follower_joined = false;
+ leader_count = 0;
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(
+ DbKvChecksumTestMergedBatch, DbKvChecksumTestMergedBatch,
+ ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
+ WriteBatchOpType::kNum),
+ ::testing::Range(static_cast<WriteBatchOpType>(0),
+ WriteBatchOpType::kNum),
+ ::testing::Values(2, 103, 251)),
+ [](const testing::TestParamInfo<
+ std::tuple<WriteBatchOpType, WriteBatchOpType, char>>& args) {
+ std::ostringstream oss;
+ oss << GetOpTypeString(std::get<0>(args.param))
+ << GetOpTypeString(std::get<1>(args.param)) << "Add"
+ << static_cast<int>(
+ static_cast<unsigned char>(std::get<2>(args.param)));
+ return oss.str();
+ });
+
+// TODO: add test for transactions
+// TODO: add test for corrupted write batch with WAL disabled
+
+class DbKVChecksumWALToWriteBatchTest : public DBTestBase {
+ public:
+ DbKVChecksumWALToWriteBatchTest()
+ : DBTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {}
+};
+
+TEST_F(DbKVChecksumWALToWriteBatchTest, WriteBatchChecksumHandoff) {
+ Options options = CurrentOptions();
+ Reopen(options);
+ ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+ std::string content = "";
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch",
+ [&](void* batch_ptr) {
+ WriteBatch* batch = reinterpret_cast<WriteBatch*>(batch_ptr);
+ content.assign(batch->Data().data(), batch->GetDataSize());
+ Slice batch_content = batch->Data();
+ // Corrupt first bit
+ CorruptWriteBatch(&batch_content, 0, 1);
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
+ [&](void* checksum_ptr) {
+ // Verify that checksum is produced on the batch content
+ uint64_t checksum = *reinterpret_cast<uint64_t*>(checksum_ptr);
+ ASSERT_EQ(checksum, XXH3_64bits(content.data(), content.size()));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_TRUE(TryReopen(options).IsCorruption());
+ SyncPoint::GetInstance()->DisableProcessing();
+};
+
+// TODO (cbi): add DeleteRange coverage once it is implemented
+class DbMemtableKVChecksumTest : public DbKvChecksumTest {
+ public:
+ DbMemtableKVChecksumTest() : DbKvChecksumTest() {}
+
+ protected:
+ // Indices in the memtable entry that we will not corrupt.
+ // For memtable entry format, see comments in MemTable::Add().
+ // We do not corrupt key length and value length fields in this test
+ // case since it causes segfault and ASAN will complain.
+ // For this test case, key and value are all of length 3, so
+ // key length field is at index 0 and value length field is at index 12.
+ const std::set<size_t> index_not_to_corrupt{0, 12};
+
+ void SkipNotToCorruptEntry() {
+ if (index_not_to_corrupt.find(corrupt_byte_offset_) !=
+ index_not_to_corrupt.end()) {
+ corrupt_byte_offset_++;
+ }
+ }
+};
+
+INSTANTIATE_TEST_CASE_P(
+ DbMemtableKVChecksumTest, DbMemtableKVChecksumTest,
+ ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
+ WriteBatchOpType::kDeleteRange),
+ ::testing::Values(2, 103, 251),
+ ::testing::Range(static_cast<WriteMode>(0),
+ WriteMode::kWriteOptionProtectedBatch),
+ // skip 1 byte checksum as it makes test flaky
+ ::testing::Values(2, 4, 8)),
+ [](const testing::TestParamInfo<
+ std::tuple<WriteBatchOpType, char, WriteMode, uint32_t>>& args) {
+ std::ostringstream oss;
+ oss << GetOpTypeString(std::get<0>(args.param)) << "Add"
+ << static_cast<int>(
+ static_cast<unsigned char>(std::get<1>(args.param)))
+ << GetWriteModeString(std::get<2>(args.param))
+ << static_cast<uint32_t>(std::get<3>(args.param));
+ return oss.str();
+ });
+
+TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
+ // Record memtable entry size.
+ // Not corrupting memtable entry here since it will segfault
+ // or fail some asserts inside memtablerep implementation
+ // e.g., when key_len is corrupted.
+ SyncPoint::GetInstance()->SetCallBack(
+ "MemTable::Add:BeforeReturn:Encoded", [&](void* arg) {
+ Slice encoded = *static_cast<Slice*>(arg);
+ entry_len_ = encoded.size();
+ });
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "Memtable::SaveValue:Begin:entry", [&](void* entry) {
+ char* buf = *static_cast<char**>(entry);
+ buf[corrupt_byte_offset_] += corrupt_byte_addend_;
+ ++corrupt_byte_offset_;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ Options options = CurrentOptions();
+ options.memtable_protection_bytes_per_key =
+ memtable_protection_bytes_per_key_;
+ if (op_type_ == WriteBatchOpType::kMerge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ }
+
+ SkipNotToCorruptEntry();
+ while (MoreBytesToCorrupt()) {
+ Reopen(options);
+ ASSERT_OK(ExecuteWrite(nullptr));
+ std::string val;
+ ASSERT_TRUE(db_->Get(ReadOptions(), "key", &val).IsCorruption());
+ Destroy(options);
+ SkipNotToCorruptEntry();
+ }
+}
+
+TEST_P(DbMemtableKVChecksumTest,
+ GetWithColumnFamilyCorruptAfterMemtableInsert) {
+ // Record memtable entry size.
+ // Not corrupting memtable entry here since it will segfault
+ // or fail some asserts inside memtablerep implementation
+ // e.g., when key_len is corrupted.
+ SyncPoint::GetInstance()->SetCallBack(
+ "MemTable::Add:BeforeReturn:Encoded", [&](void* arg) {
+ Slice encoded = *static_cast<Slice*>(arg);
+ entry_len_ = encoded.size();
+ });
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "Memtable::SaveValue:Begin:entry", [&](void* entry) {
+ char* buf = *static_cast<char**>(entry);
+ buf[corrupt_byte_offset_] += corrupt_byte_addend_;
+ ++corrupt_byte_offset_;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ Options options = CurrentOptions();
+ options.memtable_protection_bytes_per_key =
+ memtable_protection_bytes_per_key_;
+ if (op_type_ == WriteBatchOpType::kMerge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ }
+
+ SkipNotToCorruptEntry();
+ while (MoreBytesToCorrupt()) {
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(ExecuteWrite(handles_[1]));
+ std::string val;
+ ASSERT_TRUE(
+ db_->Get(ReadOptions(), handles_[1], "key", &val).IsCorruption());
+ Destroy(options);
+ SkipNotToCorruptEntry();
+ }
+}
+
+TEST_P(DbMemtableKVChecksumTest, IteratorWithCorruptAfterMemtableInsert) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "MemTable::Add:BeforeReturn:Encoded",
+ std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+ std::placeholders::_1));
+ SyncPoint::GetInstance()->EnableProcessing();
+ Options options = CurrentOptions();
+ options.memtable_protection_bytes_per_key =
+ memtable_protection_bytes_per_key_;
+ if (op_type_ == WriteBatchOpType::kMerge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ }
+
+ SkipNotToCorruptEntry();
+ while (MoreBytesToCorrupt()) {
+ Reopen(options);
+ ASSERT_OK(ExecuteWrite(nullptr));
+ Iterator* it = db_->NewIterator(ReadOptions());
+ it->SeekToFirst();
+ ASSERT_FALSE(it->Valid());
+ ASSERT_TRUE(it->status().IsCorruption());
+ delete it;
+ Destroy(options);
+ SkipNotToCorruptEntry();
+ }
+}
+
+TEST_P(DbMemtableKVChecksumTest,
+ IteratorWithColumnFamilyCorruptAfterMemtableInsert) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "MemTable::Add:BeforeReturn:Encoded",
+ std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+ std::placeholders::_1));
+ SyncPoint::GetInstance()->EnableProcessing();
+ Options options = CurrentOptions();
+ options.memtable_protection_bytes_per_key =
+ memtable_protection_bytes_per_key_;
+ if (op_type_ == WriteBatchOpType::kMerge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ }
+
+ SkipNotToCorruptEntry();
+ while (MoreBytesToCorrupt()) {
+ Reopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(ExecuteWrite(handles_[1]));
+ Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]);
+ it->SeekToFirst();
+ ASSERT_FALSE(it->Valid());
+ ASSERT_TRUE(it->status().IsCorruption());
+ delete it;
+ Destroy(options);
+ SkipNotToCorruptEntry();
+ }
+}
+
+TEST_P(DbMemtableKVChecksumTest, FlushWithCorruptAfterMemtableInsert) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "MemTable::Add:BeforeReturn:Encoded",
+ std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+ std::placeholders::_1));
+ SyncPoint::GetInstance()->EnableProcessing();
+ Options options = CurrentOptions();
+ options.memtable_protection_bytes_per_key =
+ memtable_protection_bytes_per_key_;
+ if (op_type_ == WriteBatchOpType::kMerge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ }
+
+ SkipNotToCorruptEntry();
+ // Not corruping each byte like other tests since Flush() is relatively slow.
+ Reopen(options);
+ ASSERT_OK(ExecuteWrite(nullptr));
+ ASSERT_TRUE(Flush().IsCorruption());
+ // DB enters read-only state when flush reads corrupted data
+ ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+ Destroy(options);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_log_iter_test.cc b/src/rocksdb/db/db_log_iter_test.cc
new file mode 100644
index 000000000..4e982858c
--- /dev/null
+++ b/src/rocksdb/db/db_log_iter_test.cc
@@ -0,0 +1,305 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestXactLogIterator : public DBTestBase {
+ public:
+ DBTestXactLogIterator()
+ : DBTestBase("db_log_iter_test", /*env_do_fsync=*/true) {}
+
+ std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+ const SequenceNumber seq) {
+ std::unique_ptr<TransactionLogIterator> iter;
+ Status status = dbfull()->GetUpdatesSince(seq, &iter);
+ EXPECT_OK(status);
+ EXPECT_TRUE(iter->Valid());
+ return iter;
+ }
+};
+
+namespace {
+SequenceNumber ReadRecords(std::unique_ptr<TransactionLogIterator>& iter,
+ int& count, bool expect_ok = true) {
+ count = 0;
+ SequenceNumber lastSequence = 0;
+ BatchResult res;
+ while (iter->Valid()) {
+ res = iter->GetBatch();
+ EXPECT_TRUE(res.sequence > lastSequence);
+ ++count;
+ lastSequence = res.sequence;
+ EXPECT_OK(iter->status());
+ iter->Next();
+ }
+ if (expect_ok) {
+ EXPECT_OK(iter->status());
+ } else {
+ EXPECT_NOK(iter->status());
+ }
+ return res.sequence;
+}
+
+void ExpectRecords(const int expected_no_records,
+ std::unique_ptr<TransactionLogIterator>& iter) {
+ int num_records;
+ ReadRecords(iter, num_records);
+ ASSERT_EQ(num_records, expected_no_records);
+}
+} // anonymous namespace
+
+TEST_F(DBTestXactLogIterator, TransactionLogIterator) {
+ do {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put(0, "key1", DummyString(1024)));
+ ASSERT_OK(Put(1, "key2", DummyString(1024)));
+ ASSERT_OK(Put(1, "key2", DummyString(1024)));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U);
+ {
+ auto iter = OpenTransactionLogIter(0);
+ ExpectRecords(3, iter);
+ }
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ env_->SleepForMicroseconds(2 * 1000 * 1000);
+ {
+ ASSERT_OK(Put(0, "key4", DummyString(1024)));
+ ASSERT_OK(Put(1, "key5", DummyString(1024)));
+ ASSERT_OK(Put(0, "key6", DummyString(1024)));
+ }
+ {
+ auto iter = OpenTransactionLogIter(0);
+ ExpectRecords(6, iter);
+ }
+ } while (ChangeCompactOptions());
+}
+
+#ifndef NDEBUG // sync point is not included with DNDEBUG build
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) {
+ static const int LOG_ITERATOR_RACE_TEST_COUNT = 2;
+ static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = {
+ {"WalManager::GetSortedWalFiles:1", "WalManager::PurgeObsoleteFiles:1",
+ "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"},
+ {"WalManager::GetSortedWalsOfType:1", "WalManager::PurgeObsoleteFiles:1",
+ "WalManager::PurgeObsoleteFiles:2",
+ "WalManager::GetSortedWalsOfType:2"}};
+ for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) {
+ // Setup sync point dependency to reproduce the race condition of
+ // a log file moved to archived dir, in the middle of GetSortedWalFiles
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {sync_points[test][0], sync_points[test][1]},
+ {sync_points[test][2], sync_points[test][3]},
+ });
+
+ do {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("key1", DummyString(1024)));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("key2", DummyString(1024)));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("key3", DummyString(1024)));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(Put("key4", DummyString(1024)));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
+ ASSERT_OK(dbfull()->FlushWAL(false));
+
+ {
+ auto iter = OpenTransactionLogIter(0);
+ ExpectRecords(4, iter);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ // trigger async flush, and log move. Well, log move will
+ // wait until the GetSortedWalFiles:1 to reproduce the race
+ // condition
+ FlushOptions flush_options;
+ flush_options.wait = false;
+ ASSERT_OK(dbfull()->Flush(flush_options));
+
+ // "key5" would be written in a new memtable and log
+ ASSERT_OK(Put("key5", DummyString(1024)));
+ ASSERT_OK(dbfull()->FlushWAL(false));
+ {
+ // this iter would miss "key4" if not fixed
+ auto iter = OpenTransactionLogIter(0);
+ ExpectRecords(5, iter);
+ }
+ } while (ChangeCompactOptions());
+ }
+}
+#endif
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorStallAtLastRecord) {
+ do {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("key1", DummyString(1024)));
+ auto iter = OpenTransactionLogIter(0);
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_OK(Put("key2", DummyString(1024)));
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckAfterRestart) {
+ do {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("key1", DummyString(1024)));
+ ASSERT_OK(Put("key2", DummyString(1023)));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ Reopen(options);
+ auto iter = OpenTransactionLogIter(0);
+ ExpectRecords(2, iter);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) {
+ do {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < 1024; i++) {
+ ASSERT_OK(Put("key" + std::to_string(i), DummyString(10)));
+ }
+
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->FlushWAL(false));
+
+ // Corrupt this log to create a gap
+ ASSERT_OK(db_->DisableFileDeletions());
+
+ VectorLogPtr wal_files;
+ ASSERT_OK(db_->GetSortedWalFiles(wal_files));
+ ASSERT_FALSE(wal_files.empty());
+
+ const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName();
+ ASSERT_OK(test::TruncateFile(env_, logfile_path,
+ wal_files.front()->SizeFileBytes() / 2));
+
+ ASSERT_OK(db_->EnableFileDeletions());
+
+ // Insert a new entry to a new log file
+ ASSERT_OK(Put("key1025", DummyString(10)));
+ ASSERT_OK(db_->FlushWAL(false));
+
+ // Try to read from the beginning. Should stop before the gap and read less
+ // than 1025 entries
+ auto iter = OpenTransactionLogIter(0);
+ int count = 0;
+ SequenceNumber last_sequence_read = ReadRecords(iter, count, false);
+ ASSERT_LT(last_sequence_read, 1025U);
+
+ // Try to read past the gap, should be able to seek to key1025
+ auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
+ ExpectRecords(1, iter2);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBatchOperations) {
+ do {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024)));
+ ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024)));
+ ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024)));
+ ASSERT_OK(batch.Delete(handles_[0], "key2"));
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Flush(0));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_OK(Put(1, "key4", DummyString(1024)));
+ auto iter = OpenTransactionLogIter(3);
+ ExpectRecords(2, iter);
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ {
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024)));
+ ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024)));
+ ASSERT_OK(batch.PutLogData(Slice("blob1")));
+ ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024)));
+ ASSERT_OK(batch.PutLogData(Slice("blob2")));
+ ASSERT_OK(batch.Delete(handles_[0], "key2"));
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ }
+
+ auto res = OpenTransactionLogIter(0)->GetBatch();
+ struct Handler : public WriteBatch::Handler {
+ std::string seen;
+ Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
+ seen += "Put(" + std::to_string(cf) + ", " + key.ToString() + ", " +
+ std::to_string(value.size()) + ")";
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
+ seen += "Merge(" + std::to_string(cf) + ", " + key.ToString() + ", " +
+ std::to_string(value.size()) + ")";
+ return Status::OK();
+ }
+ void LogData(const Slice& blob) override {
+ seen += "LogData(" + blob.ToString() + ")";
+ }
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ seen += "Delete(" + std::to_string(cf) + ", " + key.ToString() + ")";
+ return Status::OK();
+ }
+ } handler;
+ ASSERT_OK(res.writeBatchPtr->Iterate(&handler));
+ ASSERT_EQ(
+ "Put(1, key1, 1024)"
+ "Put(0, key2, 1024)"
+ "LogData(blob1)"
+ "Put(1, key3, 1024)"
+ "LogData(blob2)"
+ "Delete(0, key2)",
+ handler.seen);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void)argc;
+ (void)argv;
+ return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_logical_block_size_cache_test.cc b/src/rocksdb/db/db_logical_block_size_cache_test.cc
new file mode 100644
index 000000000..13c16618e
--- /dev/null
+++ b/src/rocksdb/db/db_logical_block_size_cache_test.cc
@@ -0,0 +1,521 @@
+// Copyright (c) 2020-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testharness.h"
+
+#ifdef OS_LINUX
+#include "env/io_posix.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+class EnvWithCustomLogicalBlockSizeCache : public EnvWrapper {
+ public:
+ EnvWithCustomLogicalBlockSizeCache(Env* env, LogicalBlockSizeCache* cache)
+ : EnvWrapper(env), cache_(cache) {}
+
+ Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+ return cache_->RefAndCacheLogicalBlockSize(paths);
+ }
+
+ Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+ cache_->UnrefAndTryRemoveCachedLogicalBlockSize(paths);
+ return Status::OK();
+ }
+
+ private:
+ LogicalBlockSizeCache* cache_;
+};
+
+class DBLogicalBlockSizeCacheTest : public testing::Test {
+ public:
+ DBLogicalBlockSizeCacheTest()
+ : dbname_(test::PerThreadDBPath("logical_block_size_cache_test")),
+ data_path_0_(dbname_ + "/data_path_0"),
+ data_path_1_(dbname_ + "/data_path_1"),
+ cf_path_0_(dbname_ + "/cf_path_0"),
+ cf_path_1_(dbname_ + "/cf_path_1") {
+ auto get_fd_block_size = [&](int fd) { return fd; };
+ auto get_dir_block_size = [&](const std::string& /*dir*/, size_t* size) {
+ *size = 1024;
+ return Status::OK();
+ };
+ cache_.reset(
+ new LogicalBlockSizeCache(get_fd_block_size, get_dir_block_size));
+ env_.reset(
+ new EnvWithCustomLogicalBlockSizeCache(Env::Default(), cache_.get()));
+ }
+
+ protected:
+ std::string dbname_;
+ std::string data_path_0_;
+ std::string data_path_1_;
+ std::string cf_path_0_;
+ std::string cf_path_1_;
+ std::unique_ptr<LogicalBlockSizeCache> cache_;
+ std::unique_ptr<Env> env_;
+};
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) {
+ // Tests that Open will cache the logical block size for data paths,
+ // and Close will remove the cached sizes.
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_.get();
+ options.db_paths = {{data_path_0_, 2048}, {data_path_1_, 2048}};
+
+ for (int i = 0; i < 2; i++) {
+ DB* db;
+ if (!i) {
+ printf("Open\n");
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ } else {
+#ifdef ROCKSDB_LITE
+ break;
+#else
+ printf("OpenForReadOnly\n");
+ ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db));
+#endif
+ }
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(data_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+ ASSERT_TRUE(cache_->Contains(data_path_1_));
+ ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+ ASSERT_OK(db->Close());
+ ASSERT_EQ(0, cache_->Size());
+ delete db;
+ }
+ ASSERT_OK(DestroyDB(dbname_, options, {}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) {
+ // Tests that Open will cache the logical block size for data paths,
+ // and delete the db pointer will remove the cached sizes.
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_.get();
+
+ for (int i = 0; i < 2; i++) {
+ DB* db;
+ if (!i) {
+ printf("Open\n");
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ } else {
+#ifdef ROCKSDB_LITE
+ break;
+#else
+ printf("OpenForReadOnly\n");
+ ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db));
+#endif
+ }
+ ASSERT_EQ(1, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ delete db;
+ ASSERT_EQ(0, cache_->Size());
+ }
+ ASSERT_OK(DestroyDB(dbname_, options, {}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) {
+ // Tests that CreateColumnFamily will cache the cf_paths,
+ // drop the column family handle won't drop the cache,
+ // drop and then delete the column family handle will drop the cache.
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_.get();
+ ColumnFamilyOptions cf_options;
+ cf_options.cf_paths = {{cf_path_0_, 1024}, {cf_path_1_, 2048}};
+
+ DB* db;
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ ASSERT_EQ(1, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+ ColumnFamilyHandle* cf = nullptr;
+ ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf));
+ ASSERT_EQ(3, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+ ASSERT_TRUE(cache_->Contains(cf_path_1_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+ // Drop column family does not drop cache.
+ ASSERT_OK(db->DropColumnFamily(cf));
+ ASSERT_EQ(3, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+ ASSERT_TRUE(cache_->Contains(cf_path_1_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+ // Delete handle will drop cache.
+ ASSERT_OK(db->DestroyColumnFamilyHandle(cf));
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+ delete db;
+ ASSERT_EQ(0, cache_->Size());
+ ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) {
+ // To test:
+ // (1) CreateColumnFamilies will cache the cf_paths in
+ // DBLogicalBlockSizeCache
+ // (2) Dropping column family handles associated with
+ // that cf_paths won't drop the cached cf_paths
+ // (3) Deleting all the column family handles associated
+ // with that cf_paths will drop the cached cf_paths
+
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_.get();
+ ColumnFamilyOptions cf_options;
+ cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+ DB* db;
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ ASSERT_EQ(1, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+ std::vector<ColumnFamilyHandle*> cfs;
+ ASSERT_OK(db->CreateColumnFamilies(cf_options, {"cf1", "cf2"}, &cfs));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+ // Drop column family does not drop cf_path_0_'s entry from cache
+ for (ColumnFamilyHandle* cf : cfs) {
+ ASSERT_OK(db->DropColumnFamily(cf));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+ }
+
+ // Delete one cf handle will not drop cf_path_0_'s entry from cache because
+ // another handle is still referencing cf_path_0_.
+ ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+
+ // Delete all cf handles and ensure the ref count of cf_path_0_ in cache_
+ // can be properly decreased by releasing any background reference to the
+ // ColumnFamilyData during db deletion
+ ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ delete db;
+
+ // Now cf_path_0_ in cache_ has been properly decreased and cf_path_0_'s entry
+ // is dropped from cache
+ ASSERT_EQ(0, cache_->Size());
+ ASSERT_OK(
+ DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) {
+ // Tests that Open two column families with the same cf_path will cache the
+ // cf_path and have 2 references to the cached size,
+ // drop the column family handle won't drop the cache,
+ // drop and then delete the column family handle will drop the cache.
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_.get();
+
+ ColumnFamilyOptions cf_options;
+ cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+ for (int i = 0; i < 2; i++) {
+ DB* db;
+ ColumnFamilyHandle* cf1 = nullptr;
+ ColumnFamilyHandle* cf2 = nullptr;
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ ASSERT_OK(db->CreateColumnFamily(cf_options, "cf1", &cf1));
+ ASSERT_OK(db->CreateColumnFamily(cf_options, "cf2", &cf2));
+ ASSERT_OK(db->DestroyColumnFamilyHandle(cf1));
+ ASSERT_OK(db->DestroyColumnFamilyHandle(cf2));
+ delete db;
+ ASSERT_EQ(0, cache_->Size());
+
+ std::vector<ColumnFamilyHandle*> cfs;
+ if (!i) {
+ printf("Open\n");
+ ASSERT_OK(DB::Open(options, dbname_,
+ {{"cf1", cf_options},
+ {"cf2", cf_options},
+ {"default", ColumnFamilyOptions()}},
+ &cfs, &db));
+ } else {
+#ifdef ROCKSDB_LITE
+ break;
+#else
+ printf("OpenForReadOnly\n");
+ ASSERT_OK(DB::OpenForReadOnly(options, dbname_,
+ {{"cf1", cf_options},
+ {"cf2", cf_options},
+ {"default", ColumnFamilyOptions()}},
+ &cfs, &db));
+#endif
+ }
+
+ // Logical block sizes of dbname_ and cf_path_0_ are cached during Open.
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+ // Drop handles won't drop the cache.
+ ASSERT_OK(db->DropColumnFamily(cfs[0]));
+ ASSERT_OK(db->DropColumnFamily(cfs[1]));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+ // Delete 1st handle won't drop the cache for cf_path_0_.
+ ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+ // Delete 2nd handle will drop the cache for cf_path_0_.
+ ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+ ASSERT_EQ(1, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+ // Delete the default handle won't affect the cache because db still refers
+ // to the default CF.
+ ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[2]));
+ ASSERT_EQ(1, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+ delete db;
+ ASSERT_EQ(0, cache_->Size());
+ }
+ ASSERT_OK(
+ DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) {
+ // Tests that destroy column family without dropping won't drop the cache,
+ // because compaction and flush might still need to get logical block size
+ // when opening new files.
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_.get();
+ ColumnFamilyOptions cf_options;
+ cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+ DB* db;
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ ASSERT_EQ(1, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ColumnFamilyHandle* cf = nullptr;
+ ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+ // Delete handle won't drop cache.
+ ASSERT_OK(db->DestroyColumnFamilyHandle(cf));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+ delete db;
+ ASSERT_EQ(0, cache_->Size());
+
+ // Open with column families.
+ std::vector<ColumnFamilyHandle*> cfs;
+ for (int i = 0; i < 2; i++) {
+ if (!i) {
+ printf("Open\n");
+ ASSERT_OK(DB::Open(
+ options, dbname_,
+ {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db));
+ } else {
+#ifdef ROCKSDB_LITE
+ break;
+#else
+ printf("OpenForReadOnly\n");
+ ASSERT_OK(DB::OpenForReadOnly(
+ options, dbname_,
+ {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db));
+#endif
+ }
+ // cf_path_0_ and dbname_ are cached.
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+ // Deleting handle won't drop cache.
+ ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+ ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(dbname_));
+ ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+ delete db;
+ ASSERT_EQ(0, cache_->Size());
+ }
+ ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
+ // Tests the cache behavior when there are multiple DBs sharing the same env
+ // with different db_paths and cf_paths.
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_.get();
+
+ ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+
+ DB* db0;
+ ASSERT_OK(DB::Open(options, data_path_0_, &db0));
+ ASSERT_EQ(1, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(data_path_0_));
+
+ ColumnFamilyOptions cf_options0;
+ cf_options0.cf_paths = {{cf_path_0_, 1024}};
+ ColumnFamilyHandle* cf0;
+ ASSERT_OK(db0->CreateColumnFamily(cf_options0, "cf", &cf0));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(data_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+ DB* db1;
+ ASSERT_OK(DB::Open(options, data_path_1_, &db1));
+ ASSERT_EQ(3, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(data_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+ ASSERT_TRUE(cache_->Contains(data_path_1_));
+ ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+
+ ColumnFamilyOptions cf_options1;
+ cf_options1.cf_paths = {{cf_path_1_, 1024}};
+ ColumnFamilyHandle* cf1;
+ ASSERT_OK(db1->CreateColumnFamily(cf_options1, "cf", &cf1));
+ ASSERT_EQ(4, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(data_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+ ASSERT_TRUE(cache_->Contains(data_path_1_));
+ ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+ ASSERT_TRUE(cache_->Contains(cf_path_1_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+ ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
+ delete db0;
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(data_path_1_));
+ ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+ ASSERT_TRUE(cache_->Contains(cf_path_1_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+ ASSERT_OK(DestroyDB(data_path_0_, options, {{"cf", cf_options0}}));
+
+ ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
+ delete db1;
+ ASSERT_EQ(0, cache_->Size());
+ ASSERT_OK(DestroyDB(data_path_1_, options, {{"cf", cf_options1}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
+ // Tests the cache behavior when there are multiple DBs sharing the same env
+ // with the same db_paths and cf_paths.
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_.get();
+ options.db_paths = {{data_path_0_, 1024}};
+ ColumnFamilyOptions cf_options;
+ cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+ ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+
+ DB* db0;
+ ASSERT_OK(DB::Open(options, dbname_ + "/db0", &db0));
+ ASSERT_EQ(1, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(data_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+
+ ColumnFamilyHandle* cf0;
+ ASSERT_OK(db0->CreateColumnFamily(cf_options, "cf", &cf0));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(data_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+ DB* db1;
+ ASSERT_OK(DB::Open(options, dbname_ + "/db1", &db1));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(data_path_0_));
+ ASSERT_EQ(2, cache_->GetRefCount(data_path_0_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+ ColumnFamilyHandle* cf1;
+ ASSERT_OK(db1->CreateColumnFamily(cf_options, "cf", &cf1));
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(data_path_0_));
+ ASSERT_EQ(2, cache_->GetRefCount(data_path_0_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+ ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
+ delete db0;
+ ASSERT_EQ(2, cache_->Size());
+ ASSERT_TRUE(cache_->Contains(data_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+ ASSERT_TRUE(cache_->Contains(cf_path_0_));
+ ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+ ASSERT_OK(DestroyDB(dbname_ + "/db0", options, {{"cf", cf_options}}));
+
+ ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
+ delete db1;
+ ASSERT_EQ(0, cache_->Size());
+ ASSERT_OK(DestroyDB(dbname_ + "/db1", options, {{"cf", cf_options}}));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // OS_LINUX
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_memtable_test.cc b/src/rocksdb/db/db_memtable_test.cc
new file mode 100644
index 000000000..cae592db3
--- /dev/null
+++ b/src/rocksdb/db/db_memtable_test.cc
@@ -0,0 +1,344 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "db/memtable.h"
+#include "db/range_del_aggregator.h"
+#include "port/stack_trace.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBMemTableTest : public DBTestBase {
+ public:
+ DBMemTableTest() : DBTestBase("db_memtable_test", /*env_do_fsync=*/true) {}
+};
+
+class MockMemTableRep : public MemTableRep {
+ public:
+ explicit MockMemTableRep(Allocator* allocator, MemTableRep* rep)
+ : MemTableRep(allocator), rep_(rep), num_insert_with_hint_(0) {}
+
+ KeyHandle Allocate(const size_t len, char** buf) override {
+ return rep_->Allocate(len, buf);
+ }
+
+ void Insert(KeyHandle handle) override { rep_->Insert(handle); }
+
+ void InsertWithHint(KeyHandle handle, void** hint) override {
+ num_insert_with_hint_++;
+ EXPECT_NE(nullptr, hint);
+ last_hint_in_ = *hint;
+ rep_->InsertWithHint(handle, hint);
+ last_hint_out_ = *hint;
+ }
+
+ bool Contains(const char* key) const override { return rep_->Contains(key); }
+
+ void Get(const LookupKey& k, void* callback_args,
+ bool (*callback_func)(void* arg, const char* entry)) override {
+ rep_->Get(k, callback_args, callback_func);
+ }
+
+ size_t ApproximateMemoryUsage() override {
+ return rep_->ApproximateMemoryUsage();
+ }
+
+ Iterator* GetIterator(Arena* arena) override {
+ return rep_->GetIterator(arena);
+ }
+
+ void* last_hint_in() { return last_hint_in_; }
+ void* last_hint_out() { return last_hint_out_; }
+ int num_insert_with_hint() { return num_insert_with_hint_; }
+
+ private:
+ std::unique_ptr<MemTableRep> rep_;
+ void* last_hint_in_;
+ void* last_hint_out_;
+ int num_insert_with_hint_;
+};
+
+class MockMemTableRepFactory : public MemTableRepFactory {
+ public:
+ MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+ Allocator* allocator,
+ const SliceTransform* transform,
+ Logger* logger) override {
+ SkipListFactory factory;
+ MemTableRep* skiplist_rep =
+ factory.CreateMemTableRep(cmp, allocator, transform, logger);
+ mock_rep_ = new MockMemTableRep(allocator, skiplist_rep);
+ return mock_rep_;
+ }
+
+ MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+ Allocator* allocator,
+ const SliceTransform* transform,
+ Logger* logger,
+ uint32_t column_family_id) override {
+ last_column_family_id_ = column_family_id;
+ return CreateMemTableRep(cmp, allocator, transform, logger);
+ }
+
+ const char* Name() const override { return "MockMemTableRepFactory"; }
+
+ MockMemTableRep* rep() { return mock_rep_; }
+
+ bool IsInsertConcurrentlySupported() const override { return false; }
+
+ uint32_t GetLastColumnFamilyId() { return last_column_family_id_; }
+
+ private:
+ MockMemTableRep* mock_rep_;
+ // workaround since there's no std::numeric_limits<uint32_t>::max() yet.
+ uint32_t last_column_family_id_ = static_cast<uint32_t>(-1);
+};
+
+class TestPrefixExtractor : public SliceTransform {
+ public:
+ const char* Name() const override { return "TestPrefixExtractor"; }
+
+ Slice Transform(const Slice& key) const override {
+ const char* p = separator(key);
+ if (p == nullptr) {
+ return Slice();
+ }
+ return Slice(key.data(), p - key.data() + 1);
+ }
+
+ bool InDomain(const Slice& key) const override {
+ return separator(key) != nullptr;
+ }
+
+ bool InRange(const Slice& /*key*/) const override { return false; }
+
+ private:
+ const char* separator(const Slice& key) const {
+ return reinterpret_cast<const char*>(memchr(key.data(), '_', key.size()));
+ }
+};
+
+// Test that ::Add properly returns false when inserting duplicate keys
+TEST_F(DBMemTableTest, DuplicateSeq) {
+ SequenceNumber seq = 123;
+ std::string value;
+ MergeContext merge_context;
+ Options options;
+ InternalKeyComparator ikey_cmp(options.comparator);
+ ReadRangeDelAggregator range_del_agg(&ikey_cmp,
+ kMaxSequenceNumber /* upper_bound */);
+
+ // Create a MemTable
+ InternalKeyComparator cmp(BytewiseComparator());
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ ImmutableOptions ioptions(options);
+ WriteBufferManager wb(options.db_write_buffer_size);
+ MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+
+ // Write some keys and make sure it returns false on duplicates
+ ASSERT_OK(
+ mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */));
+ ASSERT_TRUE(
+ mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */)
+ .IsTryAgain());
+ // Changing the type should still cause the duplicatae key
+ ASSERT_TRUE(
+ mem->Add(seq, kTypeMerge, "key", "value2", nullptr /* kv_prot_info */)
+ .IsTryAgain());
+ // Changing the seq number will make the key fresh
+ ASSERT_OK(mem->Add(seq + 1, kTypeMerge, "key", "value2",
+ nullptr /* kv_prot_info */));
+ // Test with different types for duplicate keys
+ ASSERT_TRUE(
+ mem->Add(seq, kTypeDeletion, "key", "", nullptr /* kv_prot_info */)
+ .IsTryAgain());
+ ASSERT_TRUE(
+ mem->Add(seq, kTypeSingleDeletion, "key", "", nullptr /* kv_prot_info */)
+ .IsTryAgain());
+
+ // Test the duplicate keys under stress
+ for (int i = 0; i < 10000; i++) {
+ bool insert_dup = i % 10 == 1;
+ if (!insert_dup) {
+ seq++;
+ }
+ Status s = mem->Add(seq, kTypeValue, "foo", "value" + std::to_string(seq),
+ nullptr /* kv_prot_info */);
+ if (insert_dup) {
+ ASSERT_TRUE(s.IsTryAgain());
+ } else {
+ ASSERT_OK(s);
+ }
+ }
+ delete mem;
+
+ // Test with InsertWithHint
+ options.memtable_insert_with_hint_prefix_extractor.reset(
+ new TestPrefixExtractor()); // which uses _ to extract the prefix
+ ioptions = ImmutableOptions(options);
+ mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ // Insert a duplicate key with _ in it
+ ASSERT_OK(
+ mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */));
+ ASSERT_TRUE(
+ mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */)
+ .IsTryAgain());
+ delete mem;
+
+ // Test when InsertConcurrently will be invoked
+ options.allow_concurrent_memtable_write = true;
+ ioptions = ImmutableOptions(options);
+ mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ MemTablePostProcessInfo post_process_info;
+ ASSERT_OK(mem->Add(seq, kTypeValue, "key", "value",
+ nullptr /* kv_prot_info */, true, &post_process_info));
+ ASSERT_TRUE(mem->Add(seq, kTypeValue, "key", "value",
+ nullptr /* kv_prot_info */, true, &post_process_info)
+ .IsTryAgain());
+ delete mem;
+}
+
+// A simple test to verify that the concurrent merge writes is functional
+TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
+ int num_ops = 1000;
+ std::string value;
+ MergeContext merge_context;
+ Options options;
+ // A merge operator that is not sensitive to concurrent writes since in this
+ // test we don't order the writes.
+ options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+
+ // Create a MemTable
+ InternalKeyComparator cmp(BytewiseComparator());
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ options.allow_concurrent_memtable_write = true;
+ ImmutableOptions ioptions(options);
+ WriteBufferManager wb(options.db_write_buffer_size);
+ MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+
+ // Put 0 as the base
+ PutFixed64(&value, static_cast<uint64_t>(0));
+ ASSERT_OK(mem->Add(0, kTypeValue, "key", value, nullptr /* kv_prot_info */));
+ value.clear();
+
+ // Write Merge concurrently
+ ROCKSDB_NAMESPACE::port::Thread write_thread1([&]() {
+ MemTablePostProcessInfo post_process_info1;
+ std::string v1;
+ for (int seq = 1; seq < num_ops / 2; seq++) {
+ PutFixed64(&v1, seq);
+ ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v1, nullptr /* kv_prot_info */,
+ true, &post_process_info1));
+ v1.clear();
+ }
+ });
+ ROCKSDB_NAMESPACE::port::Thread write_thread2([&]() {
+ MemTablePostProcessInfo post_process_info2;
+ std::string v2;
+ for (int seq = num_ops / 2; seq < num_ops; seq++) {
+ PutFixed64(&v2, seq);
+ ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v2, nullptr /* kv_prot_info */,
+ true, &post_process_info2));
+ v2.clear();
+ }
+ });
+ write_thread1.join();
+ write_thread2.join();
+
+ Status status;
+ ReadOptions roptions;
+ SequenceNumber max_covering_tombstone_seq = 0;
+ LookupKey lkey("key", kMaxSequenceNumber);
+ bool res = mem->Get(lkey, &value, /*columns=*/nullptr, /*timestamp=*/nullptr,
+ &status, &merge_context, &max_covering_tombstone_seq,
+ roptions, false /* immutable_memtable */);
+ ASSERT_OK(status);
+ ASSERT_TRUE(res);
+ uint64_t ivalue = DecodeFixed64(Slice(value).data());
+ uint64_t sum = 0;
+ for (int seq = 0; seq < num_ops; seq++) {
+ sum += seq;
+ }
+ ASSERT_EQ(ivalue, sum);
+
+ delete mem;
+}
+
+TEST_F(DBMemTableTest, InsertWithHint) {
+ Options options;
+ options.allow_concurrent_memtable_write = false;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(new MockMemTableRepFactory());
+ options.memtable_insert_with_hint_prefix_extractor.reset(
+ new TestPrefixExtractor());
+ options.env = env_;
+ Reopen(options);
+ MockMemTableRep* rep =
+ reinterpret_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
+ ->rep();
+ ASSERT_OK(Put("foo_k1", "foo_v1"));
+ ASSERT_EQ(nullptr, rep->last_hint_in());
+ void* hint_foo = rep->last_hint_out();
+ ASSERT_OK(Put("foo_k2", "foo_v2"));
+ ASSERT_EQ(hint_foo, rep->last_hint_in());
+ ASSERT_EQ(hint_foo, rep->last_hint_out());
+ ASSERT_OK(Put("foo_k3", "foo_v3"));
+ ASSERT_EQ(hint_foo, rep->last_hint_in());
+ ASSERT_EQ(hint_foo, rep->last_hint_out());
+ ASSERT_OK(Put("bar_k1", "bar_v1"));
+ ASSERT_EQ(nullptr, rep->last_hint_in());
+ void* hint_bar = rep->last_hint_out();
+ ASSERT_NE(hint_foo, hint_bar);
+ ASSERT_OK(Put("bar_k2", "bar_v2"));
+ ASSERT_EQ(hint_bar, rep->last_hint_in());
+ ASSERT_EQ(hint_bar, rep->last_hint_out());
+ ASSERT_EQ(5, rep->num_insert_with_hint());
+ ASSERT_OK(Put("NotInPrefixDomain", "vvv"));
+ ASSERT_EQ(5, rep->num_insert_with_hint());
+ ASSERT_EQ("foo_v1", Get("foo_k1"));
+ ASSERT_EQ("foo_v2", Get("foo_k2"));
+ ASSERT_EQ("foo_v3", Get("foo_k3"));
+ ASSERT_EQ("bar_v1", Get("bar_k1"));
+ ASSERT_EQ("bar_v2", Get("bar_k2"));
+ ASSERT_EQ("vvv", Get("NotInPrefixDomain"));
+}
+
+TEST_F(DBMemTableTest, ColumnFamilyId) {
+ // Verifies MemTableRepFactory is told the right column family id.
+ Options options;
+ options.env = CurrentOptions().env;
+ options.allow_concurrent_memtable_write = false;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(new MockMemTableRepFactory());
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ for (uint32_t cf = 0; cf < 2; ++cf) {
+ ASSERT_OK(Put(cf, "key", "val"));
+ ASSERT_OK(Flush(cf));
+ ASSERT_EQ(
+ cf, static_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
+ ->GetLastColumnFamilyId());
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_merge_operand_test.cc b/src/rocksdb/db/db_merge_operand_test.cc
new file mode 100644
index 000000000..cbec37138
--- /dev/null
+++ b/src/rocksdb/db/db_merge_operand_test.cc
@@ -0,0 +1,448 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_builder.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "rocksdb/merge_operator.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+ public:
+ LimitedStringAppendMergeOp(int limit, char delim)
+ : StringAppendTESTOperator(delim), limit_(limit) {}
+
+ const char* Name() const override {
+ return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+ }
+
+ bool ShouldMerge(const std::vector<Slice>& operands) const override {
+ if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+ return true;
+ }
+ return false;
+ }
+
+ private:
+ size_t limit_ = 0;
+};
+} // anonymous namespace
+
+class DBMergeOperandTest : public DBTestBase {
+ public:
+ DBMergeOperandTest()
+ : DBTestBase("db_merge_operand_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBMergeOperandTest, CacheEvictedMergeOperandReadAfterFreeBug) {
+ // There was a bug of reading merge operands after they are mistakely freed
+ // in DB::GetMergeOperands, which is surfaced by cache full.
+ // See PR#9507 for more.
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.env = env_;
+ BlockBasedTableOptions table_options;
+
+ // Small cache to simulate cache full
+ table_options.block_cache = NewLRUCache(1);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ Reopen(options);
+ int num_records = 4;
+ int number_of_operands = 0;
+ std::vector<PinnableSlice> values(num_records);
+ GetMergeOperandsOptions merge_operands_info;
+ merge_operands_info.expected_max_number_of_operands = num_records;
+
+ ASSERT_OK(Merge("k1", "v1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k1", "v2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k1", "v3"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k1", "v4"));
+
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k1", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(number_of_operands, 4);
+ ASSERT_EQ(values[0].ToString(), "v1");
+ ASSERT_EQ(values[1].ToString(), "v2");
+ ASSERT_EQ(values[2].ToString(), "v3");
+ ASSERT_EQ(values[3].ToString(), "v4");
+}
+
+TEST_F(DBMergeOperandTest, FlushedMergeOperandReadAfterFreeBug) {
+ // Repro for a bug where a memtable containing a merge operand could be
+ // deleted before the merge operand was saved to the result.
+ auto options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ Reopen(options);
+
+ ASSERT_OK(Merge("key", "value"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::GetImpl:PostMemTableGet:0",
+ "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PreFlush"},
+ {"DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PostFlush",
+ "DBImpl::GetImpl:PostMemTableGet:1"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ auto flush_thread = port::Thread([&]() {
+ TEST_SYNC_POINT(
+ "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PreFlush");
+ ASSERT_OK(Flush());
+ TEST_SYNC_POINT(
+ "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PostFlush");
+ });
+
+ PinnableSlice value;
+ GetMergeOperandsOptions merge_operands_info;
+ merge_operands_info.expected_max_number_of_operands = 1;
+ int number_of_operands;
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "key", &value, &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(1, number_of_operands);
+
+ flush_thread.join();
+}
+
+TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
+ Options options;
+ options.create_if_missing = true;
+ // Use only the latest two merge operands.
+ options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+ options.env = env_;
+ Reopen(options);
+ int num_records = 4;
+ int number_of_operands = 0;
+ std::vector<PinnableSlice> values(num_records);
+ GetMergeOperandsOptions merge_operands_info;
+ merge_operands_info.expected_max_number_of_operands = num_records;
+
+ // k0 value in memtable
+ ASSERT_OK(Put("k0", "PutARock"));
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k0", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "PutARock");
+
+ // k0.1 value in SST
+ ASSERT_OK(Put("k0.1", "RockInSST"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k0.1", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "RockInSST");
+
+ // All k1 values are in memtable.
+ ASSERT_OK(Merge("k1", "a"));
+ ASSERT_OK(Put("k1", "x"));
+ ASSERT_OK(Merge("k1", "b"));
+ ASSERT_OK(Merge("k1", "c"));
+ ASSERT_OK(Merge("k1", "d"));
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k1", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "x");
+ ASSERT_EQ(values[1], "b");
+ ASSERT_EQ(values[2], "c");
+ ASSERT_EQ(values[3], "d");
+
+ // expected_max_number_of_operands is less than number of merge operands so
+ // status should be Incomplete.
+ merge_operands_info.expected_max_number_of_operands = num_records - 1;
+ Status status = db_->GetMergeOperands(
+ ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+ &merge_operands_info, &number_of_operands);
+ ASSERT_EQ(status.IsIncomplete(), true);
+ merge_operands_info.expected_max_number_of_operands = num_records;
+
+ // All k1.1 values are in memtable.
+ ASSERT_OK(Merge("k1.1", "r"));
+ ASSERT_OK(Delete("k1.1"));
+ ASSERT_OK(Merge("k1.1", "c"));
+ ASSERT_OK(Merge("k1.1", "k"));
+ ASSERT_OK(Merge("k1.1", "s"));
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k1.1", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "c");
+ ASSERT_EQ(values[1], "k");
+ ASSERT_EQ(values[2], "s");
+
+ // All k2 values are flushed to L0 into a single file.
+ ASSERT_OK(Merge("k2", "q"));
+ ASSERT_OK(Merge("k2", "w"));
+ ASSERT_OK(Merge("k2", "e"));
+ ASSERT_OK(Merge("k2", "r"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k2", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "q");
+ ASSERT_EQ(values[1], "w");
+ ASSERT_EQ(values[2], "e");
+ ASSERT_EQ(values[3], "r");
+
+ // All k2.1 values are flushed to L0 into a single file.
+ ASSERT_OK(Merge("k2.1", "m"));
+ ASSERT_OK(Put("k2.1", "l"));
+ ASSERT_OK(Merge("k2.1", "n"));
+ ASSERT_OK(Merge("k2.1", "o"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k2.1", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "l,n,o");
+
+ // All k2.2 values are flushed to L0 into a single file.
+ ASSERT_OK(Merge("k2.2", "g"));
+ ASSERT_OK(Delete("k2.2"));
+ ASSERT_OK(Merge("k2.2", "o"));
+ ASSERT_OK(Merge("k2.2", "t"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k2.2", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "o,t");
+
+ // Do some compaction that will make the following tests more predictable
+ // Slice start("PutARock");
+ // Slice end("t");
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // All k3 values are flushed and are in different files.
+ ASSERT_OK(Merge("k3", "ab"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "bc"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "cd"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "de"));
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k3", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "ab");
+ ASSERT_EQ(values[1], "bc");
+ ASSERT_EQ(values[2], "cd");
+ ASSERT_EQ(values[3], "de");
+
+ // All k3.1 values are flushed and are in different files.
+ ASSERT_OK(Merge("k3.1", "ab"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("k3.1", "bc"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3.1", "cd"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3.1", "de"));
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k3.1", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "bc");
+ ASSERT_EQ(values[1], "cd");
+ ASSERT_EQ(values[2], "de");
+
+ // All k3.2 values are flushed and are in different files.
+ ASSERT_OK(Merge("k3.2", "ab"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Delete("k3.2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3.2", "cd"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3.2", "de"));
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k3.2", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "cd");
+ ASSERT_EQ(values[1], "de");
+
+ // All K4 values are in different levels
+ ASSERT_OK(Merge("k4", "ba"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(4);
+ ASSERT_OK(Merge("k4", "cb"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(3);
+ ASSERT_OK(Merge("k4", "dc"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+ ASSERT_OK(Merge("k4", "ed"));
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k4", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "ba");
+ ASSERT_EQ(values[1], "cb");
+ ASSERT_EQ(values[2], "dc");
+ ASSERT_EQ(values[3], "ed");
+
+ // First 3 k5 values are in SST and next 4 k5 values are in Immutable
+ // Memtable
+ ASSERT_OK(Merge("k5", "who"));
+ ASSERT_OK(Merge("k5", "am"));
+ ASSERT_OK(Merge("k5", "i"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("k5", "remember"));
+ ASSERT_OK(Merge("k5", "i"));
+ ASSERT_OK(Merge("k5", "am"));
+ ASSERT_OK(Merge("k5", "rocks"));
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k5", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "remember");
+ ASSERT_EQ(values[1], "i");
+ ASSERT_EQ(values[2], "am");
+}
+
+TEST_F(DBMergeOperandTest, BlobDBGetMergeOperandsBasic) {
+ Options options;
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ // Use only the latest two merge operands.
+ options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+ options.env = env_;
+ Reopen(options);
+ int num_records = 4;
+ int number_of_operands = 0;
+ std::vector<PinnableSlice> values(num_records);
+ GetMergeOperandsOptions merge_operands_info;
+ merge_operands_info.expected_max_number_of_operands = num_records;
+
+ // All k1 values are in memtable.
+ ASSERT_OK(Put("k1", "x"));
+ ASSERT_OK(Merge("k1", "b"));
+ ASSERT_OK(Merge("k1", "c"));
+ ASSERT_OK(Merge("k1", "d"));
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k1", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "x");
+ ASSERT_EQ(values[1], "b");
+ ASSERT_EQ(values[2], "c");
+ ASSERT_EQ(values[3], "d");
+
+ // expected_max_number_of_operands is less than number of merge operands so
+ // status should be Incomplete.
+ merge_operands_info.expected_max_number_of_operands = num_records - 1;
+ Status status = db_->GetMergeOperands(
+ ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+ &merge_operands_info, &number_of_operands);
+ ASSERT_EQ(status.IsIncomplete(), true);
+ merge_operands_info.expected_max_number_of_operands = num_records;
+
+ // All k2 values are flushed to L0 into a single file.
+ ASSERT_OK(Put("k2", "q"));
+ ASSERT_OK(Merge("k2", "w"));
+ ASSERT_OK(Merge("k2", "e"));
+ ASSERT_OK(Merge("k2", "r"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k2", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "q,w,e,r");
+
+ // Do some compaction that will make the following tests more predictable
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // All k3 values are flushed and are in different files.
+ ASSERT_OK(Put("k3", "ab"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "bc"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "cd"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "de"));
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k3", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "ab");
+ ASSERT_EQ(values[1], "bc");
+ ASSERT_EQ(values[2], "cd");
+ ASSERT_EQ(values[3], "de");
+
+ // All K4 values are in different levels
+ ASSERT_OK(Put("k4", "ba"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(4);
+ ASSERT_OK(Merge("k4", "cb"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(3);
+ ASSERT_OK(Merge("k4", "dc"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+ ASSERT_OK(Merge("k4", "ed"));
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "k4", values.data(), &merge_operands_info,
+ &number_of_operands));
+ ASSERT_EQ(values[0], "ba");
+ ASSERT_EQ(values[1], "cb");
+ ASSERT_EQ(values[2], "dc");
+ ASSERT_EQ(values[3], "ed");
+}
+
+TEST_F(DBMergeOperandTest, GetMergeOperandsLargeResultOptimization) {
+ // These constants are chosen to trigger the large result optimization
+ // (pinning a bundle of `DBImpl` resources).
+ const int kNumOperands = 1024;
+ const int kOperandLen = 1024;
+
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ std::vector<std::string> expected_merge_operands;
+ expected_merge_operands.reserve(kNumOperands);
+ for (int i = 0; i < kNumOperands; ++i) {
+ expected_merge_operands.emplace_back(rnd.RandomString(kOperandLen));
+ ASSERT_OK(Merge("key", expected_merge_operands.back()));
+ }
+
+ std::vector<PinnableSlice> merge_operands(kNumOperands);
+ GetMergeOperandsOptions merge_operands_info;
+ merge_operands_info.expected_max_number_of_operands = kNumOperands;
+ int num_merge_operands = 0;
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ "key", merge_operands.data(),
+ &merge_operands_info, &num_merge_operands));
+ ASSERT_EQ(num_merge_operands, kNumOperands);
+
+ // Ensures the large result optimization was used.
+ for (int i = 0; i < kNumOperands; ++i) {
+ ASSERT_TRUE(merge_operands[i].IsPinned());
+ }
+
+ // Add a Flush() to change the `SuperVersion` to challenge the resource
+ // pinning.
+ ASSERT_OK(Flush());
+
+ for (int i = 0; i < kNumOperands; ++i) {
+ ASSERT_EQ(expected_merge_operands[i], merge_operands[i]);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_merge_operator_test.cc b/src/rocksdb/db/db_merge_operator_test.cc
new file mode 100644
index 000000000..7c5505bd1
--- /dev/null
+++ b/src/rocksdb/db/db_merge_operator_test.cc
@@ -0,0 +1,669 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#include <string>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "port/stack_trace.h"
+#include "rocksdb/merge_operator.h"
+#include "util/random.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestReadCallback : public ReadCallback {
+ public:
+ TestReadCallback(SnapshotChecker* snapshot_checker,
+ SequenceNumber snapshot_seq)
+ : ReadCallback(snapshot_seq),
+ snapshot_checker_(snapshot_checker),
+ snapshot_seq_(snapshot_seq) {}
+
+ bool IsVisibleFullCheck(SequenceNumber seq) override {
+ return snapshot_checker_->CheckInSnapshot(seq, snapshot_seq_) ==
+ SnapshotCheckerResult::kInSnapshot;
+ }
+
+ private:
+ SnapshotChecker* snapshot_checker_;
+ SequenceNumber snapshot_seq_;
+};
+
+// Test merge operator functionality.
+class DBMergeOperatorTest : public DBTestBase {
+ public:
+ DBMergeOperatorTest()
+ : DBTestBase("db_merge_operator_test", /*env_do_fsync=*/false) {}
+
+ std::string GetWithReadCallback(SnapshotChecker* snapshot_checker,
+ const Slice& key,
+ const Snapshot* snapshot = nullptr) {
+ SequenceNumber seq = snapshot == nullptr ? db_->GetLatestSequenceNumber()
+ : snapshot->GetSequenceNumber();
+ TestReadCallback read_callback(snapshot_checker, seq);
+ ReadOptions read_opt;
+ read_opt.snapshot = snapshot;
+ PinnableSlice value;
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = db_->DefaultColumnFamily();
+ get_impl_options.value = &value;
+ get_impl_options.callback = &read_callback;
+ Status s = dbfull()->GetImpl(read_opt, key, get_impl_options);
+ if (!s.ok()) {
+ return s.ToString();
+ }
+ return value.ToString();
+ }
+};
+
+TEST_F(DBMergeOperatorTest, LimitMergeOperands) {
+ class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+ public:
+ LimitedStringAppendMergeOp(int limit, char delim)
+ : StringAppendTESTOperator(delim), limit_(limit) {}
+
+ const char* Name() const override {
+ return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+ }
+
+ bool ShouldMerge(const std::vector<Slice>& operands) const override {
+ if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+ return true;
+ }
+ return false;
+ }
+
+ private:
+ size_t limit_ = 0;
+ };
+
+ Options options;
+ options.create_if_missing = true;
+ // Use only the latest two merge operands.
+ options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+ options.env = env_;
+ Reopen(options);
+ // All K1 values are in memtable.
+ ASSERT_OK(Merge("k1", "a"));
+ ASSERT_OK(Merge("k1", "b"));
+ ASSERT_OK(Merge("k1", "c"));
+ ASSERT_OK(Merge("k1", "d"));
+ std::string value;
+ ASSERT_OK(db_->Get(ReadOptions(), "k1", &value));
+ // Make sure that only the latest two merge operands are used. If this was
+ // not the case the value would be "a,b,c,d".
+ ASSERT_EQ(value, "c,d");
+
+ // All K2 values are flushed to L0 into a single file.
+ ASSERT_OK(Merge("k2", "a"));
+ ASSERT_OK(Merge("k2", "b"));
+ ASSERT_OK(Merge("k2", "c"));
+ ASSERT_OK(Merge("k2", "d"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Get(ReadOptions(), "k2", &value));
+ ASSERT_EQ(value, "c,d");
+
+ // All K3 values are flushed and are in different files.
+ ASSERT_OK(Merge("k3", "ab"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "bc"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "cd"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("k3", "de"));
+ ASSERT_OK(db_->Get(ReadOptions(), "k3", &value));
+ ASSERT_EQ(value, "cd,de");
+
+ // All K4 values are in different levels
+ ASSERT_OK(Merge("k4", "ab"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(4);
+ ASSERT_OK(Merge("k4", "bc"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(3);
+ ASSERT_OK(Merge("k4", "cd"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+ ASSERT_OK(Merge("k4", "de"));
+ ASSERT_OK(db_->Get(ReadOptions(), "k4", &value));
+ ASSERT_EQ(value, "cd,de");
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnRead) {
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator.reset(new TestPutOperator());
+ options.env = env_;
+ Reopen(options);
+ ASSERT_OK(Merge("k1", "v1"));
+ ASSERT_OK(Merge("k1", "corrupted"));
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).IsCorruption());
+ VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}});
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnWrite) {
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator.reset(new TestPutOperator());
+ options.max_successive_merges = 3;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_OK(Merge("k1", "v1"));
+ ASSERT_OK(Merge("k1", "v2"));
+ // Will trigger a merge when hitting max_successive_merges and the merge
+ // will fail. The delta will be inserted nevertheless.
+ ASSERT_OK(Merge("k1", "corrupted"));
+ // Data should stay unmerged after the error.
+ VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v2"}, {"k1", "v1"}});
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnIteration) {
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator.reset(new TestPutOperator());
+ options.env = env_;
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Merge("k1", "v1"));
+ ASSERT_OK(Merge("k1", "corrupted"));
+ ASSERT_OK(Put("k2", "v2"));
+ auto* iter = db_->NewIterator(ReadOptions());
+ iter->Seek("k1");
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsCorruption());
+ delete iter;
+ iter = db_->NewIterator(ReadOptions());
+ iter->Seek("k2");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsCorruption());
+ delete iter;
+ VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}, {"k2", "v2"}});
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Merge("k1", "v1"));
+ ASSERT_OK(Put("k2", "v2"));
+ ASSERT_OK(Merge("k2", "corrupted"));
+ iter = db_->NewIterator(ReadOptions());
+ iter->Seek("k1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsCorruption());
+ delete iter;
+ VerifyDBInternal({{"k1", "v1"}, {"k2", "corrupted"}, {"k2", "v2"}});
+}
+
+class MergeOperatorPinningTest : public DBMergeOperatorTest,
+ public testing::WithParamInterface<bool> {
+ public:
+ MergeOperatorPinningTest() { disable_block_cache_ = GetParam(); }
+
+ bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(MergeOperatorPinningTest, MergeOperatorPinningTest,
+ ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1; // every block will contain one entry
+ table_options.no_block_cache = disable_block_cache_;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+ options.level0_slowdown_writes_trigger = (1 << 30);
+ options.level0_stop_writes_trigger = (1 << 30);
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ const int kKeysPerFile = 10;
+ const int kOperandsPerKeyPerFile = 7;
+ const int kOperandSize = 100;
+ // Filse to write in L0 before compacting to lower level
+ const int kFilesPerLevel = 3;
+
+ Random rnd(301);
+ std::map<std::string, std::string> true_data;
+ int batch_num = 1;
+ int lvl_to_fill = 4;
+ int key_id = 0;
+ while (true) {
+ for (int j = 0; j < kKeysPerFile; j++) {
+ std::string key = Key(key_id % 35);
+ key_id++;
+ for (int k = 0; k < kOperandsPerKeyPerFile; k++) {
+ std::string val = rnd.RandomString(kOperandSize);
+ ASSERT_OK(db_->Merge(WriteOptions(), key, val));
+ if (true_data[key].size() == 0) {
+ true_data[key] = val;
+ } else {
+ true_data[key] += "," + val;
+ }
+ }
+ }
+
+ if (lvl_to_fill == -1) {
+ // Keep last batch in memtable and stop
+ break;
+ }
+
+ ASSERT_OK(Flush());
+ if (batch_num % kFilesPerLevel == 0) {
+ if (lvl_to_fill != 0) {
+ MoveFilesToLevel(lvl_to_fill);
+ }
+ lvl_to_fill--;
+ }
+ batch_num++;
+ }
+
+ // 3 L0 files
+ // 1 L1 file
+ // 3 L2 files
+ // 1 L3 file
+ // 3 L4 Files
+ ASSERT_EQ(FilesPerLevel(), "3,1,3,1,3");
+
+ VerifyDBFromMap(true_data);
+}
+
+class MergeOperatorHook : public MergeOperator {
+ public:
+ explicit MergeOperatorHook(std::shared_ptr<MergeOperator> _merge_op)
+ : merge_op_(_merge_op) {}
+
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ before_merge_();
+ bool res = merge_op_->FullMergeV2(merge_in, merge_out);
+ after_merge_();
+ return res;
+ }
+
+ const char* Name() const override { return merge_op_->Name(); }
+
+ std::shared_ptr<MergeOperator> merge_op_;
+ std::function<void()> before_merge_ = []() {};
+ std::function<void()> after_merge_ = []() {};
+};
+
+TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) {
+ Options options = CurrentOptions();
+
+ auto merge_hook =
+ std::make_shared<MergeOperatorHook>(MergeOperators::CreateMaxOperator());
+ options.merge_operator = merge_hook;
+ options.disable_auto_compactions = true;
+ options.level0_slowdown_writes_trigger = (1 << 30);
+ options.level0_stop_writes_trigger = (1 << 30);
+ options.max_open_files = 20;
+ BlockBasedTableOptions bbto;
+ bbto.no_block_cache = disable_block_cache_;
+ if (bbto.no_block_cache == false) {
+ bbto.block_cache = NewLRUCache(64 * 1024 * 1024);
+ } else {
+ bbto.block_cache = nullptr;
+ }
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ const int kNumOperands = 30;
+ const int kNumKeys = 1000;
+ const int kOperandSize = 100;
+ Random rnd(301);
+
+ // 1000 keys every key have 30 operands, every operand is in a different file
+ std::map<std::string, std::string> true_data;
+ for (int i = 0; i < kNumOperands; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ std::string k = Key(j);
+ std::string v = rnd.RandomString(kOperandSize);
+ ASSERT_OK(db_->Merge(WriteOptions(), k, v));
+
+ true_data[k] = std::max(true_data[k], v);
+ }
+ ASSERT_OK(Flush());
+ }
+
+ std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
+ ASSERT_EQ(file_numbers.size(), kNumOperands);
+ int merge_cnt = 0;
+
+ // Code executed before merge operation
+ merge_hook->before_merge_ = [&]() {
+ // Evict all tables from cache before every merge operation
+ auto* table_cache = dbfull()->TEST_table_cache();
+ for (uint64_t num : file_numbers) {
+ TableCache::Evict(table_cache, num);
+ }
+ // Decrease cache capacity to force all unrefed blocks to be evicted
+ if (bbto.block_cache) {
+ bbto.block_cache->SetCapacity(1);
+ }
+ merge_cnt++;
+ };
+
+ // Code executed after merge operation
+ merge_hook->after_merge_ = [&]() {
+ // Increase capacity again after doing the merge
+ if (bbto.block_cache) {
+ bbto.block_cache->SetCapacity(64 * 1024 * 1024);
+ }
+ };
+
+ size_t total_reads;
+ VerifyDBFromMap(true_data, &total_reads);
+ ASSERT_EQ(merge_cnt, total_reads);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ VerifyDBFromMap(true_data, &total_reads);
+}
+
+TEST_P(MergeOperatorPinningTest, TailingIterator) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateMaxOperator();
+ BlockBasedTableOptions bbto;
+ bbto.no_block_cache = disable_block_cache_;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ const int kNumOperands = 100;
+ const int kNumWrites = 100000;
+
+ std::function<void()> writer_func = [&]() {
+ int k = 0;
+ for (int i = 0; i < kNumWrites; i++) {
+ ASSERT_OK(db_->Merge(WriteOptions(), Key(k), Key(k)));
+
+ if (i && i % kNumOperands == 0) {
+ k++;
+ }
+ if (i && i % 127 == 0) {
+ ASSERT_OK(Flush());
+ }
+ if (i && i % 317 == 0) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+ }
+ };
+
+ std::function<void()> reader_func = [&]() {
+ ReadOptions ro;
+ ro.tailing = true;
+ Iterator* iter = db_->NewIterator(ro);
+ ASSERT_OK(iter->status());
+ iter->SeekToFirst();
+ for (int i = 0; i < (kNumWrites / kNumOperands); i++) {
+ while (!iter->Valid()) {
+ // wait for the key to be written
+ env_->SleepForMicroseconds(100);
+ iter->Seek(Key(i));
+ }
+ ASSERT_EQ(iter->key(), Key(i));
+ ASSERT_EQ(iter->value(), Key(i));
+
+ iter->Next();
+ }
+ ASSERT_OK(iter->status());
+
+ delete iter;
+ };
+
+ ROCKSDB_NAMESPACE::port::Thread writer_thread(writer_func);
+ ROCKSDB_NAMESPACE::port::Thread reader_thread(reader_func);
+
+ writer_thread.join();
+ reader_thread.join();
+}
+
+TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ DestroyAndReopen(options);
+
+ // Overview of the test:
+ // * There are two merge operands for the same key: one in an sst file,
+ // another in a memtable.
+ // * Seek a tailing iterator to this key.
+ // * As part of the seek, the iterator will:
+ // (a) first visit the operand in the memtable and tell ForwardIterator
+ // to pin this operand, then
+ // (b) move on to the operand in the sst file, then pass both operands
+ // to merge operator.
+ // * The memtable may get flushed and unreferenced by another thread between
+ // (a) and (b). The test simulates it by flushing the memtable inside a
+ // SyncPoint callback located between (a) and (b).
+ // * In this case it's ForwardIterator's responsibility to keep the memtable
+ // pinned until (b) is complete. There used to be a bug causing
+ // ForwardIterator to not pin it in some circumstances. This test
+ // reproduces it.
+
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "sst"));
+ ASSERT_OK(db_->Flush(FlushOptions())); // Switch to SuperVersion A
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "memtable"));
+
+ // Pin SuperVersion A
+ std::unique_ptr<Iterator> someone_else(db_->NewIterator(ReadOptions()));
+ ASSERT_OK(someone_else->status());
+
+ bool pushed_first_operand = false;
+ bool stepped_to_next_operand = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) {
+ EXPECT_FALSE(pushed_first_operand);
+ pushed_first_operand = true;
+ EXPECT_OK(db_->Flush(FlushOptions())); // Switch to SuperVersion B
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) {
+ EXPECT_FALSE(stepped_to_next_operand);
+ stepped_to_next_operand = true;
+ someone_else.reset(); // Unpin SuperVersion A
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ReadOptions ro;
+ ro.tailing = true;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+ iter->Seek("key");
+
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString());
+ EXPECT_TRUE(pushed_first_operand);
+ EXPECT_TRUE(stepped_to_next_operand);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ DestroyAndReopen(options);
+
+ class TestSnapshotChecker : public SnapshotChecker {
+ public:
+ SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+ return IsInSnapshot(seq, snapshot_seq)
+ ? SnapshotCheckerResult::kInSnapshot
+ : SnapshotCheckerResult::kNotInSnapshot;
+ }
+
+ bool IsInSnapshot(SequenceNumber seq, SequenceNumber snapshot_seq) const {
+ switch (snapshot_seq) {
+ case 0:
+ return seq == 0;
+ case 1:
+ return seq <= 1;
+ case 2:
+ // seq = 2 not visible to snapshot with seq = 2
+ return seq <= 1;
+ case 3:
+ return seq <= 3;
+ case 4:
+ // seq = 4 not visible to snpahost with seq = 4
+ return seq <= 3;
+ default:
+ // seq >=4 is uncommitted
+ return seq <= 4;
+ };
+ }
+ };
+ TestSnapshotChecker* snapshot_checker = new TestSnapshotChecker();
+ dbfull()->SetSnapshotChecker(snapshot_checker);
+
+ std::string value;
+ ASSERT_OK(Merge("foo", "v1"));
+ ASSERT_EQ(1, db_->GetLatestSequenceNumber());
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+ ASSERT_OK(Merge("foo", "v2"));
+ ASSERT_EQ(2, db_->GetLatestSequenceNumber());
+ // v2 is not visible to latest snapshot, which has seq = 2.
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+ // Take a snapshot with seq = 2.
+ const Snapshot* snapshot1 = db_->GetSnapshot();
+ ASSERT_EQ(2, snapshot1->GetSequenceNumber());
+ // v2 is not visible to snapshot1, which has seq = 2
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+
+ // Verify flush doesn't alter the result.
+ ASSERT_OK(Flush());
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+
+ ASSERT_OK(Merge("foo", "v3"));
+ ASSERT_EQ(3, db_->GetLatestSequenceNumber());
+ ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+ ASSERT_OK(Merge("foo", "v4"));
+ ASSERT_EQ(4, db_->GetLatestSequenceNumber());
+ // v4 is not visible to latest snapshot, which has seq = 4.
+ ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+ const Snapshot* snapshot2 = db_->GetSnapshot();
+ ASSERT_EQ(4, snapshot2->GetSequenceNumber());
+ // v4 is not visible to snapshot2, which has seq = 4.
+ ASSERT_EQ("v1,v2,v3",
+ GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+
+ // Verify flush doesn't alter the result.
+ ASSERT_OK(Flush());
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+ ASSERT_EQ("v1,v2,v3",
+ GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+ ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+
+ ASSERT_OK(Merge("foo", "v5"));
+ ASSERT_EQ(5, db_->GetLatestSequenceNumber());
+ // v5 is uncommitted
+ ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+ // full manual compaction.
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Verify compaction doesn't alter the result.
+ ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+ ASSERT_EQ("v1,v2,v3",
+ GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+ ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+ db_->ReleaseSnapshot(snapshot1);
+ db_->ReleaseSnapshot(snapshot2);
+}
+
+class PerConfigMergeOperatorPinningTest
+ : public DBMergeOperatorTest,
+ public testing::WithParamInterface<std::tuple<bool, int>> {
+ public:
+ PerConfigMergeOperatorPinningTest() {
+ std::tie(disable_block_cache_, option_config_) = GetParam();
+ }
+
+ bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ MergeOperatorPinningTest, PerConfigMergeOperatorPinningTest,
+ ::testing::Combine(::testing::Bool(),
+ ::testing::Range(static_cast<int>(DBTestBase::kDefault),
+ static_cast<int>(DBTestBase::kEnd))));
+
+TEST_P(PerConfigMergeOperatorPinningTest, Randomized) {
+ if (ShouldSkipOptions(option_config_, kSkipMergePut)) {
+ return;
+ }
+
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateMaxOperator();
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = disable_block_cache_;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ std::map<std::string, std::string> true_data;
+
+ const int kTotalMerges = 5000;
+ // Every key gets ~10 operands
+ const int kKeyRange = kTotalMerges / 10;
+ const int kOperandSize = 20;
+ const int kNumPutBefore = kKeyRange / 10; // 10% value
+ const int kNumPutAfter = kKeyRange / 10; // 10% overwrite
+ const int kNumDelete = kKeyRange / 10; // 10% delete
+
+ // kNumPutBefore keys will have base values
+ for (int i = 0; i < kNumPutBefore; i++) {
+ std::string key = Key(rnd.Next() % kKeyRange);
+ std::string value = rnd.RandomString(kOperandSize);
+ ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+ true_data[key] = value;
+ }
+
+ // Do kTotalMerges merges
+ for (int i = 0; i < kTotalMerges; i++) {
+ std::string key = Key(rnd.Next() % kKeyRange);
+ std::string value = rnd.RandomString(kOperandSize);
+ ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+
+ if (true_data[key] < value) {
+ true_data[key] = value;
+ }
+ }
+
+ // Overwrite random kNumPutAfter keys
+ for (int i = 0; i < kNumPutAfter; i++) {
+ std::string key = Key(rnd.Next() % kKeyRange);
+ std::string value = rnd.RandomString(kOperandSize);
+ ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+ true_data[key] = value;
+ }
+
+ // Delete random kNumDelete keys
+ for (int i = 0; i < kNumDelete; i++) {
+ std::string key = Key(rnd.Next() % kKeyRange);
+ ASSERT_OK(db_->Delete(WriteOptions(), key));
+
+ true_data.erase(key);
+ }
+
+ VerifyDBFromMap(true_data);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_options_test.cc b/src/rocksdb/db/db_options_test.cc
new file mode 100644
index 000000000..691081db9
--- /dev/null
+++ b/src/rocksdb/db/db_options_test.cc
@@ -0,0 +1,1219 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <limits>
+#include <string>
+#include <unordered_map>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/stats_history.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBOptionsTest : public DBTestBase {
+ public:
+ DBOptionsTest() : DBTestBase("db_options_test", /*env_do_fsync=*/true) {}
+
+#ifndef ROCKSDB_LITE
+ std::unordered_map<std::string, std::string> GetMutableDBOptionsMap(
+ const DBOptions& options) {
+ std::string options_str;
+ std::unordered_map<std::string, std::string> mutable_map;
+ ConfigOptions config_options(options);
+ config_options.delimiter = "; ";
+
+ EXPECT_OK(GetStringFromMutableDBOptions(
+ config_options, MutableDBOptions(options), &options_str));
+ EXPECT_OK(StringToMap(options_str, &mutable_map));
+
+ return mutable_map;
+ }
+
+ std::unordered_map<std::string, std::string> GetMutableCFOptionsMap(
+ const ColumnFamilyOptions& options) {
+ std::string options_str;
+ ConfigOptions config_options;
+ config_options.delimiter = "; ";
+
+ std::unordered_map<std::string, std::string> mutable_map;
+ EXPECT_OK(GetStringFromMutableCFOptions(
+ config_options, MutableCFOptions(options), &options_str));
+ EXPECT_OK(StringToMap(options_str, &mutable_map));
+ return mutable_map;
+ }
+
+ std::unordered_map<std::string, std::string> GetRandomizedMutableCFOptionsMap(
+ Random* rnd) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ ImmutableDBOptions db_options(options);
+ test::RandomInitCFOptions(&options, options, rnd);
+ auto sanitized_options = SanitizeOptions(db_options, options);
+ auto opt_map = GetMutableCFOptionsMap(sanitized_options);
+ delete options.compaction_filter;
+ return opt_map;
+ }
+
+ std::unordered_map<std::string, std::string> GetRandomizedMutableDBOptionsMap(
+ Random* rnd) {
+ DBOptions db_options;
+ test::RandomInitDBOptions(&db_options, rnd);
+ auto sanitized_options = SanitizeOptions(dbname_, db_options);
+ return GetMutableDBOptionsMap(sanitized_options);
+ }
+#endif // ROCKSDB_LITE
+};
+
+TEST_F(DBOptionsTest, ImmutableTrackAndVerifyWalsInManifest) {
+ Options options;
+ options.env = env_;
+ options.track_and_verify_wals_in_manifest = true;
+
+ ImmutableDBOptions db_options(options);
+ ASSERT_TRUE(db_options.track_and_verify_wals_in_manifest);
+
+ Reopen(options);
+ ASSERT_TRUE(dbfull()->GetDBOptions().track_and_verify_wals_in_manifest);
+
+ Status s =
+ dbfull()->SetDBOptions({{"track_and_verify_wals_in_manifest", "false"}});
+ ASSERT_FALSE(s.ok());
+}
+
+TEST_F(DBOptionsTest, ImmutableVerifySstUniqueIdInManifest) {
+ Options options;
+ options.env = env_;
+ options.verify_sst_unique_id_in_manifest = true;
+
+ ImmutableDBOptions db_options(options);
+ ASSERT_TRUE(db_options.verify_sst_unique_id_in_manifest);
+
+ Reopen(options);
+ ASSERT_TRUE(dbfull()->GetDBOptions().verify_sst_unique_id_in_manifest);
+
+ Status s =
+ dbfull()->SetDBOptions({{"verify_sst_unique_id_in_manifest", "false"}});
+ ASSERT_FALSE(s.ok());
+}
+
+// RocksDB lite don't support dynamic options.
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBOptionsTest, AvoidUpdatingOptions) {
+ Options options;
+ options.env = env_;
+ options.max_background_jobs = 4;
+ options.delayed_write_rate = 1024;
+
+ Reopen(options);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ bool is_changed_stats = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::WriteOptionsFile:PersistOptions", [&](void* /*arg*/) {
+ ASSERT_FALSE(is_changed_stats); // should only save options file once
+ is_changed_stats = true;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // helper function to check the status and reset after each check
+ auto is_changed = [&] {
+ bool ret = is_changed_stats;
+ is_changed_stats = false;
+ return ret;
+ };
+
+ // without changing the value, but it's sanitized to a different value
+ ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "0"}}));
+ ASSERT_TRUE(is_changed());
+
+ // without changing the value
+ ASSERT_OK(dbfull()->SetDBOptions({{"max_background_jobs", "4"}}));
+ ASSERT_FALSE(is_changed());
+
+ // changing the value
+ ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}}));
+ ASSERT_TRUE(is_changed());
+
+ // update again
+ ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}}));
+ ASSERT_FALSE(is_changed());
+
+ // without changing a default value
+ ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "false"}}));
+ ASSERT_FALSE(is_changed());
+
+ // now change
+ ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "true"}}));
+ ASSERT_TRUE(is_changed());
+
+ // multiple values without change
+ ASSERT_OK(dbfull()->SetDBOptions(
+ {{"max_total_wal_size", "0"}, {"stats_dump_period_sec", "600"}}));
+ ASSERT_FALSE(is_changed());
+
+ // multiple values with change
+ ASSERT_OK(dbfull()->SetDBOptions(
+ {{"max_open_files", "100"}, {"stats_dump_period_sec", "600"}}));
+ ASSERT_TRUE(is_changed());
+}
+
+TEST_F(DBOptionsTest, GetLatestDBOptions) {
+ // GetOptions should be able to get latest option changed by SetOptions.
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_;
+ Random rnd(228);
+ Reopen(options);
+ auto new_options = GetRandomizedMutableDBOptionsMap(&rnd);
+ ASSERT_OK(dbfull()->SetDBOptions(new_options));
+ ASSERT_EQ(new_options, GetMutableDBOptionsMap(dbfull()->GetDBOptions()));
+}
+
+TEST_F(DBOptionsTest, GetLatestCFOptions) {
+ // GetOptions should be able to get latest option changed by SetOptions.
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_;
+ Random rnd(228);
+ Reopen(options);
+ CreateColumnFamilies({"foo"}, options);
+ ReopenWithColumnFamilies({"default", "foo"}, options);
+ auto options_default = GetRandomizedMutableCFOptionsMap(&rnd);
+ auto options_foo = GetRandomizedMutableCFOptionsMap(&rnd);
+ ASSERT_OK(dbfull()->SetOptions(handles_[0], options_default));
+ ASSERT_OK(dbfull()->SetOptions(handles_[1], options_foo));
+ ASSERT_EQ(options_default,
+ GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[0])));
+ ASSERT_EQ(options_foo,
+ GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1])));
+}
+
+TEST_F(DBOptionsTest, SetMutableTableOptions) {
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_;
+ options.blob_file_size = 16384;
+ BlockBasedTableOptions bbto;
+ bbto.no_block_cache = true;
+ bbto.block_size = 8192;
+ bbto.block_restart_interval = 7;
+
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+ Options c_opts = dbfull()->GetOptions(cfh);
+
+ const auto* c_bbto =
+ c_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+ ASSERT_NE(c_bbto, nullptr);
+ ASSERT_EQ(c_opts.blob_file_size, 16384);
+ ASSERT_EQ(c_bbto->no_block_cache, true);
+ ASSERT_EQ(c_bbto->block_size, 8192);
+ ASSERT_EQ(c_bbto->block_restart_interval, 7);
+ ASSERT_OK(dbfull()->SetOptions(
+ cfh, {{"table_factory.block_size", "16384"},
+ {"table_factory.block_restart_interval", "11"}}));
+ ASSERT_EQ(c_bbto->block_size, 16384);
+ ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+ // Now set an option that is not mutable - options should not change
+ ASSERT_NOK(
+ dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}}));
+ ASSERT_EQ(c_bbto->no_block_cache, true);
+ ASSERT_EQ(c_bbto->block_size, 16384);
+ ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+ // Set some that are mutable and some that are not - options should not change
+ ASSERT_NOK(dbfull()->SetOptions(
+ cfh, {{"table_factory.no_block_cache", "false"},
+ {"table_factory.block_size", "8192"},
+ {"table_factory.block_restart_interval", "7"}}));
+ ASSERT_EQ(c_bbto->no_block_cache, true);
+ ASSERT_EQ(c_bbto->block_size, 16384);
+ ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+ // Set some that are mutable and some that do not exist - options should not
+ // change
+ ASSERT_NOK(dbfull()->SetOptions(
+ cfh, {{"table_factory.block_size", "8192"},
+ {"table_factory.does_not_exist", "true"},
+ {"table_factory.block_restart_interval", "7"}}));
+ ASSERT_EQ(c_bbto->no_block_cache, true);
+ ASSERT_EQ(c_bbto->block_size, 16384);
+ ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+ // Trying to change the table factory fails
+ ASSERT_NOK(dbfull()->SetOptions(
+ cfh, {{"table_factory", TableFactory::kPlainTableName()}}));
+
+ // Set some on the table and some on the Column Family
+ ASSERT_OK(dbfull()->SetOptions(
+ cfh, {{"table_factory.block_size", "16384"},
+ {"blob_file_size", "32768"},
+ {"table_factory.block_restart_interval", "13"}}));
+ c_opts = dbfull()->GetOptions(cfh);
+ ASSERT_EQ(c_opts.blob_file_size, 32768);
+ ASSERT_EQ(c_bbto->block_size, 16384);
+ ASSERT_EQ(c_bbto->block_restart_interval, 13);
+ // Set some on the table and a bad one on the ColumnFamily - options should
+ // not change
+ ASSERT_NOK(dbfull()->SetOptions(
+ cfh, {{"table_factory.block_size", "1024"},
+ {"no_such_option", "32768"},
+ {"table_factory.block_restart_interval", "7"}}));
+ ASSERT_EQ(c_bbto->block_size, 16384);
+ ASSERT_EQ(c_bbto->block_restart_interval, 13);
+}
+
+TEST_F(DBOptionsTest, SetWithCustomMemTableFactory) {
+ class DummySkipListFactory : public SkipListFactory {
+ public:
+ static const char* kClassName() { return "DummySkipListFactory"; }
+ const char* Name() const override { return kClassName(); }
+ explicit DummySkipListFactory() : SkipListFactory(2) {}
+ };
+ {
+ // Verify the DummySkipList cannot be created
+ ConfigOptions config_options;
+ config_options.ignore_unsupported_options = false;
+ std::unique_ptr<MemTableRepFactory> factory;
+ ASSERT_NOK(MemTableRepFactory::CreateFromString(
+ config_options, DummySkipListFactory::kClassName(), &factory));
+ }
+ Options options;
+ options.create_if_missing = true;
+ // Try with fail_if_options_file_error=false/true to update the options
+ for (bool on_error : {false, true}) {
+ options.fail_if_options_file_error = on_error;
+ options.env = env_;
+ options.disable_auto_compactions = false;
+
+ options.memtable_factory.reset(new DummySkipListFactory());
+ Reopen(options);
+
+ ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+ ASSERT_OK(
+ dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}}));
+ ColumnFamilyDescriptor cfd;
+ ASSERT_OK(cfh->GetDescriptor(&cfd));
+ ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+ DummySkipListFactory::kClassName());
+ ColumnFamilyHandle* test = nullptr;
+ ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test));
+ ASSERT_OK(test->GetDescriptor(&cfd));
+ ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+ DummySkipListFactory::kClassName());
+
+ ASSERT_OK(dbfull()->DropColumnFamily(test));
+ delete test;
+ }
+}
+
+TEST_F(DBOptionsTest, SetBytesPerSync) {
+ const size_t kValueSize = 1024 * 1024; // 1MB
+ Options options;
+ options.create_if_missing = true;
+ options.bytes_per_sync = 1024 * 1024;
+ options.use_direct_reads = false;
+ options.write_buffer_size = 400 * kValueSize;
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.env = env_;
+ Reopen(options);
+ int counter = 0;
+ int low_bytes_per_sync = 0;
+ int i = 0;
+ const std::string kValue(kValueSize, 'v');
+ ASSERT_EQ(options.bytes_per_sync, dbfull()->GetDBOptions().bytes_per_sync);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; });
+
+ WriteOptions write_opts;
+ // should sync approximately 40MB/1MB ~= 40 times.
+ for (i = 0; i < 40; i++) {
+ ASSERT_OK(Put(Key(i), kValue, write_opts));
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ low_bytes_per_sync = counter;
+ ASSERT_GT(low_bytes_per_sync, 35);
+ ASSERT_LT(low_bytes_per_sync, 45);
+
+ counter = 0;
+ // 8388608 = 8 * 1024 * 1024
+ ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "8388608"}}));
+ ASSERT_EQ(8388608, dbfull()->GetDBOptions().bytes_per_sync);
+ // should sync approximately 40MB*2/8MB ~= 10 times.
+ // data will be 40*2MB because of previous Puts too.
+ for (i = 0; i < 40; i++) {
+ ASSERT_OK(Put(Key(i), kValue, write_opts));
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_GT(counter, 5);
+ ASSERT_LT(counter, 15);
+
+ // Redundant assert. But leaving it here just to get the point across that
+ // low_bytes_per_sync > counter.
+ ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, SetWalBytesPerSync) {
+ const size_t kValueSize = 1024 * 1024 * 3;
+ Options options;
+ options.create_if_missing = true;
+ options.wal_bytes_per_sync = 512;
+ options.write_buffer_size = 100 * kValueSize;
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync);
+ std::atomic_int counter{0};
+ int low_bytes_per_sync = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::RangeSync:0",
+ [&](void* /*arg*/) { counter.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ const std::string kValue(kValueSize, 'v');
+ int i = 0;
+ for (; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), kValue));
+ }
+ // Do not flush. If we flush here, SwitchWAL will reuse old WAL file since its
+ // empty and will not get the new wal_bytes_per_sync value.
+ low_bytes_per_sync = counter;
+ // 5242880 = 1024 * 1024 * 5
+ ASSERT_OK(dbfull()->SetDBOptions({{"wal_bytes_per_sync", "5242880"}}));
+ ASSERT_EQ(5242880, dbfull()->GetDBOptions().wal_bytes_per_sync);
+ counter = 0;
+ i = 0;
+ for (; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), kValue));
+ }
+ ASSERT_GT(counter, 0);
+ ASSERT_GT(low_bytes_per_sync, 0);
+ ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, WritableFileMaxBufferSize) {
+ Options options;
+ options.create_if_missing = true;
+ options.writable_file_max_buffer_size = 1024 * 1024;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_manifest_file_size = 1;
+ options.env = env_;
+ int buffer_size = 1024 * 1024;
+ Reopen(options);
+ ASSERT_EQ(buffer_size,
+ dbfull()->GetDBOptions().writable_file_max_buffer_size);
+
+ std::atomic<int> match_cnt(0);
+ std::atomic<int> unmatch_cnt(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::WritableFileWriter:0", [&](void* arg) {
+ int value = static_cast<int>(reinterpret_cast<uintptr_t>(arg));
+ if (value == buffer_size) {
+ match_cnt++;
+ } else {
+ unmatch_cnt++;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ int i = 0;
+ for (; i < 3; i++) {
+ ASSERT_OK(Put("foo", std::to_string(i)));
+ ASSERT_OK(Put("bar", std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(unmatch_cnt, 0);
+ ASSERT_GE(match_cnt, 11);
+
+ ASSERT_OK(
+ dbfull()->SetDBOptions({{"writable_file_max_buffer_size", "524288"}}));
+ buffer_size = 512 * 1024;
+ match_cnt = 0;
+ unmatch_cnt = 0; // SetDBOptions() will create a WritableFileWriter
+
+ ASSERT_EQ(buffer_size,
+ dbfull()->GetDBOptions().writable_file_max_buffer_size);
+ i = 0;
+ for (; i < 3; i++) {
+ ASSERT_OK(Put("foo", std::to_string(i)));
+ ASSERT_OK(Put("bar", std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(unmatch_cnt, 0);
+ ASSERT_GE(match_cnt, 11);
+}
+
+TEST_F(DBOptionsTest, SetOptionsAndReopen) {
+ Random rnd(1044);
+ auto rand_opts = GetRandomizedMutableCFOptionsMap(&rnd);
+ ASSERT_OK(dbfull()->SetOptions(rand_opts));
+ // Verify if DB can be reopen after setting options.
+ Options options;
+ options.env = env_;
+ ASSERT_OK(TryReopen(options));
+}
+
+TEST_F(DBOptionsTest, EnableAutoCompactionAndTriggerStall) {
+ const std::string kValue(1024, 'v');
+ for (int method_type = 0; method_type < 2; method_type++) {
+ for (int option_type = 0; option_type < 4; option_type++) {
+ Options options;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 1024 * 1024 * 10;
+ options.compression = CompressionType::kNoCompression;
+ options.level0_file_num_compaction_trigger = 1;
+ options.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+ options.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+ options.hard_pending_compaction_bytes_limit =
+ std::numeric_limits<uint64_t>::max();
+ options.soft_pending_compaction_bytes_limit =
+ std::numeric_limits<uint64_t>::max();
+ options.env = env_;
+
+ DestroyAndReopen(options);
+ int i = 0;
+ for (; i < 1024; i++) {
+ ASSERT_OK(Put(Key(i), kValue));
+ }
+ ASSERT_OK(Flush());
+ for (; i < 1024 * 2; i++) {
+ ASSERT_OK(Put(Key(i), kValue));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ uint64_t l0_size = SizeAtLevel(0);
+
+ switch (option_type) {
+ case 0:
+ // test with level0_stop_writes_trigger
+ options.level0_stop_writes_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ break;
+ case 1:
+ options.level0_slowdown_writes_trigger = 2;
+ break;
+ case 2:
+ options.hard_pending_compaction_bytes_limit = l0_size;
+ options.soft_pending_compaction_bytes_limit = l0_size;
+ break;
+ case 3:
+ options.soft_pending_compaction_bytes_limit = l0_size;
+ break;
+ }
+ Reopen(options);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+ ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
+
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBOptionsTest::EnableAutoCompactionAndTriggerStall:1",
+ "BackgroundCallCompaction:0"},
+ {"DBImpl::BackgroundCompaction():BeforePickCompaction",
+ "DBOptionsTest::EnableAutoCompactionAndTriggerStall:2"},
+ {"DBOptionsTest::EnableAutoCompactionAndTriggerStall:3",
+ "DBImpl::BackgroundCompaction():AfterPickCompaction"}});
+ // Block background compaction.
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ switch (method_type) {
+ case 0:
+ ASSERT_OK(
+ dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+ break;
+ case 1:
+ ASSERT_OK(dbfull()->EnableAutoCompaction(
+ {dbfull()->DefaultColumnFamily()}));
+ break;
+ }
+ TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:1");
+ // Wait for stall condition recalculate.
+ TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:2");
+
+ switch (option_type) {
+ case 0:
+ ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped());
+ break;
+ case 1:
+ ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ break;
+ case 2:
+ ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped());
+ break;
+ case 3:
+ ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ break;
+ }
+ TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:3");
+
+ // Background compaction executed.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+ ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
+ }
+ }
+}
+
+TEST_F(DBOptionsTest, SetOptionsMayTriggerCompaction) {
+ Options options;
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 1000;
+ options.env = env_;
+ Reopen(options);
+ for (int i = 0; i < 3; i++) {
+ // Need to insert two keys to avoid trivial move.
+ ASSERT_OK(Put("foo", std::to_string(i)));
+ ASSERT_OK(Put("bar", std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ("3", FilesPerLevel());
+ ASSERT_OK(
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}}));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,1", FilesPerLevel());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundCompactionThreads) {
+ Options options;
+ options.create_if_missing = true;
+ options.max_background_compactions = 1; // default value
+ options.env = env_;
+ Reopen(options);
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+ ASSERT_OK(dbfull()->SetDBOptions({{"max_background_compactions", "3"}}));
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+ auto stop_token = dbfull()->TEST_write_controler().GetStopToken();
+ ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundFlushThreads) {
+ Options options;
+ options.create_if_missing = true;
+ options.max_background_flushes = 1;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_EQ(1, dbfull()->TEST_BGFlushesAllowed());
+ ASSERT_EQ(1, env_->GetBackgroundThreads(Env::Priority::HIGH));
+ ASSERT_OK(dbfull()->SetDBOptions({{"max_background_flushes", "3"}}));
+ ASSERT_EQ(3, env_->GetBackgroundThreads(Env::Priority::HIGH));
+ ASSERT_EQ(3, dbfull()->TEST_BGFlushesAllowed());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundJobs) {
+ Options options;
+ options.create_if_missing = true;
+ options.max_background_jobs = 8;
+ options.env = env_;
+ Reopen(options);
+
+ for (int i = 0; i < 2; ++i) {
+ if (i > 0) {
+ options.max_background_jobs = 12;
+ ASSERT_OK(dbfull()->SetDBOptions(
+ {{"max_background_jobs",
+ std::to_string(options.max_background_jobs)}}));
+ }
+
+ const int expected_max_flushes = options.max_background_jobs / 4;
+
+ ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed());
+ ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+ auto stop_token = dbfull()->TEST_write_controler().GetStopToken();
+
+ const int expected_max_compactions = 3 * expected_max_flushes;
+
+ ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed());
+ ASSERT_EQ(expected_max_compactions, dbfull()->TEST_BGCompactionsAllowed());
+
+ ASSERT_EQ(expected_max_flushes,
+ env_->GetBackgroundThreads(Env::Priority::HIGH));
+ ASSERT_EQ(expected_max_compactions,
+ env_->GetBackgroundThreads(Env::Priority::LOW));
+ }
+}
+
+TEST_F(DBOptionsTest, AvoidFlushDuringShutdown) {
+ Options options;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+ WriteOptions write_without_wal;
+ write_without_wal.disableWAL = true;
+
+ ASSERT_FALSE(options.avoid_flush_during_shutdown);
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "v1", write_without_wal));
+ Reopen(options);
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_EQ("1", FilesPerLevel());
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "v2", write_without_wal));
+ ASSERT_OK(dbfull()->SetDBOptions({{"avoid_flush_during_shutdown", "true"}}));
+ Reopen(options);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ("", FilesPerLevel());
+}
+
+TEST_F(DBOptionsTest, SetDelayedWriteRateOption) {
+ Options options;
+ options.create_if_missing = true;
+ options.delayed_write_rate = 2 * 1024U * 1024U;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_EQ(2 * 1024U * 1024U,
+ dbfull()->TEST_write_controler().max_delayed_write_rate());
+
+ ASSERT_OK(dbfull()->SetDBOptions({{"delayed_write_rate", "20000"}}));
+ ASSERT_EQ(20000, dbfull()->TEST_write_controler().max_delayed_write_rate());
+}
+
+TEST_F(DBOptionsTest, MaxTotalWalSizeChange) {
+ Random rnd(1044);
+ const auto value_size = size_t(1024);
+ std::string value = rnd.RandomString(value_size);
+
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_;
+ CreateColumnFamilies({"1", "2", "3"}, options);
+ ReopenWithColumnFamilies({"default", "1", "2", "3"}, options);
+
+ WriteOptions write_options;
+
+ const int key_count = 100;
+ for (int i = 0; i < key_count; ++i) {
+ for (size_t cf = 0; cf < handles_.size(); ++cf) {
+ ASSERT_OK(Put(static_cast<int>(cf), Key(i), value));
+ }
+ }
+ ASSERT_OK(dbfull()->SetDBOptions({{"max_total_wal_size", "10"}}));
+
+ for (size_t cf = 0; cf < handles_.size(); ++cf) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+ ASSERT_EQ("1", FilesPerLevel(static_cast<int>(cf)));
+ }
+}
+
+TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) {
+ Options options;
+ options.create_if_missing = true;
+ options.stats_dump_period_sec = 5;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
+
+ for (int i = 0; i < 20; i++) {
+ unsigned int num = rand() % 5000 + 1;
+ ASSERT_OK(dbfull()->SetDBOptions(
+ {{"stats_dump_period_sec", std::to_string(num)}}));
+ ASSERT_EQ(num, dbfull()->GetDBOptions().stats_dump_period_sec);
+ }
+ Close();
+}
+
+TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) {
+ Options options;
+ options.create_if_missing = true;
+ options.stats_persist_period_sec = 5;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+ ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "12345"}}));
+ ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
+ ASSERT_NOK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "abcde"}}));
+ ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
+}
+
+static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) {
+ dbfull->TEST_LockMutex();
+ JobContext job_context(0);
+ dbfull->FindObsoleteFiles(&job_context, false);
+ ASSERT_EQ(empty, job_context.full_scan_candidate_files.empty());
+ dbfull->TEST_UnlockMutex();
+ if (job_context.HaveSomethingToDelete()) {
+ // fulfill the contract of FindObsoleteFiles by calling PurgeObsoleteFiles
+ // afterwards; otherwise the test may hang on shutdown
+ dbfull->PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+}
+
+TEST_F(DBOptionsTest, DeleteObsoleteFilesPeriodChange) {
+ Options options;
+ options.env = env_;
+ SetTimeElapseOnlySleepOnReopen(&options);
+ options.create_if_missing = true;
+ ASSERT_OK(TryReopen(options));
+
+ // Verify that candidate files set is empty when no full scan requested.
+ assert_candidate_files_empty(dbfull(), true);
+
+ ASSERT_OK(
+ dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "0"}}));
+
+ // After delete_obsolete_files_period_micros updated to 0, the next call
+ // to FindObsoleteFiles should make a full scan
+ assert_candidate_files_empty(dbfull(), false);
+
+ ASSERT_OK(
+ dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "20"}}));
+
+ assert_candidate_files_empty(dbfull(), true);
+
+ env_->MockSleepForMicroseconds(20);
+ assert_candidate_files_empty(dbfull(), true);
+
+ env_->MockSleepForMicroseconds(1);
+ assert_candidate_files_empty(dbfull(), false);
+
+ Close();
+}
+
+TEST_F(DBOptionsTest, MaxOpenFilesChange) {
+ SpecialEnv env(env_);
+ Options options;
+ options.env = CurrentOptions().env;
+ options.max_open_files = -1;
+
+ Reopen(options);
+
+ Cache* tc = dbfull()->TEST_table_cache();
+
+ ASSERT_EQ(-1, dbfull()->GetDBOptions().max_open_files);
+ ASSERT_LT(2000, tc->GetCapacity());
+ ASSERT_OK(dbfull()->SetDBOptions({{"max_open_files", "1024"}}));
+ ASSERT_EQ(1024, dbfull()->GetDBOptions().max_open_files);
+ // examine the table cache (actual size should be 1014)
+ ASSERT_GT(1500, tc->GetCapacity());
+ Close();
+}
+
+TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.delayed_write_rate = 0;
+ Reopen(options);
+ ASSERT_EQ(16 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
+
+ options.rate_limiter.reset(NewGenericRateLimiter(31 * 1024 * 1024));
+ Reopen(options);
+ ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
+}
+
+TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.compaction_style = kCompactionStyleUniversal;
+
+ options.ttl = 0;
+ options.periodic_compaction_seconds = 0;
+ Reopen(options);
+ ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+ ASSERT_EQ(0, dbfull()->GetOptions().periodic_compaction_seconds);
+
+ options.ttl = 0;
+ options.periodic_compaction_seconds = 100;
+ Reopen(options);
+ ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+ ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+ options.ttl = 100;
+ options.periodic_compaction_seconds = 0;
+ Reopen(options);
+ ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+ ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+ options.ttl = 100;
+ options.periodic_compaction_seconds = 500;
+ Reopen(options);
+ ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+ ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBOptionsTest, SanitizeTtlDefault) {
+ Options options;
+ options.env = CurrentOptions().env;
+ Reopen(options);
+ ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+ options.compaction_style = kCompactionStyleLevel;
+ options.ttl = 0;
+ Reopen(options);
+ ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+
+ options.ttl = 100;
+ Reopen(options);
+ ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+}
+
+TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.env = CurrentOptions().env;
+ options.ttl = 0;
+ Reopen(options);
+ ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+ options.ttl = 100;
+ Reopen(options);
+ ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+
+ options.ttl = 100 * 24 * 60 * 60;
+ Reopen(options);
+ ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+ options.ttl = 200;
+ options.periodic_compaction_seconds = 300;
+ Reopen(options);
+ ASSERT_EQ(200, dbfull()->GetOptions().ttl);
+
+ options.ttl = 500;
+ options.periodic_compaction_seconds = 300;
+ Reopen(options);
+ ASSERT_EQ(300, dbfull()->GetOptions().ttl);
+}
+
+TEST_F(DBOptionsTest, SetFIFOCompactionOptions) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 10 << 10; // 10KB
+ options.arena_block_size = 4096;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.compaction_options_fifo.allow_compaction = false;
+ env_->SetMockSleep();
+ options.env = env_;
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ // Test dynamically changing ttl.
+ options.ttl = 1 * 60 * 60; // 1 hour
+ ASSERT_OK(TryReopen(options));
+
+ Random rnd(301);
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ env_->MockSleepForSeconds(61);
+
+ // No files should be compacted as ttl is set to 1 hour.
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 3600);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Set ttl to 1 minute. So all files should get deleted.
+ ASSERT_OK(dbfull()->SetOptions({{"ttl", "60"}}));
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ // Test dynamically changing compaction_options_fifo.max_table_files_size
+ options.compaction_options_fifo.max_table_files_size = 500 << 10; // 00KB
+ options.ttl = 0;
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // No files should be compacted as max_table_files_size is set to 500 KB.
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 500 << 10);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Set max_table_files_size to 12 KB. So only 1 file should remain now.
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo", "{max_table_files_size=12288;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 12 << 10);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+ // Test dynamically changing compaction_options_fifo.allow_compaction
+ options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB
+ options.ttl = 0;
+ options.compaction_options_fifo.allow_compaction = false;
+ options.level0_file_num_compaction_trigger = 6;
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // No files should be compacted as max_table_files_size is set to 500 KB and
+ // allow_compaction is false
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ false);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Set allow_compaction to true. So number of files should be between 1 and 5.
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_GE(NumTableFilesAtLevel(0), 1);
+ ASSERT_LE(NumTableFilesAtLevel(0), 5);
+}
+
+TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) {
+ SpecialEnv env(env_);
+ Options options;
+ options.env = &env;
+
+ options.compaction_readahead_size = 0;
+ options.level0_file_num_compaction_trigger = 2;
+ const std::string kValue(1024, 'v');
+ Reopen(options);
+
+ ASSERT_EQ(0, dbfull()->GetDBOptions().compaction_readahead_size);
+ ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}}));
+ ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size);
+ for (int i = 0; i < 1024; i++) {
+ ASSERT_OK(Put(Key(i), kValue));
+ }
+ ASSERT_OK(Flush());
+ for (int i = 0; i < 1024 * 2; i++) {
+ ASSERT_OK(Put(Key(i), kValue));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(256, env_->compaction_readahead_size_);
+ Close();
+}
+
+TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 10 << 10; // 10KB
+ options.create_if_missing = true;
+ options.env = CurrentOptions().env;
+
+ ASSERT_OK(TryReopen(options));
+
+ Random rnd(301);
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // In release 6.0, ttl was promoted from a secondary level option under
+ // compaction_options_fifo to a top level option under ColumnFamilyOptions.
+ // We still need to handle old SetOptions calls but should ignore
+ // ttl under compaction_options_fifo.
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo",
+ "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"},
+ {"ttl", "60"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 1024);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+
+ // Put ttl as the first option inside compaction_options_fifo. That works as
+ // it doesn't overwrite any other option.
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo",
+ "{ttl=985;allow_compaction=true;max_table_files_size=1024;}"},
+ {"ttl", "191"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 1024);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 191);
+}
+
+TEST_F(DBOptionsTest, ChangeCompression) {
+ if (!Snappy_Supported() || !LZ4_Supported()) {
+ return;
+ }
+ Options options;
+ options.write_buffer_size = 10 << 10; // 10KB
+ options.level0_file_num_compaction_trigger = 2;
+ options.create_if_missing = true;
+ options.compression = CompressionType::kLZ4Compression;
+ options.bottommost_compression = CompressionType::kNoCompression;
+ options.bottommost_compression_opts.level = 2;
+ options.bottommost_compression_opts.parallel_threads = 1;
+ options.env = CurrentOptions().env;
+
+ ASSERT_OK(TryReopen(options));
+
+ CompressionType compression_used = CompressionType::kLZ4Compression;
+ CompressionOptions compression_opt_used;
+ bool compacted = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* c = reinterpret_cast<Compaction*>(arg);
+ compression_used = c->output_compression();
+ compression_opt_used = c->output_compression_opts();
+ compacted = true;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("foo", "foofoofoo"));
+ ASSERT_OK(Put("bar", "foofoofoo"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "foofoofoo"));
+ ASSERT_OK(Put("bar", "foofoofoo"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(compacted);
+ ASSERT_EQ(CompressionType::kNoCompression, compression_used);
+ ASSERT_EQ(options.compression_opts.level, compression_opt_used.level);
+ ASSERT_EQ(options.compression_opts.parallel_threads,
+ compression_opt_used.parallel_threads);
+
+ compression_used = CompressionType::kLZ4Compression;
+ compacted = false;
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"bottommost_compression", "kSnappyCompression"},
+ {"bottommost_compression_opts", "0:6:0:0:4:true"}}));
+ ASSERT_OK(Put("foo", "foofoofoo"));
+ ASSERT_OK(Put("bar", "foofoofoo"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "foofoofoo"));
+ ASSERT_OK(Put("bar", "foofoofoo"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(compacted);
+ ASSERT_EQ(CompressionType::kSnappyCompression, compression_used);
+ ASSERT_EQ(6, compression_opt_used.level);
+ // Right now parallel_level is not yet allowed to be changed.
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBOptionsTest, BottommostCompressionOptsWithFallbackType) {
+ // Verify the bottommost compression options still take effect even when the
+ // bottommost compression type is left at its default value. Verify for both
+ // automatic and manual compaction.
+ if (!Snappy_Supported() || !LZ4_Supported()) {
+ return;
+ }
+
+ constexpr int kUpperCompressionLevel = 1;
+ constexpr int kBottommostCompressionLevel = 2;
+ constexpr int kNumL0Files = 2;
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.compression = CompressionType::kLZ4Compression;
+ options.compression_opts.level = kUpperCompressionLevel;
+ options.bottommost_compression_opts.level = kBottommostCompressionLevel;
+ options.bottommost_compression_opts.enabled = true;
+ Reopen(options);
+
+ CompressionType compression_used = CompressionType::kDisableCompressionOption;
+ CompressionOptions compression_opt_used;
+ bool compacted = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionPicker::RegisterCompaction:Registered", [&](void* arg) {
+ Compaction* c = static_cast<Compaction*>(arg);
+ compression_used = c->output_compression();
+ compression_opt_used = c->output_compression_opts();
+ compacted = true;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // First, verify for automatic compaction.
+ for (int i = 0; i < kNumL0Files; ++i) {
+ ASSERT_OK(Put("foo", "foofoofoo"));
+ ASSERT_OK(Put("bar", "foofoofoo"));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_TRUE(compacted);
+ ASSERT_EQ(CompressionType::kLZ4Compression, compression_used);
+ ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level);
+
+ // Second, verify for manual compaction.
+ compacted = false;
+ compression_used = CompressionType::kDisableCompressionOption;
+ compression_opt_used = CompressionOptions();
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_TRUE(compacted);
+ ASSERT_EQ(CompressionType::kLZ4Compression, compression_used);
+ ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_properties_test.cc b/src/rocksdb/db/db_properties_test.cc
new file mode 100644
index 000000000..85cd5c04e
--- /dev/null
+++ b/src/rocksdb/db/db_properties_test.cc
@@ -0,0 +1,2206 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "options/cf_options.h"
+#include "port/stack_trace.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/perf_level.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/table_builder.h"
+#include "test_util/mock_time_env.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBPropertiesTest : public DBTestBase {
+ public:
+ DBPropertiesTest()
+ : DBTestBase("db_properties_test", /*env_do_fsync=*/false) {}
+
+ void AssertDbStats(const std::map<std::string, std::string>& db_stats,
+ double expected_uptime, int expected_user_bytes_written,
+ int expected_wal_bytes_written,
+ int expected_user_writes_by_self,
+ int expected_user_writes_with_wal) {
+ ASSERT_EQ(std::to_string(expected_uptime), db_stats.at("db.uptime"));
+ ASSERT_EQ(std::to_string(expected_wal_bytes_written),
+ db_stats.at("db.wal_bytes_written"));
+ ASSERT_EQ("0", db_stats.at("db.wal_syncs"));
+ ASSERT_EQ(std::to_string(expected_user_bytes_written),
+ db_stats.at("db.user_bytes_written"));
+ ASSERT_EQ("0", db_stats.at("db.user_writes_by_other"));
+ ASSERT_EQ(std::to_string(expected_user_writes_by_self),
+ db_stats.at("db.user_writes_by_self"));
+ ASSERT_EQ(std::to_string(expected_user_writes_with_wal),
+ db_stats.at("db.user_writes_with_wal"));
+ ASSERT_EQ("0", db_stats.at("db.user_write_stall_micros"));
+ }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, Empty) {
+ do {
+ Options options;
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.allow_concurrent_memtable_write = false;
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ std::string num;
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ("0", num);
+
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ("1", num);
+
+ // Block sync calls
+ env_->delay_sstable_sync_.store(true, std::memory_order_release);
+ ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ("2", num);
+
+ ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger compaction
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ("1", num);
+
+ ASSERT_EQ("v1", Get(1, "foo"));
+ // Release sync calls
+ env_->delay_sstable_sync_.store(false, std::memory_order_release);
+
+ ASSERT_OK(db_->DisableFileDeletions());
+ ASSERT_TRUE(
+ dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+ ASSERT_EQ("0", num);
+
+ ASSERT_OK(db_->DisableFileDeletions());
+ ASSERT_TRUE(
+ dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+ ASSERT_EQ("0", num);
+
+ ASSERT_OK(db_->DisableFileDeletions());
+ ASSERT_TRUE(
+ dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+ ASSERT_EQ("0", num);
+
+ ASSERT_OK(db_->EnableFileDeletions(false));
+ ASSERT_TRUE(
+ dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+ ASSERT_EQ("0", num);
+
+ ASSERT_OK(db_->EnableFileDeletions());
+ ASSERT_TRUE(
+ dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+ ASSERT_EQ("1", num);
+ } while (ChangeOptions());
+}
+
+TEST_F(DBPropertiesTest, CurrentVersionNumber) {
+ uint64_t v1, v2, v3;
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v1));
+ ASSERT_OK(Put("12345678", ""));
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v2));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v3));
+
+ ASSERT_EQ(v1, v2);
+ ASSERT_GT(v3, v2);
+}
+
+TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) {
+ const int kKeySize = 100;
+ const int kValueSize = 500;
+ const int kKeyNum = 100;
+
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.write_buffer_size = (kKeySize + kValueSize) * kKeyNum / 10;
+ // Make them never flush
+ options.min_write_buffer_number_to_merge = 1000;
+ options.max_write_buffer_number = 1000;
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"one", "two", "three", "four"}, options);
+
+ Random rnd(301);
+ for (auto* handle : handles_) {
+ for (int i = 0; i < kKeyNum; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), handle, rnd.RandomString(kKeySize),
+ rnd.RandomString(kValueSize)));
+ }
+ }
+
+ uint64_t manual_sum = 0;
+ uint64_t api_sum = 0;
+ uint64_t value = 0;
+ for (auto* handle : handles_) {
+ ASSERT_TRUE(
+ db_->GetIntProperty(handle, DB::Properties::kSizeAllMemTables, &value));
+ manual_sum += value;
+ }
+ ASSERT_TRUE(db_->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables,
+ &api_sum));
+ ASSERT_GT(manual_sum, 0);
+ ASSERT_EQ(manual_sum, api_sum);
+
+ ASSERT_FALSE(db_->GetAggregatedIntProperty(DB::Properties::kDBStats, &value));
+
+ uint64_t before_flush_trm;
+ uint64_t after_flush_trm;
+ for (auto* handle : handles_) {
+ ASSERT_TRUE(db_->GetAggregatedIntProperty(
+ DB::Properties::kEstimateTableReadersMem, &before_flush_trm));
+
+ // Issue flush and expect larger memory usage of table readers.
+ ASSERT_OK(db_->Flush(FlushOptions(), handle));
+
+ ASSERT_TRUE(db_->GetAggregatedIntProperty(
+ DB::Properties::kEstimateTableReadersMem, &after_flush_trm));
+ ASSERT_GT(after_flush_trm, before_flush_trm);
+ }
+}
+
+namespace {
+void ResetTableProperties(TableProperties* tp) {
+ tp->data_size = 0;
+ tp->index_size = 0;
+ tp->filter_size = 0;
+ tp->raw_key_size = 0;
+ tp->raw_value_size = 0;
+ tp->num_data_blocks = 0;
+ tp->num_entries = 0;
+ tp->num_deletions = 0;
+ tp->num_merge_operands = 0;
+ tp->num_range_deletions = 0;
+}
+
+void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
+ double dummy_double;
+ std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
+ std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
+ ResetTableProperties(tp);
+ sscanf(tp_string.c_str(),
+ "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64
+ " # merge operands %" SCNu64 " # range deletions %" SCNu64
+ " raw key size %" SCNu64
+ " raw average key size %lf "
+ " raw value size %" SCNu64
+ " raw average value size %lf "
+ " data block size %" SCNu64 " index block size (user-key? %" SCNu64
+ ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
+ &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
+ &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
+ &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
+ &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded,
+ &tp->index_size, &tp->filter_size);
+}
+
+void VerifySimilar(uint64_t a, uint64_t b, double bias) {
+ ASSERT_EQ(a == 0U, b == 0U);
+ if (a == 0) {
+ return;
+ }
+ double dbl_a = static_cast<double>(a);
+ double dbl_b = static_cast<double>(b);
+ if (dbl_a > dbl_b) {
+ ASSERT_LT(static_cast<double>(dbl_a - dbl_b) / (dbl_a + dbl_b), bias);
+ } else {
+ ASSERT_LT(static_cast<double>(dbl_b - dbl_a) / (dbl_a + dbl_b), bias);
+ }
+}
+
+void VerifyTableProperties(
+ const TableProperties& base_tp, const TableProperties& new_tp,
+ double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.18 : 0.1,
+ double index_size_bias = 0.1, double data_size_bias = 0.1,
+ double num_data_blocks_bias = 0.05) {
+ VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias);
+ VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias);
+ VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias);
+ VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks,
+ num_data_blocks_bias);
+
+ ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size);
+ ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size);
+ ASSERT_EQ(base_tp.num_entries, new_tp.num_entries);
+ ASSERT_EQ(base_tp.num_deletions, new_tp.num_deletions);
+ ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
+
+ // Merge operands may become Puts, so we only have an upper bound the exact
+ // number of merge operands.
+ ASSERT_GE(base_tp.num_merge_operands, new_tp.num_merge_operands);
+}
+
+void GetExpectedTableProperties(
+ TableProperties* expected_tp, const int kKeySize, const int kValueSize,
+ const int kPutsPerTable, const int kDeletionsPerTable,
+ const int kMergeOperandsPerTable, const int kRangeDeletionsPerTable,
+ const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize,
+ const bool index_key_is_user_key, const bool value_delta_encoding) {
+ const int kKeysPerTable =
+ kPutsPerTable + kDeletionsPerTable + kMergeOperandsPerTable;
+ const int kPutCount = kTableCount * kPutsPerTable;
+ const int kDeletionCount = kTableCount * kDeletionsPerTable;
+ const int kMergeCount = kTableCount * kMergeOperandsPerTable;
+ const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
+ const int kKeyCount =
+ kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount;
+ const int kAvgSuccessorSize = kKeySize / 5;
+ const int kEncodingSavePerKey = kKeySize / 4;
+ expected_tp->raw_key_size = kKeyCount * (kKeySize + 8);
+ expected_tp->raw_value_size =
+ (kPutCount + kMergeCount + kRangeDeletionCount) * kValueSize;
+ expected_tp->num_entries = kKeyCount;
+ expected_tp->num_deletions = kDeletionCount + kRangeDeletionCount;
+ expected_tp->num_merge_operands = kMergeCount;
+ expected_tp->num_range_deletions = kRangeDeletionCount;
+ expected_tp->num_data_blocks =
+ kTableCount *
+ (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
+ kBlockSize;
+ expected_tp->data_size =
+ kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
+ expected_tp->index_size =
+ expected_tp->num_data_blocks *
+ (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) -
+ // discount 1 byte as value size is not encoded in value delta encoding
+ (value_delta_encoding ? 1 : 0));
+ expected_tp->filter_size =
+ kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 +
+ /*average-ish overhead*/ CACHE_LINE_SIZE / 2);
+}
+} // anonymous namespace
+
+TEST_F(DBPropertiesTest, ValidatePropertyInfo) {
+ for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) {
+ // If C++ gets a std::string_literal, this would be better to check at
+ // compile-time using static_assert.
+ ASSERT_TRUE(ppt_name_and_info.first.empty() ||
+ !isdigit(ppt_name_and_info.first.back()));
+
+ int count = 0;
+ count += (ppt_name_and_info.second.handle_string == nullptr) ? 0 : 1;
+ count += (ppt_name_and_info.second.handle_int == nullptr) ? 0 : 1;
+ count += (ppt_name_and_info.second.handle_string_dbimpl == nullptr) ? 0 : 1;
+ ASSERT_TRUE(count == 1);
+ }
+}
+
+TEST_F(DBPropertiesTest, ValidateSampleNumber) {
+ // When "max_open_files" is -1, we read all the files for
+ // "rocksdb.estimate-num-keys" computation, which is the ground truth.
+ // Otherwise, we sample 20 newest files to make an estimation.
+ // Formula: lastest_20_files_active_key_ratio * total_files
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.level0_stop_writes_trigger = 1000;
+ DestroyAndReopen(options);
+ int key = 0;
+ for (int files = 20; files >= 10; files -= 10) {
+ for (int i = 0; i < files; i++) {
+ int rows = files / 10;
+ for (int j = 0; j < rows; j++) {
+ ASSERT_OK(db_->Put(WriteOptions(), std::to_string(++key), "foo"));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ }
+ std::string num;
+ Reopen(options);
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+ ASSERT_EQ("45", num);
+ options.max_open_files = -1;
+ Reopen(options);
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+ ASSERT_EQ("50", num);
+}
+
+TEST_F(DBPropertiesTest, AggregatedTableProperties) {
+ for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) {
+ const int kDeletionsPerTable = 0;
+ const int kMergeOperandsPerTable = 15;
+ const int kRangeDeletionsPerTable = 5;
+ const int kPutsPerTable = 100;
+ const int kKeySize = 80;
+ const int kValueSize = 200;
+ const int kBloomBitsPerKey = 20;
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 8;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.merge_operator.reset(new TestPutOperator());
+
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(
+ NewBloomFilterPolicy(kBloomBitsPerKey, false));
+ table_options.block_size = 1024;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+
+ // Hold open a snapshot to prevent range tombstones from being compacted
+ // away.
+ ManagedSnapshot snapshot(db_);
+
+ Random rnd(5632);
+ for (int table = 1; table <= kTableCount; ++table) {
+ for (int i = 0; i < kPutsPerTable; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize),
+ rnd.RandomString(kValueSize)));
+ }
+ for (int i = 0; i < kDeletionsPerTable; i++) {
+ ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize)));
+ }
+ for (int i = 0; i < kMergeOperandsPerTable; i++) {
+ ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize),
+ rnd.RandomString(kValueSize)));
+ }
+ for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+ std::string start = rnd.RandomString(kKeySize);
+ std::string end = start;
+ end.resize(kValueSize);
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ start, end));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ std::string property;
+ db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
+ TableProperties output_tp;
+ ParseTablePropertiesString(property, &output_tp);
+ bool index_key_is_user_key = output_tp.index_key_is_user_key > 0;
+ bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0;
+
+ TableProperties expected_tp;
+ GetExpectedTableProperties(
+ &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+ kMergeOperandsPerTable, kRangeDeletionsPerTable, kTableCount,
+ kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+ value_is_delta_encoded);
+
+ VerifyTableProperties(expected_tp, output_tp);
+ }
+}
+
+TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10;
+ options.level0_file_num_compaction_trigger = 6;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 4500 << 10;
+ options.target_file_size_base = 98 << 10;
+ options.max_write_buffer_number = 2;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.max_open_files = 11; // Make sure no proloading of table readers
+
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = 11;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+ int key_index = 0;
+ Random rnd(301);
+ for (int num = 0; num < 8; num++) {
+ ASSERT_OK(Put("foo", "bar"));
+ GenerateNewFile(&rnd, &key_index);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ std::string prop;
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
+
+ // Get() after flushes, See latency histogram tracked.
+ for (int key = 0; key < key_index; key++) {
+ Get(Key(key));
+ }
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+ // Reopen and issue Get(). See thee latency tracked
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ for (int key = 0; key < key_index; key++) {
+ Get(Key(key));
+ }
+
+ // Test for getting immutable_db_options_.statistics
+ ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+ "rocksdb.options-statistics", &prop));
+ ASSERT_NE(std::string::npos, prop.find("rocksdb.block.cache.miss"));
+ ASSERT_EQ(std::string::npos, prop.find("rocksdb.db.f.micros"));
+
+ ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+ "rocksdb.cf-file-histogram", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+ // Reopen and issue iterating. See thee latency tracked
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
+ }
+ ASSERT_OK(iter->status());
+ }
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+ // CF 1 should show no histogram.
+ ASSERT_TRUE(
+ dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+ // put something and read it back , CF 1 should show histogram.
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("bar", Get(1, "foo"));
+
+ ASSERT_TRUE(
+ dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+ // options.max_open_files preloads table readers.
+ options.max_open_files = -1;
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+ "rocksdb.cf-file-histogram", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+ for (int key = 0; key < key_index; key++) {
+ Get(Key(key));
+ }
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+ ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+ // Clear internal stats
+ ASSERT_OK(dbfull()->ResetStats());
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+}
+
+TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
+ const int kTableCount = 100;
+ const int kDeletionsPerTable = 0;
+ const int kMergeOperandsPerTable = 2;
+ const int kRangeDeletionsPerTable = 2;
+ const int kPutsPerTable = 10;
+ const int kKeySize = 50;
+ const int kValueSize = 400;
+ const int kMaxLevel = 7;
+ const int kBloomBitsPerKey = 20;
+ Random rnd(301);
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 8;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.target_file_size_base = 8192;
+ options.max_bytes_for_level_base = 10000;
+ options.max_bytes_for_level_multiplier = 2;
+ // This ensures there no compaction happening when we call GetProperty().
+ options.disable_auto_compactions = true;
+ options.merge_operator.reset(new TestPutOperator());
+
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(
+ NewBloomFilterPolicy(kBloomBitsPerKey, false));
+ table_options.block_size = 1024;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+
+ // Hold open a snapshot to prevent range tombstones from being compacted away.
+ ManagedSnapshot snapshot(db_);
+
+ std::string level_tp_strings[kMaxLevel];
+ std::string tp_string;
+ TableProperties level_tps[kMaxLevel];
+ TableProperties tp, sum_tp, expected_tp;
+ for (int table = 1; table <= kTableCount; ++table) {
+ for (int i = 0; i < kPutsPerTable; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize),
+ rnd.RandomString(kValueSize)));
+ }
+ for (int i = 0; i < kDeletionsPerTable; i++) {
+ ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize)));
+ }
+ for (int i = 0; i < kMergeOperandsPerTable; i++) {
+ ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize),
+ rnd.RandomString(kValueSize)));
+ }
+ for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+ std::string start = rnd.RandomString(kKeySize);
+ std::string end = start;
+ end.resize(kValueSize);
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ start, end));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ResetTableProperties(&sum_tp);
+ for (int level = 0; level < kMaxLevel; ++level) {
+ db_->GetProperty(DB::Properties::kAggregatedTablePropertiesAtLevel +
+ std::to_string(level),
+ &level_tp_strings[level]);
+ ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]);
+ sum_tp.data_size += level_tps[level].data_size;
+ sum_tp.index_size += level_tps[level].index_size;
+ sum_tp.filter_size += level_tps[level].filter_size;
+ sum_tp.raw_key_size += level_tps[level].raw_key_size;
+ sum_tp.raw_value_size += level_tps[level].raw_value_size;
+ sum_tp.num_data_blocks += level_tps[level].num_data_blocks;
+ sum_tp.num_entries += level_tps[level].num_entries;
+ sum_tp.num_deletions += level_tps[level].num_deletions;
+ sum_tp.num_merge_operands += level_tps[level].num_merge_operands;
+ sum_tp.num_range_deletions += level_tps[level].num_range_deletions;
+ }
+ db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
+ ParseTablePropertiesString(tp_string, &tp);
+ bool index_key_is_user_key = tp.index_key_is_user_key > 0;
+ bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0;
+ ASSERT_EQ(sum_tp.data_size, tp.data_size);
+ ASSERT_EQ(sum_tp.index_size, tp.index_size);
+ ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
+ ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size);
+ ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size);
+ ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
+ ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
+ ASSERT_EQ(sum_tp.num_deletions, tp.num_deletions);
+ ASSERT_EQ(sum_tp.num_merge_operands, tp.num_merge_operands);
+ ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
+ if (table > 3) {
+ GetExpectedTableProperties(
+ &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+ kMergeOperandsPerTable, kRangeDeletionsPerTable, table,
+ kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+ value_is_delta_encoded);
+ // Gives larger bias here as index block size, filter block size,
+ // and data block size become much harder to estimate in this test.
+ VerifyTableProperties(expected_tp, tp, CACHE_LINE_SIZE >= 256 ? 0.6 : 0.5,
+ 0.5, 0.5, 0.25);
+ }
+ }
+}
+
+TEST_F(DBPropertiesTest, NumImmutableMemTable) {
+ do {
+ Options options = CurrentOptions();
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.write_buffer_size = 1000000;
+ options.max_write_buffer_size_to_maintain =
+ 5 * static_cast<int64_t>(options.write_buffer_size);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ std::string big_value(1000000 * 2, 'x');
+ std::string num;
+ uint64_t value;
+ SetPerfLevel(kEnableTime);
+ ASSERT_TRUE(GetPerfLevel() == kEnableTime);
+
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+ "rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ(num, "1");
+ get_perf_context()->Reset();
+ Get(1, "k1");
+ ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+ "rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+ ASSERT_EQ(num, "1");
+
+ get_perf_context()->Reset();
+ Get(1, "k1");
+ ASSERT_EQ(2, static_cast<int>(get_perf_context()->get_from_memtable_count));
+ get_perf_context()->Reset();
+ Get(1, "k2");
+ ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.cur-size-active-mem-table", &num));
+ ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+ "rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "2");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+ ASSERT_EQ(num, "2");
+ get_perf_context()->Reset();
+ Get(1, "k2");
+ ASSERT_EQ(2, static_cast<int>(get_perf_context()->get_from_memtable_count));
+ get_perf_context()->Reset();
+ Get(1, "k3");
+ ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+ get_perf_context()->Reset();
+ Get(1, "k1");
+ ASSERT_EQ(3, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+ ASSERT_OK(Flush(1));
+ ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+ "rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty(
+ handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num));
+ ASSERT_EQ(num, "3");
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.cur-size-active-mem-table", &value));
+ // "192" is the size of the metadata of two empty skiplists, this would
+ // break if we change the default skiplist implementation
+ ASSERT_GE(value, 192);
+
+ uint64_t int_num;
+ uint64_t base_total_size;
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.estimate-num-keys", &base_total_size));
+
+ ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2"));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", ""));
+ ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3"));
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num));
+ ASSERT_EQ(int_num, 2U);
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.num-entries-active-mem-table", &int_num));
+ ASSERT_EQ(int_num, 3U);
+
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num));
+ ASSERT_EQ(int_num, 4U);
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num));
+ ASSERT_EQ(int_num, 2U);
+
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.estimate-num-keys", &int_num));
+ ASSERT_EQ(int_num, base_total_size + 1);
+
+ SetPerfLevel(kDisable);
+ ASSERT_TRUE(GetPerfLevel() == kDisable);
+ } while (ChangeCompactOptions());
+}
+
+// TODO(techdept) : Disabled flaky test #12863555
+TEST_F(DBPropertiesTest, DISABLED_GetProperty) {
+ // Set sizes to both background thread pool to be 1 and block them.
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ test::SleepingBackgroundTask sleeping_task_high;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_high, Env::Priority::HIGH);
+
+ Options options = CurrentOptions();
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.level0_file_num_compaction_trigger = 1;
+ options.compaction_options_universal.size_ratio = 50;
+ options.max_background_compactions = 1;
+ options.max_background_flushes = 1;
+ options.max_write_buffer_number = 10;
+ options.min_write_buffer_number_to_merge = 1;
+ options.max_write_buffer_size_to_maintain = 0;
+ options.write_buffer_size = 1000000;
+ Reopen(options);
+
+ std::string big_value(1000000 * 2, 'x');
+ std::string num;
+ uint64_t int_num;
+ SetPerfLevel(kEnableTime);
+
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_EQ(int_num, 0U);
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num));
+ ASSERT_EQ(int_num, 0U);
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+ ASSERT_EQ(num, "1");
+ get_perf_context()->Reset();
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing"));
+ ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+ ASSERT_EQ(num, "2");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+ ASSERT_EQ(num, "2");
+ // Verify the same set of properties through GetIntProperty
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num));
+ ASSERT_EQ(int_num, 2U);
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num));
+ ASSERT_EQ(int_num, 1U);
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num));
+ ASSERT_EQ(int_num, 0U);
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+ ASSERT_EQ(int_num, 2U);
+
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_EQ(int_num, 0U);
+
+ sleeping_task_high.WakeUp();
+ sleeping_task_high.WaitUntilDone();
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value));
+ ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value));
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+ ASSERT_EQ(num, "0");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+ ASSERT_EQ(num, "1");
+ ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+ ASSERT_EQ(num, "4");
+
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ // Wait for compaction to be done. This is important because otherwise RocksDB
+ // might schedule a compaction when reopening the database, failing assertion
+ // (A) as a result.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ options.max_open_files = 10;
+ Reopen(options);
+ // After reopening, no table reader is loaded, so no memory for table readers
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_EQ(int_num, 0U); // (A)
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ // After reading a key, at least one table reader is loaded.
+ Get("k5");
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ // Test rocksdb.num-live-versions
+ {
+ options.level0_file_num_compaction_trigger = 20;
+ Reopen(options);
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+ ASSERT_EQ(int_num, 1U);
+
+ // Use an iterator to hold current version
+ std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+ ASSERT_EQ(int_num, 2U);
+
+ // Use an iterator to hold current version
+ std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+ ASSERT_EQ(int_num, 3U);
+
+ iter2.reset();
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+ ASSERT_EQ(int_num, 2U);
+
+ iter1.reset();
+ ASSERT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+ ASSERT_EQ(int_num, 1U);
+ }
+}
+
+TEST_F(DBPropertiesTest, ApproximateMemoryUsage) {
+ const int kNumRounds = 10;
+ // TODO(noetzli) kFlushesPerRound does not really correlate with how many
+ // flushes happen.
+ const int kFlushesPerRound = 10;
+ const int kWritesPerFlush = 10;
+ const int kKeySize = 100;
+ const int kValueSize = 1000;
+ Options options;
+ options.write_buffer_size = 1000; // small write buffer
+ options.min_write_buffer_number_to_merge = 4;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ std::vector<Iterator*> iters;
+
+ uint64_t active_mem;
+ uint64_t unflushed_mem;
+ uint64_t all_mem;
+ uint64_t prev_all_mem;
+
+ // Phase 0. The verify the initial value of all these properties are the same
+ // as we have no mem-tables.
+ dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+ dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ ASSERT_EQ(all_mem, active_mem);
+ ASSERT_EQ(all_mem, unflushed_mem);
+
+ // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to
+ // "size-all-mem-tables"
+ for (int r = 0; r < kNumRounds; ++r) {
+ for (int f = 0; f < kFlushesPerRound; ++f) {
+ for (int w = 0; w < kWritesPerFlush; ++w) {
+ ASSERT_OK(
+ Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize)));
+ }
+ }
+ // Make sure that there is no flush between getting the two properties.
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ // in no iterator case, these two number should be the same.
+ ASSERT_EQ(unflushed_mem, all_mem);
+ }
+ prev_all_mem = all_mem;
+
+ // Phase 2. Keep issuing Put() but also create new iterators. This time we
+ // expect "size-all-mem-tables" > "cur-size-all-mem-tables".
+ for (int r = 0; r < kNumRounds; ++r) {
+ iters.push_back(db_->NewIterator(ReadOptions()));
+ for (int f = 0; f < kFlushesPerRound; ++f) {
+ for (int w = 0; w < kWritesPerFlush; ++w) {
+ ASSERT_OK(
+ Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize)));
+ }
+ }
+ // Force flush to prevent flush from happening between getting the
+ // properties or after getting the properties and before the new round.
+ ASSERT_OK(Flush());
+
+ // In the second round, add iterators.
+ dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+ dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ ASSERT_GT(all_mem, active_mem);
+ ASSERT_GT(all_mem, unflushed_mem);
+ ASSERT_GT(all_mem, prev_all_mem);
+ prev_all_mem = all_mem;
+ }
+
+ // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks
+ // whenever we release an iterator.
+ for (auto* iter : iters) {
+ ASSERT_OK(iter->status());
+ delete iter;
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ // Expect the size shrinking
+ ASSERT_LT(all_mem, prev_all_mem);
+ prev_all_mem = all_mem;
+ }
+
+ // Expect all these three counters to be the same.
+ dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+ dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ ASSERT_EQ(active_mem, unflushed_mem);
+ ASSERT_EQ(unflushed_mem, all_mem);
+
+ // Phase 5. Reopen, and expect all these three counters to be the same again.
+ Reopen(options);
+ dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+ dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+ dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+ ASSERT_EQ(active_mem, unflushed_mem);
+ ASSERT_EQ(unflushed_mem, all_mem);
+}
+
+TEST_F(DBPropertiesTest, EstimatePendingCompBytes) {
+ // Set sizes to both background thread pool to be 1 and block them.
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ Options options = CurrentOptions();
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 2;
+ options.max_background_compactions = 1;
+ options.max_background_flushes = 1;
+ options.max_write_buffer_number = 10;
+ options.min_write_buffer_number_to_merge = 1;
+ options.max_write_buffer_size_to_maintain = 0;
+ options.write_buffer_size = 1000000;
+ Reopen(options);
+
+ std::string big_value(1000000 * 2, 'x');
+ std::string num;
+ uint64_t int_num;
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-pending-compaction-bytes", &int_num));
+ ASSERT_EQ(int_num, 0U);
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-pending-compaction-bytes", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-pending-compaction-bytes", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-pending-compaction-bytes", &int_num));
+ ASSERT_EQ(int_num, 0U);
+}
+
+TEST_F(DBPropertiesTest, EstimateCompressionRatio) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+ const int kNumL0Files = 3;
+ const int kNumEntriesPerFile = 1000;
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = 3;
+ Reopen(options);
+
+ ASSERT_OK(db_->SetOptions(
+ {{"compression_per_level", "kNoCompression:kSnappyCompression"}}));
+ auto opts = db_->GetOptions();
+ ASSERT_EQ(opts.compression_per_level.size(), 2);
+ ASSERT_EQ(opts.compression_per_level[0], kNoCompression);
+ ASSERT_EQ(opts.compression_per_level[1], kSnappyCompression);
+
+ // compression ratio is -1.0 when no open files at level
+ ASSERT_EQ(CompressionRatioAtLevel(0), -1.0);
+
+ const std::string kVal(100, 'a');
+ for (int i = 0; i < kNumL0Files; ++i) {
+ for (int j = 0; j < kNumEntriesPerFile; ++j) {
+ // Put common data ("key") at end to prevent delta encoding from
+ // compressing the key effectively
+ std::string key = std::to_string(i) + std::to_string(j) + "key";
+ ASSERT_OK(dbfull()->Put(WriteOptions(), key, kVal));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // no compression at L0, so ratio is less than one
+ ASSERT_LT(CompressionRatioAtLevel(0), 1.0);
+ ASSERT_GT(CompressionRatioAtLevel(0), 0.0);
+ ASSERT_EQ(CompressionRatioAtLevel(1), -1.0);
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+
+ ASSERT_EQ(CompressionRatioAtLevel(0), -1.0);
+ // Data at L1 should be highly compressed thanks to Snappy and redundant data
+ // in values (ratio is 12.846 as of 4/19/2016).
+ ASSERT_GT(CompressionRatioAtLevel(1), 10.0);
+}
+
+#endif // ROCKSDB_LITE
+
+class CountingUserTblPropCollector : public TablePropertiesCollector {
+ public:
+ const char* Name() const override { return "CountingUserTblPropCollector"; }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ std::string encoded;
+ PutVarint32(&encoded, count_);
+ *properties = UserCollectedProperties{
+ {"CountingUserTblPropCollector", message_},
+ {"Count", encoded},
+ };
+ return Status::OK();
+ }
+
+ Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+ EntryType /*type*/, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ ++count_;
+ return Status::OK();
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ private:
+ std::string message_ = "Rocksdb";
+ uint32_t count_ = 0;
+};
+
+class CountingUserTblPropCollectorFactory
+ : public TablePropertiesCollectorFactory {
+ public:
+ explicit CountingUserTblPropCollectorFactory(
+ uint32_t expected_column_family_id)
+ : expected_column_family_id_(expected_column_family_id),
+ num_created_(0) {}
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context context) override {
+ EXPECT_EQ(expected_column_family_id_, context.column_family_id);
+ num_created_++;
+ return new CountingUserTblPropCollector();
+ }
+ const char* Name() const override {
+ return "CountingUserTblPropCollectorFactory";
+ }
+ void set_expected_column_family_id(uint32_t v) {
+ expected_column_family_id_ = v;
+ }
+ uint32_t expected_column_family_id_;
+ uint32_t num_created_;
+};
+
+class CountingDeleteTabPropCollector : public TablePropertiesCollector {
+ public:
+ const char* Name() const override { return "CountingDeleteTabPropCollector"; }
+
+ Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+ EntryType type, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ if (type == kEntryDelete) {
+ num_deletes_++;
+ }
+ return Status::OK();
+ }
+
+ bool NeedCompact() const override { return num_deletes_ > 10; }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ *properties =
+ UserCollectedProperties{{"num_delete", std::to_string(num_deletes_)}};
+ return Status::OK();
+ }
+
+ private:
+ uint32_t num_deletes_ = 0;
+};
+
+class CountingDeleteTabPropCollectorFactory
+ : public TablePropertiesCollectorFactory {
+ public:
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ return new CountingDeleteTabPropCollector();
+ }
+ const char* Name() const override {
+ return "CountingDeleteTabPropCollectorFactory";
+ }
+};
+
+class BlockCountingTablePropertiesCollector : public TablePropertiesCollector {
+ public:
+ static const std::string kNumSampledBlocksPropertyName;
+
+ const char* Name() const override {
+ return "BlockCountingTablePropertiesCollector";
+ }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ (*properties)[kNumSampledBlocksPropertyName] =
+ std::to_string(num_sampled_blocks_);
+ return Status::OK();
+ }
+
+ Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+ EntryType /*type*/, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ return Status::OK();
+ }
+
+ void BlockAdd(uint64_t /* block_uncomp_bytes */,
+ uint64_t block_compressed_bytes_fast,
+ uint64_t block_compressed_bytes_slow) override {
+ if (block_compressed_bytes_fast > 0 || block_compressed_bytes_slow > 0) {
+ num_sampled_blocks_++;
+ }
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{
+ {kNumSampledBlocksPropertyName, std::to_string(num_sampled_blocks_)},
+ };
+ }
+
+ private:
+ uint32_t num_sampled_blocks_ = 0;
+};
+
+const std::string
+ BlockCountingTablePropertiesCollector::kNumSampledBlocksPropertyName =
+ "NumSampledBlocks";
+
+class BlockCountingTablePropertiesCollectorFactory
+ : public TablePropertiesCollectorFactory {
+ public:
+ const char* Name() const override {
+ return "BlockCountingTablePropertiesCollectorFactory";
+ }
+
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /* context */) override {
+ return new BlockCountingTablePropertiesCollector();
+ }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = (1 << 30);
+ options.table_properties_collector_factories.resize(1);
+ std::shared_ptr<CountingUserTblPropCollectorFactory> collector_factory =
+ std::make_shared<CountingUserTblPropCollectorFactory>(0);
+ options.table_properties_collector_factories[0] = collector_factory;
+ Reopen(options);
+ // Create 4 tables
+ for (int table = 0; table < 4; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ ASSERT_OK(
+ db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val"));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+
+ TablePropertiesCollection props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+ ASSERT_EQ(4U, props.size());
+ uint32_t sum = 0;
+ for (const auto& item : props) {
+ auto& user_collected = item.second->user_collected_properties;
+ ASSERT_TRUE(user_collected.find("CountingUserTblPropCollector") !=
+ user_collected.end());
+ ASSERT_EQ(user_collected.at("CountingUserTblPropCollector"), "Rocksdb");
+ ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
+ Slice key(user_collected.at("Count"));
+ uint32_t count;
+ ASSERT_TRUE(GetVarint32(&key, &count));
+ sum += count;
+ }
+ ASSERT_EQ(10u + 11u + 12u + 13u, sum);
+
+ ASSERT_GT(collector_factory->num_created_, 0U);
+ collector_factory->num_created_ = 0;
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+ ASSERT_GT(collector_factory->num_created_, 0U);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBPropertiesTest, UserDefinedTablePropertiesContext) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 3;
+ options.table_properties_collector_factories.resize(1);
+ std::shared_ptr<CountingUserTblPropCollectorFactory> collector_factory =
+ std::make_shared<CountingUserTblPropCollectorFactory>(1);
+ options.table_properties_collector_factories[0] = collector_factory,
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // Create 2 files
+ for (int table = 0; table < 2; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ ASSERT_OK(Put(1, std::to_string(table * 100 + i), "val"));
+ }
+ ASSERT_OK(Flush(1));
+ }
+ ASSERT_GT(collector_factory->num_created_, 0U);
+
+ collector_factory->num_created_ = 0;
+ // Trigger automatic compactions.
+ for (int table = 0; table < 3; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ ASSERT_OK(Put(1, std::to_string(table * 100 + i), "val"));
+ }
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_GT(collector_factory->num_created_, 0U);
+
+ collector_factory->num_created_ = 0;
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+ ASSERT_GT(collector_factory->num_created_, 0U);
+
+ // Come back to write to default column family
+ collector_factory->num_created_ = 0;
+ collector_factory->set_expected_column_family_id(0); // default CF
+ // Create 4 tables in default column family
+ for (int table = 0; table < 2; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_GT(collector_factory->num_created_, 0U);
+
+ collector_factory->num_created_ = 0;
+ // Trigger automatic compactions.
+ for (int table = 0; table < 3; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_GT(collector_factory->num_created_, 0U);
+
+ collector_factory->num_created_ = 0;
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+ ASSERT_GT(collector_factory->num_created_, 0U);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, TablePropertiesNeedCompactTest) {
+ Random rnd(301);
+
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = 4096;
+ options.max_write_buffer_number = 8;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 4;
+ options.target_file_size_base = 2048;
+ options.max_bytes_for_level_base = 10240;
+ options.max_bytes_for_level_multiplier = 4;
+ options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+ options.num_levels = 8;
+ options.env = env_;
+
+ std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
+ std::make_shared<CountingDeleteTabPropCollectorFactory>();
+ options.table_properties_collector_factories.resize(1);
+ options.table_properties_collector_factories[0] = collector_factory;
+
+ DestroyAndReopen(options);
+
+ const int kMaxKey = 1000;
+ for (int i = 0; i < kMaxKey; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+ ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ if (NumTableFilesAtLevel(0) == 1) {
+ // Clear Level 0 so that when later flush a file with deletions,
+ // we don't trigger an organic compaction.
+ ASSERT_OK(Put(Key(0), ""));
+ ASSERT_OK(Put(Key(kMaxKey * 2), ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ {
+ int c = 0;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ iter->Seek(Key(kMaxKey - 100));
+ while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+ iter->Next();
+ ++c;
+ }
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(c, 200);
+ }
+
+ ASSERT_OK(Delete(Key(0)));
+ for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Delete(Key(kMaxKey * 2)));
+
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ {
+ SetPerfLevel(kEnableCount);
+ get_perf_context()->Reset();
+ int c = 0;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ iter->Seek(Key(kMaxKey - 100));
+ while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+ iter->Next();
+ }
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(c, 0);
+ ASSERT_LT(get_perf_context()->internal_delete_skipped_count, 30u);
+ ASSERT_LT(get_perf_context()->internal_key_skipped_count, 30u);
+ SetPerfLevel(kDisable);
+ }
+}
+
+TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) {
+ Random rnd(301);
+
+ Options options;
+ options.create_if_missing = true;
+ options.max_write_buffer_number = 8;
+ options.level0_file_num_compaction_trigger = 10;
+ options.level0_slowdown_writes_trigger = 10;
+ options.level0_stop_writes_trigger = 10;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+
+ std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
+ std::make_shared<CountingDeleteTabPropCollectorFactory>();
+ options.table_properties_collector_factories.resize(1);
+ options.table_properties_collector_factories[0] = collector_factory;
+
+ DestroyAndReopen(options);
+
+ const int kMaxKey = 100;
+ for (int i = 0; i < kMaxKey; i++) {
+ ASSERT_OK(Put(Key(i), ""));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ for (int i = 1; i < kMaxKey - 1; i++) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+
+ // Restart the DB. Although number of files didn't reach
+ // options.level0_file_num_compaction_trigger, compaction should
+ // still be triggered because of the need-compaction hint.
+ options.disable_auto_compactions = false;
+ Reopen(options);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ {
+ SetPerfLevel(kEnableCount);
+ get_perf_context()->Reset();
+ int c = 0;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
+ c++;
+ }
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(c, 2);
+ ASSERT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
+ // We iterate every key twice. Is it a bug?
+ ASSERT_LE(get_perf_context()->internal_key_skipped_count, 2);
+ SetPerfLevel(kDisable);
+ }
+}
+
+// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
+TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
+ // Sampled compression requires at least one of the following four types.
+ if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() &&
+ !ZSTD_Supported()) {
+ return;
+ }
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.table_properties_collector_factories.emplace_back(
+ std::make_shared<BlockCountingTablePropertiesCollectorFactory>());
+
+ for (bool sample_for_compression : {false, true}) {
+ // For simplicity/determinism, sample 100% when enabled, or 0% when disabled
+ options.sample_for_compression = sample_for_compression ? 1 : 0;
+
+ DestroyAndReopen(options);
+
+ // Setup the following LSM:
+ //
+ // L0_0 ["a", "b"]
+ // L1_0 ["a", "b"]
+ //
+ // L0_0 was created by flush. L1_0 was created by compaction. Each file
+ // contains one data block.
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("a", "val"));
+ ASSERT_OK(Put("b", "val"));
+ ASSERT_OK(Flush());
+ if (i == 1) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+ }
+
+ // A `BlockAdd()` should have been seen for files generated by flush or
+ // compaction when `sample_for_compression` is enabled.
+ TablePropertiesCollection file_to_props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
+ ASSERT_EQ(2, file_to_props.size());
+ for (const auto& file_and_props : file_to_props) {
+ auto& user_props = file_and_props.second->user_collected_properties;
+ ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector::
+ kNumSampledBlocksPropertyName) !=
+ user_props.end());
+ ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector::
+ kNumSampledBlocksPropertyName),
+ std::to_string(sample_for_compression ? 1 : 0));
+ }
+ }
+}
+
+class CompressionSamplingDBPropertiesTest
+ : public DBPropertiesTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ CompressionSamplingDBPropertiesTest() : fast_(GetParam()) {}
+
+ protected:
+ const bool fast_;
+};
+
+INSTANTIATE_TEST_CASE_P(CompressionSamplingDBPropertiesTest,
+ CompressionSamplingDBPropertiesTest, ::testing::Bool());
+
+// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
+TEST_P(CompressionSamplingDBPropertiesTest,
+ EstimateDataSizeWithCompressionSampling) {
+ Options options = CurrentOptions();
+ if (fast_) {
+ // One of the following light compression libraries must be present.
+ if (LZ4_Supported()) {
+ options.compression = kLZ4Compression;
+ } else if (Snappy_Supported()) {
+ options.compression = kSnappyCompression;
+ } else {
+ return;
+ }
+ } else {
+ // One of the following heavy compression libraries must be present.
+ if (ZSTD_Supported()) {
+ options.compression = kZSTD;
+ } else if (Zlib_Supported()) {
+ options.compression = kZlibCompression;
+ } else {
+ return;
+ }
+ }
+ options.disable_auto_compactions = true;
+ // For simplicity/determinism, sample 100%.
+ options.sample_for_compression = 1;
+ Reopen(options);
+
+ // Setup the following LSM:
+ //
+ // L0_0 ["a", "b"]
+ // L1_0 ["a", "b"]
+ //
+ // L0_0 was created by flush. L1_0 was created by compaction. Each file
+ // contains one data block. The value consists of compressible data so the
+ // data block should be stored compressed.
+ std::string val(1024, 'a');
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("a", val));
+ ASSERT_OK(Put("b", val));
+ ASSERT_OK(Flush());
+ if (i == 1) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+ }
+
+ TablePropertiesCollection file_to_props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
+ ASSERT_EQ(2, file_to_props.size());
+ for (const auto& file_and_props : file_to_props) {
+ ASSERT_GT(file_and_props.second->data_size, 0);
+ if (fast_) {
+ ASSERT_EQ(file_and_props.second->data_size,
+ file_and_props.second->fast_compression_estimated_data_size);
+ } else {
+ ASSERT_EQ(file_and_props.second->data_size,
+ file_and_props.second->slow_compression_estimated_data_size);
+ }
+ }
+}
+
+TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) {
+ Options options = CurrentOptions();
+ Reopen(options);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Delete("foo"));
+ ASSERT_OK(Delete("foo"));
+ uint64_t num_keys = 0;
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &num_keys));
+ ASSERT_EQ(0, num_keys);
+}
+
+TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
+ uint64_t oldest_key_time = 0;
+ Options options = CurrentOptions();
+ SetTimeElapseOnlySleepOnReopen(&options);
+
+ // "rocksdb.estimate-oldest-key-time" only available to fifo compaction.
+ for (auto compaction : {kCompactionStyleLevel, kCompactionStyleUniversal,
+ kCompactionStyleNone}) {
+ options.compaction_style = compaction;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_FALSE(dbfull()->GetIntProperty(
+ DB::Properties::kEstimateOldestKeyTime, &oldest_key_time));
+ }
+
+ int64_t mock_start_time;
+ ASSERT_OK(env_->GetCurrentTime(&mock_start_time));
+
+ options.compaction_style = kCompactionStyleFIFO;
+ options.ttl = 300;
+ options.max_open_files = -1;
+ options.compaction_options_fifo.allow_compaction = false;
+ DestroyAndReopen(options);
+
+ env_->MockSleepForSeconds(100);
+ ASSERT_OK(Put("k1", "v1"));
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(100, oldest_key_time - mock_start_time);
+ ASSERT_OK(Flush());
+ ASSERT_EQ("1", FilesPerLevel());
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(100, oldest_key_time - mock_start_time);
+
+ env_->MockSleepForSeconds(100); // -> 200
+ ASSERT_OK(Put("k2", "v2"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("2", FilesPerLevel());
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(100, oldest_key_time - mock_start_time);
+
+ env_->MockSleepForSeconds(100); // -> 300
+ ASSERT_OK(Put("k3", "v3"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("3", FilesPerLevel());
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(100, oldest_key_time - mock_start_time);
+
+ env_->MockSleepForSeconds(150); // -> 450
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("2", FilesPerLevel());
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(200, oldest_key_time - mock_start_time);
+
+ env_->MockSleepForSeconds(100); // -> 550
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("1", FilesPerLevel());
+ ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+ ASSERT_EQ(300, oldest_key_time - mock_start_time);
+
+ env_->MockSleepForSeconds(100); // -> 650
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("", FilesPerLevel());
+ ASSERT_FALSE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+ &oldest_key_time));
+}
+
+TEST_F(DBPropertiesTest, SstFilesSize) {
+ struct TestListener : public EventListener {
+ void OnCompactionCompleted(DB* db,
+ const CompactionJobInfo& /*info*/) override {
+ assert(callback_triggered == false);
+ assert(size_before_compaction > 0);
+ callback_triggered = true;
+ uint64_t total_sst_size = 0;
+ uint64_t live_sst_size = 0;
+ bool ok = db->GetIntProperty(DB::Properties::kTotalSstFilesSize,
+ &total_sst_size);
+ ASSERT_TRUE(ok);
+ // total_sst_size include files before and after compaction.
+ ASSERT_GT(total_sst_size, size_before_compaction);
+ ok =
+ db->GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
+ ASSERT_TRUE(ok);
+ // live_sst_size only include files after compaction.
+ ASSERT_GT(live_sst_size, 0);
+ ASSERT_LT(live_sst_size, size_before_compaction);
+ }
+
+ uint64_t size_before_compaction = 0;
+ bool callback_triggered = false;
+ };
+ std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+ Options options;
+ options.env = CurrentOptions().env;
+ options.disable_auto_compactions = true;
+ options.listeners.push_back(listener);
+ Reopen(options);
+
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put("key" + std::to_string(i), std::string(1000, 'v')));
+ }
+ ASSERT_OK(Flush());
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(Delete("key" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ uint64_t sst_size;
+ bool ok = db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &sst_size);
+ ASSERT_TRUE(ok);
+ ASSERT_GT(sst_size, 0);
+ listener->size_before_compaction = sst_size;
+ // Compact to clean all keys and trigger listener.
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_TRUE(listener->callback_triggered);
+}
+
+TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) {
+ class TestListener : public EventListener {
+ public:
+ void OnTableFileCreated(const TableFileCreationInfo& info) override {
+ if (info.reason == TableFileCreationReason::kCompaction) {
+ // Verify the property indicates that SSTs created by a running
+ // compaction cannot be deleted.
+ uint64_t created_file_num;
+ FileType created_file_type;
+ std::string filename =
+ info.file_path.substr(info.file_path.rfind('/') + 1);
+ ASSERT_TRUE(
+ ParseFileName(filename, &created_file_num, &created_file_type));
+ ASSERT_EQ(kTableFile, created_file_type);
+
+ uint64_t keep_sst_lower_bound;
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kMinObsoleteSstNumberToKeep,
+ &keep_sst_lower_bound));
+
+ ASSERT_LE(keep_sst_lower_bound, created_file_num);
+ validated_ = true;
+ }
+ }
+
+ void SetDB(DB* db) { db_ = db; }
+
+ int GetNumCompactions() { return num_compactions_; }
+
+ // True if we've verified the property for at least one output file
+ bool Validated() { return validated_; }
+
+ private:
+ int num_compactions_ = 0;
+ bool validated_ = false;
+ DB* db_ = nullptr;
+ };
+
+ const int kNumL0Files = 4;
+
+ std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+ Options options = CurrentOptions();
+ options.listeners.push_back(listener);
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ DestroyAndReopen(options);
+ listener->SetDB(db_);
+
+ for (int i = 0; i < kNumL0Files; ++i) {
+ // Make sure they overlap in keyspace to prevent trivial move
+ ASSERT_OK(Put("key1", "val"));
+ ASSERT_OK(Put("key2", "val"));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_TRUE(listener->Validated());
+}
+
+TEST_F(DBPropertiesTest, BlobCacheProperties) {
+ Options options;
+ uint64_t value;
+
+ options.env = CurrentOptions().env;
+
+ // Test with empty blob cache.
+ constexpr size_t kCapacity = 100;
+ LRUCacheOptions co;
+ co.capacity = kCapacity;
+ co.num_shard_bits = 0;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto blob_cache = NewLRUCache(co);
+ options.blob_cache = blob_cache;
+
+ Reopen(options);
+
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+ ASSERT_EQ(0, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+ ASSERT_EQ(0, value);
+
+ // Insert unpinned blob to the cache and check size.
+ constexpr size_t kSize1 = 70;
+ ASSERT_OK(blob_cache->Insert("blob1", nullptr /*value*/, kSize1,
+ nullptr /*deleter*/));
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+ ASSERT_EQ(kSize1, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+ ASSERT_EQ(0, value);
+
+ // Insert pinned blob to the cache and check size.
+ constexpr size_t kSize2 = 60;
+ Cache::Handle* blob2 = nullptr;
+ ASSERT_OK(blob_cache->Insert("blob2", nullptr /*value*/, kSize2,
+ nullptr /*deleter*/, &blob2));
+ ASSERT_NE(nullptr, blob2);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+ // blob1 is evicted.
+ ASSERT_EQ(kSize2, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+ ASSERT_EQ(kSize2, value);
+
+ // Insert another pinned blob to make the cache over-sized.
+ constexpr size_t kSize3 = 80;
+ Cache::Handle* blob3 = nullptr;
+ ASSERT_OK(blob_cache->Insert("blob3", nullptr /*value*/, kSize3,
+ nullptr /*deleter*/, &blob3));
+ ASSERT_NE(nullptr, blob3);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+ ASSERT_EQ(kSize2 + kSize3, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+ ASSERT_EQ(kSize2 + kSize3, value);
+
+ // Check size after release.
+ blob_cache->Release(blob2);
+ blob_cache->Release(blob3);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+ // blob2 will be evicted, while blob3 remain in cache after release.
+ ASSERT_EQ(kSize3, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+ ASSERT_EQ(0, value);
+}
+
+TEST_F(DBPropertiesTest, BlockCacheProperties) {
+ Options options;
+ uint64_t value;
+
+ options.env = CurrentOptions().env;
+
+ // Block cache properties are not available for tables other than
+ // block-based table.
+ options.table_factory.reset(NewPlainTableFactory());
+ Reopen(options);
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+ options.table_factory.reset(NewCuckooTableFactory());
+ Reopen(options);
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+ // Block cache properties are not available if block cache is not used.
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_FALSE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+ // Test with empty block cache.
+ constexpr size_t kCapacity = 100;
+ LRUCacheOptions co;
+ co.capacity = kCapacity;
+ co.num_shard_bits = 0;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto block_cache = NewLRUCache(co);
+ table_options.block_cache = block_cache;
+ table_options.no_block_cache = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_EQ(0, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+ ASSERT_EQ(0, value);
+
+ // Insert unpinned item to the cache and check size.
+ constexpr size_t kSize1 = 50;
+ ASSERT_OK(block_cache->Insert("item1", nullptr /*value*/, kSize1,
+ nullptr /*deleter*/));
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_EQ(kSize1, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+ ASSERT_EQ(0, value);
+
+ // Insert pinned item to the cache and check size.
+ constexpr size_t kSize2 = 30;
+ Cache::Handle* item2 = nullptr;
+ ASSERT_OK(block_cache->Insert("item2", nullptr /*value*/, kSize2,
+ nullptr /*deleter*/, &item2));
+ ASSERT_NE(nullptr, item2);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ ASSERT_EQ(kSize1 + kSize2, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+ ASSERT_EQ(kSize2, value);
+
+ // Insert another pinned item to make the cache over-sized.
+ constexpr size_t kSize3 = 80;
+ Cache::Handle* item3 = nullptr;
+ ASSERT_OK(block_cache->Insert("item3", nullptr /*value*/, kSize3,
+ nullptr /*deleter*/, &item3));
+ ASSERT_NE(nullptr, item2);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ // Item 1 is evicted.
+ ASSERT_EQ(kSize2 + kSize3, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+ ASSERT_EQ(kSize2 + kSize3, value);
+
+ // Check size after release.
+ block_cache->Release(item2);
+ block_cache->Release(item3);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+ ASSERT_EQ(kCapacity, value);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+ // item2 will be evicted, while item3 remain in cache after release.
+ ASSERT_EQ(kSize3, value);
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+ ASSERT_EQ(0, value);
+}
+
+TEST_F(DBPropertiesTest, GetMapPropertyDbStats) {
+ auto mock_clock = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+ CompositeEnvWrapper env(env_, mock_clock);
+
+ Options opts = CurrentOptions();
+ opts.env = &env;
+ Reopen(opts);
+
+ {
+ std::map<std::string, std::string> db_stats;
+ ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+ AssertDbStats(db_stats, 0.0 /* expected_uptime */,
+ 0 /* expected_user_bytes_written */,
+ 0 /* expected_wal_bytes_written */,
+ 0 /* expected_user_writes_by_self */,
+ 0 /* expected_user_writes_with_wal */);
+ }
+
+ {
+ mock_clock->SleepForMicroseconds(1500000);
+
+ std::map<std::string, std::string> db_stats;
+ ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+ AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+ 0 /* expected_user_bytes_written */,
+ 0 /* expected_wal_bytes_written */,
+ 0 /* expected_user_writes_by_self */,
+ 0 /* expected_user_writes_with_wal */);
+ }
+
+ int expected_user_bytes_written = 0;
+ {
+ // Write with WAL disabled.
+ WriteOptions write_opts;
+ write_opts.disableWAL = true;
+
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("key", "val"));
+ expected_user_bytes_written += static_cast<int>(batch.GetDataSize());
+
+ ASSERT_OK(db_->Write(write_opts, &batch));
+
+ std::map<std::string, std::string> db_stats;
+ ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+ AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+ expected_user_bytes_written,
+ 0 /* expected_wal_bytes_written */,
+ 1 /* expected_user_writes_by_self */,
+ 0 /* expected_user_writes_with_wal */);
+ }
+
+ int expected_wal_bytes_written = 0;
+ {
+ // Write with WAL enabled.
+ WriteBatch batch;
+ ASSERT_OK(batch.Delete("key"));
+ expected_user_bytes_written += static_cast<int>(batch.GetDataSize());
+ expected_wal_bytes_written += static_cast<int>(batch.GetDataSize());
+
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+ std::map<std::string, std::string> db_stats;
+ ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+ AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+ expected_user_bytes_written, expected_wal_bytes_written,
+ 2 /* expected_user_writes_by_self */,
+ 1 /* expected_user_writes_with_wal */);
+ }
+
+ Close();
+}
+
+TEST_F(DBPropertiesTest, GetMapPropertyBlockCacheEntryStats) {
+ // Currently only verifies the expected properties are present
+ std::map<std::string, std::string> values;
+ ASSERT_TRUE(
+ db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+
+ ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::CacheId()) !=
+ values.end());
+ ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::CacheCapacityBytes()) !=
+ values.end());
+ ASSERT_TRUE(
+ values.find(
+ BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds()) !=
+ values.end());
+ ASSERT_TRUE(
+ values.find(BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds()) !=
+ values.end());
+ for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+ CacheEntryRole role = static_cast<CacheEntryRole>(i);
+ ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::EntryCount(role)) !=
+ values.end());
+ ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::UsedBytes(role)) !=
+ values.end());
+ ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::UsedPercent(role)) !=
+ values.end());
+ }
+
+ // There should be no extra values in the map.
+ ASSERT_EQ(3 * kNumCacheEntryRoles + 4, values.size());
+}
+
+namespace {
+std::string PopMetaIndexKey(InternalIterator* meta_iter) {
+ Status s = meta_iter->status();
+ if (!s.ok()) {
+ return s.ToString();
+ } else if (meta_iter->Valid()) {
+ std::string rv = meta_iter->key().ToString();
+ meta_iter->Next();
+ return rv;
+ } else {
+ return "NOT_FOUND";
+ }
+}
+
+} // anonymous namespace
+
+TEST_F(DBPropertiesTest, TableMetaIndexKeys) {
+ // This is to detect unexpected churn in metaindex block keys. This is more
+ // of a "table test" but table_test.cc doesn't depend on db_test_util.h and
+ // we need ChangeOptions() for broad coverage.
+ constexpr int kKeyCount = 100;
+ do {
+ Options options;
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ // Create an SST file
+ for (int key = 0; key < kKeyCount; key++) {
+ ASSERT_OK(Put(Key(key), "val"));
+ }
+ ASSERT_OK(Flush());
+
+ // Find its file number
+ std::vector<LiveFileMetaData> files;
+ db_->GetLiveFilesMetaData(&files);
+ // 1 SST file
+ ASSERT_EQ(1, files.size());
+
+ // Open it for inspection
+ std::string sst_file =
+ files[0].directory + "/" + files[0].relative_filename;
+ std::unique_ptr<FSRandomAccessFile> f;
+ ASSERT_OK(env_->GetFileSystem()->NewRandomAccessFile(
+ sst_file, FileOptions(), &f, nullptr));
+ std::unique_ptr<RandomAccessFileReader> r;
+ r.reset(new RandomAccessFileReader(std::move(f), sst_file));
+ uint64_t file_size = 0;
+ ASSERT_OK(env_->GetFileSize(sst_file, &file_size));
+
+ // Read metaindex
+ BlockContents bc;
+ ASSERT_OK(ReadMetaIndexBlockInFile(r.get(), file_size, 0U,
+ ImmutableOptions(options), &bc));
+ Block metaindex_block(std::move(bc));
+ std::unique_ptr<InternalIterator> meta_iter;
+ meta_iter.reset(metaindex_block.NewMetaIterator());
+ meta_iter->SeekToFirst();
+
+ if (strcmp(options.table_factory->Name(),
+ TableFactory::kBlockBasedTableName()) == 0) {
+ auto bbto = options.table_factory->GetOptions<BlockBasedTableOptions>();
+ if (bbto->filter_policy) {
+ if (bbto->partition_filters) {
+ // The key names are intentionally hard-coded here to detect
+ // accidental regression on compatibility.
+ EXPECT_EQ("partitionedfilter.rocksdb.BuiltinBloomFilter",
+ PopMetaIndexKey(meta_iter.get()));
+ } else {
+ EXPECT_EQ("fullfilter.rocksdb.BuiltinBloomFilter",
+ PopMetaIndexKey(meta_iter.get()));
+ }
+ }
+ if (bbto->index_type == BlockBasedTableOptions::kHashSearch) {
+ EXPECT_EQ("rocksdb.hashindex.metadata",
+ PopMetaIndexKey(meta_iter.get()));
+ EXPECT_EQ("rocksdb.hashindex.prefixes",
+ PopMetaIndexKey(meta_iter.get()));
+ }
+ }
+ EXPECT_EQ("rocksdb.properties", PopMetaIndexKey(meta_iter.get()));
+ EXPECT_EQ("NOT_FOUND", PopMetaIndexKey(meta_iter.get()));
+ } while (ChangeOptions());
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_range_del_test.cc b/src/rocksdb/db/db_range_del_test.cc
new file mode 100644
index 000000000..d576f2217
--- /dev/null
+++ b/src/rocksdb/db/db_range_del_test.cc
@@ -0,0 +1,2807 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(cbi): parameterize the test to cover user-defined timestamp cases
+class DBRangeDelTest : public DBTestBase {
+ public:
+ DBRangeDelTest() : DBTestBase("db_range_del_test", /*env_do_fsync=*/false) {}
+
+ std::string GetNumericStr(int key) {
+ uint64_t uint64_key = static_cast<uint64_t>(key);
+ std::string str;
+ str.resize(8);
+ memcpy(&str[0], static_cast<void*>(&uint64_key), 8);
+ return str;
+ }
+};
+
+// PlainTableFactory, WriteBatchWithIndex, and NumTableFilesAtLevel() are not
+// supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) {
+ // TODO: figure out why MmapReads trips the iterator pinning assertion in
+ // RangeDelAggregator. Ideally it would be supported; otherwise it should at
+ // least be explicitly unsupported.
+ for (auto config : {kPlainTableAllBytesPrefix, /* kWalDirAndMmapReads */}) {
+ option_config_ = config;
+ DestroyAndReopen(CurrentOptions());
+ ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "dr1", "dr1")
+ .IsNotSupported());
+ }
+}
+
+TEST_F(DBRangeDelTest, WriteBatchWithIndexNotSupported) {
+ WriteBatchWithIndex indexedBatch{};
+ ASSERT_TRUE(indexedBatch.DeleteRange(db_->DefaultColumnFamily(), "dr1", "dr1")
+ .IsNotSupported());
+ ASSERT_TRUE(indexedBatch.DeleteRange("dr1", "dr1").IsNotSupported());
+}
+
+TEST_F(DBRangeDelTest, EndSameAsStartCoversNothing) {
+ ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "b"));
+ ASSERT_EQ("val", Get("b"));
+}
+
+TEST_F(DBRangeDelTest, EndComesBeforeStartInvalidArgument) {
+ ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
+ ASSERT_TRUE(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "a")
+ .IsInvalidArgument());
+ ASSERT_EQ("val", Get("b"));
+}
+
+TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
+ do {
+ DestroyAndReopen(CurrentOptions());
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "dr1", "dr2"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, DictionaryCompressionWithOnlyRangeTombstones) {
+ Options opts = CurrentOptions();
+ opts.compression_opts.max_dict_bytes = 16384;
+ Reopen(opts);
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+ "dr2"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
+ do {
+ Options opts = CurrentOptions();
+ opts.disable_auto_compactions = true;
+ opts.statistics = CreateDBStatistics();
+ DestroyAndReopen(opts);
+
+ // snapshot protects range tombstone from dropping due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+ db_->ReleaseSnapshot(snapshot);
+ // Skip cuckoo memtables, which do not support snapshots. Skip non-leveled
+ // compactions as the above assertions about the number of files in a level
+ // do not hold true.
+ } while (ChangeOptions(kRangeDelSkipConfigs | kSkipUniversalCompaction |
+ kSkipFIFOCompaction));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
+ // regression test for exactly filled compaction output files. Previously
+ // another file would be generated containing all range deletions, which
+ // could invalidate the non-overlapping file boundary invariant.
+ const int kNumPerFile = 4, kNumFiles = 2, kFileBytes = 9 << 10;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.level0_file_num_compaction_trigger = kNumFiles;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+ options.num_levels = 2;
+ options.target_file_size_base = kFileBytes;
+ BlockBasedTableOptions table_options;
+ table_options.block_size_deviation = 50; // each block holds two keys
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ // snapshot protects range tombstone from dropping due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(1)));
+
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; ++i) {
+ std::vector<std::string> values;
+ // Write 12K (4 values, each 3K)
+ for (int j = 0; j < kNumPerFile; j++) {
+ values.push_back(rnd.RandomString(3 << 10));
+ ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+ if (j == 0 && i > 0) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ }
+ }
+ // put extra key to trigger final flush
+ ASSERT_OK(Put("", ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) {
+ // Ensures range deletion spanning multiple compaction output files that are
+ // cut by max_compaction_bytes will have non-overlapping key-ranges.
+ // https://github.com/facebook/rocksdb/issues/1778
+ const int kNumFiles = 2, kNumPerFile = 1 << 8, kBytesPerVal = 1 << 12;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ opts.disable_auto_compactions = true;
+ opts.level0_file_num_compaction_trigger = kNumFiles;
+ opts.max_compaction_bytes = kNumPerFile * kBytesPerVal;
+ opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+ // Want max_compaction_bytes to trigger the end of compaction output file, not
+ // target_file_size_base, so make the latter much bigger
+ // opts.target_file_size_base = 100 * opts.max_compaction_bytes;
+ opts.target_file_size_base = 1;
+ DestroyAndReopen(opts);
+
+ // snapshot protects range tombstone from dropping due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ Random rnd(301);
+
+ ASSERT_OK(Put(GetNumericStr(0), rnd.RandomString(kBytesPerVal)));
+ ASSERT_OK(
+ Put(GetNumericStr(kNumPerFile - 1), rnd.RandomString(kBytesPerVal)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(GetNumericStr(kNumPerFile), rnd.RandomString(kBytesPerVal)));
+ ASSERT_OK(
+ Put(GetNumericStr(kNumPerFile * 2 - 1), rnd.RandomString(kBytesPerVal)));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(2);
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(NumTableFilesAtLevel(2), 2);
+
+ ASSERT_OK(
+ db_->SetOptions(db_->DefaultColumnFamily(),
+ {{"target_file_size_base",
+ std::to_string(100 * opts.max_compaction_bytes)}}));
+
+ // It spans the whole key-range, thus will be included in all output files
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr(0),
+ GetNumericStr(kNumFiles * kNumPerFile - 1)));
+
+ for (int i = 0; i < kNumFiles; ++i) {
+ std::vector<std::string> values;
+ // Write 1MB (256 values, each 4K)
+ for (int j = 0; j < kNumPerFile; j++) {
+ values.push_back(rnd.RandomString(kBytesPerVal));
+ ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j]));
+ }
+ // extra entry to trigger SpecialSkipListFactory's flush
+ ASSERT_OK(Put(GetNumericStr(kNumPerFile), ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+ }
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr,
+ /*column_family=*/nullptr,
+ /*disallow_trivial_move=*/true));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GE(NumTableFilesAtLevel(1), 2);
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+ for (size_t i = 0; i + 1 < files[1].size(); ++i) {
+ ASSERT_TRUE(InternalKeyComparator(opts.comparator)
+ .Compare(files[1][i].largest, files[1][i + 1].smallest) <
+ 0);
+ }
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) {
+ // Regression test for bug where sentinel range deletions (i.e., ones with
+ // sequence number of zero) were included in output files.
+ // snapshot protects range tombstone from dropping due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ // gaps between ranges creates sentinels in our internal representation
+ std::vector<std::pair<std::string, std::string>> range_dels = {
+ {"a", "b"}, {"c", "d"}, {"e", "f"}};
+ for (const auto& range_del : range_dels) {
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ range_del.first, range_del.second));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+ ASSERT_GT(files[0][0].fd.smallest_seqno, 0);
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
+ ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+ ASSERT_OK(db_->Put(WriteOptions(), "b2", "val"));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+ // first iteration verifies query correctness in memtable, second verifies
+ // query correctness for a single SST file
+ for (int i = 0; i < 2; ++i) {
+ if (i > 0) {
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ }
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+ ASSERT_OK(db_->Get(ReadOptions(), "b2", &value));
+ }
+}
+
+TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
+ ASSERT_OK(db_->Put(WriteOptions(), "unused",
+ "val")); // prevents empty after compaction
+ ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(3, NumTableFilesAtLevel(0));
+
+ for (int i = 0; i < 2; ++i) {
+ if (i > 0) {
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ }
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+ }
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
+ const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ DestroyAndReopen(opts);
+
+ // Write a third before snapshot, a third between snapshot and tombstone, and
+ // a third after the tombstone. Keys older than snapshot or newer than the
+ // tombstone should be preserved.
+ const Snapshot* snapshot = nullptr;
+ for (int i = 0; i < kNum; ++i) {
+ if (i == kNum / 3) {
+ snapshot = db_->GetSnapshot();
+ } else if (i == 2 * kNum / 3) {
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr(kRangeBegin),
+ GetNumericStr(kRangeEnd)));
+ }
+ ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ for (int i = 0; i < kNum; ++i) {
+ ReadOptions read_opts;
+ read_opts.ignore_range_deletions = true;
+ std::string value;
+ if (i < kRangeBegin || i > kRangeEnd || i < kNum / 3 || i >= 2 * kNum / 3) {
+ ASSERT_OK(db_->Get(read_opts, GetNumericStr(i), &value));
+ } else {
+ ASSERT_TRUE(db_->Get(read_opts, GetNumericStr(i), &value).IsNotFound());
+ }
+ }
+ db_->ReleaseSnapshot(snapshot);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
+ const int kNumPerFile = 100, kNumFiles = 4;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ opts.disable_auto_compactions = true;
+ opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+ opts.num_levels = 2;
+ opts.statistics = CreateDBStatistics();
+ DestroyAndReopen(opts);
+
+ for (int i = 0; i < kNumFiles; ++i) {
+ if (i > 0) {
+ // range tombstone covers first half of the previous file
+ ASSERT_OK(db_->DeleteRange(
+ WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr((i - 1) * kNumPerFile),
+ GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2)));
+ }
+ // Make sure a given key appears in each file so compaction won't be able to
+ // use trivial move, which would happen if the ranges were non-overlapping.
+ // Also, we need an extra element since flush is only triggered when the
+ // number of keys is one greater than SpecialSkipListFactory's limit.
+ // We choose a key outside the key-range used by the test to avoid conflict.
+ ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles),
+ "val"));
+
+ for (int j = 0; j < kNumPerFile; ++j) {
+ ASSERT_OK(
+ db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val"));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+ }
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
+ TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL));
+
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kNumPerFile; ++j) {
+ ReadOptions read_opts;
+ read_opts.ignore_range_deletions = true;
+ std::string value;
+ if (i == kNumFiles - 1 || j >= kNumPerFile / 2) {
+ ASSERT_OK(
+ db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value));
+ } else {
+ ASSERT_TRUE(
+ db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value)
+ .IsNotFound());
+ }
+ }
+ }
+}
+
+TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
+ const int kNumPerFile = 100, kNumFiles = 4, kFileBytes = 100 << 10;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.level0_file_num_compaction_trigger = kNumFiles;
+ options.max_bytes_for_level_base = 2 * kFileBytes;
+ options.max_subcompactions = 4;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+ options.num_levels = 3;
+ options.target_file_size_base = kFileBytes;
+ options.target_file_size_multiplier = 1;
+ options.max_compaction_bytes = 1500;
+ Reopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < kNumFiles; ++j) {
+ if (i > 0) {
+ // delete [95,105) in two files, [295,305) in next two
+ int mid = (j + (1 - j % 2)) * kNumPerFile;
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(mid - 5), Key(mid + 5)));
+ }
+ std::vector<std::string> values;
+ // Write 100KB (100 values, each 1K)
+ for (int k = 0; k < kNumPerFile; k++) {
+ values.push_back(rnd.RandomString(990));
+ ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put("", ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ if (j < kNumFiles - 1) {
+ // background compaction may happen early for kNumFiles'th file
+ ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+ }
+ if (j == options.level0_file_num_compaction_trigger - 1) {
+ // When i == 1, compaction will output some files to L1, at which point
+ // L1 is not bottommost so range deletions cannot be compacted away. The
+ // new L1 files must be generated with non-overlapping key ranges even
+ // though multiple subcompactions see the same ranges deleted, else an
+ // assertion will fail.
+ //
+ // Only enable auto-compactions when we're ready; otherwise, the
+ // oversized L0 (relative to base_level) causes the compaction to run
+ // earlier.
+ ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+ {{"disable_auto_compactions", "true"}}));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ ASSERT_GT(NumTableFilesAtLevel(2), 0);
+ }
+ }
+ }
+}
+
+TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
+ const int kNumPerFile = 100, kFilesPerLevel = 4, kNumLevels = 4;
+ Options options = CurrentOptions();
+ options.compaction_options_universal.min_merge_width = kFilesPerLevel;
+ options.compaction_options_universal.max_merge_width = kFilesPerLevel;
+ options.compaction_options_universal.size_ratio = 10;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.level0_file_num_compaction_trigger = kFilesPerLevel;
+ options.max_subcompactions = 4;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+ options.num_levels = kNumLevels;
+ options.target_file_size_base = kNumPerFile << 10;
+ options.target_file_size_multiplier = 1;
+ Reopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevels - 1; ++i) {
+ for (int j = 0; j < kFilesPerLevel; ++j) {
+ if (i == kNumLevels - 2) {
+ // insert range deletions [95,105) in two files, [295,305) in next two
+ // to prepare L1 for later manual compaction.
+ int mid = (j + (1 - j % 2)) * kNumPerFile;
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(mid - 5), Key(mid + 5)));
+ }
+ std::vector<std::string> values;
+ // Write 100KB (100 values, each 1K)
+ for (int k = 0; k < kNumPerFile; k++) {
+ values.push_back(rnd.RandomString(990));
+ ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put("", ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ if (j < kFilesPerLevel - 1) {
+ // background compaction may happen early for kFilesPerLevel'th file
+ ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
+ }
+ // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions
+ // happen since input level > 0; (2) range deletions are not dropped since
+ // output level is not bottommost. If no file boundary assertion fails, that
+ // probably means universal compaction + subcompaction + range deletion are
+ // compatible.
+ ASSERT_OK(dbfull()->RunManualCompaction(
+ static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+ ->cfd(),
+ 1 /* input_level */, 2 /* output_level */, CompactRangeOptions(),
+ nullptr /* begin */, nullptr /* end */, true /* exclusive */,
+ true /* disallow_trivial_move */,
+ std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
+ "" /*trim_ts*/));
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
+ const int kNumPerFile = 3, kNumFiles = 3;
+ Options opts = CurrentOptions();
+ opts.disable_auto_compactions = true;
+ opts.memtable_factory.reset(test::NewSpecialSkipListFactory(2 * kNumPerFile));
+ opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ opts.num_levels = 2;
+ Reopen(opts);
+
+ // Iterates kNumFiles * kNumPerFile + 1 times since flushing the last file
+ // requires an extra entry.
+ for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) {
+ if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) {
+ // Delete merge operands from all but the last file
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "key", "key_"));
+ }
+ std::string val;
+ PutFixed64(&val, i);
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+ // we need to prevent trivial move using Puts so compaction will actually
+ // process the merge operands.
+ ASSERT_OK(db_->Put(WriteOptions(), "prevent_trivial_move", ""));
+ if (i > 0 && i % kNumPerFile == 0) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ }
+
+ ReadOptions read_opts;
+ read_opts.ignore_range_deletions = true;
+ std::string expected, actual;
+ ASSERT_OK(db_->Get(read_opts, "key", &actual));
+ PutFixed64(&expected, 45); // 1+2+...+9
+ ASSERT_EQ(expected, actual);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ expected.clear();
+ ASSERT_OK(db_->Get(read_opts, "key", &actual));
+ uint64_t tmp;
+ Slice tmp2(actual);
+ GetFixed64(&tmp2, &tmp);
+ PutFixed64(&expected, 30); // 6+7+8+9 (earlier operands covered by tombstone)
+ ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) {
+ // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4)
+ // Flush. The `CompactionIterator` previously had a bug where we forgot to
+ // check for covering range tombstones when processing the (1) Put, causing
+ // it to reappear after the flush.
+ Options opts = CurrentOptions();
+ opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ Reopen(opts);
+
+ std::string val;
+ PutFixed64(&val, 1);
+ ASSERT_OK(db_->Put(WriteOptions(), "key", val));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
+ "key_"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ReadOptions read_opts;
+ std::string expected, actual;
+ ASSERT_OK(db_->Get(read_opts, "key", &actual));
+ PutFixed64(&expected, 1);
+ ASSERT_EQ(expected, actual);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
+ // During compaction to bottommost level, verify range tombstones older than
+ // the oldest snapshot are removed, while others are preserved.
+ Options opts = CurrentOptions();
+ opts.disable_auto_compactions = true;
+ opts.num_levels = 2;
+ opts.statistics = CreateDBStatistics();
+ Reopen(opts);
+
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+ "dr10")); // obsolete after compaction
+ ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
+ "dr20")); // protected by snapshot
+ ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
+ // The RangeDelAggregator holds pointers into range deletion blocks created by
+ // table readers. This test ensures the aggregator can still access those
+ // blocks even if it outlives the table readers that created them.
+ //
+ // DBIter always keeps readers open for L0 files. So, in order to test
+ // aggregator outliving reader, we need to have deletions in L1 files, which
+ // are opened/closed on-demand during the scan. This is accomplished by
+ // setting kNumRanges > level0_stop_writes_trigger, which prevents deletions
+ // from all lingering in L0 (there is at most one range deletion per L0 file).
+ //
+ // The first L1 file will contain a range deletion since its begin key is 0.
+ // SeekToFirst() references that table's reader and adds its range tombstone
+ // to the aggregator. Upon advancing beyond that table's key-range via Next(),
+ // the table reader will be unreferenced by the iterator. Since we manually
+ // call Evict() on all readers before the full scan, this unreference causes
+ // the reader's refcount to drop to zero and thus be destroyed.
+ //
+ // When it is destroyed, we do not remove its range deletions from the
+ // aggregator. So, subsequent calls to Next() must be able to use these
+ // deletions to decide whether a key is covered. This will work as long as
+ // the aggregator properly references the range deletion block.
+ const int kNum = 25, kRangeBegin = 0, kRangeEnd = 7, kNumRanges = 5;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ opts.level0_file_num_compaction_trigger = 4;
+ opts.level0_stop_writes_trigger = 4;
+ opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+ opts.num_levels = 2;
+ BlockBasedTableOptions bbto;
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.block_cache = NewLRUCache(8 << 20);
+ opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(opts);
+
+ // Hold a snapshot so range deletions can't become obsolete during compaction
+ // to bottommost level (i.e., L1).
+ const Snapshot* snapshot = db_->GetSnapshot();
+ for (int i = 0; i < kNum; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+ if (i > 0) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) {
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr(kRangeBegin),
+ GetNumericStr(kRangeEnd)));
+ }
+ }
+ // Must be > 1 so the first L1 file can be closed before scan finishes
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_GT(NumTableFilesAtLevel(1), 1);
+ std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
+
+ ReadOptions read_opts;
+ auto* iter = db_->NewIterator(read_opts);
+ ASSERT_OK(iter->status());
+ int expected = kRangeEnd;
+ iter->SeekToFirst();
+ for (auto file_number : file_numbers) {
+ // This puts table caches in the state of being externally referenced only
+ // so they are destroyed immediately upon iterator unreferencing.
+ TableCache::Evict(dbfull()->TEST_table_cache(), file_number);
+ }
+ for (; iter->Valid(); iter->Next()) {
+ ASSERT_EQ(GetNumericStr(expected), iter->key());
+ ++expected;
+ // Keep clearing block cache's LRU so range deletion block can be freed as
+ // soon as its refcount drops to zero.
+ bbto.block_cache->EraseUnRefEntries();
+ }
+ ASSERT_EQ(kNum, expected);
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+
+ // Also test proper cache handling in GetRangeTombstoneIterator,
+ // via TablesRangeTombstoneSummary. (This once triggered memory leak
+ // report with ASAN.)
+ opts.max_open_files = 1;
+ Reopen(opts);
+
+ std::string str;
+ ASSERT_OK(dbfull()->TablesRangeTombstoneSummary(db_->DefaultColumnFamily(),
+ 100, &str));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
+ do {
+ DestroyAndReopen(CurrentOptions());
+ ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+ ReadOptions read_opts;
+ std::string value;
+ ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+ } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
+ do {
+ Options opts = CurrentOptions();
+ opts.max_write_buffer_number = 3;
+ opts.min_write_buffer_number_to_merge = 2;
+ // SpecialSkipListFactory lets us specify maximum number of elements the
+ // memtable can hold. It switches the active memtable to immutable (flush is
+ // prevented by the above options) upon inserting an element that would
+ // overflow the memtable.
+ opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+ DestroyAndReopen(opts);
+
+ ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ ASSERT_OK(db_->Put(WriteOptions(), "blah", "val"));
+
+ ReadOptions read_opts;
+ std::string value;
+ ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+ } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
+ do {
+ DestroyAndReopen(CurrentOptions());
+ ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+ // snapshot prevents key from being deleted during flush
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ReadOptions read_opts;
+ std::string value;
+ ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+ db_->ReleaseSnapshot(snapshot);
+ } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) {
+ const int kNumMergeOps = 10;
+ Options opts = CurrentOptions();
+ opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ Reopen(opts);
+
+ for (int i = 0; i < kNumMergeOps; ++i) {
+ std::string val;
+ PutFixed64(&val, i);
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+ if (i == kNumMergeOps / 2) {
+ // deletes [0, 5]
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "key", "key_"));
+ }
+ }
+
+ ReadOptions read_opts;
+ std::string expected, actual;
+ ASSERT_OK(db_->Get(read_opts, "key", &actual));
+ PutFixed64(&expected, 30); // 6+7+8+9
+ ASSERT_EQ(expected, actual);
+
+ expected.clear();
+ read_opts.ignore_range_deletions = true;
+ ASSERT_OK(db_->Get(read_opts, "key", &actual));
+ PutFixed64(&expected, 45); // 0+1+2+...+9
+ ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) {
+ Options opts = CurrentOptions();
+ opts.max_write_buffer_number = 4;
+ opts.min_write_buffer_number_to_merge = 3;
+ opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+ Reopen(opts);
+
+ ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
+ // snapshot prevents key from being deleted during flush
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+ ReadOptions read_opts;
+ read_opts.ignore_range_deletions = true;
+ for (std::string key : {"sst_key", "imm_key", "mem_key"}) {
+ std::string value;
+ ASSERT_OK(db_->Get(read_opts, key, &value));
+ }
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) {
+ const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+ DestroyAndReopen(opts);
+
+ // Write half of the keys before the tombstone and half after the tombstone.
+ // Only covered keys (i.e., within the range and older than the tombstone)
+ // should be deleted.
+ for (int i = 0; i < kNum; ++i) {
+ if (i == kNum / 2) {
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr(kRangeBegin),
+ GetNumericStr(kRangeEnd)));
+ }
+ ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+ }
+ ReadOptions read_opts;
+ auto* iter = db_->NewIterator(read_opts);
+ ASSERT_OK(iter->status());
+
+ int expected = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(GetNumericStr(expected), iter->key());
+ if (expected == kRangeBegin - 1) {
+ expected = kNum / 2;
+ } else {
+ ++expected;
+ }
+ }
+ ASSERT_EQ(kNum, expected);
+ delete iter;
+}
+
+TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
+ const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+ Options opts = CurrentOptions();
+ opts.comparator = test::Uint64Comparator();
+ opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+ DestroyAndReopen(opts);
+
+ const Snapshot* snapshot = nullptr;
+ // Put a snapshot before the range tombstone, verify an iterator using that
+ // snapshot sees all inserted keys.
+ for (int i = 0; i < kNum; ++i) {
+ if (i == kNum / 2) {
+ snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ GetNumericStr(kRangeBegin),
+ GetNumericStr(kRangeEnd)));
+ }
+ ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+ }
+ ReadOptions read_opts;
+ read_opts.snapshot = snapshot;
+ auto* iter = db_->NewIterator(read_opts);
+ ASSERT_OK(iter->status());
+
+ int expected = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(GetNumericStr(expected), iter->key());
+ ++expected;
+ }
+ ASSERT_EQ(kNum / 2, expected);
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
+ Options opts = CurrentOptions();
+ opts.max_write_buffer_number = 4;
+ opts.min_write_buffer_number_to_merge = 3;
+ opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+ Reopen(opts);
+
+ ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
+ // snapshot prevents key from being deleted during flush
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+ ReadOptions read_opts;
+ read_opts.ignore_range_deletions = true;
+ auto* iter = db_->NewIterator(read_opts);
+ ASSERT_OK(iter->status());
+ int i = 0;
+ std::string expected[] = {"imm_key", "mem_key", "sst_key"};
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) {
+ std::string key;
+ ASSERT_EQ(expected[i], iter->key());
+ }
+ ASSERT_EQ(3, i);
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+}
+
+#ifndef ROCKSDB_UBSAN_RUN
+TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
+ ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+ // snapshot prevents key from being deleted during flush
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+ // iterations check unsupported in memtable, l0, and then l1
+ for (int i = 0; i < 3; ++i) {
+ ReadOptions read_opts;
+ read_opts.tailing = true;
+ auto* iter = db_->NewIterator(read_opts);
+ if (i == 2) {
+ // For L1+, iterators over files are created on-demand, so need seek
+ iter->SeekToFirst();
+ }
+ ASSERT_TRUE(iter->status().IsNotSupported());
+
+ delete iter;
+ if (i == 0) {
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ } else if (i == 1) {
+ MoveFilesToLevel(1);
+ }
+ }
+ db_->ReleaseSnapshot(snapshot);
+}
+#endif // !ROCKSDB_UBSAN_RUN
+
+TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) {
+ const int kNumFiles = 2, kNumKeysPerFile = 4;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.level0_file_num_compaction_trigger = kNumFiles;
+ options.max_subcompactions = 2;
+ options.num_levels = 2;
+ options.target_file_size_base = 4096;
+ Reopen(options);
+
+ // need a L1 file for subcompaction to be triggered
+ ASSERT_OK(
+ db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(0), "val"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+
+ // put enough keys to fill up the first subcompaction, and later range-delete
+ // them so that the first subcompaction outputs no key-values. In that case
+ // it'll consider making an SST file dedicated to range deletions.
+ for (int i = 0; i < kNumKeysPerFile; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
+ std::string(1024, 'a')));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(kNumKeysPerFile)));
+
+ // the above range tombstone can be dropped, so that one alone won't cause a
+ // dedicated file to be opened. We can make one protected by snapshot that
+ // must be considered. Make its range outside the first subcompaction's range
+ // to exercise the tricky part of the code.
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(kNumKeysPerFile + 1),
+ Key(kNumKeysPerFile + 2)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MemtableBloomFilter) {
+ // regression test for #2743. the range delete tombstones in memtable should
+ // be added even when Get() skips searching due to its prefix bloom filter
+ const int kMemtableSize = 1 << 20; // 1MB
+ const int kMemtablePrefixFilterSize = 1 << 13; // 8KB
+ const int kNumKeys = 1000;
+ const int kPrefixLen = 8;
+ Options options = CurrentOptions();
+ options.memtable_prefix_bloom_size_ratio =
+ static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+ options.prefix_extractor.reset(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen));
+ options.write_buffer_size = kMemtableSize;
+ Reopen(options);
+
+ for (int i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(kNumKeys)));
+ for (int i = 0; i < kNumKeys; ++i) {
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+ }
+}
+
+TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
+ // This test originally verified that compaction treated files containing a
+ // split range deletion in the input level as an atomic unit. I.e.,
+ // compacting any input-level file(s) containing a portion of the range
+ // deletion causes all other input-level files containing portions of that
+ // same range deletion to be included in the compaction. Range deletion
+ // tombstones are now truncated to sstable boundaries which removed the need
+ // for that behavior (which could lead to excessively large
+ // compactions).
+ const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(2 /* num_entries_flush */));
+ // max file size could be 2x of target file size, so set it to half of that
+ options.target_file_size_base = kValueBytes / 2;
+ // disable dynamic_file_size, as it will cut L1 files into more files (than
+ // kNumFilesPerLevel).
+ options.level_compaction_dynamic_file_size = false;
+ options.max_compaction_bytes = 1500;
+ // i == 0: CompactFiles
+ // i == 1: CompactRange
+ // i == 2: automatic compaction
+ for (int i = 0; i < 3; ++i) {
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), ""));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // snapshot protects range tombstone from dropping due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(0), Key(2 * kNumFilesPerLevel)));
+
+ Random rnd(301);
+ std::string value = rnd.RandomString(kValueBytes);
+ for (int j = 0; j < kNumFilesPerLevel; ++j) {
+ // give files overlapping key-ranges to prevent trivial move
+ ASSERT_OK(Put(Key(j), value));
+ ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+ if (j > 0) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(j, NumTableFilesAtLevel(0));
+ }
+ }
+ // put extra key to trigger final flush
+ ASSERT_OK(Put("", ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1));
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+ if (i == 0) {
+ ASSERT_OK(db_->CompactFiles(
+ CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+ } else if (i == 1) {
+ auto begin_str = Key(0), end_str = Key(1);
+ Slice begin = begin_str, end = end_str;
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end));
+ ASSERT_EQ(3, NumTableFilesAtLevel(1));
+ } else if (i == 2) {
+ ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+ {{"max_bytes_for_level_base", "10000"}}));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ }
+ ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+ db_->ReleaseSnapshot(snapshot);
+ }
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
+ // Test the handling of the range-tombstone end-key as the
+ // upper-bound for an sstable.
+
+ const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(2 /* num_entries_flush */));
+ options.target_file_size_base = kValueBytes;
+ options.disable_auto_compactions = true;
+ // disable it for now, otherwise the L1 files are going be cut before data 1:
+ // L1: [0] [1,4]
+ // L2: [0,0]
+ // because the grandparent file is between [0]->[1] and it's size is more than
+ // 1/8 of target size (4k).
+ options.level_compaction_dynamic_file_size = false;
+
+ DestroyAndReopen(options);
+
+ // Create an initial sstable at L2:
+ // [key000000#1,1, key000000#1,1]
+ ASSERT_OK(Put(Key(0), ""));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // A snapshot protects the range tombstone from dropping due to
+ // becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(2 * kNumFilesPerLevel)));
+
+ // Create 2 additional sstables in L0. Note that the first sstable
+ // contains the range tombstone.
+ // [key000000#3,1, key000004#72057594037927935,15]
+ // [key000001#5,1, key000002#6,1]
+ Random rnd(301);
+ std::string value = rnd.RandomString(kValueBytes);
+ for (int j = 0; j < kNumFilesPerLevel; ++j) {
+ // Give files overlapping key-ranges to prevent a trivial move when we
+ // compact from L0 to L1.
+ ASSERT_OK(Put(Key(j), value));
+ ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(j + 1, NumTableFilesAtLevel(0));
+ }
+ // Compact the 2 L0 sstables to L1, resulting in the following LSM. There
+ // are 2 sstables generated in L1 due to the target_file_size_base setting.
+ // L1:
+ // [key000000#3,1, key000002#72057594037927935,15]
+ // [key000002#6,1, key000004#72057594037927935,15]
+ // L2:
+ // [key000000#1,1, key000000#1,1]
+ MoveFilesToLevel(1);
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+ {
+ // Compact the second sstable in L1:
+ // L1:
+ // [key000000#3,1, key000002#72057594037927935,15]
+ // L2:
+ // [key000000#1,1, key000000#1,1]
+ // [key000002#6,1, key000004#72057594037927935,15]
+ //
+ // At the same time, verify the compaction does not cause the key at the
+ // endpoint (key000002#6,1) to disappear.
+ ASSERT_EQ(value, Get(Key(2)));
+ auto begin_str = Key(3);
+ const ROCKSDB_NAMESPACE::Slice begin = begin_str;
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ ASSERT_EQ(2, NumTableFilesAtLevel(2));
+ ASSERT_EQ(value, Get(Key(2)));
+ }
+
+ {
+ // Compact the first sstable in L1. This should be copacetic, but
+ // was previously resulting in overlapping sstables in L2 due to
+ // mishandling of the range tombstone end-key when used as the
+ // largest key for an sstable. The resulting LSM structure should
+ // be:
+ //
+ // L2:
+ // [key000000#1,1, key000001#72057594037927935,15]
+ // [key000001#5,1, key000002#72057594037927935,15]
+ // [key000002#6,1, key000004#72057594037927935,15]
+ auto begin_str = Key(0);
+ const ROCKSDB_NAMESPACE::Slice begin = begin_str;
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, &begin));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+ ASSERT_EQ(3, NumTableFilesAtLevel(2));
+ }
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UnorderedTombstones) {
+ // Regression test for #2752. Range delete tombstones between
+ // different snapshot stripes are not stored in order, so the first
+ // tombstone of each snapshot stripe should be checked as a smallest
+ // candidate.
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+
+ auto cf = db_->DefaultColumnFamily();
+
+ ASSERT_OK(db_->Put(WriteOptions(), cf, "a", "a"));
+ ASSERT_OK(db_->Flush(FlushOptions(), cf));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "b", "c"));
+ // Hold a snapshot to separate these two delete ranges.
+ auto snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "a", "b"));
+ ASSERT_OK(db_->Flush(FlushOptions(), cf));
+ db_->ReleaseSnapshot(snapshot);
+
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_GetFilesMetaData(cf, &files);
+ ASSERT_EQ(1, files[0].size());
+ ASSERT_EQ("a", files[0][0].smallest.user_key());
+ ASSERT_EQ("c", files[0][0].largest.user_key());
+
+ std::string v;
+ auto s = db_->Get(ReadOptions(), "a", &v);
+ ASSERT_TRUE(s.IsNotFound());
+}
+
+class MockMergeOperator : public MergeOperator {
+ // Mock non-associative operator. Non-associativity is expressed by lack of
+ // implementation for any `PartialMerge*` functions.
+ public:
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ assert(merge_out != nullptr);
+ merge_out->new_value = merge_in.operand_list.back().ToString();
+ return true;
+ }
+
+ const char* Name() const override { return "MockMergeOperator"; }
+};
+
+TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) {
+ // This test uses a non-associative merge operator since that is a convenient
+ // way to get compaction to write out files with overlapping user-keys at the
+ // endpoints. Note, however, overlapping endpoints can also occur with other
+ // value types (Put, etc.), assuming the right snapshots are present.
+ const int kFileBytes = 1 << 20;
+ const int kValueBytes = 1 << 10;
+ const int kNumFiles = 4;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.merge_operator.reset(new MockMergeOperator());
+ options.target_file_size_base = kFileBytes;
+ Reopen(options);
+
+ // Push dummy data to L3 so that our actual test files on L0-L2
+ // will not be considered "bottommost" level, otherwise compaction
+ // may prevent us from creating overlapping user keys
+ // as on the bottommost layer MergeHelper
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "dummy"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(3);
+
+ Random rnd(301);
+ const Snapshot* snapshot = nullptr;
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+ auto value = rnd.RandomString(kValueBytes);
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", value));
+ }
+ if (i == kNumFiles - 1) {
+ // Take snapshot to prevent covered merge operands from being dropped by
+ // compaction.
+ snapshot = db_->GetSnapshot();
+ // The DeleteRange is the last write so all merge operands are covered.
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "key", "key_"));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(
+ 0 /* level */, nullptr /* begin */, nullptr /* end */,
+ nullptr /* column_family */, true /* disallow_trivial_move */));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ // Now we have multiple files at L1 all containing a single user key, thus
+ // guaranteeing overlap in the file endpoints.
+ ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+ // Verify no merge operands reappeared after the compaction.
+ ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+ // Compact and verify again. It's worthwhile because now the files have
+ // tighter endpoints, so we can verify that doesn't mess anything up.
+ ASSERT_OK(dbfull()->TEST_CompactRange(
+ 1 /* level */, nullptr /* begin */, nullptr /* end */,
+ nullptr /* column_family */, true /* disallow_trivial_move */));
+ ASSERT_GT(NumTableFilesAtLevel(2), 1);
+ ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) {
+ // Verify a key newer than a range tombstone cannot be deleted by being
+ // compacted to the bottom level (and thus having its seqnum zeroed) before
+ // the range tombstone. This used to happen when range tombstones were
+ // untruncated on reads such that they extended past their file boundaries.
+ //
+ // Test summary:
+ //
+ // - L1 is bottommost.
+ // - A couple snapshots are strategically taken to prevent seqnums from being
+ // zeroed, range tombstone from being dropped, merge operands from being
+ // dropped, and merge operands from being combined.
+ // - Left half of files in L1 all have same user key, ensuring their file
+ // boundaries overlap. In the past this would cause range tombstones to be
+ // untruncated.
+ // - Right half of L1 files all have different keys, ensuring no overlap.
+ // - A range tombstone spans all L1 keys, so it is stored in every L1 file.
+ // - Keys in the right side of the key-range are overwritten. These are
+ // compacted down to L1 after releasing snapshots such that their seqnums
+ // will be zeroed.
+ // - A full range scan is performed. If the tombstone in the left L1 files
+ // were untruncated, it would now cover keys newer than it (but with zeroed
+ // seqnums) in the right L1 files.
+ const int kFileBytes = 1 << 20;
+ const int kValueBytes = 1 << 10;
+ const int kNumFiles = 4;
+ const int kMaxKey = kNumFiles * kFileBytes / kValueBytes;
+ const int kKeysOverwritten = 10;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.merge_operator.reset(new MockMergeOperator());
+ options.num_levels = 2;
+ options.target_file_size_base = kFileBytes;
+ Reopen(options);
+
+ Random rnd(301);
+ // - snapshots[0] prevents merge operands from being combined during
+ // compaction.
+ // - snapshots[1] prevents merge operands from being dropped due to the
+ // covering range tombstone.
+ const Snapshot* snapshots[] = {nullptr, nullptr};
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+ auto value = rnd.RandomString(kValueBytes);
+ std::string key;
+ if (i < kNumFiles / 2) {
+ key = Key(0);
+ } else {
+ key = Key(1 + i * kFileBytes / kValueBytes + j);
+ }
+ ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+ }
+ if (i == 0) {
+ snapshots[0] = db_->GetSnapshot();
+ }
+ if (i == kNumFiles - 1) {
+ snapshots[1] = db_->GetSnapshot();
+ // The DeleteRange is the last write so all merge operands are covered.
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(0), Key(kMaxKey + 1)));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+ auto get_key_count = [this]() -> int {
+ auto* iter = db_->NewIterator(ReadOptions());
+ assert(iter->status().ok());
+ iter->SeekToFirst();
+ int keys_found = 0;
+ for (; iter->Valid(); iter->Next()) {
+ ++keys_found;
+ }
+ delete iter;
+ return keys_found;
+ };
+
+ // All keys should be covered
+ ASSERT_EQ(0, get_key_count());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+ nullptr /* end_key */));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ // Roughly the left half of L1 files should have overlapping boundary keys,
+ // while the right half should not.
+ ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+ // Now overwrite a few keys that are in L1 files that definitely don't have
+ // overlapping boundary keys.
+ for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) {
+ auto value = rnd.RandomString(kValueBytes);
+ ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ // The overwritten keys are in L0 now, so clearly aren't covered by the range
+ // tombstone in L1.
+ ASSERT_EQ(kKeysOverwritten, get_key_count());
+
+ // Release snapshots so seqnums can be zeroed when L0->L1 happens.
+ db_->ReleaseSnapshot(snapshots[0]);
+ db_->ReleaseSnapshot(snapshots[1]);
+
+ auto begin_key_storage = Key(kMaxKey - kKeysOverwritten + 1);
+ auto end_key_storage = Key(kMaxKey);
+ Slice begin_key(begin_key_storage);
+ Slice end_key(end_key_storage);
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_key, &end_key));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+ ASSERT_EQ(kKeysOverwritten, get_key_count());
+}
+
+TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) {
+ // Exposes a bug where we were using
+ // `RangeDelPositioningMode::kBackwardTraversal` while scanning merge operands
+ // in the forward direction. Confusingly, this case happened during
+ // `DBIter::Prev`. It could cause assertion failure, or reappearing keys.
+ const int kFileBytes = 1 << 20;
+ const int kValueBytes = 1 << 10;
+ // Need multiple keys so we can get results when calling `Prev()` after
+ // `SeekToLast()`.
+ const int kNumKeys = 3;
+ const int kNumFiles = 4;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.merge_operator.reset(new MockMergeOperator());
+ options.target_file_size_base = kFileBytes;
+ Reopen(options);
+
+ Random rnd(301);
+ const Snapshot* snapshot = nullptr;
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+ auto value = rnd.RandomString(kValueBytes);
+ ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value));
+ if (i == 0 && j == kNumKeys) {
+ // Take snapshot to prevent covered merge operands from being dropped or
+ // merged by compaction.
+ snapshot = db_->GetSnapshot();
+ // Do a DeleteRange near the beginning so only the oldest merge operand
+ // for each key is covered. This ensures the sequence of events:
+ //
+ // - `DBIter::Prev()` is called
+ // - After several same versions of the same user key are encountered,
+ // it decides to seek using `DBIter::FindValueForCurrentKeyUsingSeek`.
+ // - Binary searches to the newest version of the key, which is in the
+ // leftmost file containing the user key.
+ // - Scans forwards to collect all merge operands. Eventually reaches
+ // the rightmost file containing the oldest merge operand, which
+ // should be covered by the `DeleteRange`. If `RangeDelAggregator`
+ // were not properly using `kForwardTraversal` here, that operand
+ // would reappear.
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(0), Key(kNumKeys + 1)));
+ }
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+ nullptr /* end_key */));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+ auto* iter = db_->NewIterator(ReadOptions());
+ ASSERT_OK(iter->status());
+ iter->SeekToLast();
+ int keys_found = 0;
+ for (; iter->Valid(); iter->Prev()) {
+ ++keys_found;
+ }
+ delete iter;
+ ASSERT_EQ(kNumKeys, keys_found);
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
+ const int kFileBytes = 1 << 20;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = kFileBytes;
+ Reopen(options);
+
+ ASSERT_OK(Put(Key(0), "a"));
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(10)));
+
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ReadOptions read_opts;
+ read_opts.snapshot = snapshot;
+ auto* iter = db_->NewIterator(read_opts);
+ ASSERT_OK(iter->status());
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(Key(0), iter->key());
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeysInImmMemTables) {
+ const int kFileBytes = 1 << 20;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = kFileBytes;
+ Reopen(options);
+
+ // block flush thread -> pin immtables in memory
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator",
+ "DBImpl::BGWorkFlush"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(0), "a"));
+ std::unique_ptr<const Snapshot, std::function<void(const Snapshot*)>>
+ snapshot(db_->GetSnapshot(),
+ [this](const Snapshot* s) { db_->ReleaseSnapshot(s); });
+
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(10)));
+
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+ ReadOptions read_opts;
+ read_opts.snapshot = snapshot.get();
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ ASSERT_OK(iter->status());
+
+ TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator");
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(Key(0), iter->key());
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
+ // Adapted from
+ // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
+ // Regression test for issue where range tombstone was written to more files
+ // than necessary when it began exactly at the begin key in the next
+ // compaction output file.
+ const int kFileBytes = 1 << 20;
+ const int kValueBytes = 4 << 10;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ // Have a bit of slack in the size limits but we enforce them more strictly
+ // when manually flushing/compacting.
+ options.max_compaction_bytes = 2 * kFileBytes;
+ options.target_file_size_base = 2 * kFileBytes;
+ options.write_buffer_size = 2 * kFileBytes;
+ Reopen(options);
+
+ Random rnd(301);
+ for (char first_char : {'a', 'b', 'c'}) {
+ for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
+ std::string key(1, first_char);
+ key.append(Key(i));
+ std::string value = rnd.RandomString(kValueBytes);
+ ASSERT_OK(Put(key, value));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ }
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(3, NumTableFilesAtLevel(2));
+
+ // Populate the memtable lightly while spanning the whole key-space. The
+ // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
+ // files to prevent a large L1->L2 compaction later.
+ ASSERT_OK(Put("a", "val"));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "c" + Key(1), "d"));
+ // Our compaction output file cutting logic currently only considers point
+ // keys. So, in order for the range tombstone to have a chance at landing at
+ // the start of a new file, we need a point key at the range tombstone's
+ // start.
+ // TODO(ajkr): remove this `Put` after file cutting accounts for range
+ // tombstones (#3977).
+ ASSERT_OK(Put("c" + Key(1), "value"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
+ // and the range tombstone is only placed in the second SST.
+ std::string begin_key_storage("c" + Key(1));
+ Slice begin_key(begin_key_storage);
+ std::string end_key_storage("d");
+ Slice end_key(end_key_storage);
+ ASSERT_OK(dbfull()->TEST_CompactRange(
+ 0 /* level */, &begin_key /* begin */, &end_key /* end */,
+ nullptr /* column_family */, true /* disallow_trivial_move */));
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+ std::vector<LiveFileMetaData> all_metadata;
+ std::vector<LiveFileMetaData> l1_metadata;
+ db_->GetLiveFilesMetaData(&all_metadata);
+ for (const auto& metadata : all_metadata) {
+ if (metadata.level == 1) {
+ l1_metadata.push_back(metadata);
+ }
+ }
+ std::sort(l1_metadata.begin(), l1_metadata.end(),
+ [&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
+ return options.comparator->Compare(a.smallestkey, b.smallestkey) <
+ 0;
+ });
+ ASSERT_EQ("a", l1_metadata[0].smallestkey);
+ ASSERT_EQ("a", l1_metadata[0].largestkey);
+ ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
+ ASSERT_EQ("d", l1_metadata[1].largestkey);
+
+ TablePropertiesCollection all_table_props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
+ int64_t num_range_deletions = 0;
+ for (const auto& name_and_table_props : all_table_props) {
+ const auto& name = name_and_table_props.first;
+ const auto& table_props = name_and_table_props.second;
+ // The range tombstone should only be output to the second L1 SST.
+ if (name.size() >= l1_metadata[1].name.size() &&
+ name.substr(name.size() - l1_metadata[1].name.size())
+ .compare(l1_metadata[1].name) == 0) {
+ ASSERT_EQ(1, table_props->num_range_deletions);
+ ++num_range_deletions;
+ } else {
+ ASSERT_EQ(0, table_props->num_range_deletions);
+ }
+ }
+ ASSERT_EQ(1, num_range_deletions);
+}
+
+TEST_F(DBRangeDelTest, OverlappedTombstones) {
+ const int kNumPerFile = 4, kNumFiles = 2;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 9 * 1024;
+ options.max_compaction_bytes = 9 * 1024;
+ DestroyAndReopen(options);
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; ++i) {
+ std::vector<std::string> values;
+ // Write 12K (4 values, each 3K)
+ for (int j = 0; j < kNumPerFile; j++) {
+ values.push_back(rnd.RandomString(3 << 10));
+ ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+ }
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
+ Key((kNumFiles)*kNumPerFile + 1)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */));
+
+ // The tombstone range is not broken up into multiple SSTs which may incur a
+ // large compaction with L2.
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ std::vector<std::vector<FileMetaData>> files;
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */));
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBRangeDelTest, OverlappedKeys) {
+ const int kNumPerFile = 4, kNumFiles = 2;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 9 * 1024;
+ options.max_compaction_bytes = 9 * 1024;
+ DestroyAndReopen(options);
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; ++i) {
+ std::vector<std::string> values;
+ // Write 12K (4 values, each 3K)
+ for (int j = 0; j < kNumPerFile; j++) {
+ values.push_back(rnd.RandomString(3 << 10));
+ ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+ }
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+ for (int i = 1; i < kNumFiles * kNumPerFile + 1; i++) {
+ ASSERT_OK(Put(Key(i), "0x123"));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // The key range is broken up into three SSTs to avoid a future big compaction
+ // with the grandparent
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */));
+ ASSERT_EQ(3, NumTableFilesAtLevel(1));
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+ true /* disallow_trivial_move */));
+ // L1->L2 compaction size is limited to max_compaction_bytes
+ ASSERT_EQ(3, NumTableFilesAtLevel(2));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBRangeDelTest, IteratorRefresh) {
+ // Refreshing an iterator after a range tombstone is added should cause the
+ // deleted range of keys to disappear.
+ for (bool sv_changed : {false, true}) {
+ ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2"));
+
+ auto* iter = db_->NewIterator(ReadOptions());
+ ASSERT_OK(iter->status());
+
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "key2", "key3"));
+
+ if (sv_changed) {
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+
+ ASSERT_OK(iter->Refresh());
+ ASSERT_OK(iter->status());
+ iter->SeekToFirst();
+ ASSERT_EQ("key1", iter->key());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ delete iter;
+ }
+}
+
+void VerifyIteratorReachesEnd(InternalIterator* iter) {
+ ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+}
+
+void VerifyIteratorReachesEnd(Iterator* iter) {
+ ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+}
+
+TEST_F(DBRangeDelTest, IteratorReseek) {
+ // Range tombstone triggers reseek (seeking to a range tombstone end key) in
+ // merging iterator. Test set up:
+ // one memtable: range tombstone [0, 1)
+ // one immutable memtable: range tombstone [1, 2)
+ // one L0 file with range tombstone [2, 3)
+ // one L1 file with range tombstone [3, 4)
+ // Seek(0) should trigger cascading reseeks at all levels below memtable.
+ // Seek(1) should trigger cascading reseeks at all levels below immutable
+ // memtable. SeekToFirst and SeekToLast trigger no reseek.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+
+ DestroyAndReopen(options);
+ // L1
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+ Key(4)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ // L0
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+ Key(3)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ // Immutable memtable
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
+ Key(2)));
+ ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+ std::string value;
+ ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(),
+ "rocksdb.num-immutable-mem-table", &value));
+ ASSERT_EQ(1, std::stoi(value));
+ // live memtable
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(1)));
+ // this memtable is still active
+ ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(),
+ "rocksdb.num-immutable-mem-table", &value));
+ ASSERT_EQ(1, std::stoi(value));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ get_perf_context()->Reset();
+ iter->Seek(Key(0));
+ // Reseeked immutable memtable, L0 and L1
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 3);
+ VerifyIteratorReachesEnd(iter);
+ get_perf_context()->Reset();
+ iter->SeekForPrev(Key(1));
+ // Reseeked L0 and L1
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+ VerifyIteratorReachesEnd(iter);
+ get_perf_context()->Reset();
+ iter->SeekToFirst();
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+ VerifyIteratorReachesEnd(iter);
+ iter->SeekToLast();
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+ VerifyIteratorReachesEnd(iter);
+ delete iter;
+}
+
+TEST_F(DBRangeDelTest, ReseekDuringNextAndPrev) {
+ // Range tombstone triggers reseek during Next()/Prev() in merging iterator.
+ // Test set up:
+ // memtable has: [0, 1) [2, 3)
+ // L0 has: 2
+ // L1 has: 1, 2, 3
+ // Seek(0) will reseek to 1 for L0 and L1. Seek(1) will not trigger any
+ // reseek. Then Next() determines 2 is covered by [2, 3), it will try to
+ // reseek to 3 for L0 and L1. Similar story for Prev() and SeekForPrev() is
+ // tested.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+
+ DestroyAndReopen(options);
+ // L1
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1), "foo"));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ // L0
+ ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // Memtable
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(1)));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+ Key(3)));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ auto iter_test_forward = [&] {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(1));
+
+ get_perf_context()->Reset();
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(3));
+ // Reseeked L0 and L1
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+ // Next to Prev
+ get_perf_context()->Reset();
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(1));
+ // Reseeked L0 and L1
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+ // Prev to Next
+ get_perf_context()->Reset();
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(3));
+ // Reseeked L0 and L1
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+ iter->Next();
+ VerifyIteratorReachesEnd(iter);
+ };
+
+ get_perf_context()->Reset();
+ iter->Seek(Key(0));
+ // Reseeked L0 and L1
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+ iter_test_forward();
+ get_perf_context()->Reset();
+ iter->Seek(Key(1));
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+ iter_test_forward();
+
+ get_perf_context()->Reset();
+ iter->SeekForPrev(Key(2));
+ // Reseeked L0 and L1
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+ iter_test_forward();
+ get_perf_context()->Reset();
+ iter->SeekForPrev(Key(1));
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+ iter_test_forward();
+
+ get_perf_context()->Reset();
+ iter->SeekToFirst();
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+ iter_test_forward();
+
+ iter->SeekToLast();
+ iter->Prev();
+ iter_test_forward();
+ delete iter;
+}
+
+TEST_F(DBRangeDelTest, TombstoneFromCurrentLevel) {
+ // Range tombstone triggers reseek when covering key from the same level.
+ // in merging iterator. Test set up:
+ // memtable has: [0, 1)
+ // L0 has: [2, 3), 2
+ // L1 has: 1, 2, 3
+ // Seek(0) will reseek to 1 for L0 and L1.
+ // Then Next() will reseek to 3 for L1 since 2 in L0 is covered by [2, 3) in
+ // L0.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+
+ DestroyAndReopen(options);
+ // L1
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1), "foo"));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ // L0
+ ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+ Key(3)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // Memtable
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+ Key(1)));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ get_perf_context()->Reset();
+ iter->Seek(Key(0));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(1));
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+ get_perf_context()->Reset();
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(3));
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+
+ delete iter;
+}
+
+class TombstoneTestSstPartitioner : public SstPartitioner {
+ public:
+ const char* Name() const override { return "SingleKeySstPartitioner"; }
+
+ PartitionerResult ShouldPartition(
+ const PartitionerRequest& request) override {
+ if (cmp->Compare(*request.current_user_key, DBTestBase::Key(5)) == 0) {
+ return kRequired;
+ } else {
+ return kNotRequired;
+ }
+ }
+
+ bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+ const Slice& /*largest_user_key*/) override {
+ return false;
+ }
+
+ const Comparator* cmp = BytewiseComparator();
+};
+
+class TombstoneTestSstPartitionerFactory : public SstPartitionerFactory {
+ public:
+ static const char* kClassName() {
+ return "TombstoneTestSstPartitionerFactory";
+ }
+ const char* Name() const override { return kClassName(); }
+
+ std::unique_ptr<SstPartitioner> CreatePartitioner(
+ const SstPartitioner::Context& /* context */) const override {
+ return std::unique_ptr<SstPartitioner>(new TombstoneTestSstPartitioner());
+ }
+};
+
+TEST_F(DBRangeDelTest, TombstoneAcrossFileBoundary) {
+ // Verify that a range tombstone across file boundary covers keys from older
+ // levels. Test set up:
+ // L1_0: 1, 3, [2, 6) L1_1: 5, 7, [2, 6) ([2, 6) is from compaction with
+ // L1_0) L2 has: 5
+ // Seek(1) and then Next() should move the L1 level iterator to
+ // L1_1. Check if 5 is returned after Next().
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 2 * 1024;
+ options.max_compaction_bytes = 2 * 1024;
+
+ // Make sure L1 files are split before "5"
+ auto factory = std::make_shared<TombstoneTestSstPartitionerFactory>();
+ options.sst_partitioner_factory = factory;
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ // L2
+ // the file should be smaller than max_compaction_bytes, otherwise the file
+ // will be cut before 7.
+ ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 9)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // L1_1
+ ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(7), rnd.RandomString(1 << 10)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // L1_0
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(1 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(1 << 10)));
+ // Prevent keys being compacted away
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+ Key(6)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ get_perf_context()->Reset();
+ iter->Seek(Key(1));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(1));
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(7));
+ // 1 reseek into L2 when key 5 in L2 is covered by [2, 6) from L1
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) {
+ // Verify that a range tombstone across file boundary covers keys from older
+ // levels.
+ // Test set up:
+ // L1_0: 1, 3, [4, 7) L1_1: 6, 8, [4, 7)
+ // L2: 5
+ // Note that [4, 7) is at end of L1_0 and not overlapping with any point key
+ // in L1_0. [4, 7) from L1_0 should cover 5 is sentinel works
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 2 * 1024;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ // L2
+ ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // L1_1
+ ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(8), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // L1_0
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(4 << 10)));
+ // Prevent keys being compacted away
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
+ Key(7)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ iter->Seek(Key(3));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(3));
+ get_perf_context()->Reset();
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(8));
+ // 1 reseek into L1 since 5 from L2 is covered by [4, 7) from L1
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+ for (auto& k : {4, 5, 6}) {
+ get_perf_context()->Reset();
+ iter->Seek(Key(k));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(8));
+ // 1 reseek into L1
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+ }
+ delete iter;
+}
+
+TEST_F(DBRangeDelTest, OlderLevelHasNewerData) {
+ // L1_0: 1, 3, [2, 7) L1_1: 5, 6 at a newer sequence number than [2, 7)
+ // Compact L1_1 to L2. Seek(3) should not skip 5 or 6.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 3 * 1024;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ // L1_0
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(4 << 10)));
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+ Key(7)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ // L1_1
+ ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+ auto key = Key(6);
+ Slice begin(key);
+ EXPECT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr));
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ iter->Seek(Key(3));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(5));
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), Key(6));
+ delete iter;
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, LevelBoundaryDefinedByTombstone) {
+ // L1 has: 1, 2, [4, 5)
+ // L2 has: 4
+ // Seek(3), which is over all points keys in L1, check whether
+ // sentinel key from L1 works in this case.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 3 * 1024;
+ DestroyAndReopen(options);
+ Random rnd(301);
+ // L2
+ ASSERT_OK(db_->Put(WriteOptions(), Key(4), "foo"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ const Snapshot* snapshot = db_->GetSnapshot();
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // L1_0
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
+ Key(5)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ iter->Seek(Key(3));
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+
+ get_perf_context()->Reset();
+ iter->SeekForPrev(Key(5));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(2));
+ db_->ReleaseSnapshot(snapshot);
+ delete iter;
+}
+
+TEST_F(DBRangeDelTest, TombstoneOnlyFile) {
+ // L1_0: 1, 2, L1_1: [3, 5)
+ // L2: 3
+ // Seek(2) then Next() should advance L1 iterator into L1_1.
+ // If sentinel works with tombstone only file, it should cover the key in L2.
+ // Similar story for SeekForPrev(4).
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 3 * 1024;
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+ // L2
+ ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // L1_0
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // L1_1
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+ Key(5)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ iter->Seek(Key(2));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(2));
+ iter->Next();
+ VerifyIteratorReachesEnd(iter);
+ iter->SeekForPrev(Key(4));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(2));
+ iter->Next();
+ VerifyIteratorReachesEnd(iter);
+ delete iter;
+}
+
+void VerifyIteratorKey(InternalIterator* iter,
+ const std::vector<std::string>& expected_keys,
+ bool forward = true) {
+ for (auto& key : expected_keys) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->user_key(), key);
+ if (forward) {
+ iter->Next();
+ } else {
+ iter->Prev();
+ }
+ }
+}
+
+TEST_F(DBRangeDelTest, TombstoneOnlyLevel) {
+ // L1 [3, 5)
+ // L2 has: 3, 4
+ // Any kind of iterator seek should skip 3 and 4 in L2.
+ // L1 level iterator should produce sentinel key.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 3 * 1024;
+
+ DestroyAndReopen(options);
+ // L2
+ ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // L1
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+ Key(5)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ get_perf_context()->Reset();
+ uint64_t expected_reseek = 0;
+ for (auto i = 0; i < 7; ++i) {
+ iter->Seek(Key(i));
+ VerifyIteratorReachesEnd(iter);
+ if (i < 5) {
+ ++expected_reseek;
+ }
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+ expected_reseek);
+ iter->SeekForPrev(Key(i));
+ VerifyIteratorReachesEnd(iter);
+ if (i > 2) {
+ ++expected_reseek;
+ }
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+ expected_reseek);
+ iter->SeekToFirst();
+ VerifyIteratorReachesEnd(iter);
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+ ++expected_reseek);
+ iter->SeekToLast();
+ VerifyIteratorReachesEnd(iter);
+ ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+ ++expected_reseek);
+ }
+ delete iter;
+
+ // Check L1 LevelIterator behavior
+ ColumnFamilyData* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+ ->cfd();
+ SuperVersion* sv = cfd->GetSuperVersion();
+ Arena arena;
+ ReadOptions read_options;
+ MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), &arena,
+ false /* prefix seek */);
+ InternalIterator* level_iter = sv->current->TEST_GetLevelIterator(
+ read_options, &merge_iter_builder, 1 /* level */, true);
+ // This is needed to make LevelIterator range tombstone aware
+ auto miter = merge_iter_builder.Finish();
+ auto k = Key(3);
+ IterKey target;
+ target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
+ level_iter->Seek(target.GetInternalKey());
+ // sentinel key (file boundary as a fake key)
+ VerifyIteratorKey(level_iter, {Key(5)});
+ VerifyIteratorReachesEnd(level_iter);
+
+ k = Key(5);
+ target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
+ level_iter->SeekForPrev(target.GetInternalKey());
+ VerifyIteratorKey(level_iter, {Key(3)}, false);
+ VerifyIteratorReachesEnd(level_iter);
+
+ level_iter->SeekToFirst();
+ VerifyIteratorKey(level_iter, {Key(5)});
+ VerifyIteratorReachesEnd(level_iter);
+
+ level_iter->SeekToLast();
+ VerifyIteratorKey(level_iter, {Key(3)}, false);
+ VerifyIteratorReachesEnd(level_iter);
+
+ miter->~InternalIterator();
+}
+
+TEST_F(DBRangeDelTest, TombstoneOnlyWithOlderVisibleKey) {
+ // L1: [3, 5)
+ // L2: 2, 4, 5
+ // 2 and 5 should be visible
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 3 * 1024;
+
+ DestroyAndReopen(options);
+ // L2
+ ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(5), "foobar"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // l1
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+ Key(5)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ auto iter_test_backward = [&] {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(5));
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(2));
+ iter->Prev();
+ VerifyIteratorReachesEnd(iter);
+ };
+ auto iter_test_forward = [&] {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(2));
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(5));
+ iter->Next();
+ VerifyIteratorReachesEnd(iter);
+ };
+ iter->Seek(Key(4));
+ iter_test_backward();
+ iter->SeekForPrev(Key(4));
+ iter->Next();
+ iter_test_backward();
+
+ iter->Seek(Key(4));
+ iter->Prev();
+ iter_test_forward();
+ iter->SeekForPrev(Key(4));
+ iter_test_forward();
+
+ iter->SeekToFirst();
+ iter_test_forward();
+ iter->SeekToLast();
+ iter_test_backward();
+
+ delete iter;
+}
+
+TEST_F(DBRangeDelTest, TombstoneSentinelDirectionChange) {
+ // L1: 7
+ // L2: [4, 6)
+ // L3: 4
+ // Seek(5) will have sentinel key 6 at the top of minHeap in merging iterator.
+ // then do a prev, how would sentinel work?
+ // Redo the test after Put(5) into L1 so that there is a visible key in range
+ // [4, 6).
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 3 * 1024;
+
+ DestroyAndReopen(options);
+ // L3
+ ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(3);
+ ASSERT_EQ(1, NumTableFilesAtLevel(3));
+ // L2
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
+ Key(6)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // L1
+ ASSERT_OK(db_->Put(WriteOptions(), Key(7), "foobar"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ iter->Seek(Key(5));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(7));
+ iter->Prev();
+ ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+ delete iter;
+
+ ASSERT_OK(db_->Put(WriteOptions(), Key(5), "foobar"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+ iter = db_->NewIterator(ReadOptions());
+ iter->Seek(Key(5));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(5));
+ iter->Prev();
+ ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+ delete iter;
+}
+
+// Right sentinel tested in many test cases above
+TEST_F(DBRangeDelTest, LeftSentinelKeyTest) {
+ // L1_0: 0, 1 L1_1: [2, 3), 5
+ // L2: 2
+ // SeekForPrev(4) should give 1 due to sentinel key keeping [2, 3) alive.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 3 * 1024;
+ options.max_compaction_bytes = 1024;
+
+ DestroyAndReopen(options);
+ // L2
+ ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // L1_0
+ Random rnd(301);
+ ASSERT_OK(db_->Put(WriteOptions(), Key(0), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ // L1_1
+ ASSERT_OK(db_->Put(WriteOptions(), Key(5), "bar"));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+ Key(3)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ iter->SeekForPrev(Key(4));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(1));
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key(0));
+ iter->Prev();
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+ delete iter;
+}
+
+TEST_F(DBRangeDelTest, LeftSentinelKeyTestWithNewerKey) {
+ // L1_0: 1, 2 newer than L1_1, L1_1: [2, 4), 5
+ // L2: 3
+ // SeekForPrev(4) then Prev() should give 2 and then 1.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 3 * 1024;
+ options.max_compaction_bytes = 1024;
+
+ DestroyAndReopen(options);
+ // L2
+ ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // L1_1
+ ASSERT_OK(db_->Put(WriteOptions(), Key(5), "bar"));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+ Key(4)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ // L1_0
+ Random rnd(301);
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+ // Used to verify sequence number of iterator key later.
+ auto seq = dbfull()->TEST_GetLastVisibleSequence();
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+ Arena arena;
+ InternalKeyComparator icmp(options.comparator);
+ ReadOptions read_options;
+ ScopedArenaIterator iter;
+ iter.set(
+ dbfull()->NewInternalIterator(read_options, &arena, kMaxSequenceNumber));
+
+ auto k = Key(4);
+ IterKey target;
+ target.SetInternalKey(k, 0 /* sequence_number */, kValueTypeForSeekForPrev);
+ iter->SeekForPrev(target.GetInternalKey());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->user_key(), Key(2));
+ SequenceNumber actual_seq;
+ ValueType type;
+ UnPackSequenceAndType(ExtractInternalKeyFooter(iter->key()), &actual_seq,
+ &type);
+ ASSERT_EQ(seq, actual_seq);
+ // might as well check type
+ ASSERT_EQ(type, kTypeValue);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->user_key(), Key(1));
+ iter->Prev();
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+}
+
+TEST_F(DBRangeDelTest, SentinelKeyCommonCaseTest) {
+ // L1 has 3 files
+ // L1_0: 1, 2 L1_1: [3, 4) 5, 6, [7, 8) L1_2: 9
+ // Check iterator operations on LevelIterator.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = 3 * 1024;
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+ // L1_0
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ // L1_1
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+ Key(4)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(7),
+ Key(8)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+ // L1_2
+ ASSERT_OK(db_->Put(WriteOptions(), Key(9), rnd.RandomString(4 << 10)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(3, NumTableFilesAtLevel(1));
+
+ ColumnFamilyData* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+ ->cfd();
+ SuperVersion* sv = cfd->GetSuperVersion();
+ Arena arena;
+ ReadOptions read_options;
+ MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), &arena,
+ false /* prefix seek */);
+ InternalIterator* level_iter = sv->current->TEST_GetLevelIterator(
+ read_options, &merge_iter_builder, 1 /* level */, true);
+ // This is needed to make LevelIterator range tombstone aware
+ auto miter = merge_iter_builder.Finish();
+ auto k = Key(7);
+ IterKey target;
+ target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
+ level_iter->Seek(target.GetInternalKey());
+ // The last Key(9) is a sentinel key.
+ VerifyIteratorKey(level_iter, {Key(8), Key(9), Key(9)});
+ ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+ k = Key(6);
+ target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
+ level_iter->Seek(target.GetInternalKey());
+ VerifyIteratorKey(level_iter, {Key(6), Key(8), Key(9), Key(9)});
+ ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+ k = Key(4);
+ target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
+ level_iter->SeekForPrev(target.GetInternalKey());
+ VerifyIteratorKey(level_iter, {Key(3), Key(2), Key(1), Key(1)}, false);
+ ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+ k = Key(5);
+ target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
+ level_iter->SeekForPrev(target.GetInternalKey());
+ VerifyIteratorKey(level_iter, {Key(5), Key(3), Key(2), Key(1), Key(1)},
+ false);
+
+ level_iter->SeekToFirst();
+ VerifyIteratorKey(level_iter, {Key(1), Key(2), Key(2), Key(5), Key(6), Key(8),
+ Key(9), Key(9)});
+ ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+ level_iter->SeekToLast();
+ VerifyIteratorKey(
+ level_iter,
+ {Key(9), Key(9), Key(6), Key(5), Key(3), Key(2), Key(1), Key(1)}, false);
+ ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+ miter->~InternalIterator();
+}
+
+TEST_F(DBRangeDelTest, PrefixSentinelKey) {
+ // L1: ['aaaa', 'aaad'), 'bbbb'
+ // L2: 'aaac', 'aaae'
+ // Prefix extracts first 3 chars
+ // Seek('aaab') should give 'aaae' as first key.
+ // This is to test a previous bug where prefix seek sees there is no prefix in
+ // the SST file, and will just set file iter to null in LevelIterator and may
+ // just skip to the next SST file. But in this case, we should keep the file's
+ // tombstone alive.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ table_options.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // L2:
+ ASSERT_OK(db_->Put(WriteOptions(), "aaac", rnd.RandomString(10)));
+ ASSERT_OK(db_->Put(WriteOptions(), "aaae", rnd.RandomString(10)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(2);
+ ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+ // L1
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "aaaa",
+ "aaad"));
+ ASSERT_OK(db_->Put(WriteOptions(), "bbbb", rnd.RandomString(10)));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ MoveFilesToLevel(1);
+ ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+ auto iter = db_->NewIterator(ReadOptions());
+ iter->Seek("aaab");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), "aaae");
+ delete iter;
+}
+
+TEST_F(DBRangeDelTest, RefreshMemtableIter) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+ ReadOptions ro;
+ ro.read_tier = kMemtableTier;
+ std::unique_ptr<Iterator> iter{db_->NewIterator(ro)};
+ ASSERT_OK(Flush());
+ // First refresh reinits iter, which had a bug where
+ // iter.memtable_range_tombstone_iter_ was not set to nullptr, and caused
+ // subsequent refresh to double free.
+ ASSERT_OK(iter->Refresh());
+ ASSERT_OK(iter->Refresh());
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneRespectIterateUpperBound) {
+ // Memtable: a, [b, bz)
+ // Do a Seek on `a` with iterate_upper_bound being az
+ // range tombstone [b, bz) should not be processed (added to and
+ // popped from the min_heap in MergingIterator).
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("a", "bar"));
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "bz"));
+
+ // I could not find a cleaner way to test this without relying on
+ // implementation detail. Tried to test the value of
+ // `internal_range_del_reseek_count` but that did not work
+ // since BlockBasedTable iterator becomes !Valid() when point key
+ // is out of bound and that reseek only happens when a point key
+ // is covered by some range tombstone.
+ SyncPoint::GetInstance()->SetCallBack("MergeIterator::PopDeleteRangeStart",
+ [](void*) {
+ // there should not be any range
+ // tombstone in the heap.
+ FAIL();
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ReadOptions read_opts;
+ std::string upper_bound = "az";
+ Slice upper_bound_slice = upper_bound;
+ read_opts.iterate_upper_bound = &upper_bound_slice;
+ std::unique_ptr<Iterator> iter{db_->NewIterator(read_opts)};
+ iter->Seek("a");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), "a");
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_rate_limiter_test.cc b/src/rocksdb/db/db_rate_limiter_test.cc
new file mode 100644
index 000000000..e44cc047d
--- /dev/null
+++ b/src/rocksdb/db/db_rate_limiter_test.cc
@@ -0,0 +1,451 @@
+// Copyright (c) 2022-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "util/file_checksum_helper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBRateLimiterOnReadTest
+ : public DBTestBase,
+ public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+ public:
+ explicit DBRateLimiterOnReadTest()
+ : DBTestBase("db_rate_limiter_on_read_test", /*env_do_fsync=*/false),
+ use_direct_io_(std::get<0>(GetParam())),
+ use_block_cache_(std::get<1>(GetParam())),
+ use_readahead_(std::get<2>(GetParam())) {}
+
+ void Init() {
+ options_ = GetOptions();
+ Reopen(options_);
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), "val"));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(1);
+ }
+
+ BlockBasedTableOptions GetTableOptions() {
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = !use_block_cache_;
+ return table_options;
+ }
+
+ ReadOptions GetReadOptions() {
+ ReadOptions read_options;
+ read_options.rate_limiter_priority = Env::IO_USER;
+ read_options.readahead_size = use_readahead_ ? kReadaheadBytes : 0;
+ return read_options;
+ }
+
+ Options GetOptions() {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.file_checksum_gen_factory.reset(new FileChecksumGenCrc32cFactory());
+ options.rate_limiter.reset(NewGenericRateLimiter(
+ 1 << 20 /* rate_bytes_per_sec */, 100 * 1000 /* refill_period_us */,
+ 10 /* fairness */, RateLimiter::Mode::kAllIo));
+ options.table_factory.reset(NewBlockBasedTableFactory(GetTableOptions()));
+ options.use_direct_reads = use_direct_io_;
+ return options;
+ }
+
+ protected:
+ const static int kNumKeysPerFile = 1;
+ const static int kNumFiles = 3;
+ const static int kReadaheadBytes = 32 << 10; // 32KB
+
+ Options options_;
+ const bool use_direct_io_;
+ const bool use_block_cache_;
+ const bool use_readahead_;
+};
+
+std::string GetTestNameSuffix(
+ ::testing::TestParamInfo<std::tuple<bool, bool, bool>> info) {
+ std::ostringstream oss;
+ if (std::get<0>(info.param)) {
+ oss << "DirectIO";
+ } else {
+ oss << "BufferedIO";
+ }
+ if (std::get<1>(info.param)) {
+ oss << "_BlockCache";
+ } else {
+ oss << "_NoBlockCache";
+ }
+ if (std::get<2>(info.param)) {
+ oss << "_Readahead";
+ } else {
+ oss << "_NoReadahead";
+ }
+ return oss.str();
+}
+
+#ifndef ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(DBRateLimiterOnReadTest, DBRateLimiterOnReadTest,
+ ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+ ::testing::Bool()),
+ GetTestNameSuffix);
+#else // ROCKSDB_LITE
+// Cannot use direct I/O in lite mode.
+INSTANTIATE_TEST_CASE_P(DBRateLimiterOnReadTest, DBRateLimiterOnReadTest,
+ ::testing::Combine(::testing::Values(false),
+ ::testing::Bool(),
+ ::testing::Bool()),
+ GetTestNameSuffix);
+#endif // ROCKSDB_LITE
+
+TEST_P(DBRateLimiterOnReadTest, Get) {
+ if (use_direct_io_ && !IsDirectIOSupported()) {
+ return;
+ }
+ Init();
+
+ ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+ int expected = 0;
+ for (int i = 0; i < kNumFiles; ++i) {
+ {
+ std::string value;
+ ASSERT_OK(db_->Get(GetReadOptions(), Key(i * kNumKeysPerFile), &value));
+ ++expected;
+ }
+ ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+ {
+ std::string value;
+ ASSERT_OK(db_->Get(GetReadOptions(), Key(i * kNumKeysPerFile), &value));
+ if (!use_block_cache_) {
+ ++expected;
+ }
+ }
+ ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+ }
+}
+
+TEST_P(DBRateLimiterOnReadTest, NewMultiGet) {
+ if (use_direct_io_ && !IsDirectIOSupported()) {
+ return;
+ }
+ Init();
+
+ ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+ const int kNumKeys = kNumFiles * kNumKeysPerFile;
+ int64_t expected = 0;
+ {
+ std::vector<std::string> key_bufs;
+ key_bufs.reserve(kNumKeys);
+ std::vector<Slice> keys;
+ keys.reserve(kNumKeys);
+ for (int i = 0; i < kNumKeys; ++i) {
+ key_bufs.emplace_back(Key(i));
+ keys.emplace_back(key_bufs[i]);
+ }
+ std::vector<Status> statuses(kNumKeys);
+ std::vector<PinnableSlice> values(kNumKeys);
+ const int64_t prev_total_rl_req = options_.rate_limiter->GetTotalRequests();
+ db_->MultiGet(GetReadOptions(), dbfull()->DefaultColumnFamily(), kNumKeys,
+ keys.data(), values.data(), statuses.data());
+ const int64_t cur_total_rl_req = options_.rate_limiter->GetTotalRequests();
+ for (int i = 0; i < kNumKeys; ++i) {
+ ASSERT_TRUE(statuses[i].ok());
+ }
+ ASSERT_GT(cur_total_rl_req, prev_total_rl_req);
+ ASSERT_EQ(cur_total_rl_req - prev_total_rl_req,
+ options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+ }
+ expected += kNumKeys;
+ ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+TEST_P(DBRateLimiterOnReadTest, OldMultiGet) {
+ // The old `vector<Status>`-returning `MultiGet()` APIs use `Read()`, which
+ // supports rate limiting.
+ if (use_direct_io_ && !IsDirectIOSupported()) {
+ return;
+ }
+ Init();
+
+ ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+ const int kNumKeys = kNumFiles * kNumKeysPerFile;
+ int expected = 0;
+ {
+ std::vector<std::string> key_bufs;
+ key_bufs.reserve(kNumKeys);
+ std::vector<Slice> keys;
+ keys.reserve(kNumKeys);
+ for (int i = 0; i < kNumKeys; ++i) {
+ key_bufs.emplace_back(Key(i));
+ keys.emplace_back(key_bufs[i]);
+ }
+ std::vector<std::string> values;
+ std::vector<Status> statuses =
+ db_->MultiGet(GetReadOptions(), keys, &values);
+ for (int i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(statuses[i]);
+ }
+ }
+ expected += kNumKeys;
+ ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+TEST_P(DBRateLimiterOnReadTest, Iterator) {
+ if (use_direct_io_ && !IsDirectIOSupported()) {
+ return;
+ }
+ Init();
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(GetReadOptions()));
+ ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+ int expected = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++expected;
+ ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+ }
+
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ // When `use_block_cache_ == true`, the reverse scan will access the blocks
+ // loaded to cache during the above forward scan, in which case no further
+ // file reads are expected.
+ if (!use_block_cache_) {
+ ++expected;
+ }
+ }
+ // Reverse scan does not read evenly (one block per iteration) due to
+ // descending seqno ordering, so wait until after the loop to check total.
+ ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+#if !defined(ROCKSDB_LITE)
+
+TEST_P(DBRateLimiterOnReadTest, VerifyChecksum) {
+ if (use_direct_io_ && !IsDirectIOSupported()) {
+ return;
+ }
+ Init();
+
+ ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+ ASSERT_OK(db_->VerifyChecksum(GetReadOptions()));
+ // The files are tiny so there should have just been one read per file.
+ int expected = kNumFiles;
+ ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+TEST_P(DBRateLimiterOnReadTest, VerifyFileChecksums) {
+ if (use_direct_io_ && !IsDirectIOSupported()) {
+ return;
+ }
+ Init();
+
+ ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+ ASSERT_OK(db_->VerifyFileChecksums(GetReadOptions()));
+ // The files are tiny so there should have just been one read per file.
+ int expected = kNumFiles;
+ ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+#endif // !defined(ROCKSDB_LITE)
+
+class DBRateLimiterOnWriteTest : public DBTestBase {
+ public:
+ explicit DBRateLimiterOnWriteTest()
+ : DBTestBase("db_rate_limiter_on_write_test", /*env_do_fsync=*/false) {}
+
+ void Init() {
+ options_ = GetOptions();
+ ASSERT_OK(TryReopenWithColumnFamilies({"default"}, options_));
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; i++) {
+ ASSERT_OK(Put(0, kStartKey, rnd.RandomString(2)));
+ ASSERT_OK(Put(0, kEndKey, rnd.RandomString(2)));
+ ASSERT_OK(Flush(0));
+ }
+ }
+
+ Options GetOptions() {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.rate_limiter.reset(NewGenericRateLimiter(
+ 1 << 20 /* rate_bytes_per_sec */, 100 * 1000 /* refill_period_us */,
+ 10 /* fairness */, RateLimiter::Mode::kWritesOnly));
+ options.table_factory.reset(
+ NewBlockBasedTableFactory(BlockBasedTableOptions()));
+ return options;
+ }
+
+ protected:
+ inline const static int64_t kNumFiles = 3;
+ inline const static std::string kStartKey = "a";
+ inline const static std::string kEndKey = "b";
+ Options options_;
+};
+
+TEST_F(DBRateLimiterOnWriteTest, Flush) {
+ std::int64_t prev_total_request = 0;
+
+ Init();
+
+ std::int64_t actual_flush_request =
+ options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+ prev_total_request;
+ std::int64_t exepcted_flush_request = kNumFiles;
+ EXPECT_EQ(actual_flush_request, exepcted_flush_request);
+ EXPECT_EQ(actual_flush_request,
+ options_.rate_limiter->GetTotalRequests(Env::IO_HIGH));
+}
+
+TEST_F(DBRateLimiterOnWriteTest, Compact) {
+ Init();
+
+ // Pre-comaction:
+ // level-0 : `kNumFiles` SST files overlapping on [kStartKey, kEndKey]
+#ifndef ROCKSDB_LITE
+ std::string files_per_level_pre_compaction = std::to_string(kNumFiles);
+ ASSERT_EQ(files_per_level_pre_compaction, FilesPerLevel(0 /* cf */));
+#endif // !ROCKSDB_LITE
+
+ std::int64_t prev_total_request =
+ options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL);
+ ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_LOW));
+
+ Compact(kStartKey, kEndKey);
+
+ std::int64_t actual_compaction_request =
+ options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+ prev_total_request;
+
+ // Post-comaction:
+ // level-0 : 0 SST file
+ // level-1 : 1 SST file
+#ifndef ROCKSDB_LITE
+ std::string files_per_level_post_compaction = "0,1";
+ ASSERT_EQ(files_per_level_post_compaction, FilesPerLevel(0 /* cf */));
+#endif // !ROCKSDB_LITE
+
+ std::int64_t exepcted_compaction_request = 1;
+ EXPECT_EQ(actual_compaction_request, exepcted_compaction_request);
+ EXPECT_EQ(actual_compaction_request,
+ options_.rate_limiter->GetTotalRequests(Env::IO_LOW));
+}
+
+class DBRateLimiterOnWriteWALTest
+ : public DBRateLimiterOnWriteTest,
+ public ::testing::WithParamInterface<std::tuple<
+ bool /* WriteOptions::disableWal */,
+ bool /* Options::manual_wal_flush */,
+ Env::IOPriority /* WriteOptions::rate_limiter_priority */>> {
+ public:
+ static std::string GetTestNameSuffix(
+ ::testing::TestParamInfo<std::tuple<bool, bool, Env::IOPriority>> info) {
+ std::ostringstream oss;
+ if (std::get<0>(info.param)) {
+ oss << "DisableWAL";
+ } else {
+ oss << "EnableWAL";
+ }
+ if (std::get<1>(info.param)) {
+ oss << "_ManualWALFlush";
+ } else {
+ oss << "_AutoWALFlush";
+ }
+ if (std::get<2>(info.param) == Env::IO_USER) {
+ oss << "_RateLimitAutoWALFlush";
+ } else if (std::get<2>(info.param) == Env::IO_TOTAL) {
+ oss << "_NoRateLimitAutoWALFlush";
+ } else {
+ oss << "_RateLimitAutoWALFlushWithIncorrectPriority";
+ }
+ return oss.str();
+ }
+
+ explicit DBRateLimiterOnWriteWALTest()
+ : disable_wal_(std::get<0>(GetParam())),
+ manual_wal_flush_(std::get<1>(GetParam())),
+ rate_limiter_priority_(std::get<2>(GetParam())) {}
+
+ void Init() {
+ options_ = GetOptions();
+ options_.manual_wal_flush = manual_wal_flush_;
+ Reopen(options_);
+ }
+
+ WriteOptions GetWriteOptions() {
+ WriteOptions write_options;
+ write_options.disableWAL = disable_wal_;
+ write_options.rate_limiter_priority = rate_limiter_priority_;
+ return write_options;
+ }
+
+ protected:
+ bool disable_wal_;
+ bool manual_wal_flush_;
+ Env::IOPriority rate_limiter_priority_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ DBRateLimiterOnWriteWALTest, DBRateLimiterOnWriteWALTest,
+ ::testing::Values(std::make_tuple(false, false, Env::IO_TOTAL),
+ std::make_tuple(false, false, Env::IO_USER),
+ std::make_tuple(false, false, Env::IO_HIGH),
+ std::make_tuple(false, true, Env::IO_USER),
+ std::make_tuple(true, false, Env::IO_USER)),
+ DBRateLimiterOnWriteWALTest::GetTestNameSuffix);
+
+TEST_P(DBRateLimiterOnWriteWALTest, AutoWalFlush) {
+ Init();
+
+ const bool no_rate_limit_auto_wal_flush =
+ (rate_limiter_priority_ == Env::IO_TOTAL);
+ const bool valid_arg = (rate_limiter_priority_ == Env::IO_USER &&
+ !disable_wal_ && !manual_wal_flush_);
+
+ std::int64_t prev_total_request =
+ options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL);
+ ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+ Status s = Put("foo", "v1", GetWriteOptions());
+
+ if (no_rate_limit_auto_wal_flush || valid_arg) {
+ EXPECT_TRUE(s.ok());
+ } else {
+ EXPECT_TRUE(s.IsInvalidArgument());
+ EXPECT_TRUE(s.ToString().find("WriteOptions::rate_limiter_priority") !=
+ std::string::npos);
+ }
+
+ std::int64_t actual_auto_wal_flush_request =
+ options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+ prev_total_request;
+ std::int64_t expected_auto_wal_flush_request = valid_arg ? 1 : 0;
+
+ EXPECT_EQ(actual_auto_wal_flush_request, expected_auto_wal_flush_request);
+ EXPECT_EQ(actual_auto_wal_flush_request,
+ options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_readonly_with_timestamp_test.cc b/src/rocksdb/db/db_readonly_with_timestamp_test.cc
new file mode 100644
index 000000000..3f53e7806
--- /dev/null
+++ b/src/rocksdb/db/db_readonly_with_timestamp_test.cc
@@ -0,0 +1,960 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_with_timestamp_test_util.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBReadOnlyTestWithTimestamp : public DBBasicTestWithTimestampBase {
+ public:
+ DBReadOnlyTestWithTimestamp()
+ : DBBasicTestWithTimestampBase("db_readonly_test_with_timestamp") {}
+
+ protected:
+#ifndef ROCKSDB_LITE
+ void CheckDBOpenedAsCompactedDBWithOneLevel0File() {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ ASSERT_NE(versions, nullptr);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(cfd, nullptr);
+
+ Version* const current = cfd->current();
+ ASSERT_NE(current, nullptr);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ ASSERT_NE(storage_info, nullptr);
+
+ // Only 1 L0 file.
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ // L0 is the max level.
+ ASSERT_EQ(storage_info->num_non_empty_levels(), 1);
+ }
+
+ void CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles() {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ ASSERT_NE(versions, nullptr);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(cfd, nullptr);
+
+ Version* const current = cfd->current();
+ ASSERT_NE(current, nullptr);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ ASSERT_NE(storage_info, nullptr);
+
+ // L0 has no files.
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+ // All other levels have no files except the highest level with files.
+ for (int i = 1; i < storage_info->num_non_empty_levels() - 1; ++i) {
+ ASSERT_FALSE(storage_info->LevelFilesBrief(i).num_files > 0);
+ }
+
+ // The highest level with files have some files.
+ int highest_non_empty_level = storage_info->num_non_empty_levels() - 1;
+ ASSERT_TRUE(
+ storage_info->LevelFilesBrief(highest_non_empty_level).num_files > 0);
+ }
+#endif // !ROCKSDB_LITE
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGetReadTimestampSizeMismatch) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database in read only mode to test its timestamp support.
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ ReadOptions read_opts;
+ std::string different_size_read_timestamp;
+ PutFixed32(&different_size_read_timestamp, 2);
+ Slice different_size_read_ts = different_size_read_timestamp;
+ read_opts.timestamp = &different_size_read_ts;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsInvalidArgument());
+ }
+
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ std::string value_from_get;
+ std::string timestamp;
+ ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+ .IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+ IteratorAndGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database in read only mode to test its timestamp support.
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ ReadOptions read_opts;
+ const std::string read_timestamp = Timestamp(2, 0);
+ Slice read_ts = read_timestamp;
+ read_opts.timestamp = &read_ts;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsInvalidArgument());
+ }
+
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ std::string value_from_get;
+ std::string timestamp;
+ ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+ .IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+ IteratorAndGetWriteWithTimestampReadWithoutTimestamp) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database in read only mode to test its timestamp support.
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ ReadOptions read_opts;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsInvalidArgument());
+ }
+
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ std::string value_from_get;
+ ASSERT_TRUE(
+ db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::vector<uint64_t> start_keys = {1, 0};
+ const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+ Timestamp(3, 0)};
+ const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+ Timestamp(4, 0)};
+ for (size_t i = 0; i < write_timestamps.size(); ++i) {
+ WriteOptions write_opts;
+ for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ }
+
+ // Reopen the database in read only mode to test its timestamp support.
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+
+ auto get_value_and_check = [](DB* db, ReadOptions read_opts, Slice key,
+ Slice expected_value, std::string expected_ts) {
+ std::string value_from_get;
+ std::string timestamp;
+ ASSERT_OK(db->Get(read_opts, key.ToString(), &value_from_get, &timestamp));
+ ASSERT_EQ(expected_value, value_from_get);
+ ASSERT_EQ(expected_ts, timestamp);
+ };
+ for (size_t i = 0; i < read_timestamps.size(); ++i) {
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamps[i];
+ read_opts.timestamp = &read_ts;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ int count = 0;
+ uint64_t key = 0;
+ // Forward iterate.
+ for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
+ it->Next(), ++count, ++key) {
+ CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ get_value_and_check(db_, read_opts, it->key(), it->value(),
+ write_timestamps[i]);
+ }
+ size_t expected_count = kMaxKey - start_keys[i] + 1;
+ ASSERT_EQ(expected_count, count);
+
+ // Backward iterate.
+ count = 0;
+ for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+ it->Prev(), ++count, --key) {
+ CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ get_value_and_check(db_, read_opts, it->key(), it->value(),
+ write_timestamps[i]);
+ }
+ ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+ // SeekToFirst()/SeekToLast() with lower/upper bounds.
+ // Then iter with lower and upper bounds.
+ uint64_t l = 0;
+ uint64_t r = kMaxKey + 1;
+ while (l < r) {
+ std::string lb_str = Key1(l);
+ Slice lb = lb_str;
+ std::string ub_str = Key1(r);
+ Slice ub = ub_str;
+ read_opts.iterate_lower_bound = &lb;
+ read_opts.iterate_upper_bound = &ub;
+ it.reset(db_->NewIterator(read_opts));
+ for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
+ it->Valid(); it->Next(), ++key, ++count) {
+ CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ get_value_and_check(db_, read_opts, it->key(), it->value(),
+ write_timestamps[i]);
+ }
+ ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+ for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+ it->Valid(); it->Prev(), --key, ++count) {
+ CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ get_value_and_check(db_, read_opts, it->key(), it->value(),
+ write_timestamps[i]);
+ }
+ l += (kMaxKey / 100);
+ r -= (kMaxKey / 100);
+ }
+ }
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, Iterators) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::string write_timestamp = Timestamp(1, 0);
+ const std::string read_timestamp = Timestamp(2, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database in read only mode to test its timestamp support.
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamp;
+ read_opts.timestamp = &read_ts;
+ std::vector<Iterator*> iters;
+ ASSERT_OK(db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters));
+ ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
+
+ int count = 0;
+ uint64_t key = 0;
+ // Forward iterate.
+ for (iters[0]->Seek(Key1(0)), key = 0; iters[0]->Valid();
+ iters[0]->Next(), ++count, ++key) {
+ CheckIterUserEntry(iters[0], Key1(key), kTypeValue,
+ "value" + std::to_string(key), write_timestamp);
+ }
+
+ size_t expected_count = kMaxKey - 0 + 1;
+ ASSERT_EQ(expected_count, count);
+ delete iters[0];
+
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorsReadTimestampSizeMismatch) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database in read only mode to test its timestamp support.
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ ReadOptions read_opts;
+ std::string different_size_read_timestamp;
+ PutFixed32(&different_size_read_timestamp, 2);
+ Slice different_size_read_ts = different_size_read_timestamp;
+ read_opts.timestamp = &different_size_read_ts;
+ {
+ std::vector<Iterator*> iters;
+ ASSERT_TRUE(
+ db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+ .IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+ IteratorsReadTimestampSpecifiedWithoutWriteTimestamp) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database in read only mode to test its timestamp support.
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ ReadOptions read_opts;
+ const std::string read_timestamp = Timestamp(2, 0);
+ Slice read_ts = read_timestamp;
+ read_opts.timestamp = &read_ts;
+ {
+ std::vector<Iterator*> iters;
+ ASSERT_TRUE(
+ db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+ .IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+ IteratorsWriteWithTimestampReadWithoutTimestamp) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database in read only mode to test its timestamp support.
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ ReadOptions read_opts;
+ {
+ std::vector<Iterator*> iters;
+ ASSERT_TRUE(
+ db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+ .IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBGetReadTimestampSizeMismatch) {
+ const int kNumKeysPerFile = 1026;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(0));
+ ASSERT_OK(s);
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ Close();
+
+ // Reopen the database in read only mode as a Compacted DB to test its
+ // timestamp support.
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+ ReadOptions read_opts;
+ std::string different_size_read_timestamp;
+ PutFixed32(&different_size_read_timestamp, 2);
+ Slice different_size_read_ts = different_size_read_timestamp;
+ read_opts.timestamp = &different_size_read_ts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ std::string value_from_get;
+ std::string timestamp;
+ ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+ .IsInvalidArgument());
+ }
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+ CompactedDBGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+ const int kNumKeysPerFile = 1026;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(0));
+ ASSERT_OK(s);
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ Close();
+
+ // Reopen the database in read only mode as a Compacted DB to test its
+ // timestamp support.
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+ ReadOptions read_opts;
+ const std::string read_timestamp = Timestamp(2, 0);
+ Slice read_ts = read_timestamp;
+ read_opts.timestamp = &read_ts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ std::string value_from_get;
+ std::string timestamp;
+ ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+ .IsInvalidArgument());
+ }
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+ CompactedDBGetWriteWithTimestampReadWithoutTimestamp) {
+ const int kNumKeysPerFile = 1026;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(0));
+ ASSERT_OK(s);
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ Close();
+
+ // Reopen the database in read only mode as a Compacted DB to test its
+ // timestamp support.
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+ ReadOptions read_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ std::string value_from_get;
+ ASSERT_TRUE(
+ db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument());
+ }
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBGetWithOnlyOneL0File) {
+ const int kNumKeysPerFile = 1026 * 2;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::vector<uint64_t> start_keys = {1, 0};
+ const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+ Timestamp(3, 0)};
+ const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+ Timestamp(4, 0)};
+ for (size_t i = 0; i < write_timestamps.size(); ++i) {
+ WriteOptions write_opts;
+ for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ Close();
+
+ // Reopen the database in read only mode as a Compacted DB to test its
+ // timestamp support.
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+ for (size_t i = 0; i < read_timestamps.size(); ++i) {
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamps[i];
+ read_opts.timestamp = &read_ts;
+ int count = 0;
+ for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key, ++count) {
+ std::string value_from_get;
+ std::string timestamp;
+ ASSERT_OK(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp));
+ ASSERT_EQ("value" + std::to_string(i), value_from_get);
+ ASSERT_EQ(write_timestamps[i], timestamp);
+ }
+ size_t expected_count = kMaxKey - start_keys[i] + 1;
+ ASSERT_EQ(expected_count, count);
+ }
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+ CompactedDBGetWithOnlyHighestNonEmptyLevelFiles) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::vector<uint64_t> start_keys = {1, 0};
+ const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+ Timestamp(3, 0)};
+ const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+ Timestamp(4, 0)};
+ for (size_t i = 0; i < write_timestamps.size(); ++i) {
+ WriteOptions write_opts;
+ for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ Close();
+
+ // Reopen the database in read only mode as a Compacted DB to test its
+ // timestamp support.
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles();
+
+ for (size_t i = 0; i < read_timestamps.size(); ++i) {
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamps[i];
+ read_opts.timestamp = &read_ts;
+ int count = 0;
+ for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key, ++count) {
+ std::string value_from_get;
+ std::string timestamp;
+ ASSERT_OK(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp));
+ ASSERT_EQ("value" + std::to_string(i), value_from_get);
+ ASSERT_EQ(write_timestamps[i], timestamp);
+ }
+ size_t expected_count = kMaxKey - start_keys[i] + 1;
+ ASSERT_EQ(expected_count, count);
+ }
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+ CompactedDBMultiGetReadTimestampSizeMismatch) {
+ const int kNumKeysPerFile = 1026;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(0));
+ ASSERT_OK(s);
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ Close();
+
+ // Reopen the database in read only mode as a Compacted DB to test its
+ // timestamp support.
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+ ReadOptions read_opts;
+ std::string different_size_read_timestamp;
+ PutFixed32(&different_size_read_timestamp, 2);
+ Slice different_size_read_ts = different_size_read_timestamp;
+ read_opts.timestamp = &different_size_read_ts;
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ key_strs.push_back(Key1(key));
+ }
+ for (const auto& key_str : key_strs) {
+ keys.emplace_back(key_str);
+ }
+ std::vector<std::string> values;
+ std::vector<std::string> timestamps;
+ std::vector<Status> status_list =
+ db_->MultiGet(read_opts, keys, &values, &timestamps);
+ for (const auto& status : status_list) {
+ ASSERT_TRUE(status.IsInvalidArgument());
+ }
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+ CompactedDBMultiGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+ const int kNumKeysPerFile = 1026;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(0));
+ ASSERT_OK(s);
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ Close();
+
+ // Reopen the database in read only mode as a Compacted DB to test its
+ // timestamp support.
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+ ReadOptions read_opts;
+ std::string read_timestamp = Timestamp(2, 0);
+ Slice read_ts = read_timestamp;
+ read_opts.timestamp = &read_ts;
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ key_strs.push_back(Key1(key));
+ }
+ for (const auto& key_str : key_strs) {
+ keys.emplace_back(key_str);
+ }
+ std::vector<std::string> values;
+ std::vector<std::string> timestamps;
+ std::vector<Status> status_list =
+ db_->MultiGet(read_opts, keys, &values, &timestamps);
+ for (const auto& status : status_list) {
+ ASSERT_TRUE(status.IsInvalidArgument());
+ }
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+ CompactedDBMultiGetWriteWithTimestampReadWithoutTimestamp) {
+ const int kNumKeysPerFile = 1026;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(0));
+ ASSERT_OK(s);
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ Close();
+
+ // Reopen the database in read only mode as a Compacted DB to test its
+ // timestamp support.
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+ ReadOptions read_opts;
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ key_strs.push_back(Key1(key));
+ }
+ for (const auto& key_str : key_strs) {
+ keys.emplace_back(key_str);
+ }
+ std::vector<std::string> values;
+ std::vector<Status> status_list = db_->MultiGet(read_opts, keys, &values);
+ for (const auto& status : status_list) {
+ ASSERT_TRUE(status.IsInvalidArgument());
+ }
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBMultiGetWithOnlyOneL0File) {
+ const int kNumKeysPerFile = 1026 * 2;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::vector<uint64_t> start_keys = {1, 0};
+ const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+ Timestamp(3, 0)};
+ const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+ Timestamp(4, 0)};
+ for (size_t i = 0; i < write_timestamps.size(); ++i) {
+ WriteOptions write_opts;
+ for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ Close();
+
+ // Reopen the database in read only mode as a Compacted DB to test its
+ // timestamp support.
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+ for (size_t i = 0; i < write_timestamps.size(); ++i) {
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamps[i];
+ read_opts.timestamp = &read_ts;
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+ key_strs.push_back(Key1(key));
+ }
+ for (const auto& key_str : key_strs) {
+ keys.emplace_back(key_str);
+ }
+ size_t batch_size = kMaxKey - start_keys[i] + 1;
+ std::vector<std::string> values;
+ std::vector<std::string> timestamps;
+ std::vector<Status> status_list =
+ db_->MultiGet(read_opts, keys, &values, &timestamps);
+ ASSERT_EQ(batch_size, values.size());
+ ASSERT_EQ(batch_size, timestamps.size());
+ for (uint64_t idx = 0; idx < values.size(); ++idx) {
+ ASSERT_EQ("value" + std::to_string(i), values[idx]);
+ ASSERT_EQ(write_timestamps[i], timestamps[idx]);
+ ASSERT_OK(status_list[idx]);
+ }
+ }
+
+ Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+ CompactedDBMultiGetWithOnlyHighestNonEmptyLevelFiles) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::vector<uint64_t> start_keys = {1, 0};
+ const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+ Timestamp(3, 0)};
+ const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+ Timestamp(4, 0)};
+ for (size_t i = 0; i < write_timestamps.size(); ++i) {
+ WriteOptions write_opts;
+ for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ Close();
+
+ // Reopen the database in read only mode as a Compacted DB to test its
+ // timestamp support.
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles();
+
+ for (size_t i = 0; i < write_timestamps.size(); ++i) {
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamps[i];
+ read_opts.timestamp = &read_ts;
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+ key_strs.push_back(Key1(key));
+ }
+ for (const auto& key_str : key_strs) {
+ keys.emplace_back(key_str);
+ }
+ size_t batch_size = kMaxKey - start_keys[i] + 1;
+ std::vector<std::string> values;
+ std::vector<std::string> timestamps;
+ std::vector<Status> status_list =
+ db_->MultiGet(read_opts, keys, &values, &timestamps);
+ ASSERT_EQ(batch_size, values.size());
+ ASSERT_EQ(batch_size, timestamps.size());
+ for (uint64_t idx = 0; idx < values.size(); ++idx) {
+ ASSERT_EQ("value" + std::to_string(i), values[idx]);
+ ASSERT_EQ(write_timestamps[i], timestamps[idx]);
+ ASSERT_OK(status_list[idx]);
+ }
+ }
+
+ Close();
+}
+#endif // !ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_secondary_test.cc b/src/rocksdb/db/db_secondary_test.cc
new file mode 100644
index 000000000..20d7534e0
--- /dev/null
+++ b/src/rocksdb/db/db_secondary_test.cc
@@ -0,0 +1,1693 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl/db_impl_secondary.h"
+#include "db/db_test_util.h"
+#include "db/db_with_timestamp_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class DBSecondaryTestBase : public DBBasicTestWithTimestampBase {
+ public:
+ explicit DBSecondaryTestBase(const std::string& dbname)
+ : DBBasicTestWithTimestampBase(dbname),
+ secondary_path_(),
+ handles_secondary_(),
+ db_secondary_(nullptr) {
+ secondary_path_ =
+ test::PerThreadDBPath(env_, "/db_secondary_test_secondary");
+ }
+
+ ~DBSecondaryTestBase() override {
+ CloseSecondary();
+ if (getenv("KEEP_DB") != nullptr) {
+ fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str());
+ } else {
+ Options options;
+ options.env = env_;
+ EXPECT_OK(DestroyDB(secondary_path_, options));
+ }
+ }
+
+ protected:
+ Status ReopenAsSecondary(const Options& options) {
+ return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_);
+ }
+
+ void OpenSecondary(const Options& options);
+
+ Status TryOpenSecondary(const Options& options);
+
+ void OpenSecondaryWithColumnFamilies(
+ const std::vector<std::string>& column_families, const Options& options);
+
+ void CloseSecondary() {
+ for (auto h : handles_secondary_) {
+ ASSERT_OK(db_secondary_->DestroyColumnFamilyHandle(h));
+ }
+ handles_secondary_.clear();
+ delete db_secondary_;
+ db_secondary_ = nullptr;
+ }
+
+ DBImplSecondary* db_secondary_full() {
+ return static_cast<DBImplSecondary*>(db_secondary_);
+ }
+
+ void CheckFileTypeCounts(const std::string& dir, int expected_log,
+ int expected_sst, int expected_manifest) const;
+
+ std::string secondary_path_;
+ std::vector<ColumnFamilyHandle*> handles_secondary_;
+ DB* db_secondary_;
+};
+
+void DBSecondaryTestBase::OpenSecondary(const Options& options) {
+ ASSERT_OK(TryOpenSecondary(options));
+}
+
+Status DBSecondaryTestBase::TryOpenSecondary(const Options& options) {
+ Status s =
+ DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_);
+ return s;
+}
+
+void DBSecondaryTestBase::OpenSecondaryWithColumnFamilies(
+ const std::vector<std::string>& column_families, const Options& options) {
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ for (const auto& cf_name : column_families) {
+ cf_descs.emplace_back(cf_name, options);
+ }
+ Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs,
+ &handles_secondary_, &db_secondary_);
+ ASSERT_OK(s);
+}
+
+void DBSecondaryTestBase::CheckFileTypeCounts(const std::string& dir,
+ int expected_log,
+ int expected_sst,
+ int expected_manifest) const {
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dir, &filenames));
+
+ int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+ for (auto file : filenames) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(file, &number, &type)) {
+ log_cnt += (type == kWalFile);
+ sst_cnt += (type == kTableFile);
+ manifest_cnt += (type == kDescriptorFile);
+ }
+ }
+ ASSERT_EQ(expected_log, log_cnt);
+ ASSERT_EQ(expected_sst, sst_cnt);
+ ASSERT_EQ(expected_manifest, manifest_cnt);
+}
+
+class DBSecondaryTest : public DBSecondaryTestBase {
+ public:
+ explicit DBSecondaryTest() : DBSecondaryTestBase("db_secondary_test") {}
+};
+
+TEST_F(DBSecondaryTest, FailOpenIfLoggerCreationFail) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ Reopen(options);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) {
+ auto* s = reinterpret_cast<Status*>(arg);
+ assert(s);
+ *s = Status::IOError("Injected");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ options.max_open_files = -1;
+ Status s = TryOpenSecondary(options);
+ ASSERT_EQ(nullptr, options.info_log);
+ ASSERT_TRUE(s.IsIOError());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBSecondaryTest, NonExistingDb) {
+ Destroy(last_options_);
+
+ Options options = GetDefaultOptions();
+ options.env = env_;
+ options.max_open_files = -1;
+ const std::string dbname = "/doesnt/exist";
+ Status s =
+ DB::OpenAsSecondary(options, dbname, secondary_path_, &db_secondary_);
+ ASSERT_TRUE(s.IsIOError());
+}
+
+TEST_F(DBSecondaryTest, ReopenAsSecondary) {
+ Options options;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_OK(Put("foo", "foo_value"));
+ ASSERT_OK(Put("bar", "bar_value"));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ Close();
+
+ ASSERT_OK(ReopenAsSecondary(options));
+ ASSERT_EQ("foo_value", Get("foo"));
+ ASSERT_EQ("bar_value", Get("bar"));
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ auto db1 = static_cast<DBImplSecondary*>(db_);
+ ASSERT_NE(nullptr, db1);
+ Iterator* iter = db1->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ if (0 == count) {
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ("bar_value", iter->value().ToString());
+ } else if (1 == count) {
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("foo_value", iter->value().ToString());
+ }
+ ++count;
+ }
+ delete iter;
+ ASSERT_EQ(2, count);
+}
+
+TEST_F(DBSecondaryTest, SimpleInternalCompaction) {
+ Options options;
+ options.env = env_;
+ Reopen(options);
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+ CompactionServiceInput input;
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+ for (auto& file : meta.levels[0].files) {
+ ASSERT_EQ(0, meta.levels[0].level);
+ input.input_files.push_back(file.name);
+ }
+ ASSERT_EQ(input.input_files.size(), 3);
+
+ input.output_level = 1;
+ ASSERT_OK(db_->GetDbIdentity(input.db_id));
+ Close();
+
+ options.max_open_files = -1;
+ OpenSecondary(options);
+ auto cfh = db_secondary_->DefaultColumnFamily();
+
+ CompactionServiceResult result;
+ ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+ OpenAndCompactOptions(), cfh, input, &result));
+
+ ASSERT_EQ(result.output_files.size(), 1);
+ InternalKey smallest, largest;
+ smallest.DecodeFrom(result.output_files[0].smallest_internal_key);
+ largest.DecodeFrom(result.output_files[0].largest_internal_key);
+ ASSERT_EQ(smallest.user_key().ToString(), "bar");
+ ASSERT_EQ(largest.user_key().ToString(), "foo");
+ ASSERT_EQ(result.output_level, 1);
+ ASSERT_EQ(result.output_path, this->secondary_path_);
+ ASSERT_EQ(result.num_output_records, 2);
+ ASSERT_GT(result.bytes_written, 0);
+ ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) {
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ const int kRangeL2 = 10;
+ const int kRangeL1 = 30;
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i * kRangeL2), "value" + std::to_string(i)));
+ ASSERT_OK(Put(Key((i + 1) * kRangeL2 - 1), "value" + std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(2);
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(Put(Key(i * kRangeL1), "value" + std::to_string(i)));
+ ASSERT_OK(Put(Key((i + 1) * kRangeL1 - 1), "value" + std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(1);
+ for (int i = 0; i < 4; i++) {
+ ASSERT_OK(Put(Key(i * 30), "value" + std::to_string(i)));
+ ASSERT_OK(Put(Key(i * 30 + 50), "value" + std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+
+ // pick 2 files on level 0 for compaction, which has 3 overlap files on L1
+ CompactionServiceInput input1;
+ input1.input_files.push_back(meta.levels[0].files[2].name);
+ input1.input_files.push_back(meta.levels[0].files[3].name);
+ input1.input_files.push_back(meta.levels[1].files[0].name);
+ input1.input_files.push_back(meta.levels[1].files[1].name);
+ input1.input_files.push_back(meta.levels[1].files[2].name);
+
+ input1.output_level = 1;
+ ASSERT_OK(db_->GetDbIdentity(input1.db_id));
+
+ options.max_open_files = -1;
+ Close();
+
+ OpenSecondary(options);
+ auto cfh = db_secondary_->DefaultColumnFamily();
+ CompactionServiceResult result;
+ ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+ OpenAndCompactOptions(), cfh, input1, &result));
+ ASSERT_OK(result.status);
+
+ // pick 2 files on level 1 for compaction, which has 6 overlap files on L2
+ CompactionServiceInput input2;
+ input2.input_files.push_back(meta.levels[1].files[1].name);
+ input2.input_files.push_back(meta.levels[1].files[2].name);
+ for (int i = 3; i < 9; i++) {
+ input2.input_files.push_back(meta.levels[2].files[i].name);
+ }
+
+ input2.output_level = 2;
+ input2.db_id = input1.db_id;
+ ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+ OpenAndCompactOptions(), cfh, input2, &result));
+ ASSERT_OK(result.status);
+
+ CloseSecondary();
+
+ // delete all l2 files, without update manifest
+ for (auto& file : meta.levels[2].files) {
+ ASSERT_OK(env_->DeleteFile(dbname_ + file.name));
+ }
+ OpenSecondary(options);
+ cfh = db_secondary_->DefaultColumnFamily();
+ Status s = db_secondary_full()->TEST_CompactWithoutInstallation(
+ OpenAndCompactOptions(), cfh, input2, &result);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ ASSERT_OK(result.status);
+
+ // TODO: L0 -> L1 compaction should success, currently version is not built
+ // if files is missing.
+ // ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(OpenAndCompactOptions(),
+ // cfh, input1, &result));
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionCompactedFiles) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+ CompactionServiceInput input;
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+ for (auto& file : meta.levels[0].files) {
+ ASSERT_EQ(0, meta.levels[0].level);
+ input.input_files.push_back(file.name);
+ }
+ ASSERT_EQ(input.input_files.size(), 3);
+
+ input.output_level = 1;
+ ASSERT_OK(db_->GetDbIdentity(input.db_id));
+
+ // trigger compaction to delete the files for secondary instance compaction
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(3)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(3)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ Close();
+
+ options.max_open_files = -1;
+ OpenSecondary(options);
+ auto cfh = db_secondary_->DefaultColumnFamily();
+
+ CompactionServiceResult result;
+ Status s = db_secondary_full()->TEST_CompactWithoutInstallation(
+ OpenAndCompactOptions(), cfh, input, &result);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionMissingFiles) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+ CompactionServiceInput input;
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+ for (auto& file : meta.levels[0].files) {
+ ASSERT_EQ(0, meta.levels[0].level);
+ input.input_files.push_back(file.name);
+ }
+ ASSERT_EQ(input.input_files.size(), 3);
+
+ input.output_level = 1;
+ ASSERT_OK(db_->GetDbIdentity(input.db_id));
+
+ Close();
+
+ ASSERT_OK(env_->DeleteFile(dbname_ + input.input_files[0]));
+
+ options.max_open_files = -1;
+ OpenSecondary(options);
+ auto cfh = db_secondary_->DefaultColumnFamily();
+
+ CompactionServiceResult result;
+ Status s = db_secondary_full()->TEST_CompactWithoutInstallation(
+ OpenAndCompactOptions(), cfh, input, &result);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ ASSERT_OK(result.status);
+
+ input.input_files.erase(input.input_files.begin());
+
+ ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+ OpenAndCompactOptions(), cfh, input, &result));
+ ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondary) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ const auto verify_db_func = [&](const std::string& foo_val,
+ const std::string& bar_val) {
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ(foo_val, value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ(bar_val, value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ(foo_val, iter->value().ToString());
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ(bar_val, iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+ };
+
+ verify_db_func("foo_value2", "bar_value2");
+
+ ASSERT_OK(Put("foo", "new_foo_value"));
+ ASSERT_OK(Put("bar", "new_bar_value"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db_func("new_foo_value", "new_bar_value");
+}
+
+namespace {
+class TraceFileEnv : public EnvWrapper {
+ public:
+ explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {}
+ static const char* kClassName() { return "TraceFileEnv"; }
+ const char* Name() const override { return kClassName(); }
+
+ Status NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& env_options) override {
+ class TracedRandomAccessFile : public RandomAccessFile {
+ public:
+ TracedRandomAccessFile(std::unique_ptr<RandomAccessFile>&& target,
+ std::atomic<int>& counter)
+ : target_(std::move(target)), files_closed_(counter) {}
+ ~TracedRandomAccessFile() override {
+ files_closed_.fetch_add(1, std::memory_order_relaxed);
+ }
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ return target_->Read(offset, n, result, scratch);
+ }
+
+ private:
+ std::unique_ptr<RandomAccessFile> target_;
+ std::atomic<int>& files_closed_;
+ };
+ Status s = target()->NewRandomAccessFile(f, r, env_options);
+ if (s.ok()) {
+ r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_));
+ }
+ return s;
+ }
+
+ int files_closed() const {
+ return files_closed_.load(std::memory_order_relaxed);
+ }
+
+ private:
+ std::atomic<int> files_closed_{0};
+};
+} // anonymous namespace
+
+TEST_F(DBSecondaryTest, SecondaryCloseFiles) {
+ Options options;
+ options.env = env_;
+ options.max_open_files = 1;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ Options options1;
+ std::unique_ptr<Env> traced_env(new TraceFileEnv(env_));
+ options1.env = traced_env.get();
+ OpenSecondary(options1);
+
+ static const auto verify_db = [&]() {
+ std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+ std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(ReadOptions()));
+ for (iter1->SeekToFirst(), iter2->SeekToFirst();
+ iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
+ ASSERT_EQ(iter1->key(), iter2->key());
+ ASSERT_EQ(iter1->value(), iter2->value());
+ }
+ ASSERT_FALSE(iter1->Valid());
+ ASSERT_FALSE(iter2->Valid());
+ };
+
+ ASSERT_OK(Put("a", "value"));
+ ASSERT_OK(Put("c", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db();
+
+ ASSERT_OK(Put("b", "value"));
+ ASSERT_OK(Put("d", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db();
+
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ASSERT_EQ(2, static_cast<TraceFileEnv*>(traced_env.get())->files_closed());
+
+ Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}});
+ ASSERT_TRUE(s.IsNotSupported());
+ CloseSecondary();
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ }
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ const auto verify_db_func = [&](const std::string& foo_val,
+ const std::string& bar_val) {
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ(foo_val, value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ(bar_val, value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ(foo_val, iter->value().ToString());
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ(bar_val, iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+ };
+
+ verify_db_func("foo_value2", "bar_value2");
+
+ ASSERT_OK(Put("foo", "new_foo_value"));
+ ASSERT_OK(Put("bar", "new_bar_value"));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db_func("new_foo_value", "new_bar_value");
+
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "new_foo_value_1"));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db_func("new_foo_value_1", "new_bar_value");
+}
+
+TEST_F(DBSecondaryTest, SecondaryTailingBug_ISSUE_8467) {
+ Options options;
+ options.env = env_;
+ Reopen(options);
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ }
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ const auto verify_db = [&](const std::string& foo_val,
+ const std::string& bar_val) {
+ std::string value;
+ ReadOptions ropts;
+ Status s = db_secondary_->Get(ropts, "foo", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(foo_val, value);
+
+ s = db_secondary_->Get(ropts, "bar", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(bar_val, value);
+ };
+
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db("foo_value2", "bar_value2");
+ }
+}
+
+TEST_F(DBSecondaryTest, RefreshIterator) {
+ Options options;
+ options.env = env_;
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ std::unique_ptr<Iterator> it(db_secondary_->NewIterator(ReadOptions()));
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ if (0 == i) {
+ it->Seek("foo");
+ ASSERT_FALSE(it->Valid());
+ ASSERT_OK(it->status());
+
+ ASSERT_OK(it->Refresh());
+
+ it->Seek("foo");
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("foo", it->key());
+ ASSERT_EQ("foo_value0", it->value());
+ } else {
+ it->Seek("foo");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("foo", it->key());
+ ASSERT_EQ("foo_value" + std::to_string(i - 1), it->value());
+ ASSERT_OK(it->status());
+
+ ASSERT_OK(it->Refresh());
+
+ it->Seek("foo");
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("foo", it->key());
+ ASSERT_EQ("foo_value" + std::to_string(i), it->value());
+ }
+ }
+}
+
+TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {
+ Options options;
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options1);
+ cf_descs.emplace_back("pikachu", options1);
+ cf_descs.emplace_back("eevee", options1);
+ Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs,
+ &handles_secondary_, &db_secondary_);
+ ASSERT_NOK(s);
+}
+
+TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) {
+ Options options;
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+ ASSERT_EQ(0, handles_secondary_.size());
+ ASSERT_NE(nullptr, db_secondary_);
+
+ ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value"));
+ ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value"));
+ ASSERT_OK(Flush(0 /*cf*/));
+ ASSERT_OK(Flush(1 /*cf*/));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ("foo_value", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) {
+ Options options;
+ options.env = env_;
+ Reopen(options);
+ Close();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0",
+ "VersionSet::ProcessManifestWrites:BeforeNewManifest"},
+ {"DBImpl::Open:AfterDeleteFiles",
+ "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:"
+ "1"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread ro_db_thread([&]() {
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ Status s = TryOpenSecondary(options1);
+ ASSERT_TRUE(s.IsTryAgain());
+
+ // Try again
+ OpenSecondary(options1);
+ CloseSecondary();
+ });
+ Reopen(options);
+ ro_db_thread.join();
+}
+
+TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+ for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+}
+
+TEST_F(DBSecondaryTest, MissingTableFile) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_NE(nullptr, db_secondary_full());
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+}
+
+TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) {
+ Options options;
+ options.env = env_;
+ const std::string kCfName1 = "pikachu";
+ CreateAndReopenWithCF({kCfName1}, options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondaryWithColumnFamilies({kCfName1}, options1);
+ ASSERT_EQ(2, handles_secondary_.size());
+
+ ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1"));
+ ASSERT_OK(Flush(1 /*cf*/));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+ ASSERT_EQ("foo_val_1", value);
+
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ Close();
+ CheckFileTypeCounts(dbname_, 1, 0, 1);
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ value.clear();
+ ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+ ASSERT_EQ("foo_val_1", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchManifest) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ const std::string cf1_name("test_cf");
+ CreateAndReopenWithCF({cf1_name}, options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name},
+ options1);
+
+ const int kNumFiles = options.level0_file_num_compaction_trigger - 1;
+ // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1,
+ // ..., 9.
+ const int kNumKeys = 10;
+ // Create two sst
+ for (int i = 0; i != kNumFiles; ++i) {
+ for (int j = 0; j != kNumKeys; ++j) {
+ ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ const auto& range_scan_db = [&]() {
+ ReadOptions tmp_ropts;
+ tmp_ropts.total_order_seek = true;
+ tmp_ropts.verify_checksums = true;
+ std::unique_ptr<Iterator> iter(db_secondary_->NewIterator(tmp_ropts));
+ int cnt = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) {
+ ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString());
+ ASSERT_EQ("value_" + std::to_string(kNumFiles - 1),
+ iter->value().ToString());
+ }
+ };
+
+ range_scan_db();
+
+ // While secondary instance still keeps old MANIFEST open, we close primary,
+ // restart primary, performs full compaction, close again, restart again so
+ // that next time secondary tries to catch up with primary, the secondary
+ // will skip the MANIFEST in middle.
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ range_scan_db();
+}
+
+TEST_F(DBSecondaryTest, SwitchManifestTwice) {
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ const std::string cf1_name("test_cf");
+ CreateAndReopenWithCF({cf1_name}, options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name},
+ options1);
+
+ ASSERT_OK(Put("0", "value0"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ std::string value;
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+ ASSERT_EQ("value0", value);
+
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+ ASSERT_OK(Put("0", "value1"));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+
+ ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+ ASSERT_EQ("value1", value);
+}
+
+TEST_F(DBSecondaryTest, DISABLED_SwitchWAL) {
+ const int kNumKeysPerMemtable = 1;
+ Options options;
+ options.env = env_;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 2;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ const auto& verify_db = [](DB* db1, DB* db2) {
+ ASSERT_NE(nullptr, db1);
+ ASSERT_NE(nullptr, db2);
+ ReadOptions read_opts;
+ read_opts.verify_checksums = true;
+ std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts));
+ std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts));
+ it1->SeekToFirst();
+ it2->SeekToFirst();
+ for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+ ASSERT_EQ(it1->key(), it2->key());
+ ASSERT_EQ(it1->value(), it2->value());
+ }
+ ASSERT_FALSE(it1->Valid());
+ ASSERT_FALSE(it2->Valid());
+
+ for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+ std::string value;
+ ASSERT_OK(db2->Get(read_opts, it1->key(), &value));
+ ASSERT_EQ(it1->value(), value);
+ }
+ for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+ std::string value;
+ ASSERT_OK(db1->Get(read_opts, it2->key(), &value));
+ ASSERT_EQ(it2->value(), value);
+ }
+ };
+ for (int k = 0; k != 16; ++k) {
+ ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db(dbfull(), db_secondary_);
+ }
+}
+
+TEST_F(DBSecondaryTest, DISABLED_SwitchWALMultiColumnFamilies) {
+ const int kNumKeysPerMemtable = 1;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BackgroundCallFlush:ContextCleanedUp",
+ "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+ const std::string kCFName1 = "pikachu";
+ Options options;
+ options.env = env_;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 2;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+ CreateAndReopenWithCF({kCFName1}, options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondaryWithColumnFamilies({kCFName1}, options1);
+ ASSERT_EQ(2, handles_secondary_.size());
+
+ const auto& verify_db = [](DB* db1,
+ const std::vector<ColumnFamilyHandle*>& handles1,
+ DB* db2,
+ const std::vector<ColumnFamilyHandle*>& handles2) {
+ ASSERT_NE(nullptr, db1);
+ ASSERT_NE(nullptr, db2);
+ ReadOptions read_opts;
+ read_opts.verify_checksums = true;
+ ASSERT_EQ(handles1.size(), handles2.size());
+ for (size_t i = 0; i != handles1.size(); ++i) {
+ std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts, handles1[i]));
+ std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts, handles2[i]));
+ it1->SeekToFirst();
+ it2->SeekToFirst();
+ for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+ ASSERT_EQ(it1->key(), it2->key());
+ ASSERT_EQ(it1->value(), it2->value());
+ }
+ ASSERT_FALSE(it1->Valid());
+ ASSERT_FALSE(it2->Valid());
+
+ for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+ std::string value;
+ ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value));
+ ASSERT_EQ(it1->value(), value);
+ }
+ for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+ std::string value;
+ ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value));
+ ASSERT_EQ(it2->value(), value);
+ }
+ }
+ };
+ for (int k = 0; k != 8; ++k) {
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(0 /*cf*/, "key" + std::to_string(k),
+ "value" + std::to_string(k)));
+ ASSERT_OK(Put(1 /*cf*/, "key" + std::to_string(k),
+ "value" + std::to_string(k)));
+ }
+ TEST_SYNC_POINT(
+ "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+ SyncPoint::GetInstance()->ClearTrace();
+ }
+}
+
+TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
+ const int kNumKeysPerMemtable = 16;
+ Options options;
+ options.env = env_;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 2;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ WriteOptions write_opts;
+ WriteBatch wb;
+ ASSERT_OK(wb.Put("key0", "value0"));
+ ASSERT_OK(wb.Put("key1", "value1"));
+ ASSERT_OK(dbfull()->Write(write_opts, &wb));
+ ReadOptions read_opts;
+ std::unique_ptr<Iterator> iter1(db_secondary_->NewIterator(read_opts));
+ iter1->Seek("key0");
+ ASSERT_FALSE(iter1->Valid());
+ iter1->Seek("key1");
+ ASSERT_FALSE(iter1->Valid());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ iter1->Seek("key0");
+ ASSERT_FALSE(iter1->Valid());
+ iter1->Seek("key1");
+ ASSERT_FALSE(iter1->Valid());
+ ASSERT_OK(iter1->status());
+ std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(read_opts));
+ iter2->Seek("key0");
+ ASSERT_TRUE(iter2->Valid());
+ ASSERT_EQ("value0", iter2->value());
+ iter2->Seek("key1");
+ ASSERT_TRUE(iter2->Valid());
+ ASSERT_OK(iter2->status());
+ ASSERT_EQ("value1", iter2->value());
+
+ {
+ WriteBatch wb1;
+ ASSERT_OK(wb1.Put("key0", "value01"));
+ ASSERT_OK(wb1.Put("key1", "value11"));
+ ASSERT_OK(dbfull()->Write(write_opts, &wb1));
+ }
+
+ {
+ WriteBatch wb2;
+ ASSERT_OK(wb2.Put("key0", "new_value0"));
+ ASSERT_OK(wb2.Delete("key1"));
+ ASSERT_OK(dbfull()->Write(write_opts, &wb2));
+ }
+
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ std::unique_ptr<Iterator> iter3(db_secondary_->NewIterator(read_opts));
+ // iter3 should not see value01 and value11 at all.
+ iter3->Seek("key0");
+ ASSERT_TRUE(iter3->Valid());
+ ASSERT_EQ("new_value0", iter3->value());
+ iter3->Seek("key1");
+ ASSERT_FALSE(iter3->Valid());
+ ASSERT_OK(iter3->status());
+}
+
+TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
+ bool called = false;
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
+ ASSERT_NE(nullptr, arg);
+ called = true;
+ auto* s = reinterpret_cast<Status*>(arg);
+ ASSERT_NOK(*s);
+ });
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
+ "BackgroundCallCompaction:0"},
+ {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+ "DBImpl::CheckConsistency:BeforeGetFileSize"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("a", "value0"));
+ ASSERT_OK(Put("c", "value0"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("b", "value1"));
+ ASSERT_OK(Put("d", "value1"));
+ ASSERT_OK(Flush());
+ port::Thread thread([this]() {
+ Options opts;
+ opts.env = env_;
+ opts.max_open_files = -1;
+ OpenSecondary(opts);
+ });
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ thread.join();
+ ASSERT_TRUE(called);
+}
+
+TEST_F(DBSecondaryTest, StartFromInconsistent) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_OK(Flush());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+ ASSERT_NE(nullptr, arg);
+ *(reinterpret_cast<Status*>(arg)) =
+ Status::Corruption("Inject corruption");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ Options options1;
+ options1.env = env_;
+ Status s = TryOpenSecondary(options1);
+ ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBSecondaryTest, InconsistencyDuringCatchUp) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_OK(Flush());
+
+ Options options1;
+ options1.env = env_;
+ OpenSecondary(options1);
+
+ {
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ReadOptions(), "foo", &value));
+ ASSERT_EQ("value", value);
+ }
+
+ ASSERT_OK(Put("bar", "value1"));
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+ ASSERT_NE(nullptr, arg);
+ *(reinterpret_cast<Status*>(arg)) =
+ Status::Corruption("Inject corruption");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ Status s = db_secondary_->TryCatchUpWithPrimary();
+ ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBSecondaryTest, OpenWithTransactionDB) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+
+ // Destroy the DB to recreate as a TransactionDB.
+ Close();
+ Destroy(options, true);
+
+ // Create a TransactionDB.
+ TransactionDB* txn_db = nullptr;
+ TransactionDBOptions txn_db_opts;
+ ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
+ ASSERT_NE(txn_db, nullptr);
+ db_ = txn_db;
+
+ std::vector<std::string> cfs = {"new_CF"};
+ CreateColumnFamilies(cfs, options);
+ ASSERT_EQ(handles_.size(), 1);
+
+ WriteOptions wopts;
+ TransactionOptions txn_opts;
+ Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+ ASSERT_NE(txn1, nullptr);
+ ASSERT_OK(txn1->Put(handles_[0], "k1", "v1"));
+ ASSERT_OK(txn1->Commit());
+ delete txn1;
+
+ options = CurrentOptions();
+ options.max_open_files = -1;
+ ASSERT_OK(TryOpenSecondary(options));
+}
+
+class DBSecondaryTestWithTimestamp : public DBSecondaryTestBase {
+ public:
+ explicit DBSecondaryTestWithTimestamp()
+ : DBSecondaryTestBase("db_secondary_test_with_timestamp") {}
+};
+TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGetReadTimestampSizeMismatch) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database as secondary instance to test its timestamp support.
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReopenAsSecondary(options));
+
+ ReadOptions read_opts;
+ std::string different_size_read_timestamp;
+ PutFixed32(&different_size_read_timestamp, 2);
+ Slice different_size_read_ts = different_size_read_timestamp;
+ read_opts.timestamp = &different_size_read_ts;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsInvalidArgument());
+ }
+
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ std::string value_from_get;
+ std::string timestamp;
+ ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+ .IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+ IteratorAndGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database as secondary instance to test its timestamp support.
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReopenAsSecondary(options));
+
+ ReadOptions read_opts;
+ const std::string read_timestamp = Timestamp(2, 0);
+ Slice read_ts = read_timestamp;
+ read_opts.timestamp = &read_ts;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsInvalidArgument());
+ }
+
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ std::string value_from_get;
+ std::string timestamp;
+ ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+ .IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+ IteratorAndGetWriteWithTimestampReadWithoutTimestamp) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database as secondary instance to test its timestamp support.
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReopenAsSecondary(options));
+
+ ReadOptions read_opts;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsInvalidArgument());
+ }
+
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ std::string value_from_get;
+ ASSERT_TRUE(
+ db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::vector<uint64_t> start_keys = {1, 0};
+ const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+ Timestamp(3, 0)};
+ const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+ Timestamp(4, 0)};
+ for (size_t i = 0; i < write_timestamps.size(); ++i) {
+ WriteOptions write_opts;
+ for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ }
+
+ // Reopen the database as secondary instance to test its timestamp support.
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReopenAsSecondary(options));
+
+ auto get_value_and_check = [](DB* db, ReadOptions read_opts, Slice key,
+ Slice expected_value, std::string expected_ts) {
+ std::string value_from_get;
+ std::string timestamp;
+ ASSERT_OK(db->Get(read_opts, key.ToString(), &value_from_get, &timestamp));
+ ASSERT_EQ(expected_value, value_from_get);
+ ASSERT_EQ(expected_ts, timestamp);
+ };
+ for (size_t i = 0; i < read_timestamps.size(); ++i) {
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamps[i];
+ read_opts.timestamp = &read_ts;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ int count = 0;
+ uint64_t key = 0;
+ // Forward iterate.
+ for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
+ it->Next(), ++count, ++key) {
+ CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ get_value_and_check(db_, read_opts, it->key(), it->value(),
+ write_timestamps[i]);
+ }
+ size_t expected_count = kMaxKey - start_keys[i] + 1;
+ ASSERT_EQ(expected_count, count);
+
+ // Backward iterate.
+ count = 0;
+ for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+ it->Prev(), ++count, --key) {
+ CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ get_value_and_check(db_, read_opts, it->key(), it->value(),
+ write_timestamps[i]);
+ }
+ ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+ // SeekToFirst()/SeekToLast() with lower/upper bounds.
+ // Then iter with lower and upper bounds.
+ uint64_t l = 0;
+ uint64_t r = kMaxKey + 1;
+ while (l < r) {
+ std::string lb_str = Key1(l);
+ Slice lb = lb_str;
+ std::string ub_str = Key1(r);
+ Slice ub = ub_str;
+ read_opts.iterate_lower_bound = &lb;
+ read_opts.iterate_upper_bound = &ub;
+ it.reset(db_->NewIterator(read_opts));
+ for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
+ it->Valid(); it->Next(), ++key, ++count) {
+ CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ get_value_and_check(db_, read_opts, it->key(), it->value(),
+ write_timestamps[i]);
+ }
+ ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+ for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+ it->Valid(); it->Prev(), --key, ++count) {
+ CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ get_value_and_check(db_, read_opts, it->key(), it->value(),
+ write_timestamps[i]);
+ }
+ l += (kMaxKey / 100);
+ r -= (kMaxKey / 100);
+ }
+ }
+ Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp, IteratorsReadTimestampSizeMismatch) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database as secondary instance to test its timestamp support.
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReopenAsSecondary(options));
+
+ ReadOptions read_opts;
+ std::string different_size_read_timestamp;
+ PutFixed32(&different_size_read_timestamp, 2);
+ Slice different_size_read_ts = different_size_read_timestamp;
+ read_opts.timestamp = &different_size_read_ts;
+ {
+ std::vector<Iterator*> iters;
+ ASSERT_TRUE(
+ db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+ .IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+ IteratorsReadTimestampSpecifiedWithoutWriteTimestamp) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database as secondary instance to test its timestamp support.
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReopenAsSecondary(options));
+
+ ReadOptions read_opts;
+ const std::string read_timestamp = Timestamp(2, 0);
+ Slice read_ts = read_timestamp;
+ read_opts.timestamp = &read_ts;
+ {
+ std::vector<Iterator*> iters;
+ ASSERT_TRUE(
+ db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+ .IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+ IteratorsWriteWithTimestampReadWithoutTimestamp) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::string write_timestamp = Timestamp(1, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database as secondary instance to test its timestamp support.
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReopenAsSecondary(options));
+
+ ReadOptions read_opts;
+ {
+ std::vector<Iterator*> iters;
+ ASSERT_TRUE(
+ db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+ .IsInvalidArgument());
+ }
+
+ Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp, Iterators) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::string write_timestamp = Timestamp(1, 0);
+ const std::string read_timestamp = Timestamp(2, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+ "value" + std::to_string(key));
+ ASSERT_OK(s);
+ }
+
+ // Reopen the database as secondary instance to test its timestamp support.
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReopenAsSecondary(options));
+
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamp;
+ read_opts.timestamp = &read_ts;
+ std::vector<Iterator*> iters;
+ ASSERT_OK(db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters));
+ ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
+
+ int count = 0;
+ uint64_t key = 0;
+ // Forward iterate.
+ for (iters[0]->Seek(Key1(0)), key = 0; iters[0]->Valid();
+ iters[0]->Next(), ++count, ++key) {
+ CheckIterUserEntry(iters[0], Key1(key), kTypeValue,
+ "value" + std::to_string(key), write_timestamp);
+ }
+
+ size_t expected_count = kMaxKey - 0 + 1;
+ ASSERT_EQ(expected_count, count);
+ delete iters[0];
+
+ Close();
+}
+#endif //! ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_sst_test.cc b/src/rocksdb/db/db_sst_test.cc
new file mode 100644
index 000000000..7f031444a
--- /dev/null
+++ b/src/rocksdb/db/db_sst_test.cc
@@ -0,0 +1,1868 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksdb/table.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBSSTTest : public DBTestBase {
+ public:
+ DBSSTTest() : DBTestBase("db_sst_test", /*env_do_fsync=*/true) {}
+};
+
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+ FlushedFileCollector() {}
+ ~FlushedFileCollector() override {}
+
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.push_back(info.file_path);
+ }
+
+ std::vector<std::string> GetFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ std::vector<std::string> result;
+ for (auto fname : flushed_files_) {
+ result.push_back(fname);
+ }
+ return result;
+ }
+ void ClearFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.clear();
+ }
+
+ private:
+ std::vector<std::string> flushed_files_;
+ std::mutex mutex_;
+};
+#endif // ROCKSDB_LITE
+
+TEST_F(DBSSTTest, DontDeletePendingOutputs) {
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ // Every time we write to a table file, call FOF/POF with full DB scan. This
+ // will make sure our pending_outputs_ protection work correctly
+ std::function<void()> purge_obsolete_files_function = [&]() {
+ JobContext job_context(0);
+ dbfull()->TEST_LockMutex();
+ dbfull()->FindObsoleteFiles(&job_context, true /*force*/);
+ dbfull()->TEST_UnlockMutex();
+ dbfull()->PurgeObsoleteFiles(job_context);
+ job_context.Clean();
+ };
+
+ env_->table_write_callback_ = &purge_obsolete_files_function;
+
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_OK(Put("a", "begin"));
+ ASSERT_OK(Put("z", "end"));
+ ASSERT_OK(Flush());
+ }
+
+ // If pending output guard does not work correctly, PurgeObsoleteFiles() will
+ // delete the file that Compaction is trying to create, causing this: error
+ // db/db_test.cc:975: IO error:
+ // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory
+ Compact("a", "b");
+}
+
+// 1 Create some SST files by inserting K-V pairs into DB
+// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file
+// 3 Open DB and check if all key can be read
+TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.num_levels = 4;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_id = 0;
+ for (int i = 0; i < 10; ++i) {
+ GenerateNewFile(&rnd, &key_id, false);
+ }
+ ASSERT_OK(Flush());
+ Close();
+ int const num_files = GetSstFileCount(dbname_);
+ ASSERT_GT(num_files, 0);
+
+ Reopen(options);
+ std::vector<std::string> values;
+ values.reserve(key_id);
+ for (int k = 0; k < key_id; ++k) {
+ values.push_back(Get(Key(k)));
+ }
+ Close();
+
+ std::vector<std::string> filenames;
+ GetSstFiles(env_, dbname_, &filenames);
+ int num_ldb_files = 0;
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ if (i & 1) {
+ continue;
+ }
+ std::string const rdb_name = dbname_ + "/" + filenames[i];
+ std::string const ldb_name = Rocks2LevelTableFileName(rdb_name);
+ ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok());
+ ++num_ldb_files;
+ }
+ ASSERT_GT(num_ldb_files, 0);
+ ASSERT_EQ(num_files, GetSstFileCount(dbname_));
+
+ Reopen(options);
+ for (int k = 0; k < key_id; ++k) {
+ ASSERT_EQ(values[k], Get(Key(k)));
+ }
+ Destroy(options);
+}
+
+// Check that we don't crash when opening DB with
+// DBOptions::skip_checking_sst_file_sizes_on_db_open = true.
+TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) {
+ ASSERT_OK(Put("pika", "choo"));
+ ASSERT_OK(Flush());
+
+ // Just open the DB with the option set to true and check that we don't crash.
+ Options options;
+ options.env = env_;
+ options.skip_checking_sst_file_sizes_on_db_open = true;
+ Reopen(options);
+
+ ASSERT_EQ("choo", Get("pika"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBSSTTest, DontDeleteMovedFile) {
+ // This test triggers move compaction and verifies that the file is not
+ // deleted when it's part of move compaction
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.max_bytes_for_level_base = 1024 * 1024; // 1 MB
+ options.level0_file_num_compaction_trigger =
+ 2; // trigger compaction when we have 2 files
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ // Create two 1MB sst files
+ for (int i = 0; i < 2; ++i) {
+ // Create 1MB sst file
+ for (int j = 0; j < 100; ++j) {
+ ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ // this should execute both L0->L1 and L1->(move)->L2 compactions
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+ // If the moved file is actually deleted (the move-safeguard in
+ // ~Version::Version() is not there), we get this failure:
+ // Corruption: Can't access /000009.sst
+ Reopen(options);
+}
+
+// This reproduces a bug where we don't delete a file because when it was
+// supposed to be deleted, it was blocked by pending_outputs
+// Consider:
+// 1. current file_number is 13
+// 2. compaction (1) starts, blocks deletion of all files starting with 13
+// (pending outputs)
+// 3. file 13 is created by compaction (2)
+// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file
+// 13 has no references, it is put into VersionSet::obsolete_files_
+// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13
+// is deleted from obsolete_files_ set.
+// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by
+// pending outputs since compaction (1) is still running. It is not deleted and
+// it is not present in obsolete_files_ anymore. Therefore, we never delete it.
+TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 2 * 1024 * 1024; // 2 MB
+ options.max_bytes_for_level_base = 1024 * 1024; // 1 MB
+ options.level0_file_num_compaction_trigger =
+ 2; // trigger compaction when we have 2 files
+ options.max_background_flushes = 2;
+ options.max_background_compactions = 2;
+
+ OnFileDeletionListener* listener = new OnFileDeletionListener();
+ options.listeners.emplace_back(listener);
+
+ Reopen(options);
+
+ Random rnd(301);
+ // Create two 1MB sst files
+ for (int i = 0; i < 2; ++i) {
+ // Create 1MB sst file
+ for (int j = 0; j < 100; ++j) {
+ ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ // this should execute both L0->L1 and L1->(move)->L2 compactions
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+ test::SleepingBackgroundTask blocking_thread;
+ port::Mutex mutex_;
+ bool already_blocked(false);
+
+ // block the flush
+ std::function<void()> block_first_time = [&]() {
+ bool blocking = false;
+ {
+ MutexLock l(&mutex_);
+ if (!already_blocked) {
+ blocking = true;
+ already_blocked = true;
+ }
+ }
+ if (blocking) {
+ blocking_thread.DoSleep();
+ }
+ };
+ env_->table_write_callback_ = &block_first_time;
+ // Insert 2.5MB data, which should trigger a flush because we exceed
+ // write_buffer_size. The flush will be blocked with block_first_time
+ // pending_file is protecting all the files created after
+ for (int j = 0; j < 256; ++j) {
+ ASSERT_OK(Put(Key(j), rnd.RandomString(10 * 1024)));
+ }
+ blocking_thread.WaitUntilSleeping();
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr));
+
+ ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+ auto file_on_L2 = metadata[0].name;
+ listener->SetExpectedFileName(dbname_ + file_on_L2);
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr,
+ true /* disallow trivial move */));
+ ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+
+ // finish the flush!
+ blocking_thread.WakeUp();
+ blocking_thread.WaitUntilDone();
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ // File just flushed is too big for L0 and L1 so gets moved to L2.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,0,1,0,1", FilesPerLevel(0));
+
+ metadata.clear();
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 2U);
+
+ // This file should have been deleted during last compaction
+ ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2));
+ listener->VerifyMatchedCount(1);
+}
+
+// Test that producing an empty .sst file does not write it out to
+// disk, and that the DeleteFile() env method is not called for
+// removing the non-existing file later.
+TEST_F(DBSSTTest, DeleteFileNotCalledForNotCreatedSSTFile) {
+ Options options = CurrentOptions();
+ options.env = env_;
+
+ OnFileDeletionListener* listener = new OnFileDeletionListener();
+ options.listeners.emplace_back(listener);
+
+ Reopen(options);
+
+ // Flush the empty database.
+ ASSERT_OK(Flush());
+ ASSERT_EQ("", FilesPerLevel(0));
+
+ // We expect no .sst files.
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 0U);
+
+ // We expect no file deletions.
+ listener->VerifyMatchedCount(0);
+}
+
+// Test that producing a non-empty .sst file does write it out to
+// disk, and that the DeleteFile() env method is not called for removing
+// the file later.
+TEST_F(DBSSTTest, DeleteFileNotCalledForCreatedSSTFile) {
+ Options options = CurrentOptions();
+ options.env = env_;
+
+ OnFileDeletionListener* listener = new OnFileDeletionListener();
+ options.listeners.emplace_back(listener);
+
+ Reopen(options);
+
+ ASSERT_OK(Put("pika", "choo"));
+
+ // Flush the non-empty database.
+ ASSERT_OK(Flush());
+ ASSERT_EQ("1", FilesPerLevel(0));
+
+ // We expect 1 .sst files.
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+
+ // We expect no file deletions.
+ listener->VerifyMatchedCount(0);
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManager) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ int files_added = 0;
+ int files_deleted = 0;
+ int files_moved = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnAddFile", [&](void* /*arg*/) { files_added++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnDeleteFile",
+ [&](void* /*arg*/) { files_deleted++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 25; i++) {
+ GenerateNewRandomFile(&rnd);
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Verify that we are tracking all sst files in dbname_
+ std::unordered_map<std::string, uint64_t> files_in_db;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ }
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ std::unordered_map<std::string, uint64_t> files_in_db;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+ // Verify that we are tracking all sst files in dbname_
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ // Verify the total files size
+ uint64_t total_files_size = 0;
+ for (auto& file_to_size : files_in_db) {
+ total_files_size += file_to_size.second;
+ }
+ ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+ // We flushed at least 25 files
+ ASSERT_GE(files_added, 25);
+ // Compaction must have deleted some files
+ ASSERT_GT(files_deleted, 0);
+ // No files were moved
+ ASSERT_EQ(files_moved, 0);
+
+ Close();
+ Reopen(options);
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+ // Verify that we track all the files again after the DB is closed and opened
+ Close();
+ sst_file_manager.reset(NewSstFileManager(env_));
+ options.sst_file_manager = sst_file_manager;
+ sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Reopen(options);
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ int files_added = 0;
+ int files_deleted = 0;
+ int files_moved = 0;
+ int files_scheduled_to_delete = 0;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+ const std::string* const file_path =
+ static_cast<const std::string*>(arg);
+ if (file_path->find(".blob") != std::string::npos) {
+ files_added++;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+ const std::string* const file_path =
+ static_cast<const std::string*>(arg);
+ if (file_path->find(".blob") != std::string::npos) {
+ files_deleted++;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+ assert(arg);
+ const std::string* const file_path =
+ static_cast<const std::string*>(arg);
+ if (file_path->find(".blob") != std::string::npos) {
+ ++files_scheduled_to_delete;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.enable_blob_files = true;
+ options.blob_file_size = 32; // create one blob per file
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put("Key_" + std::to_string(i), "Value_" + std::to_string(i)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Verify that we are tracking all sst and blob files in dbname_
+ std::unordered_map<std::string, uint64_t> files_in_db;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+ ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ }
+
+ std::vector<uint64_t> blob_files = GetBlobFileNumbers();
+ ASSERT_EQ(files_added, blob_files.size());
+ // No blob file is obsoleted.
+ ASSERT_EQ(files_deleted, 0);
+ ASSERT_EQ(files_scheduled_to_delete, 0);
+ // No files were moved.
+ ASSERT_EQ(files_moved, 0);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ std::unordered_map<std::string, uint64_t> files_in_db;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+ ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+
+ // Verify that we are tracking all sst and blob files in dbname_
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ // Verify the total files size
+ uint64_t total_files_size = 0;
+ for (auto& file_to_size : files_in_db) {
+ total_files_size += file_to_size.second;
+ }
+ ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+ Close();
+
+ Reopen(options);
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+ // Verify that we track all the files again after the DB is closed and opened.
+ Close();
+
+ sst_file_manager.reset(NewSstFileManager(env_));
+ options.sst_file_manager = sst_file_manager;
+ sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Reopen(options);
+
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+ // Destroy DB and it will remove all the blob files from sst file manager and
+ // blob files deletion will go through ScheduleFileDeletion.
+ ASSERT_EQ(files_deleted, 0);
+ ASSERT_EQ(files_scheduled_to_delete, 0);
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options));
+ ASSERT_EQ(files_deleted, blob_files.size());
+ ASSERT_EQ(files_scheduled_to_delete, blob_files.size());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.enable_blob_files = true;
+ options.blob_file_size = 32; // create one blob per file
+ options.disable_auto_compactions = true;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 0.5;
+
+ int files_added = 0;
+ int files_deleted = 0;
+ int files_moved = 0;
+ int files_scheduled_to_delete = 0;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+ const std::string* const file_path =
+ static_cast<const std::string*>(arg);
+ if (file_path->find(".blob") != std::string::npos) {
+ files_added++;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+ const std::string* const file_path =
+ static_cast<const std::string*>(arg);
+ if (file_path->find(".blob") != std::string::npos) {
+ files_deleted++;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+ assert(arg);
+ const std::string* const file_path =
+ static_cast<const std::string*>(arg);
+ if (file_path->find(".blob") != std::string::npos) {
+ ++files_scheduled_to_delete;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "first_value";
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "second_value";
+
+ ASSERT_OK(Put(first_key, first_value));
+ ASSERT_OK(Put(second_key, second_value));
+ ASSERT_OK(Flush());
+
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "third_value";
+ constexpr char fourth_key[] = "fourth_key";
+ constexpr char fourth_value[] = "fourth_value";
+ constexpr char fifth_key[] = "fifth_key";
+ constexpr char fifth_value[] = "fifth_value";
+
+ ASSERT_OK(Put(third_key, third_value));
+ ASSERT_OK(Put(fourth_key, fourth_value));
+ ASSERT_OK(Put(fifth_key, fifth_value));
+ ASSERT_OK(Flush());
+
+ const std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+
+ ASSERT_EQ(original_blob_files.size(), 5);
+ ASSERT_EQ(files_added, 5);
+ ASSERT_EQ(files_deleted, 0);
+ ASSERT_EQ(files_scheduled_to_delete, 0);
+ ASSERT_EQ(files_moved, 0);
+ {
+ // Verify that we are tracking all sst and blob files in dbname_
+ std::unordered_map<std::string, uint64_t> files_in_db;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+ ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ }
+
+ const size_t cutoff_index = static_cast<size_t>(
+ options.blob_garbage_collection_age_cutoff * original_blob_files.size());
+
+ size_t expected_number_of_files = original_blob_files.size();
+ // Note: turning off enable_blob_files before the compaction results in
+ // garbage collected values getting inlined.
+ ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+ expected_number_of_files -= cutoff_index;
+ files_added = 0;
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ sfm->WaitForEmptyTrash();
+
+ ASSERT_EQ(Get(first_key), first_value);
+ ASSERT_EQ(Get(second_key), second_value);
+ ASSERT_EQ(Get(third_key), third_value);
+ ASSERT_EQ(Get(fourth_key), fourth_value);
+ ASSERT_EQ(Get(fifth_key), fifth_value);
+
+ const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+ ASSERT_EQ(new_blob_files.size(), expected_number_of_files);
+ // No new file is added.
+ ASSERT_EQ(files_added, 0);
+ ASSERT_EQ(files_deleted, cutoff_index);
+ ASSERT_EQ(files_scheduled_to_delete, cutoff_index);
+ ASSERT_EQ(files_moved, 0);
+
+ // Original blob files below the cutoff should be gone, original blob files at
+ // or above the cutoff should be still there
+ for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+ ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+ }
+
+ {
+ // Verify that we are tracking all sst and blob files in dbname_
+ std::unordered_map<std::string, uint64_t> files_in_db;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+ ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+ }
+
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options));
+ sfm->WaitForEmptyTrash();
+ ASSERT_EQ(files_deleted, 5);
+ ASSERT_EQ(files_scheduled_to_delete, 5);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class DBSSTTestRateLimit : public DBSSTTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ DBSSTTestRateLimit() : DBSSTTest() {}
+ ~DBSSTTestRateLimit() override {}
+};
+
+TEST_P(DBSSTTestRateLimit, RateLimitedDelete) {
+ Destroy(last_options_);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBSSTTest::RateLimitedDelete:1",
+ "DeleteScheduler::BackgroundEmptyTrash"},
+ });
+
+ std::vector<uint64_t> penalties;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::BackgroundEmptyTrash:Wait",
+ [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+ // Turn timed wait into a simulated sleep
+ uint64_t* abs_time_us = static_cast<uint64_t*>(arg);
+ uint64_t cur_time = env_->NowMicros();
+ if (*abs_time_us > cur_time) {
+ env_->MockSleepForMicroseconds(*abs_time_us - cur_time);
+ }
+
+ // Plus an additional short, random amount
+ env_->MockSleepForMicroseconds(Random::GetTLSInstance()->Uniform(10));
+
+ // Set wait until time to before (actual) current time to force not
+ // to sleep
+ *abs_time_us = Env::Default()->NowMicros();
+ });
+
+ // Disable PeriodicTaskScheduler as it also has TimedWait, which could update
+ // the simulated sleep time
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::StartPeriodicTaskScheduler:DisableScheduler", [&](void* arg) {
+ bool* disable_scheduler = static_cast<bool*>(arg);
+ *disable_scheduler = true;
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ bool different_wal_dir = GetParam();
+ Options options = CurrentOptions();
+ SetTimeElapseOnlySleepOnReopen(&options);
+ options.disable_auto_compactions = true;
+ options.env = env_;
+ options.statistics = CreateDBStatistics();
+ if (different_wal_dir) {
+ options.wal_dir = alternative_wal_dir_;
+ }
+
+ int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec
+ Status s;
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+ ASSERT_OK(s);
+ options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+ sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1);
+
+ WriteOptions wo;
+ if (!different_wal_dir) {
+ wo.disableWAL = true;
+ }
+ Reopen(options);
+ // Create 4 files in L0
+ for (char v = 'a'; v <= 'd'; v++) {
+ ASSERT_OK(Put("Key2", DummyString(1024, v), wo));
+ ASSERT_OK(Put("Key3", DummyString(1024, v), wo));
+ ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
+ ASSERT_OK(Put("Key1", DummyString(1024, v), wo));
+ ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
+ ASSERT_OK(Flush());
+ }
+ // We created 4 sst files in L0
+ ASSERT_EQ("4", FilesPerLevel(0));
+
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+
+ // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ uint64_t delete_start_time = env_->NowMicros();
+ // Hold BackgroundEmptyTrash
+ TEST_SYNC_POINT("DBSSTTest::RateLimitedDelete:1");
+ sfm->WaitForEmptyTrash();
+ uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+ uint64_t total_files_size = 0;
+ uint64_t expected_penlty = 0;
+ ASSERT_EQ(penalties.size(), metadata.size());
+ for (size_t i = 0; i < metadata.size(); i++) {
+ total_files_size += metadata[i].size;
+ expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec);
+ ASSERT_EQ(expected_penlty, penalties[i]);
+ }
+ ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+ ASSERT_LT(time_spent_deleting, expected_penlty * 1.1);
+ ASSERT_EQ(4, options.statistics->getAndResetTickerCount(FILES_MARKED_TRASH));
+ ASSERT_EQ(
+ 0, options.statistics->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(RateLimitedDelete, DBSSTTestRateLimit,
+ ::testing::Bool());
+
+TEST_F(DBSSTTest, RateLimitedWALDelete) {
+ Destroy(last_options_);
+
+ std::vector<uint64_t> penalties;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::BackgroundEmptyTrash:Wait",
+ [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.env = env_;
+
+ int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec
+ Status s;
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+ ASSERT_OK(s);
+ options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+ sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+ SetTimeElapseOnlySleepOnReopen(&options);
+
+ ASSERT_OK(TryReopen(options));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Create 4 files in L0
+ for (char v = 'a'; v <= 'd'; v++) {
+ ASSERT_OK(Put("Key2", DummyString(1024, v)));
+ ASSERT_OK(Put("Key3", DummyString(1024, v)));
+ ASSERT_OK(Put("Key4", DummyString(1024, v)));
+ ASSERT_OK(Put("Key1", DummyString(1024, v)));
+ ASSERT_OK(Put("Key4", DummyString(1024, v)));
+ ASSERT_OK(Flush());
+ }
+ // We created 4 sst files in L0
+ ASSERT_EQ("4", FilesPerLevel(0));
+
+ // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ sfm->WaitForEmptyTrash();
+ ASSERT_EQ(penalties.size(), 8);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class DBWALTestWithParam
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+ explicit DBWALTestWithParam()
+ : DBTestBase("db_wal_test_with_params", /*env_do_fsync=*/true) {
+ wal_dir_ = std::get<0>(GetParam());
+ wal_dir_same_as_dbname_ = std::get<1>(GetParam());
+ }
+
+ std::string wal_dir_;
+ bool wal_dir_same_as_dbname_;
+};
+
+TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
+ class MyEnv : public EnvWrapper {
+ public:
+ MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {}
+ const char* Name() const override { return "MyEnv"; }
+ Status DeleteFile(const std::string& fname) override {
+ if (fname.find(".log.trash") != std::string::npos && fake_log_delete) {
+ return Status::OK();
+ }
+
+ return target()->DeleteFile(fname);
+ }
+
+ void set_fake_log_delete(bool fake) { fake_log_delete = fake; }
+
+ private:
+ bool fake_log_delete;
+ };
+
+ std::unique_ptr<MyEnv> env(new MyEnv(env_));
+ Destroy(last_options_);
+
+ env->set_fake_log_delete(true);
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.env = env.get();
+ options.wal_dir = dbname_ + wal_dir_;
+
+ int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec
+ Status s;
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+ ASSERT_OK(s);
+ options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+ sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+
+ Reopen(options);
+
+ // Create 4 files in L0
+ for (char v = 'a'; v <= 'd'; v++) {
+ if (v == 'c') {
+ // Maximize the change that the last log file will be preserved in trash
+ // before restarting the DB.
+ // We have to set this on the 2nd to last file for it to delay deletion
+ // on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash())
+ options.sst_file_manager->SetDeleteRateBytesPerSecond(1);
+ }
+ ASSERT_OK(Put("Key2", DummyString(1024, v)));
+ ASSERT_OK(Put("Key3", DummyString(1024, v)));
+ ASSERT_OK(Put("Key4", DummyString(1024, v)));
+ ASSERT_OK(Put("Key1", DummyString(1024, v)));
+ ASSERT_OK(Put("Key4", DummyString(1024, v)));
+ ASSERT_OK(Flush());
+ }
+ // We created 4 sst files in L0
+ ASSERT_EQ("4", FilesPerLevel(0));
+
+ Close();
+
+ options.sst_file_manager.reset();
+ std::vector<std::string> filenames;
+ int trash_log_count = 0;
+ if (!wal_dir_same_as_dbname_) {
+ // Forcibly create some trash log files
+ std::unique_ptr<WritableFile> result;
+ ASSERT_OK(env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result,
+ EnvOptions()));
+ result.reset();
+ }
+ ASSERT_OK(env->GetChildren(options.wal_dir, &filenames));
+ for (const std::string& fname : filenames) {
+ if (fname.find(".log.trash") != std::string::npos) {
+ trash_log_count++;
+ }
+ }
+ ASSERT_GE(trash_log_count, 1);
+
+ env->set_fake_log_delete(false);
+ Reopen(options);
+
+ filenames.clear();
+ trash_log_count = 0;
+ ASSERT_OK(env->GetChildren(options.wal_dir, &filenames));
+ for (const std::string& fname : filenames) {
+ if (fname.find(".log.trash") != std::string::npos) {
+ trash_log_count++;
+ }
+ }
+ ASSERT_EQ(trash_log_count, 0);
+ Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam,
+ ::testing::Values(std::make_tuple("", true),
+ std::make_tuple("_wal_dir", false)));
+
+TEST_F(DBSSTTest, OpenDBWithExistingTrash) {
+ Options options = CurrentOptions();
+
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", 1024 * 1024 /* 1 MB/sec */));
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+ Destroy(last_options_);
+
+ // Add some trash files to the db directory so the DB can clean them up
+ ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+ ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash"));
+ ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash"));
+ ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash"));
+
+ // Reopen the DB and verify that it deletes existing trash files
+ Reopen(options);
+ sfm->WaitForEmptyTrash();
+ ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash"));
+ ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash"));
+ ASSERT_NOK(env_->FileExists(dbname_ + "/" + "003.sst.trash"));
+}
+
+// Create a DB with 2 db_paths, and generate multiple files in the 2
+// db_paths using CompactRangeOptions, make sure that files that were
+// deleted from first db_path were deleted using DeleteScheduler and
+// files in the second path were not.
+TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) {
+ std::atomic<int> bg_delete_file(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
+ [&](void* /*arg*/) { bg_delete_file++; });
+ // The deletion scheduler sometimes skips marking file as trash according to
+ // a heuristic. In that case the deletion will go through the below SyncPoint.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { bg_delete_file++; });
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.db_paths.emplace_back(dbname_, 1024 * 100);
+ options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100);
+ options.env = env_;
+
+ int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec
+ Status s;
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", rate_bytes_per_sec, false, &s,
+ /* max_trash_db_ratio= */ 1.1));
+
+ ASSERT_OK(s);
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+ DestroyAndReopen(options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ // Create 4 files in L0
+ for (int i = 0; i < 4; i++) {
+ ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'A'), wo));
+ ASSERT_OK(Flush());
+ }
+ // We created 4 sst files in L0
+ ASSERT_EQ("4", FilesPerLevel(0));
+ // Compaction will delete files from L0 in first db path and generate a new
+ // file in L1 in second db path
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = 1;
+ Slice begin("Key0");
+ Slice end("Key3");
+ ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ // Create 4 files in L0
+ for (int i = 4; i < 8; i++) {
+ ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'B'), wo));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ("4,1", FilesPerLevel(0));
+
+ // Compaction will delete files from L0 in first db path and generate a new
+ // file in L1 in second db path
+ begin = "Key4";
+ end = "Key7";
+ ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+ ASSERT_EQ("0,2", FilesPerLevel(0));
+
+ sfm->WaitForEmptyTrash();
+ ASSERT_EQ(bg_delete_file, 8);
+
+ // Compaction will delete both files and regenerate a file in L1 in second
+ // db path. The deleted files should still be cleaned up via delete scheduler.
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ sfm->WaitForEmptyTrash();
+ ASSERT_EQ(bg_delete_file, 10);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) {
+ int bg_delete_file = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
+ [&](void* /*arg*/) { bg_delete_file++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Status s;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.env = env_;
+ options.sst_file_manager.reset(
+ NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+ ASSERT_OK(s);
+ DestroyAndReopen(options);
+
+ // Create 4 files in L0
+ for (int i = 0; i < 4; i++) {
+ ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'A')));
+ ASSERT_OK(Flush());
+ }
+ // We created 4 sst files in L0
+ ASSERT_EQ("4", FilesPerLevel(0));
+
+ // Close DB and destroy it using DeleteScheduler
+ Close();
+
+ int num_sst_files = 0;
+ int num_wal_files = 0;
+ std::vector<std::string> db_files;
+ ASSERT_OK(env_->GetChildren(dbname_, &db_files));
+ for (std::string f : db_files) {
+ if (f.substr(f.find_last_of(".") + 1) == "sst") {
+ num_sst_files++;
+ } else if (f.substr(f.find_last_of(".") + 1) == "log") {
+ num_wal_files++;
+ }
+ }
+ ASSERT_GT(num_sst_files, 0);
+ ASSERT_GT(num_wal_files, 0);
+
+ auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+ sfm->SetDeleteRateBytesPerSecond(1024 * 1024);
+ // Set an extra high trash ratio to prevent immediate/non-rate limited
+ // deletions
+ sfm->delete_scheduler()->SetMaxTrashDBRatio(1000.0);
+ ASSERT_OK(DestroyDB(dbname_, options));
+ sfm->WaitForEmptyTrash();
+ ASSERT_EQ(bg_delete_file, num_sst_files + num_wal_files);
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ // Generate a file containing 100 keys.
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+
+ uint64_t first_file_size = 0;
+ std::unordered_map<std::string, uint64_t> files_in_db;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &first_file_size));
+ ASSERT_EQ(sfm->GetTotalSize(), first_file_size);
+
+ // Set the maximum allowed space usage to the current total size
+ sfm->SetMaxAllowedSpaceUsage(first_file_size + 1);
+
+ ASSERT_OK(Put("key1", "val1"));
+ // This flush will cause bg_error_ and will fail
+ ASSERT_NOK(Flush());
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowedWithBlobFiles) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.disable_auto_compactions = true;
+ options.enable_blob_files = true;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ // Generate a file containing keys.
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+
+ uint64_t files_size = 0;
+ uint64_t total_files_size = 0;
+ std::unordered_map<std::string, uint64_t> files_in_db;
+
+ ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db, &files_size));
+ // Make sure blob files are considered by SSTFileManage in size limits.
+ ASSERT_GT(files_size, 0);
+ total_files_size = files_size;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &files_size));
+ total_files_size += files_size;
+ ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+ // Set the maximum allowed space usage to the current total size.
+ sfm->SetMaxAllowedSpaceUsage(total_files_size + 1);
+
+ bool max_allowed_space_reached = false;
+ bool delete_blob_file = false;
+ // Sync point called after blob file is closed and max allowed space is
+ // checked.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached",
+ [&](void* /*arg*/) { max_allowed_space_reached = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable::AfterDeleteFile",
+ [&](void* /*arg*/) { delete_blob_file = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {
+ "BuildTable::AfterDeleteFile",
+ "DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1",
+ },
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("key1", "val1"));
+ // This flush will fail
+ ASSERT_NOK(Flush());
+ ASSERT_TRUE(max_allowed_space_reached);
+
+ TEST_SYNC_POINT("DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1");
+ ASSERT_TRUE(delete_blob_file);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, CancellingCompactionsWorks) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.level0_file_num_compaction_trigger = 2;
+ options.statistics = CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ int completed_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* /*arg*/) {
+ sfm->SetMaxAllowedSpaceUsage(0);
+ ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun",
+ [&](void* /*arg*/) { completed_compactions++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+
+ // Generate a file containing 10 keys.
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+ uint64_t total_file_size = 0;
+ std::unordered_map<std::string, uint64_t> files_in_db;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size));
+ // Set the maximum allowed space usage to the current total size
+ sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+ // Generate another file to trigger compaction.
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ // Because we set a callback in CancelledCompaction, we actually
+ // let the compaction run
+ ASSERT_GT(completed_compactions, 0);
+ ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+ // Make sure the stat is bumped
+ ASSERT_GT(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+ COMPACTION_CANCELLED),
+ 0);
+ ASSERT_EQ(0,
+ dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+ FILES_MARKED_TRASH));
+ ASSERT_EQ(4,
+ dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+ FILES_DELETED_IMMEDIATELY));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, CancellingManualCompactionsWorks) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.statistics = CreateDBStatistics();
+
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ // Generate a file containing 10 keys.
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+ uint64_t total_file_size = 0;
+ std::unordered_map<std::string, uint64_t> files_in_db;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size));
+ // Set the maximum allowed space usage to the current total size
+ sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+ // Generate another file to trigger compaction.
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+
+ // OK, now trigger a manual compaction
+ ASSERT_TRUE(dbfull()
+ ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+ .IsCompactionTooLarge());
+
+ // Wait for manual compaction to get scheduled and finish
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+ // Make sure the stat is bumped
+ ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+ COMPACTION_CANCELLED),
+ 1);
+
+ // Now make sure CompactFiles also gets cancelled
+ auto l0_files = collector->GetFlushedFiles();
+ ASSERT_TRUE(
+ dbfull()
+ ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0)
+ .IsCompactionTooLarge());
+
+ // Wait for manual compaction to get scheduled and finish
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+ COMPACTION_CANCELLED),
+ 2);
+ ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+
+ // Now let the flush through and make sure GetCompactionsReservedSize
+ // returns to normal
+ sfm->SetMaxAllowedSpaceUsage(0);
+ int completed_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+ l0_files, 0));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+ ASSERT_GT(completed_compactions, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) {
+ // This test will set a maximum allowed space for the DB, then it will
+ // keep filling the DB until the limit is reached and bg_error_ is set.
+ // When bg_error_ is set we will verify that the DB size is greater
+ // than the limit.
+
+ std::vector<int> max_space_limits_mbs = {1, 10};
+ std::atomic<bool> bg_error_set(false);
+
+ std::atomic<int> reached_max_space_on_flush(0);
+ std::atomic<int> reached_max_space_on_compaction(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
+ [&](void* arg) {
+ Status* bg_error = static_cast<Status*>(arg);
+ bg_error_set = true;
+ reached_max_space_on_flush++;
+ // clear error to ensure compaction callback is called
+ *bg_error = Status::OK();
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* arg) {
+ bool* enough_room = static_cast<bool*>(arg);
+ *enough_room = true;
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached",
+ [&](void* /*arg*/) {
+ bg_error_set = true;
+ reached_max_space_on_compaction++;
+ });
+
+ for (auto limit_mb : max_space_limits_mbs) {
+ bg_error_set = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.write_buffer_size = 1024 * 512; // 512 Kb
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024);
+
+ // It is easy to detect if the test is stuck in a loop. No need for
+ // complex termination logic.
+ while (true) {
+ auto s = Put(rnd.RandomString(10), rnd.RandomString(50));
+ if (!s.ok()) {
+ break;
+ }
+ }
+ ASSERT_TRUE(bg_error_set);
+ uint64_t total_sst_files_size = 0;
+ std::unordered_map<std::string, uint64_t> files_in_db;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_sst_files_size));
+ ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+
+ ASSERT_GT(reached_max_space_on_flush, 0);
+ ASSERT_GT(reached_max_space_on_compaction, 0);
+}
+
+TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) {
+ // Open DB with infinite max open files
+ // - First iteration use 1 thread to open files
+ // - Second iteration use 5 threads to open files
+ for (int iter = 0; iter < 2; iter++) {
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = 100000;
+ options.disable_auto_compactions = true;
+ options.max_open_files = -1;
+ if (iter == 0) {
+ options.max_file_opening_threads = 1;
+ } else {
+ options.max_file_opening_threads = 5;
+ }
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ // Create 12 Files in L0 (then move then to L2)
+ for (int i = 0; i < 12; i++) {
+ std::string k = "L2_" + Key(i);
+ ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+ ASSERT_OK(Flush());
+ }
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+
+ // Create 12 Files in L0
+ for (int i = 0; i < 12; i++) {
+ std::string k = "L0_" + Key(i);
+ ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+ ASSERT_OK(Flush());
+ }
+ Close();
+
+ // Reopening the DB will load all existing files
+ Reopen(options);
+ ASSERT_EQ("12,0,12", FilesPerLevel(0));
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+ for (const auto& level : files) {
+ for (const auto& file : level) {
+ ASSERT_TRUE(file.table_reader_handle != nullptr);
+ }
+ }
+
+ for (int i = 0; i < 12; i++) {
+ ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a'));
+ ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a'));
+ }
+ }
+}
+
+TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFilesSubjectToMemoryLimit) {
+ for (CacheEntryRoleOptions::Decision charge_table_reader :
+ {CacheEntryRoleOptions::Decision::kEnabled,
+ CacheEntryRoleOptions::Decision::kDisabled}) {
+ // Open DB with infinite max open files
+ // - First iteration use 1 thread to open files
+ // - Second iteration use 5 threads to open files
+ for (int iter = 0; iter < 2; iter++) {
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = 100000;
+ options.disable_auto_compactions = true;
+ options.max_open_files = -1;
+
+ BlockBasedTableOptions table_options;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ if (iter == 0) {
+ options.max_file_opening_threads = 1;
+ } else {
+ options.max_file_opening_threads = 5;
+ }
+
+ DestroyAndReopen(options);
+
+ // Create 5 Files in L0 (then move then to L2)
+ for (int i = 0; i < 5; i++) {
+ std::string k = "L2_" + Key(i);
+ ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+ ASSERT_OK(Flush()) << i;
+ }
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+
+ // Create 5 Files in L0
+ for (int i = 0; i < 5; i++) {
+ std::string k = "L0_" + Key(i);
+ ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+ ASSERT_OK(Flush());
+ }
+ Close();
+
+ table_options.cache_usage_options.options_overrides.insert(
+ {CacheEntryRole::kBlockBasedTableReader,
+ {/*.charged = */ charge_table_reader}});
+ table_options.block_cache =
+ NewLRUCache(1024 /* capacity */, 0 /* num_shard_bits */,
+ true /* strict_capacity_limit */);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ // Reopening the DB will try to load all existing files, conditionally
+ // subject to memory limit
+ Status s = TryReopen(options);
+
+ if (charge_table_reader == CacheEntryRoleOptions::Decision::kEnabled) {
+ EXPECT_TRUE(s.IsMemoryLimit());
+ EXPECT_TRUE(s.ToString().find(
+ kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+ CacheEntryRole::kBlockBasedTableReader)]) !=
+ std::string::npos);
+ EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
+ std::string::npos);
+
+ } else {
+ EXPECT_TRUE(s.ok());
+ ASSERT_EQ("5,0,5", FilesPerLevel(0));
+ }
+ }
+ }
+}
+
+TEST_F(DBSSTTest, GetTotalSstFilesSize) {
+ // We don't propagate oldest-key-time table property on compaction and
+ // just write 0 as default value. This affect the exact table size, since
+ // we encode table properties as varint64. Force time to be 0 to work around
+ // it. Should remove the workaround after we propagate the property on
+ // compaction.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FlushJob::WriteLevel0Table:oldest_ancester_time", [&](void* arg) {
+ uint64_t* current_time = static_cast<uint64_t*>(arg);
+ *current_time = 0;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ DestroyAndReopen(options);
+ // Generate 5 files in L0
+ for (int i = 0; i < 5; i++) {
+ for (int j = 0; j < 10; j++) {
+ std::string val = "val_file_" + std::to_string(i);
+ ASSERT_OK(Put(Key(j), val));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ("5", FilesPerLevel(0));
+
+ std::vector<LiveFileMetaData> live_files_meta;
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 5);
+ uint64_t single_file_size = live_files_meta[0].size;
+
+ uint64_t live_sst_files_size = 0;
+ uint64_t total_sst_files_size = 0;
+ for (const auto& file_meta : live_files_meta) {
+ live_sst_files_size += file_meta.size;
+ }
+
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 5
+ // Total SST files = 5
+ ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+ ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+ // hold current version
+ std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+ ASSERT_OK(iter1->status());
+
+ // Compact 5 files into 1 file in L0
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ live_files_meta.clear();
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 1);
+
+ live_sst_files_size = 0;
+ total_sst_files_size = 0;
+ for (const auto& file_meta : live_files_meta) {
+ live_sst_files_size += file_meta.size;
+ }
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 1 (compacted file)
+ // Total SST files = 6 (5 original files + compacted file)
+ ASSERT_EQ(live_sst_files_size, 1 * single_file_size);
+ ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
+
+ // hold current version
+ std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+ ASSERT_OK(iter2->status());
+
+ // Delete all keys and compact, this will delete all live files
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("", FilesPerLevel(0));
+
+ live_files_meta.clear();
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 0);
+
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 0
+ // Total SST files = 6 (5 original files + compacted file)
+ ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
+
+ ASSERT_OK(iter1->status());
+ iter1.reset();
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 0
+ // Total SST files = 1 (compacted file)
+ ASSERT_EQ(total_sst_files_size, 1 * single_file_size);
+
+ ASSERT_OK(iter2->status());
+ iter2.reset();
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 0
+ // Total SST files = 0
+ ASSERT_EQ(total_sst_files_size, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, OpenDBWithoutGetFileSizeInvocations) {
+ Options options = CurrentOptions();
+ std::unique_ptr<MockEnv> env{MockEnv::Create(Env::Default())};
+ options.env = env.get();
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.enable_blob_files = true;
+ options.blob_file_size = 32; // create one blob per file
+ options.skip_checking_sst_file_sizes_on_db_open = true;
+
+ DestroyAndReopen(options);
+ // Generate 5 files in L0
+ for (int i = 0; i < 5; i++) {
+ for (int j = 0; j < 10; j++) {
+ std::string val = "val_file_" + std::to_string(i);
+ ASSERT_OK(Put(Key(j), val));
+ }
+ ASSERT_OK(Flush());
+ }
+ Close();
+
+ bool is_get_file_size_called = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "MockFileSystem::GetFileSize:CheckFileType", [&](void* arg) {
+ std::string* filename = reinterpret_cast<std::string*>(arg);
+ if (filename->find(".blob") != std::string::npos) {
+ is_get_file_size_called = true;
+ }
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ Reopen(options);
+ ASSERT_FALSE(is_get_file_size_called);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ Destroy(options);
+}
+
+TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ DestroyAndReopen(options);
+ // Generate 5 files in L0
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(Put(Key(i), "val"));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ("5", FilesPerLevel(0));
+
+ std::vector<LiveFileMetaData> live_files_meta;
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 5);
+ uint64_t single_file_size = live_files_meta[0].size;
+
+ uint64_t live_sst_files_size = 0;
+ uint64_t total_sst_files_size = 0;
+ for (const auto& file_meta : live_files_meta) {
+ live_sst_files_size += file_meta.size;
+ }
+
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+
+ // Live SST files = 5
+ // Total SST files = 5
+ ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+ ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+ // hold current version
+ std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+ ASSERT_OK(iter1->status());
+
+ // Compaction will do trivial move from L0 to L1
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,5", FilesPerLevel(0));
+
+ live_files_meta.clear();
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 5);
+
+ live_sst_files_size = 0;
+ total_sst_files_size = 0;
+ for (const auto& file_meta : live_files_meta) {
+ live_sst_files_size += file_meta.size;
+ }
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 5
+ // Total SST files = 5 (used in 2 version)
+ ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+ ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+ // hold current version
+ std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+ ASSERT_OK(iter2->status());
+
+ // Delete all keys and compact, this will delete all live files
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("", FilesPerLevel(0));
+
+ live_files_meta.clear();
+ dbfull()->GetLiveFilesMetaData(&live_files_meta);
+ ASSERT_EQ(live_files_meta.size(), 0);
+
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 0
+ // Total SST files = 5 (used in 2 version)
+ ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+ ASSERT_OK(iter1->status());
+ iter1.reset();
+ ASSERT_OK(iter2->status());
+ iter2.reset();
+
+ ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+ &total_sst_files_size));
+ // Live SST files = 0
+ // Total SST files = 0
+ ASSERT_EQ(total_sst_files_size, 0);
+}
+
+// This test if blob files are recorded by SST File Manager when Compaction job
+// creates/delete them and in case of AtomicFlush.
+TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) {
+ std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+ Options options = CurrentOptions();
+ options.sst_file_manager = sst_file_manager;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.disable_auto_compactions = true;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 0.5;
+ options.atomic_flush = true;
+
+ int files_added = 0;
+ int files_deleted = 0;
+ int files_scheduled_to_delete = 0;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+ const std::string* const file_path =
+ static_cast<const std::string*>(arg);
+ if (EndsWith(*file_path, ".blob")) {
+ files_added++;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+ const std::string* const file_path =
+ static_cast<const std::string*>(arg);
+ if (EndsWith(*file_path, ".blob")) {
+ files_deleted++;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+ assert(arg);
+ const std::string* const file_path =
+ static_cast<const std::string*>(arg);
+ if (EndsWith(*file_path, ".blob")) {
+ ++files_scheduled_to_delete;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ ASSERT_OK(Put("key_1", "value_1"));
+ ASSERT_OK(Put("key_2", "value_2"));
+ ASSERT_OK(Put("key_3", "value_3"));
+ ASSERT_OK(Put("key_4", "value_4"));
+ ASSERT_OK(Flush());
+
+ // Overwrite will create the garbage data.
+ ASSERT_OK(Put("key_3", "new_value_3"));
+ ASSERT_OK(Put("key_4", "new_value_4"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("Key5", "blob_value5"));
+ ASSERT_OK(Put("Key6", "blob_value6"));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(files_added, 3);
+ ASSERT_EQ(files_deleted, 0);
+ ASSERT_EQ(files_scheduled_to_delete, 0);
+ files_added = 0;
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+ // Compaction job will create a new file and delete the older files.
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(files_added, 1);
+ ASSERT_EQ(files_scheduled_to_delete, 1);
+
+ sfm->WaitForEmptyTrash();
+
+ ASSERT_EQ(files_deleted, 1);
+
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options));
+
+ ASSERT_EQ(files_scheduled_to_delete, 4);
+
+ sfm->WaitForEmptyTrash();
+
+ ASSERT_EQ(files_deleted, 4);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_statistics_test.cc b/src/rocksdb/db/db_statistics_test.cc
new file mode 100644
index 000000000..4d4655361
--- /dev/null
+++ b/src/rocksdb/db/db_statistics_test.cc
@@ -0,0 +1,215 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+
+#include "db/db_test_util.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/statistics.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBStatisticsTest : public DBTestBase {
+ public:
+ DBStatisticsTest()
+ : DBTestBase("db_statistics_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBStatisticsTest, CompressionStatsTest) {
+ CompressionType type;
+
+ if (Snappy_Supported()) {
+ type = kSnappyCompression;
+ fprintf(stderr, "using snappy\n");
+ } else if (Zlib_Supported()) {
+ type = kZlibCompression;
+ fprintf(stderr, "using zlib\n");
+ } else if (BZip2_Supported()) {
+ type = kBZip2Compression;
+ fprintf(stderr, "using bzip2\n");
+ } else if (LZ4_Supported()) {
+ type = kLZ4Compression;
+ fprintf(stderr, "using lz4\n");
+ } else if (XPRESS_Supported()) {
+ type = kXpressCompression;
+ fprintf(stderr, "using xpress\n");
+ } else if (ZSTD_Supported()) {
+ type = kZSTD;
+ fprintf(stderr, "using ZSTD\n");
+ } else {
+ fprintf(stderr, "skipping test, compression disabled\n");
+ return;
+ }
+
+ Options options = CurrentOptions();
+ options.compression = type;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+ DestroyAndReopen(options);
+
+ int kNumKeysWritten = 100000;
+
+ // Check that compressions occur and are counted when compression is turned on
+ Random rnd(301);
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ // compressible string
+ ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED), 0);
+
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ auto r = Get(Key(i));
+ }
+ ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED), 0);
+
+ options.compression = kNoCompression;
+ DestroyAndReopen(options);
+ uint64_t currentCompressions =
+ options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+ uint64_t currentDecompressions =
+ options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED);
+
+ // Check that compressions do not occur when turned off
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ // compressible string
+ ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED) -
+ currentCompressions,
+ 0);
+
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ auto r = Get(Key(i));
+ }
+ ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED) -
+ currentDecompressions,
+ 0);
+}
+
+TEST_F(DBStatisticsTest, MutexWaitStatsDisabledByDefault) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ const uint64_t kMutexWaitDelay = 100;
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT,
+ kMutexWaitDelay);
+ ASSERT_OK(Put("hello", "rocksdb"));
+ ASSERT_EQ(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), 0);
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
+TEST_F(DBStatisticsTest, MutexWaitStats) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kAll);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ const uint64_t kMutexWaitDelay = 100;
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT,
+ kMutexWaitDelay);
+ ASSERT_OK(Put("hello", "rocksdb"));
+ ASSERT_GE(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay);
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
+TEST_F(DBStatisticsTest, ResetStats) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ for (int i = 0; i < 2; ++i) {
+ // pick arbitrary ticker and histogram. On first iteration they're zero
+ // because db is unused. On second iteration they're zero due to Reset().
+ ASSERT_EQ(0, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
+ HistogramData histogram_data;
+ options.statistics->histogramData(DB_WRITE, &histogram_data);
+ ASSERT_EQ(0.0, histogram_data.max);
+
+ if (i == 0) {
+ // The Put() makes some of the ticker/histogram stats nonzero until we
+ // Reset().
+ ASSERT_OK(Put("hello", "rocksdb"));
+ ASSERT_EQ(1, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
+ options.statistics->histogramData(DB_WRITE, &histogram_data);
+ ASSERT_GT(histogram_data.max, 0.0);
+ ASSERT_OK(options.statistics->Reset());
+ }
+ }
+}
+
+TEST_F(DBStatisticsTest, ExcludeTickers) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ options.statistics->set_stats_level(StatsLevel::kExceptTickers);
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_EQ(0, options.statistics->getTickerCount(BYTES_WRITTEN));
+ options.statistics->set_stats_level(StatsLevel::kExceptHistogramOrTimers);
+ Reopen(options);
+ ASSERT_EQ("value", Get("foo"));
+ ASSERT_GT(options.statistics->getTickerCount(BYTES_READ), 0);
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBStatisticsTest, VerifyChecksumReadStat) {
+ Options options = CurrentOptions();
+ options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ Reopen(options);
+
+ // Expected to be populated regardless of `PerfLevel` in user thread
+ SetPerfLevel(kDisable);
+
+ {
+ // Scenario 0: only WAL data. Not verified so require ticker to be zero.
+ ASSERT_OK(Put("foo", "value"));
+ ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+ ASSERT_OK(db_->VerifyChecksum());
+ ASSERT_EQ(0,
+ options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES));
+ }
+
+ // Create one SST.
+ ASSERT_OK(Flush());
+ std::unordered_map<std::string, uint64_t> table_files;
+ uint64_t table_files_size = 0;
+ GetAllDataFiles(kTableFile, &table_files, &table_files_size);
+
+ {
+ // Scenario 1: Table verified in `VerifyFileChecksums()`. This should read
+ // the whole file so we require the ticker stat exactly matches the file
+ // size.
+ ASSERT_OK(options.statistics->Reset());
+ ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+ ASSERT_EQ(table_files_size,
+ options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES));
+ }
+
+ {
+ // Scenario 2: Table verified in `VerifyChecksum()`. This opens a
+ // `TableReader` to verify each block. It can involve duplicate reads of the
+ // same data so we set a lower-bound only.
+ ASSERT_OK(options.statistics->Reset());
+ ASSERT_OK(db_->VerifyChecksum());
+ ASSERT_GE(options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES),
+ table_files_size);
+ }
+}
+
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_table_properties_test.cc b/src/rocksdb/db/db_table_properties_test.cc
new file mode 100644
index 000000000..981a514ad
--- /dev/null
+++ b/src/rocksdb/db/db_table_properties_test.cc
@@ -0,0 +1,625 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/table_properties_internal.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+// A helper function that ensures the table properties returned in
+// `GetPropertiesOfAllTablesTest` is correct.
+// This test assumes entries size is different for each of the tables.
+namespace {
+
+void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
+ TablePropertiesCollection props;
+ ASSERT_OK(db->GetPropertiesOfAllTables(&props));
+
+ ASSERT_EQ(4U, props.size());
+ std::unordered_set<uint64_t> unique_entries;
+
+ // Indirect test
+ uint64_t sum = 0;
+ for (const auto& item : props) {
+ unique_entries.insert(item.second->num_entries);
+ sum += item.second->num_entries;
+ }
+
+ ASSERT_EQ(props.size(), unique_entries.size());
+ ASSERT_EQ(expected_entries_size, sum);
+
+ VerifySstUniqueIds(props);
+}
+} // anonymous namespace
+
+class DBTablePropertiesTest : public DBTestBase,
+ public testing::WithParamInterface<std::string> {
+ public:
+ DBTablePropertiesTest()
+ : DBTestBase("db_table_properties_test", /*env_do_fsync=*/false) {}
+ TablePropertiesCollection TestGetPropertiesOfTablesInRange(
+ std::vector<Range> ranges, std::size_t* num_properties = nullptr,
+ std::size_t* num_files = nullptr);
+};
+
+TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 8;
+ // Part of strategy to prevent pinning table files
+ options.max_open_files = 42;
+ Reopen(options);
+
+ // Create 4 tables
+ for (int table = 0; table < 4; ++table) {
+ // Use old meta name for table properties for one file
+ if (table == 3) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WritePropertiesBlock:Meta", [&](void* meta) {
+ *reinterpret_cast<const std::string**>(meta) =
+ &kPropertiesBlockOldName;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ }
+ // Build file
+ for (int i = 0; i < 10 + table; ++i) {
+ ASSERT_OK(
+ db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val"));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ std::string original_session_id;
+ ASSERT_OK(db_->GetDbSessionId(original_session_id));
+
+ // Part of strategy to prevent pinning table files
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionEditHandler::LoadTables:skip_load_table_files",
+ [&](void* skip_load) { *reinterpret_cast<bool*>(skip_load) = true; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // 1. Read table properties directly from file
+ Reopen(options);
+ // Clear out auto-opened files
+ dbfull()->TEST_table_cache()->EraseUnRefEntries();
+ ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
+ VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+ // 2. Put two tables to table cache and
+ Reopen(options);
+ // Clear out auto-opened files
+ dbfull()->TEST_table_cache()->EraseUnRefEntries();
+ ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
+ // fetch key from 1st and 2nd table, which will internally place that table to
+ // the table cache.
+ for (int i = 0; i < 2; ++i) {
+ Get(std::to_string(i * 100 + 0));
+ }
+
+ VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+ // 3. Put all tables to table cache
+ Reopen(options);
+ // fetch key from all tables, which will place them in table cache.
+ for (int i = 0; i < 4; ++i) {
+ Get(std::to_string(i * 100 + 0));
+ }
+ VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+ // 4. Try to read CORRUPT properties (a) directly from file, and (b)
+ // through reader on Get
+
+ // It's not practical to prevent table file read on Open, so we
+ // corrupt after open and after purging table cache.
+ for (bool direct : {true, false}) {
+ Reopen(options);
+ // Clear out auto-opened files
+ dbfull()->TEST_table_cache()->EraseUnRefEntries();
+ ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
+
+ TablePropertiesCollection props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+ std::string sst_file = props.begin()->first;
+
+ // Corrupt the file's TableProperties using session id
+ std::string contents;
+ ASSERT_OK(
+ ReadFileToString(env_->GetFileSystem().get(), sst_file, &contents));
+ size_t pos = contents.find(original_session_id);
+ ASSERT_NE(pos, std::string::npos);
+ ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast<int>(pos), 1,
+ /*verify checksum fails*/ false));
+
+ // Try to read CORRUPT properties
+ if (direct) {
+ ASSERT_TRUE(db_->GetPropertiesOfAllTables(&props).IsCorruption());
+ } else {
+ bool found_corruption = false;
+ for (int i = 0; i < 4; ++i) {
+ std::string result = Get(std::to_string(i * 100 + 0));
+ if (result.find_first_of("Corruption: block checksum mismatch") !=
+ std::string::npos) {
+ found_corruption = true;
+ }
+ }
+ ASSERT_TRUE(found_corruption);
+ }
+
+ // UN-corrupt file for next iteration
+ ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast<int>(pos), 1,
+ /*verify checksum fails*/ false));
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTablePropertiesTest, InvalidIgnored) {
+ // RocksDB versions 2.5 - 2.7 generate some properties that Block considers
+ // invalid in some way. This approximates that.
+
+ // Inject properties block data that Block considers invalid
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WritePropertiesBlock:BlockData",
+ [&](void* block_data) {
+ *reinterpret_cast<Slice*>(block_data) = Slice("X");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Corrupting the table properties corrupts the unique id.
+ // Ignore the unique id recorded in the manifest.
+ auto options = CurrentOptions();
+ options.verify_sst_unique_id_in_manifest = false;
+ Reopen(options);
+
+ // Build file
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), std::to_string(i), "val"));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // Not crashing is good enough
+ TablePropertiesCollection props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+}
+
+TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) {
+ ConfigOptions options;
+ options.ignore_unsupported_options = false;
+
+ std::shared_ptr<TablePropertiesCollectorFactory> factory;
+ std::string id = CompactOnDeletionCollectorFactory::kClassName();
+ ASSERT_OK(
+ TablePropertiesCollectorFactory::CreateFromString(options, id, &factory));
+ auto del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+ ASSERT_NE(del_factory, nullptr);
+ ASSERT_EQ(0U, del_factory->GetWindowSize());
+ ASSERT_EQ(0U, del_factory->GetDeletionTrigger());
+ ASSERT_EQ(0.0, del_factory->GetDeletionRatio());
+ ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString(
+ options, "window_size=100; deletion_trigger=90; id=" + id, &factory));
+ del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+ ASSERT_NE(del_factory, nullptr);
+ ASSERT_EQ(100U, del_factory->GetWindowSize());
+ ASSERT_EQ(90U, del_factory->GetDeletionTrigger());
+ ASSERT_EQ(0.0, del_factory->GetDeletionRatio());
+ ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString(
+ options,
+ "window_size=100; deletion_trigger=90; deletion_ratio=0.5; id=" + id,
+ &factory));
+ del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+ ASSERT_NE(del_factory, nullptr);
+ ASSERT_EQ(100U, del_factory->GetWindowSize());
+ ASSERT_EQ(90U, del_factory->GetDeletionTrigger());
+ ASSERT_EQ(0.5, del_factory->GetDeletionRatio());
+}
+
+TablePropertiesCollection
+DBTablePropertiesTest::TestGetPropertiesOfTablesInRange(
+ std::vector<Range> ranges, std::size_t* num_properties,
+ std::size_t* num_files) {
+ // Since we deref zero element in the vector it can not be empty
+ // otherwise we pass an address to some random memory
+ EXPECT_GT(ranges.size(), 0U);
+ // run the query
+ TablePropertiesCollection props;
+ EXPECT_OK(db_->GetPropertiesOfTablesInRange(
+ db_->DefaultColumnFamily(), &ranges[0], ranges.size(), &props));
+
+ // Make sure that we've received properties for those and for those files
+ // only which fall within requested ranges
+ std::vector<LiveFileMetaData> vmd;
+ db_->GetLiveFilesMetaData(&vmd);
+ for (auto& md : vmd) {
+ std::string fn = md.db_path + md.name;
+ bool in_range = false;
+ for (auto& r : ranges) {
+ // smallestkey < limit && largestkey >= start
+ if (r.limit.compare(md.smallestkey) >= 0 &&
+ r.start.compare(md.largestkey) <= 0) {
+ in_range = true;
+ EXPECT_GT(props.count(fn), 0);
+ }
+ }
+ if (!in_range) {
+ EXPECT_EQ(props.count(fn), 0);
+ }
+ }
+
+ if (num_properties) {
+ *num_properties = props.size();
+ }
+
+ if (num_files) {
+ *num_files = vmd.size();
+ }
+ return props;
+}
+
+TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) {
+ // Fixed random sead
+ Random rnd(301);
+
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = 4096;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 2;
+ options.target_file_size_base = 2048;
+ options.max_bytes_for_level_base = 40960;
+ options.max_bytes_for_level_multiplier = 4;
+ options.hard_pending_compaction_bytes_limit = 16 * 1024;
+ options.num_levels = 8;
+ options.env = env_;
+
+ DestroyAndReopen(options);
+
+ // build a decent LSM
+ for (int i = 0; i < 10000; i++) {
+ ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ if (NumTableFilesAtLevel(0) == 0) {
+ ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
+ ASSERT_OK(Flush());
+ }
+
+ ASSERT_OK(db_->PauseBackgroundWork());
+
+ // Ensure that we have at least L0, L1 and L2
+ ASSERT_GT(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+ // Query the largest range
+ std::size_t num_properties, num_files;
+ TestGetPropertiesOfTablesInRange(
+ {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST),
+ test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))},
+ &num_properties, &num_files);
+ ASSERT_EQ(num_properties, num_files);
+
+ // Query the empty range
+ TestGetPropertiesOfTablesInRange(
+ {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST),
+ test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST))},
+ &num_properties, &num_files);
+ ASSERT_GT(num_files, 0);
+ ASSERT_EQ(num_properties, 0);
+
+ // Query the middle rangee
+ TestGetPropertiesOfTablesInRange(
+ {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::MIDDLE),
+ test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))},
+ &num_properties, &num_files);
+ ASSERT_GT(num_files, 0);
+ ASSERT_GT(num_files, num_properties);
+ ASSERT_GT(num_properties, 0);
+
+ // Query a bunch of random ranges
+ for (int j = 0; j < 100; j++) {
+ // create a bunch of ranges
+ std::vector<std::string> random_keys;
+ // Random returns numbers with zero included
+ // when we pass empty ranges TestGetPropertiesOfTablesInRange()
+ // derefs random memory in the empty ranges[0]
+ // so want to be greater than zero and even since
+ // the below loop requires that random_keys.size() to be even.
+ auto n = 2 * (rnd.Uniform(50) + 1);
+
+ for (uint32_t i = 0; i < n; ++i) {
+ random_keys.push_back(test::RandomKey(&rnd, 5));
+ }
+
+ ASSERT_GT(random_keys.size(), 0U);
+ ASSERT_EQ((random_keys.size() % 2), 0U);
+
+ std::vector<Range> ranges;
+ auto it = random_keys.begin();
+ while (it != random_keys.end()) {
+ ranges.push_back(Range(*it, *(it + 1)));
+ it += 2;
+ }
+
+ TestGetPropertiesOfTablesInRange(std::move(ranges));
+ }
+}
+
+TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) {
+ std::string kExtraCfName = "pikachu";
+ CreateAndReopenWithCF({kExtraCfName}, CurrentOptions());
+
+ // Create one table per CF, then verify it was created with the column family
+ // name property.
+ for (uint32_t cf = 0; cf < 2; ++cf) {
+ ASSERT_OK(Put(cf, "key", "val"));
+ ASSERT_OK(Flush(cf));
+
+ TablePropertiesCollection fname_to_props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+ ASSERT_EQ(1U, fname_to_props.size());
+
+ std::string expected_cf_name;
+ if (cf > 0) {
+ expected_cf_name = kExtraCfName;
+ } else {
+ expected_cf_name = kDefaultColumnFamilyName;
+ }
+ ASSERT_EQ(expected_cf_name,
+ fname_to_props.begin()->second->column_family_name);
+ ASSERT_EQ(cf, static_cast<uint32_t>(
+ fname_to_props.begin()->second->column_family_id));
+ }
+}
+
+TEST_F(DBTablePropertiesTest, GetDbIdentifiersProperty) {
+ CreateAndReopenWithCF({"goku"}, CurrentOptions());
+
+ for (uint32_t cf = 0; cf < 2; ++cf) {
+ ASSERT_OK(Put(cf, "key", "val"));
+ ASSERT_OK(Put(cf, "foo", "bar"));
+ ASSERT_OK(Flush(cf));
+
+ TablePropertiesCollection fname_to_props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+ ASSERT_EQ(1U, fname_to_props.size());
+
+ std::string id, sid;
+ ASSERT_OK(db_->GetDbIdentity(id));
+ ASSERT_OK(db_->GetDbSessionId(sid));
+ ASSERT_EQ(id, fname_to_props.begin()->second->db_id);
+ ASSERT_EQ(sid, fname_to_props.begin()->second->db_session_id);
+ }
+}
+
+class DBTableHostnamePropertyTest
+ : public DBTestBase,
+ public ::testing::WithParamInterface<std::tuple<int, std::string>> {
+ public:
+ DBTableHostnamePropertyTest()
+ : DBTestBase("db_table_hostname_property_test",
+ /*env_do_fsync=*/false) {}
+};
+
+TEST_P(DBTableHostnamePropertyTest, DbHostLocationProperty) {
+ option_config_ = std::get<0>(GetParam());
+ Options opts = CurrentOptions();
+ std::string expected_host_id = std::get<1>(GetParam());
+ ;
+ if (expected_host_id == kHostnameForDbHostId) {
+ ASSERT_OK(env_->GetHostNameString(&expected_host_id));
+ } else {
+ opts.db_host_id = expected_host_id;
+ }
+ CreateAndReopenWithCF({"goku"}, opts);
+
+ for (uint32_t cf = 0; cf < 2; ++cf) {
+ ASSERT_OK(Put(cf, "key", "val"));
+ ASSERT_OK(Put(cf, "foo", "bar"));
+ ASSERT_OK(Flush(cf));
+
+ TablePropertiesCollection fname_to_props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+ ASSERT_EQ(1U, fname_to_props.size());
+
+ ASSERT_EQ(fname_to_props.begin()->second->db_host_id, expected_host_id);
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ DBTableHostnamePropertyTest, DBTableHostnamePropertyTest,
+ ::testing::Values(
+ // OptionConfig, override db_host_location
+ std::make_tuple(DBTestBase::OptionConfig::kDefault,
+ kHostnameForDbHostId),
+ std::make_tuple(DBTestBase::OptionConfig::kDefault, "foobar"),
+ std::make_tuple(DBTestBase::OptionConfig::kDefault, ""),
+ std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+ kHostnameForDbHostId),
+ std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+ "foobar"),
+ std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+ "")));
+
+class DeletionTriggeredCompactionTestListener : public EventListener {
+ public:
+ void OnCompactionBegin(DB*, const CompactionJobInfo& ci) override {
+ ASSERT_EQ(ci.compaction_reason,
+ CompactionReason::kFilesMarkedForCompaction);
+ }
+
+ void OnCompactionCompleted(DB*, const CompactionJobInfo& ci) override {
+ ASSERT_EQ(ci.compaction_reason,
+ CompactionReason::kFilesMarkedForCompaction);
+ }
+};
+
+TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
+ int kNumKeys = 1000;
+ int kWindowSize = 100;
+ int kNumDelsTrigger = 90;
+ std::shared_ptr<TablePropertiesCollectorFactory> compact_on_del =
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger);
+
+ Options opts = CurrentOptions();
+ opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ opts.table_properties_collector_factories.emplace_back(compact_on_del);
+
+ if (GetParam() == "kCompactionStyleUniversal") {
+ opts.compaction_style = kCompactionStyleUniversal;
+ }
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+
+ DeletionTriggeredCompactionTestListener* listener =
+ new DeletionTriggeredCompactionTestListener();
+ opts.listeners.emplace_back(listener);
+ Reopen(opts);
+
+ for (int i = 0; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ ASSERT_OK(Delete(Key(i)));
+ } else {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+ // Change the window size and deletion trigger and ensure new values take
+ // effect
+ kWindowSize = 50;
+ kNumDelsTrigger = 40;
+ static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+ ->SetWindowSize(kWindowSize);
+ static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+ ->SetDeletionTrigger(kNumDelsTrigger);
+ for (int i = 0; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ ASSERT_OK(Delete(Key(i)));
+ } else {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+ // Change the window size to disable delete triggered compaction
+ kWindowSize = 0;
+ static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+ ->SetWindowSize(kWindowSize);
+ static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+ ->SetDeletionTrigger(kNumDelsTrigger);
+ for (int i = 0; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ ASSERT_OK(Delete(Key(i)));
+ } else {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_WRITE_BYTES_MARKED));
+ ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_READ_BYTES_MARKED));
+}
+
+TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) {
+ constexpr int kNumKeys = 1000;
+ constexpr int kWindowSize = 0;
+ constexpr int kNumDelsTrigger = 0;
+ constexpr double kDeletionRatio = 0.1;
+ std::shared_ptr<TablePropertiesCollectorFactory> compact_on_del =
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger,
+ kDeletionRatio);
+
+ Options opts = CurrentOptions();
+ opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ opts.table_properties_collector_factories.emplace_back(compact_on_del);
+
+ Reopen(opts);
+
+ // Add an L2 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(2);
+
+ auto* listener = new DeletionTriggeredCompactionTestListener();
+ opts.listeners.emplace_back(listener);
+ Reopen(opts);
+
+ // Generate one L0 with kNumKeys Put.
+ for (int i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(Put(Key(i), "not important"));
+ }
+ ASSERT_OK(Flush());
+
+ // Generate another L0 with kNumKeys Delete.
+ // This file, due to deletion ratio, will trigger compaction: 2@0 files to L1.
+ // The resulting L1 file has only one tombstone for user key 'Key(0)'.
+ // Again, due to deletion ratio, a compaction will be triggered: 1@1 + 1@2
+ // files to L2. However, the resulting file is empty because the tombstone
+ // and value are both dropped.
+ for (int i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_EQ(0, NumTableFilesAtLevel(i));
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(DBTablePropertiesTest, DBTablePropertiesTest,
+ ::testing::Values("kCompactionStyleLevel",
+ "kCompactionStyleUniversal"));
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_tailing_iter_test.cc b/src/rocksdb/db/db_tailing_iter_test.cc
new file mode 100644
index 000000000..af3194ac4
--- /dev/null
+++ b/src/rocksdb/db/db_tailing_iter_test.cc
@@ -0,0 +1,604 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestTailingIterator : public DBTestBase,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ DBTestTailingIterator()
+ : DBTestBase("db_tailing_iterator_test", /*env_do_fsync=*/true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBTestTailingIterator, DBTestTailingIterator,
+ ::testing::Bool());
+
+TEST_P(DBTestTailingIterator, TailingIteratorSingle) {
+ ReadOptions read_options;
+ read_options.tailing = true;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ iter->SeekToFirst();
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+
+ // add a record and check that iter can see it
+ ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "mirko");
+
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorKeepAdding) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ReadOptions read_options;
+ read_options.tailing = true;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(iter->status());
+ std::string value(1024, 'a');
+
+ const int num_records = 10000;
+ for (int i = 0; i < num_records; ++i) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%016d", i);
+
+ Slice key(buf, 16);
+ ASSERT_OK(Put(1, key, value));
+
+ iter->Seek(key);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(key), 0);
+ }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorSeekToNext) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ReadOptions read_options;
+ read_options.tailing = true;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(iter->status());
+ std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(itern->status());
+ std::string value(1024, 'a');
+
+ const int num_records = 1000;
+ for (int i = 1; i < num_records; ++i) {
+ char buf1[32];
+ char buf2[32];
+ snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+ Slice key(buf1, 20);
+ ASSERT_OK(Put(1, key, value));
+
+ if (i % 100 == 99) {
+ ASSERT_OK(Flush(1));
+ }
+
+ snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+ Slice target(buf2, 20);
+ iter->Seek(target);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(key), 0);
+ if (i == 1) {
+ itern->SeekToFirst();
+ } else {
+ itern->Next();
+ }
+ ASSERT_TRUE(itern->Valid());
+ ASSERT_EQ(itern->key().compare(key), 0);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ for (int i = 2 * num_records; i > 0; --i) {
+ char buf1[32];
+ char buf2[32];
+ snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+ Slice key(buf1, 20);
+ ASSERT_OK(Put(1, key, value));
+
+ if (i % 100 == 99) {
+ ASSERT_OK(Flush(1));
+ }
+
+ snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+ Slice target(buf2, 20);
+ iter->Seek(target);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(key), 0);
+ }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
+ const uint64_t k150KB = 150 * 1024;
+ Options options;
+ options.write_buffer_size = k150KB;
+ options.max_write_buffer_number = 3;
+ options.min_write_buffer_number_to_merge = 2;
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ReadOptions read_options;
+ read_options.tailing = true;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+ int num_iters, deleted_iters;
+
+ char bufe[32];
+ snprintf(bufe, sizeof(bufe), "00b0%016d", 0);
+ Slice keyu(bufe, 20);
+ read_options.iterate_upper_bound = &keyu;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(iter->status());
+ std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(itern->status());
+ std::unique_ptr<Iterator> iterh(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(iterh->status());
+ std::string value(1024, 'a');
+ bool file_iters_deleted = false;
+ bool file_iters_renewed_null = false;
+ bool file_iters_renewed_copy = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ForwardIterator::SeekInternal:Return", [&](void* arg) {
+ ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+ ASSERT_TRUE(!file_iters_deleted ||
+ fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ForwardIterator::Next:Return", [&](void* arg) {
+ ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+ ASSERT_TRUE(!file_iters_deleted ||
+ fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ForwardIterator::RenewIterators:Null",
+ [&](void* /*arg*/) { file_iters_renewed_null = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ForwardIterator::RenewIterators:Copy",
+ [&](void* /*arg*/) { file_iters_renewed_copy = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ const int num_records = 1000;
+ for (int i = 1; i < num_records; ++i) {
+ char buf1[32];
+ char buf2[32];
+ char buf3[32];
+ char buf4[32];
+ snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+ snprintf(buf3, sizeof(buf3), "00b0%016d", i * 5);
+
+ Slice key(buf1, 20);
+ ASSERT_OK(Put(1, key, value));
+ Slice keyn(buf3, 20);
+ ASSERT_OK(Put(1, keyn, value));
+
+ if (i % 100 == 99) {
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ if (i == 299) {
+ file_iters_deleted = true;
+ }
+ snprintf(buf4, sizeof(buf4), "00a0%016d", i * 5 / 2);
+ Slice target(buf4, 20);
+ iterh->Seek(target);
+ ASSERT_TRUE(iter->Valid());
+ for (int j = (i + 1) * 5 / 2; j < i * 5; j += 5) {
+ iterh->Next();
+ ASSERT_TRUE(iterh->Valid());
+ }
+ if (i == 299) {
+ file_iters_deleted = false;
+ }
+ }
+
+ file_iters_deleted = true;
+ snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+ Slice target(buf2, 20);
+ iter->Seek(target);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(key), 0);
+ ASSERT_LE(num_iters, 1);
+ if (i == 1) {
+ itern->SeekToFirst();
+ } else {
+ itern->Next();
+ }
+ ASSERT_TRUE(itern->Valid());
+ ASSERT_EQ(itern->key().compare(key), 0);
+ ASSERT_LE(num_iters, 1);
+ file_iters_deleted = false;
+ }
+ ASSERT_TRUE(file_iters_renewed_null);
+ ASSERT_TRUE(file_iters_renewed_copy);
+ iter = nullptr;
+ itern = nullptr;
+ iterh = nullptr;
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ table_options.block_cache_compressed = nullptr;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ read_options.read_tier = kBlockCacheTier;
+ std::unique_ptr<Iterator> iteri(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(iteri->status());
+ char buf5[32];
+ snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2);
+ Slice target1(buf5, 20);
+ iteri->Seek(target1);
+ ASSERT_TRUE(iteri->status().IsIncomplete());
+ iteri = nullptr;
+
+ read_options.read_tier = kReadAllTier;
+ options.table_factory.reset(NewBlockBasedTableFactory());
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ iter.reset(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(iter->status());
+ for (int i = 2 * num_records; i > 0; --i) {
+ char buf1[32];
+ char buf2[32];
+ snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+ Slice key(buf1, 20);
+ ASSERT_OK(Put(1, key, value));
+
+ if (i % 100 == 99) {
+ ASSERT_OK(Flush(1));
+ }
+
+ snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+ Slice target(buf2, 20);
+ iter->Seek(target);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(key), 0);
+ }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorDeletes) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ReadOptions read_options;
+ read_options.tailing = true;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(iter->status());
+
+ // write a single record, read it using the iterator, then delete it
+ ASSERT_OK(Put(1, "0test", "test"));
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "0test");
+ ASSERT_OK(Delete(1, "0test"));
+
+ // write many more records
+ const int num_records = 10000;
+ std::string value(1024, 'A');
+
+ for (int i = 0; i < num_records; ++i) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "1%015d", i);
+
+ Slice key(buf, 16);
+ ASSERT_OK(Put(1, key, value));
+ }
+
+ // force a flush to make sure that no records are read from memtable
+ ASSERT_OK(Flush(1));
+
+ // skip "0test"
+ iter->Next();
+
+ // make sure we can read all new records using the existing iterator
+ int count = 0;
+ for (; iter->Valid(); iter->Next(), ++count)
+ ;
+
+ ASSERT_EQ(count, num_records);
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorPrefixSeek) {
+ ReadOptions read_options;
+ read_options.tailing = true;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+ options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+ options.allow_concurrent_memtable_write = false;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(iter->status());
+ ASSERT_OK(Put(1, "0101", "test"));
+
+ ASSERT_OK(Flush(1));
+
+ ASSERT_OK(Put(1, "0202", "test"));
+
+ // Seek(0102) shouldn't find any records since 0202 has a different prefix
+ iter->Seek("0102");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("0202");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "0202");
+
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorIncomplete) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ReadOptions read_options;
+ read_options.tailing = true;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+ read_options.read_tier = kBlockCacheTier;
+
+ std::string key("key");
+ std::string value("value");
+
+ ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+ iter->SeekToFirst();
+ // we either see the entry or it's not in cache
+ ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ iter->SeekToFirst();
+ // should still be true after compaction
+ ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorSeekToSame) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 1000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ReadOptions read_options;
+ read_options.tailing = true;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+ const int NROWS = 10000;
+ // Write rows with keys 00000, 00002, 00004 etc.
+ for (int i = 0; i < NROWS; ++i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%05d", 2 * i);
+ std::string key(buf);
+ std::string value("value");
+ ASSERT_OK(db_->Put(WriteOptions(), key, value));
+ }
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+ // Seek to 00001. We expect to find 00002.
+ std::string start_key = "00001";
+ iter->Seek(start_key);
+ ASSERT_TRUE(iter->Valid());
+
+ std::string found = iter->key().ToString();
+ ASSERT_EQ("00002", found);
+
+ // Now seek to the same key. The iterator should remain in the same
+ // position.
+ iter->Seek(found);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(found, iter->key().ToString());
+}
+
+// Sets iterate_upper_bound and verifies that ForwardIterator doesn't call
+// Seek() on immutable iterators when target key is >= prev_key and all
+// iterators, including the memtable iterator, are over the upper bound.
+TEST_P(DBTestTailingIterator, TailingIteratorUpperBound) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+ const Slice upper_bound("20", 3);
+ ReadOptions read_options;
+ read_options.tailing = true;
+ read_options.iterate_upper_bound = &upper_bound;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+ ASSERT_OK(Put(1, "11", "11"));
+ ASSERT_OK(Put(1, "12", "12"));
+ ASSERT_OK(Put(1, "22", "22"));
+ ASSERT_OK(Flush(1)); // flush all those keys to an immutable SST file
+
+ // Add another key to the memtable.
+ ASSERT_OK(Put(1, "21", "21"));
+
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+ ASSERT_OK(it->status());
+ it->Seek("12");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("12", it->key().ToString());
+
+ it->Next();
+ // Not valid since "21" is over the upper bound.
+ ASSERT_FALSE(it->Valid());
+ ASSERT_OK(it->status());
+ // This keeps track of the number of times NeedToSeekImmutable() was true.
+ int immutable_seeks = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ForwardIterator::SeekInternal:Immutable",
+ [&](void* /*arg*/) { ++immutable_seeks; });
+
+ // Seek to 13. This should not require any immutable seeks.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ it->Seek("13");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ASSERT_FALSE(it->Valid());
+ ASSERT_OK(it->status());
+ if (GetParam()) {
+ ASSERT_EQ(1, immutable_seeks);
+ } else {
+ ASSERT_EQ(0, immutable_seeks);
+ }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorGap) {
+ // level 1: [20, 25] [35, 40]
+ // level 2: [10 - 15] [45 - 50]
+ // level 3: [20, 30, 40]
+ // Previously there is a bug in tailing_iterator that if there is a gap in
+ // lower level, the key will be skipped if it is within the range between
+ // the largest key of index n file and the smallest key of index n+1 file
+ // if both file fit in that gap. In this example, 25 < key < 35
+ // https://github.com/facebook/rocksdb/issues/1372
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+ ReadOptions read_options;
+ read_options.tailing = true;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+ ASSERT_OK(Put(1, "20", "20"));
+ ASSERT_OK(Put(1, "30", "30"));
+ ASSERT_OK(Put(1, "40", "40"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(3, 1);
+
+ ASSERT_OK(Put(1, "10", "10"));
+ ASSERT_OK(Put(1, "15", "15"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "45", "45"));
+ ASSERT_OK(Put(1, "50", "50"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(2, 1);
+
+ ASSERT_OK(Put(1, "20", "20"));
+ ASSERT_OK(Put(1, "25", "25"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "35", "35"));
+ ASSERT_OK(Put(1, "40", "40"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(1, 1);
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(handles_[1], &meta);
+
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+ it->Seek("30");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("30", it->key().ToString());
+
+ it->Next();
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("35", it->key().ToString());
+
+ it->Next();
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("40", it->key().ToString());
+
+ ASSERT_OK(it->status());
+}
+
+TEST_P(DBTestTailingIterator, SeekWithUpperBoundBug) {
+ ReadOptions read_options;
+ read_options.tailing = true;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+ const Slice upper_bound("cc", 3);
+ read_options.iterate_upper_bound = &upper_bound;
+
+ // 1st L0 file
+ ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
+ ASSERT_OK(Flush());
+
+ // 2nd L0 file
+ ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
+ ASSERT_OK(Flush());
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+
+ iter->Seek("aa");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "aa");
+}
+
+TEST_P(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) {
+ ReadOptions read_options;
+ read_options.tailing = true;
+ if (GetParam()) {
+ read_options.async_io = true;
+ }
+ const Slice upper_bound("cc", 3);
+ read_options.iterate_upper_bound = &upper_bound;
+
+ // 1st L0 file
+ ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
+ ASSERT_OK(Flush());
+
+ // 2nd L0 file
+ ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
+ ASSERT_OK(Flush());
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "aa");
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "aa");
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void)argc;
+ (void)argv;
+ return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_test.cc b/src/rocksdb/db/db_test.cc
new file mode 100644
index 000000000..9575248b4
--- /dev/null
+++ b/src/rocksdb/db/db_test.cc
@@ -0,0 +1,7397 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#include <fcntl.h>
+
+#include <algorithm>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#ifdef OS_SOLARIS
+#include <alloca.h>
+#endif
+
+#include "cache/lru_cache.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/mock_table.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Note that whole DBTest and its child classes disable fsync on files
+// and directories for speed.
+// If fsync needs to be covered in a test, put it in other places.
+class DBTest : public DBTestBase {
+ public:
+ DBTest() : DBTestBase("db_test", /*env_do_fsync=*/false) {}
+};
+
+class DBTestWithParam
+ : public DBTest,
+ public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+ DBTestWithParam() {
+ max_subcompactions_ = std::get<0>(GetParam());
+ exclusive_manual_compaction_ = std::get<1>(GetParam());
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ uint32_t max_subcompactions_;
+ bool exclusive_manual_compaction_;
+};
+
+TEST_F(DBTest, MockEnvTest) {
+ std::unique_ptr<MockEnv> env{MockEnv::Create(Env::Default())};
+ Options options;
+ options.create_if_missing = true;
+ options.env = env.get();
+ DB* db;
+
+ const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+ const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+ ASSERT_OK(DB::Open(options, "/dir/db", &db));
+ for (size_t i = 0; i < 3; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+ }
+
+ for (size_t i = 0; i < 3; ++i) {
+ std::string res;
+ ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+ ASSERT_TRUE(res == vals[i]);
+ }
+
+ Iterator* iterator = db->NewIterator(ReadOptions());
+ iterator->SeekToFirst();
+ for (size_t i = 0; i < 3; ++i) {
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_TRUE(keys[i] == iterator->key());
+ ASSERT_TRUE(vals[i] == iterator->value());
+ iterator->Next();
+ }
+ ASSERT_TRUE(!iterator->Valid());
+ delete iterator;
+
+// TEST_FlushMemTable() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+
+ for (size_t i = 0; i < 3; ++i) {
+ std::string res;
+ ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+ ASSERT_TRUE(res == vals[i]);
+ }
+#endif // ROCKSDB_LITE
+
+ delete db;
+}
+
+// NewMemEnv returns nullptr in ROCKSDB_LITE since class InMemoryEnv isn't
+// defined.
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, MemEnvTest) {
+ std::unique_ptr<Env> env{NewMemEnv(Env::Default())};
+ Options options;
+ options.create_if_missing = true;
+ options.env = env.get();
+ DB* db;
+
+ const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+ const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+ ASSERT_OK(DB::Open(options, "/dir/db", &db));
+ for (size_t i = 0; i < 3; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+ }
+
+ for (size_t i = 0; i < 3; ++i) {
+ std::string res;
+ ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+ ASSERT_TRUE(res == vals[i]);
+ }
+
+ Iterator* iterator = db->NewIterator(ReadOptions());
+ iterator->SeekToFirst();
+ for (size_t i = 0; i < 3; ++i) {
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_TRUE(keys[i] == iterator->key());
+ ASSERT_TRUE(vals[i] == iterator->value());
+ iterator->Next();
+ }
+ ASSERT_TRUE(!iterator->Valid());
+ delete iterator;
+
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db);
+ ASSERT_OK(dbi->TEST_FlushMemTable());
+
+ for (size_t i = 0; i < 3; ++i) {
+ std::string res;
+ ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+ ASSERT_TRUE(res == vals[i]);
+ }
+
+ delete db;
+
+ options.create_if_missing = false;
+ ASSERT_OK(DB::Open(options, "/dir/db", &db));
+ for (size_t i = 0; i < 3; ++i) {
+ std::string res;
+ ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+ ASSERT_TRUE(res == vals[i]);
+ }
+ delete db;
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, WriteEmptyBatch) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ WriteOptions wo;
+ wo.sync = true;
+ wo.disableWAL = false;
+ WriteBatch empty_batch;
+ ASSERT_OK(dbfull()->Write(wo, &empty_batch));
+
+ // make sure we can re-open it.
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+ ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBTest, SkipDelay) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ for (bool sync : {true, false}) {
+ for (bool disableWAL : {true, false}) {
+ if (sync && disableWAL) {
+ // sync and disableWAL is incompatible.
+ continue;
+ }
+ // Use a small number to ensure a large delay that is still effective
+ // when we do Put
+ // TODO(myabandeh): this is time dependent and could potentially make
+ // the test flaky
+ auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+ std::atomic<int> sleep_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Sleep",
+ [&](void* /*arg*/) { sleep_count.fetch_add(1); });
+ std::atomic<int> wait_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Wait",
+ [&](void* /*arg*/) { wait_count.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = sync;
+ wo.disableWAL = disableWAL;
+ wo.no_slowdown = true;
+ // Large enough to exceed allowance for one time interval
+ std::string large_value(1024, 'x');
+ // Perhaps ideally this first write would fail because of delay, but
+ // the current implementation does not guarantee that.
+ dbfull()->Put(wo, "foo", large_value).PermitUncheckedError();
+ // We need the 2nd write to trigger delay. This is because delay is
+ // estimated based on the last write size which is 0 for the first write.
+ ASSERT_NOK(dbfull()->Put(wo, "foo2", large_value));
+ ASSERT_GE(sleep_count.load(), 0);
+ ASSERT_GE(wait_count.load(), 0);
+ token.reset();
+
+ token = dbfull()->TEST_write_controler().GetDelayToken(1000000);
+ wo.no_slowdown = false;
+ ASSERT_OK(dbfull()->Put(wo, "foo3", large_value));
+ ASSERT_GE(sleep_count.load(), 1);
+ token.reset();
+ }
+ }
+}
+
+TEST_F(DBTest, MixedSlowdownOptions) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ std::vector<port::Thread> threads;
+ std::atomic<int> thread_num(0);
+
+ std::function<void()> write_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = false;
+ ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+ };
+ std::function<void()> write_no_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = true;
+ ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+ };
+ // Use a small number to ensure a large delay that is still effective
+ // when we do Put
+ // TODO(myabandeh): this is time dependent and could potentially make
+ // the test flaky
+ auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+ std::atomic<int> sleep_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:BeginWriteStallDone", [&](void* /*arg*/) {
+ sleep_count.fetch_add(1);
+ if (threads.empty()) {
+ for (int i = 0; i < 2; ++i) {
+ threads.emplace_back(write_slowdown_func);
+ }
+ for (int i = 0; i < 2; ++i) {
+ threads.emplace_back(write_no_slowdown_func);
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = false;
+ wo.disableWAL = false;
+ wo.no_slowdown = false;
+ ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
+ // We need the 2nd write to trigger delay. This is because delay is
+ // estimated based on the last write size which is 0 for the first write.
+ ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+ token.reset();
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ ASSERT_GE(sleep_count.load(), 1);
+
+ wo.no_slowdown = true;
+ ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsInQueue) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ std::vector<port::Thread> threads;
+ std::atomic<int> thread_num(0);
+
+ std::function<void()> write_no_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = true;
+ ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+ };
+ // Use a small number to ensure a large delay that is still effective
+ // when we do Put
+ // TODO(myabandeh): this is time dependent and could potentially make
+ // the test flaky
+ auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+ std::atomic<int> sleep_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Sleep", [&](void* /*arg*/) {
+ sleep_count.fetch_add(1);
+ if (threads.empty()) {
+ for (int i = 0; i < 2; ++i) {
+ threads.emplace_back(write_no_slowdown_func);
+ }
+ // Sleep for 2s to allow the threads to insert themselves into the
+ // write queue
+ env_->SleepForMicroseconds(3000000ULL);
+ }
+ });
+ std::atomic<int> wait_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Wait",
+ [&](void* /*arg*/) { wait_count.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = false;
+ wo.disableWAL = false;
+ wo.no_slowdown = false;
+ ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
+ // We need the 2nd write to trigger delay. This is because delay is
+ // estimated based on the last write size which is 0 for the first write.
+ ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+ token.reset();
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ ASSERT_EQ(sleep_count.load(), 1);
+ ASSERT_GE(wait_count.load(), 0);
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsStop) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ std::vector<port::Thread> threads;
+ std::atomic<int> thread_num(0);
+
+ std::function<void()> write_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = false;
+ ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+ };
+ std::function<void()> write_no_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = true;
+ ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+ };
+ std::function<void()> wakeup_writer = [&]() {
+ dbfull()->mutex_.Lock();
+ dbfull()->bg_cv_.SignalAll();
+ dbfull()->mutex_.Unlock();
+ };
+ // Use a small number to ensure a large delay that is still effective
+ // when we do Put
+ // TODO(myabandeh): this is time dependent and could potentially make
+ // the test flaky
+ auto token = dbfull()->TEST_write_controler().GetStopToken();
+ std::atomic<int> wait_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
+ wait_count.fetch_add(1);
+ if (threads.empty()) {
+ for (int i = 0; i < 2; ++i) {
+ threads.emplace_back(write_slowdown_func);
+ }
+ for (int i = 0; i < 2; ++i) {
+ threads.emplace_back(write_no_slowdown_func);
+ }
+ // Sleep for 2s to allow the threads to insert themselves into the
+ // write queue
+ env_->SleepForMicroseconds(3000000ULL);
+ }
+ token.reset();
+ threads.emplace_back(wakeup_writer);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions wo;
+ wo.sync = false;
+ wo.disableWAL = false;
+ wo.no_slowdown = false;
+ ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
+ // We need the 2nd write to trigger delay. This is because delay is
+ // estimated based on the last write size which is 0 for the first write.
+ ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+ token.reset();
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ ASSERT_GE(wait_count.load(), 1);
+
+ wo.no_slowdown = true;
+ ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, LevelLimitReopen) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const std::string value(1024 * 1024, ' ');
+ int i = 0;
+ while (NumTableFilesAtLevel(2, 1) == 0) {
+ ASSERT_OK(Put(1, Key(i++), value));
+ }
+
+ options.num_levels = 1;
+ options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+ Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ(s.IsInvalidArgument(), true);
+ ASSERT_EQ(s.ToString(),
+ "Invalid argument: db has more levels than options.num_levels");
+
+ options.num_levels = 10;
+ options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, LevelReopenWithFIFO) {
+ const int kLevelCount = 4;
+ const int kKeyCount = 5;
+ const int kTotalSstFileCount = kLevelCount * kKeyCount;
+ const int kCF = 1;
+
+ Options options = CurrentOptions();
+ // Config level0_file_num_compaction_trigger to prevent L0 files being
+ // automatically compacted while we are constructing a LSM tree structure
+ // to test multi-level FIFO compaction.
+ options.level0_file_num_compaction_trigger = kKeyCount + 1;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // The expected number of files per level after each file creation.
+ const std::string expected_files_per_level[kLevelCount][kKeyCount] = {
+ {"0,0,0,1", "0,0,0,2", "0,0,0,3", "0,0,0,4", "0,0,0,5"},
+ {"0,0,1,5", "0,0,2,5", "0,0,3,5", "0,0,4,5", "0,0,5,5"},
+ {"0,1,5,5", "0,2,5,5", "0,3,5,5", "0,4,5,5", "0,5,5,5"},
+ {"1,5,5,5", "2,5,5,5", "3,5,5,5", "4,5,5,5", "5,5,5,5"},
+ };
+
+ const std::string expected_entries[kKeyCount][kLevelCount + 1] = {
+ {"[ ]", "[ a3 ]", "[ a2, a3 ]", "[ a1, a2, a3 ]", "[ a0, a1, a2, a3 ]"},
+ {"[ ]", "[ b3 ]", "[ b2, b3 ]", "[ b1, b2, b3 ]", "[ b0, b1, b2, b3 ]"},
+ {"[ ]", "[ c3 ]", "[ c2, c3 ]", "[ c1, c2, c3 ]", "[ c0, c1, c2, c3 ]"},
+ {"[ ]", "[ d3 ]", "[ d2, d3 ]", "[ d1, d2, d3 ]", "[ d0, d1, d2, d3 ]"},
+ {"[ ]", "[ e3 ]", "[ e2, e3 ]", "[ e1, e2, e3 ]", "[ e0, e1, e2, e3 ]"},
+ };
+
+ // The loop below creates the following LSM tree where each (k, v) pair
+ // represents a file that contains that entry. When a file is created,
+ // the db is reopend with FIFO compaction and verified the LSM tree
+ // structure is still the same.
+ //
+ // The resulting LSM tree will contain 5 different keys. Each key as
+ // 4 different versions, located in different level.
+ //
+ // L0: (e, e0) (d, d0) (c, c0) (b, b0) (a, a0)
+ // L1: (a, a1) (b, b1) (c, c1) (d, d1) (e, e1)
+ // L2: (a, a2) (b, b2) (c, c2) (d, d2) (e, e2)
+ // L3: (a, a3) (b, b3) (c, c3) (d, d3) (e, e3)
+ for (int l = 0; l < kLevelCount; ++l) {
+ int level = kLevelCount - 1 - l;
+ for (int p = 0; p < kKeyCount; ++p) {
+ std::string put_key = std::string(1, char('a' + p));
+ ASSERT_OK(Put(kCF, put_key, put_key + std::to_string(level)));
+ ASSERT_OK(Flush(kCF));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ for (int g = 0; g < kKeyCount; ++g) {
+ int entry_count = (p >= g) ? l + 1 : l;
+ std::string get_key = std::string(1, char('a' + g));
+ CheckAllEntriesWithFifoReopen(expected_entries[g][entry_count], get_key,
+ kCF, {"pikachu"}, options);
+ }
+ if (level != 0) {
+ MoveFilesToLevel(level, kCF);
+ for (int g = 0; g < kKeyCount; ++g) {
+ int entry_count = (p >= g) ? l + 1 : l;
+ std::string get_key = std::string(1, char('a' + g));
+ CheckAllEntriesWithFifoReopen(expected_entries[g][entry_count],
+ get_key, kCF, {"pikachu"}, options);
+ }
+ }
+ ASSERT_EQ(expected_files_per_level[l][p], FilesPerLevel(kCF));
+ }
+ }
+
+ // The expected number of sst files in each level after each FIFO compaction
+ // that deletes the oldest sst file.
+ const std::string expected_files_per_level_after_fifo[] = {
+ "5,5,5,4", "5,5,5,3", "5,5,5,2", "5,5,5,1", "5,5,5", "5,5,4", "5,5,3",
+ "5,5,2", "5,5,1", "5,5", "5,4", "5,3", "5,2", "5,1",
+ "5", "4", "3", "2", "1", "",
+ };
+
+ // The expected value entries of each key after each FIFO compaction.
+ // This verifies whether FIFO removes the file with the smallest key in non-L0
+ // files first then the oldest files in L0.
+ const std::string expected_entries_after_fifo[kKeyCount][kLevelCount + 1] = {
+ {"[ a0, a1, a2, a3 ]", "[ a0, a1, a2 ]", "[ a0, a1 ]", "[ a0 ]", "[ ]"},
+ {"[ b0, b1, b2, b3 ]", "[ b0, b1, b2 ]", "[ b0, b1 ]", "[ b0 ]", "[ ]"},
+ {"[ c0, c1, c2, c3 ]", "[ c0, c1, c2 ]", "[ c0, c1 ]", "[ c0 ]", "[ ]"},
+ {"[ d0, d1, d2, d3 ]", "[ d0, d1, d2 ]", "[ d0, d1 ]", "[ d0 ]", "[ ]"},
+ {"[ e0, e1, e2, e3 ]", "[ e0, e1, e2 ]", "[ e0, e1 ]", "[ e0 ]", "[ ]"},
+ };
+
+ // In the 2nd phase, we reopen the DB with FIFO compaction. In each reopen,
+ // we config max_table_files_size so that FIFO will remove exactly one file
+ // at a time upon compaction, and we will use it to verify whether the sst
+ // files are deleted in the correct order.
+ for (int i = 0; i < kTotalSstFileCount; ++i) {
+ uint64_t total_sst_files_size = 0;
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ handles_[1], "rocksdb.total-sst-files-size", &total_sst_files_size));
+ ASSERT_TRUE(total_sst_files_size > 0);
+
+ Options fifo_options(options);
+ fifo_options.compaction_style = kCompactionStyleFIFO;
+ options.create_if_missing = false;
+ fifo_options.max_open_files = -1;
+ fifo_options.disable_auto_compactions = false;
+ // Config max_table_files_size to be total_sst_files_size - 1 so that
+ // FIFO will delete one file.
+ fifo_options.compaction_options_fifo.max_table_files_size =
+ total_sst_files_size - 1;
+ ASSERT_OK(
+ TryReopenWithColumnFamilies({"default", "pikachu"}, fifo_options));
+ // For FIFO to pick a compaction
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(false));
+ for (int g = 0; g < kKeyCount; ++g) {
+ std::string get_key = std::string(1, char('a' + g));
+ int status_index = i / kKeyCount;
+ if ((i % kKeyCount) >= g) {
+ // If true, then it means the sst file containing the get_key in the
+ // current level has already been deleted, so we need to move the
+ // status_index for checking the expected value.
+ status_index++;
+ }
+ CheckAllEntriesWithFifoReopen(
+ expected_entries_after_fifo[g][status_index], get_key, kCF,
+ {"pikachu"}, options);
+ }
+ ASSERT_EQ(expected_files_per_level_after_fifo[i], FilesPerLevel(kCF));
+ }
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBTest, PutSingleDeleteGet) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_OK(Put(1, "foo2", "v2"));
+ ASSERT_EQ("v2", Get(1, "foo2"));
+ ASSERT_OK(SingleDelete(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+ // Skip FIFO and universal compaction because they do not apply to the test
+ // case. Skip MergePut because single delete does not get removed when it
+ // encounters a merge.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+TEST_F(DBTest, ReadFromPersistedTier) {
+ do {
+ Random rnd(301);
+ Options options = CurrentOptions();
+ for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) {
+ CreateAndReopenWithCF({"pikachu"}, options);
+ WriteOptions wopt;
+ wopt.disableWAL = (disableWAL == 1);
+ // 1st round: put but not flush
+ ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first"));
+ ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one"));
+ ASSERT_EQ("first", Get(1, "foo"));
+ ASSERT_EQ("one", Get(1, "bar"));
+
+ // Read directly from persited data.
+ ReadOptions ropt;
+ ropt.read_tier = kPersistedTier;
+ std::string value;
+ if (wopt.disableWAL) {
+ // as data has not yet being flushed, we expect not found.
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
+ } else {
+ ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+ ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+ }
+
+ // Multiget
+ std::vector<ColumnFamilyHandle*> multiget_cfs;
+ multiget_cfs.push_back(handles_[1]);
+ multiget_cfs.push_back(handles_[1]);
+ std::vector<Slice> multiget_keys;
+ multiget_keys.push_back("foo");
+ multiget_keys.push_back("bar");
+ std::vector<std::string> multiget_values;
+ auto statuses =
+ db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+ if (wopt.disableWAL) {
+ ASSERT_TRUE(statuses[0].IsNotFound());
+ ASSERT_TRUE(statuses[1].IsNotFound());
+ } else {
+ ASSERT_OK(statuses[0]);
+ ASSERT_OK(statuses[1]);
+ }
+
+ // 2nd round: flush and put a new value in memtable.
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello"));
+
+ // once the data has been flushed, we are able to get the
+ // data when kPersistedTier is used.
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok());
+ ASSERT_EQ(value, "first");
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
+ ASSERT_EQ(value, "one");
+ if (wopt.disableWAL) {
+ ASSERT_TRUE(
+ db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound());
+ } else {
+ ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value));
+ ASSERT_EQ(value, "hello");
+ }
+
+ // Expect same result in multiget
+ multiget_cfs.push_back(handles_[1]);
+ multiget_keys.push_back("rocksdb");
+ statuses =
+ db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+ ASSERT_TRUE(statuses[0].ok());
+ ASSERT_EQ("first", multiget_values[0]);
+ ASSERT_TRUE(statuses[1].ok());
+ ASSERT_EQ("one", multiget_values[1]);
+ if (wopt.disableWAL) {
+ ASSERT_TRUE(statuses[2].IsNotFound());
+ } else {
+ ASSERT_OK(statuses[2]);
+ }
+
+ // 3rd round: delete and flush
+ ASSERT_OK(db_->Delete(wopt, handles_[1], "foo"));
+ Flush(1);
+ ASSERT_OK(db_->Delete(wopt, handles_[1], "bar"));
+
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
+ if (wopt.disableWAL) {
+ // Still expect finding the value as its delete has not yet being
+ // flushed.
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
+ ASSERT_EQ(value, "one");
+ } else {
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
+ }
+ ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok());
+ ASSERT_EQ(value, "hello");
+
+ statuses =
+ db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+ ASSERT_TRUE(statuses[0].IsNotFound());
+ if (wopt.disableWAL) {
+ ASSERT_TRUE(statuses[1].ok());
+ ASSERT_EQ("one", multiget_values[1]);
+ } else {
+ ASSERT_TRUE(statuses[1].IsNotFound());
+ }
+ ASSERT_TRUE(statuses[2].ok());
+ ASSERT_EQ("hello", multiget_values[2]);
+ if (wopt.disableWAL == 0) {
+ DestroyAndReopen(options);
+ }
+ }
+ } while (ChangeOptions());
+}
+
+TEST_F(DBTest, SingleDeleteFlush) {
+ // Test to check whether flushing preserves a single delete hidden
+ // behind a put.
+ do {
+ Random rnd(301);
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Put values on second level (so that they will not be in the same
+ // compaction as the other operations.
+ ASSERT_OK(Put(1, "foo", "first"));
+ ASSERT_OK(Put(1, "bar", "one"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(2, 1);
+
+ // (Single) delete hidden by a put
+ ASSERT_OK(SingleDelete(1, "foo"));
+ ASSERT_OK(Put(1, "foo", "second"));
+ ASSERT_OK(Delete(1, "bar"));
+ ASSERT_OK(Put(1, "bar", "two"));
+ ASSERT_OK(Flush(1));
+
+ ASSERT_OK(SingleDelete(1, "foo"));
+ ASSERT_OK(Delete(1, "bar"));
+ ASSERT_OK(Flush(1));
+
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+ nullptr, nullptr));
+
+ ASSERT_EQ("NOT_FOUND", Get(1, "bar"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+ // Skip FIFO and universal compaction beccaus they do not apply to the test
+ // case. Skip MergePut because single delete does not get removed when it
+ // encounters a merge.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+TEST_F(DBTest, SingleDeletePutFlush) {
+ // Single deletes that encounter the matching put in a flush should get
+ // removed.
+ do {
+ Random rnd(301);
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", Slice()));
+ ASSERT_OK(Put(1, "a", Slice()));
+ ASSERT_OK(SingleDelete(1, "a"));
+ ASSERT_OK(Flush(1));
+
+ ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+ // Skip FIFO and universal compaction because they do not apply to the test
+ // case. Skip MergePut because single delete does not get removed when it
+ // encounters a merge.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+// Disable because not all platform can run it.
+// It requires more than 9GB memory to run it, With single allocation
+// of more than 3GB.
+TEST_F(DBTest, DISABLED_SanitizeVeryVeryLargeValue) {
+ const size_t kValueSize = 4 * size_t{1024 * 1024 * 1024}; // 4GB value
+ std::string raw(kValueSize, 'v');
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ options.write_buffer_size = 100000; // Small write buffer
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("boo", "v1"));
+ ASSERT_TRUE(Put("foo", raw).IsInvalidArgument());
+ ASSERT_TRUE(Merge("foo", raw).IsInvalidArgument());
+
+ WriteBatch wb;
+ ASSERT_TRUE(wb.Put("foo", raw).IsInvalidArgument());
+ ASSERT_TRUE(wb.Merge("foo", raw).IsInvalidArgument());
+
+ Slice value_slice = raw;
+ Slice key_slice = "foo";
+ SliceParts sp_key(&key_slice, 1);
+ SliceParts sp_value(&value_slice, 1);
+
+ ASSERT_TRUE(wb.Put(sp_key, sp_value).IsInvalidArgument());
+ ASSERT_TRUE(wb.Merge(sp_key, sp_value).IsInvalidArgument());
+}
+
+// Disable because not all platform can run it.
+// It requires more than 9GB memory to run it, With single allocation
+// of more than 3GB.
+TEST_F(DBTest, DISABLED_VeryLargeValue) {
+ const size_t kValueSize = 3221225472u; // 3GB value
+ const size_t kKeySize = 8388608u; // 8MB key
+ std::string raw(kValueSize, 'v');
+ std::string key1(kKeySize, 'c');
+ std::string key2(kKeySize, 'd');
+
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("boo", "v1"));
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put(key1, raw));
+ raw[0] = 'w';
+ ASSERT_OK(Put(key2, raw));
+ dbfull()->TEST_WaitForFlushMemTable();
+
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+#endif // !ROCKSDB_LITE
+
+ std::string value;
+ Status s = db_->Get(ReadOptions(), key1, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(kValueSize, value.size());
+ ASSERT_EQ('v', value[0]);
+
+ s = db_->Get(ReadOptions(), key2, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(kValueSize, value.size());
+ ASSERT_EQ('w', value[0]);
+
+ // Compact all files.
+ Flush();
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ // Check DB is not in read-only state.
+ ASSERT_OK(Put("boo", "v1"));
+
+ s = db_->Get(ReadOptions(), key1, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(kValueSize, value.size());
+ ASSERT_EQ('v', value[0]);
+
+ s = db_->Get(ReadOptions(), key2, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(kValueSize, value.size());
+ ASSERT_EQ('w', value[0]);
+}
+
+TEST_F(DBTest, GetFromImmutableLayer) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+
+ // Block sync calls
+ env_->delay_sstable_sync_.store(true, std::memory_order_release);
+ ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable
+ ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger flush
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+ // Release sync calls
+ env_->delay_sstable_sync_.store(false, std::memory_order_release);
+ } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetLevel0Ordering) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ // Check that we process level-0 files in correct order. The code
+ // below generates two level-0 files where the earlier one comes
+ // before the later one in the level-0 file list since the earlier
+ // one has a smaller "smallest" key.
+ ASSERT_OK(Put(1, "bar", "b"));
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("v2", Get(1, "foo"));
+ } while (ChangeOptions());
+}
+
+TEST_F(DBTest, WrongLevel0Config) {
+ Options options = CurrentOptions();
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options));
+ options.level0_stop_writes_trigger = 1;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_file_num_compaction_trigger = 3;
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, GetOrderedByLevels) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ Compact(1, "a", "z");
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ ASSERT_EQ("v2", Get(1, "foo"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("v2", Get(1, "foo"));
+ } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetPicksCorrectFile) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ // Arrange to have multiple files in a non-level-0 level.
+ ASSERT_OK(Put(1, "a", "va"));
+ Compact(1, "a", "b");
+ ASSERT_OK(Put(1, "x", "vx"));
+ Compact(1, "x", "y");
+ ASSERT_OK(Put(1, "f", "vf"));
+ Compact(1, "f", "g");
+ ASSERT_EQ("va", Get(1, "a"));
+ ASSERT_EQ("vf", Get(1, "f"));
+ ASSERT_EQ("vx", Get(1, "x"));
+ } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetEncountersEmptyLevel) {
+ do {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // Arrange for the following to happen:
+ // * sstable A in level 0
+ // * nothing in level 1
+ // * sstable B in level 2
+ // Then do enough Get() calls to arrange for an automatic compaction
+ // of sstable A. A bug would cause the compaction to be marked as
+ // occurring at level 1 (instead of the correct level 0).
+
+ // Step 1: First place sstables in levels 0 and 2
+ ASSERT_OK(Put(1, "a", "begin"));
+ ASSERT_OK(Put(1, "z", "end"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+ ASSERT_OK(Put(1, "a", "begin"));
+ ASSERT_OK(Put(1, "z", "end"));
+ ASSERT_OK(Flush(1));
+ ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+ // Step 2: clear level 1 if necessary.
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1);
+
+ // Step 3: read a bunch of times
+ for (int i = 0; i < 1000; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(1, "missing"));
+ }
+
+ // Step 4: Wait for compaction to finish
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); // XXX
+ } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, FlushMultipleMemtable) {
+ do {
+ Options options = CurrentOptions();
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 3;
+ options.max_write_buffer_size_to_maintain = -1;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v1", Get(1, "bar"));
+ ASSERT_OK(Flush(1));
+ } while (ChangeCompactOptions());
+}
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, FlushSchedule) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.level0_stop_writes_trigger = 1 << 10;
+ options.level0_slowdown_writes_trigger = 1 << 10;
+ options.min_write_buffer_number_to_merge = 1;
+ options.max_write_buffer_size_to_maintain =
+ static_cast<int64_t>(options.write_buffer_size);
+ options.max_write_buffer_number = 2;
+ options.write_buffer_size = 120 * 1024;
+ auto flush_listener = std::make_shared<FlushCounterListener>();
+ flush_listener->expected_flush_reason = FlushReason::kWriteBufferFull;
+ options.listeners.push_back(flush_listener);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ std::vector<port::Thread> threads;
+
+ std::atomic<int> thread_num(0);
+ // each column family will have 5 thread, each thread generating 2 memtables.
+ // each column family should end up with 10 table files
+ std::function<void()> fill_memtable_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ Random rnd(a);
+ WriteOptions wo;
+ // this should fill up 2 memtables
+ for (int k = 0; k < 5000; ++k) {
+ ASSERT_OK(db_->Put(wo, handles_[a & 1], rnd.RandomString(13), ""));
+ }
+ };
+
+ for (int i = 0; i < 10; ++i) {
+ threads.emplace_back(fill_memtable_func);
+ }
+
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
+ auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
+ ASSERT_LE(default_tables, static_cast<uint64_t>(10));
+ ASSERT_GT(default_tables, static_cast<uint64_t>(0));
+ ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
+ ASSERT_GT(pikachu_tables, static_cast<uint64_t>(0));
+}
+#endif // ROCKSDB_LITE
+
+namespace {
+class KeepFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return false;
+ }
+
+ const char* Name() const override { return "KeepFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit KeepFilterFactory(bool check_context = false)
+ : check_context_(check_context) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ if (check_context_) {
+ EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+ EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+ }
+ return std::unique_ptr<CompactionFilter>(new KeepFilter());
+ }
+
+ const char* Name() const override { return "KeepFilterFactory"; }
+ bool check_context_;
+ std::atomic_bool expect_full_compaction_;
+ std::atomic_bool expect_manual_compaction_;
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+ explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ db_test->env_->MockSleepForMicroseconds(1000);
+ return true;
+ }
+
+ const char* Name() const override { return "DelayFilter"; }
+
+ private:
+ DBTestBase* db_test;
+};
+
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+ }
+
+ const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+ DBTestBase* db_test;
+};
+} // anonymous namespace
+
+#ifndef ROCKSDB_LITE
+
+static std::string CompressibleString(Random* rnd, int len) {
+ std::string r;
+ test::CompressibleString(rnd, 0.8, len, &r);
+ return r;
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, FailMoreDbPaths) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 10000000);
+ options.db_paths.emplace_back(dbname_ + "_2", 1000000);
+ options.db_paths.emplace_back(dbname_ + "_3", 1000000);
+ options.db_paths.emplace_back(dbname_ + "_4", 1000000);
+ options.db_paths.emplace_back(dbname_ + "_5", 1000000);
+ ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+void CheckColumnFamilyMeta(
+ const ColumnFamilyMetaData& cf_meta, const std::string& cf_name,
+ const std::vector<std::vector<FileMetaData>>& files_by_level,
+ uint64_t start_time, uint64_t end_time) {
+ ASSERT_EQ(cf_meta.name, cf_name);
+ ASSERT_EQ(cf_meta.levels.size(), files_by_level.size());
+
+ uint64_t cf_size = 0;
+ size_t file_count = 0;
+
+ for (size_t i = 0; i < cf_meta.levels.size(); ++i) {
+ const auto& level_meta_from_cf = cf_meta.levels[i];
+ const auto& level_meta_from_files = files_by_level[i];
+
+ ASSERT_EQ(level_meta_from_cf.level, i);
+ ASSERT_EQ(level_meta_from_cf.files.size(), level_meta_from_files.size());
+
+ file_count += level_meta_from_cf.files.size();
+
+ uint64_t level_size = 0;
+ for (size_t j = 0; j < level_meta_from_cf.files.size(); ++j) {
+ const auto& file_meta_from_cf = level_meta_from_cf.files[j];
+ const auto& file_meta_from_files = level_meta_from_files[j];
+
+ level_size += file_meta_from_cf.size;
+
+ ASSERT_EQ(file_meta_from_cf.file_number,
+ file_meta_from_files.fd.GetNumber());
+ ASSERT_EQ(file_meta_from_cf.file_number,
+ TableFileNameToNumber(file_meta_from_cf.name));
+ ASSERT_EQ(file_meta_from_cf.size, file_meta_from_files.fd.file_size);
+ ASSERT_EQ(file_meta_from_cf.smallest_seqno,
+ file_meta_from_files.fd.smallest_seqno);
+ ASSERT_EQ(file_meta_from_cf.largest_seqno,
+ file_meta_from_files.fd.largest_seqno);
+ ASSERT_EQ(file_meta_from_cf.smallestkey,
+ file_meta_from_files.smallest.user_key().ToString());
+ ASSERT_EQ(file_meta_from_cf.largestkey,
+ file_meta_from_files.largest.user_key().ToString());
+ ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number,
+ file_meta_from_files.oldest_blob_file_number);
+ ASSERT_EQ(file_meta_from_cf.oldest_ancester_time,
+ file_meta_from_files.oldest_ancester_time);
+ ASSERT_EQ(file_meta_from_cf.file_creation_time,
+ file_meta_from_files.file_creation_time);
+ ASSERT_GE(file_meta_from_cf.file_creation_time, start_time);
+ ASSERT_LE(file_meta_from_cf.file_creation_time, end_time);
+ ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time);
+ ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time);
+ // More from FileStorageInfo
+ ASSERT_EQ(file_meta_from_cf.file_type, kTableFile);
+ ASSERT_EQ(file_meta_from_cf.name,
+ "/" + file_meta_from_cf.relative_filename);
+ ASSERT_EQ(file_meta_from_cf.directory, file_meta_from_cf.db_path);
+ }
+
+ ASSERT_EQ(level_meta_from_cf.size, level_size);
+ cf_size += level_size;
+ }
+
+ ASSERT_EQ(cf_meta.file_count, file_count);
+ ASSERT_EQ(cf_meta.size, cf_size);
+}
+
+void CheckLiveFilesMeta(
+ const std::vector<LiveFileMetaData>& live_file_meta,
+ const std::vector<std::vector<FileMetaData>>& files_by_level) {
+ size_t total_file_count = 0;
+ for (const auto& f : files_by_level) {
+ total_file_count += f.size();
+ }
+
+ ASSERT_EQ(live_file_meta.size(), total_file_count);
+
+ int level = 0;
+ int i = 0;
+
+ for (const auto& meta : live_file_meta) {
+ if (level != meta.level) {
+ level = meta.level;
+ i = 0;
+ }
+
+ ASSERT_LT(i, files_by_level[level].size());
+
+ const auto& expected_meta = files_by_level[level][i];
+
+ ASSERT_EQ(meta.column_family_name, kDefaultColumnFamilyName);
+ ASSERT_EQ(meta.file_number, expected_meta.fd.GetNumber());
+ ASSERT_EQ(meta.file_number, TableFileNameToNumber(meta.name));
+ ASSERT_EQ(meta.size, expected_meta.fd.file_size);
+ ASSERT_EQ(meta.smallest_seqno, expected_meta.fd.smallest_seqno);
+ ASSERT_EQ(meta.largest_seqno, expected_meta.fd.largest_seqno);
+ ASSERT_EQ(meta.smallestkey, expected_meta.smallest.user_key().ToString());
+ ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString());
+ ASSERT_EQ(meta.oldest_blob_file_number,
+ expected_meta.oldest_blob_file_number);
+
+ // More from FileStorageInfo
+ ASSERT_EQ(meta.file_type, kTableFile);
+ ASSERT_EQ(meta.name, "/" + meta.relative_filename);
+ ASSERT_EQ(meta.directory, meta.db_path);
+
+ ++i;
+ }
+}
+
+#ifndef ROCKSDB_LITE
+void AddBlobFile(const ColumnFamilyHandle* cfh, uint64_t blob_file_number,
+ uint64_t total_blob_count, uint64_t total_blob_bytes,
+ const std::string& checksum_method,
+ const std::string& checksum_value,
+ uint64_t garbage_blob_count = 0,
+ uint64_t garbage_blob_bytes = 0) {
+ ColumnFamilyData* cfd =
+ (static_cast<const ColumnFamilyHandleImpl*>(cfh))->cfd();
+ assert(cfd);
+
+ Version* const version = cfd->current();
+ assert(version);
+
+ VersionStorageInfo* const storage_info = version->storage_info();
+ assert(storage_info);
+
+ // Add a live blob file.
+
+ auto shared_meta = SharedBlobFileMetaData::Create(
+ blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+ checksum_value);
+
+ auto meta = BlobFileMetaData::Create(std::move(shared_meta),
+ BlobFileMetaData::LinkedSsts(),
+ garbage_blob_count, garbage_blob_bytes);
+
+ storage_info->AddBlobFile(std::move(meta));
+}
+
+static void CheckBlobMetaData(
+ const BlobMetaData& bmd, uint64_t blob_file_number,
+ uint64_t total_blob_count, uint64_t total_blob_bytes,
+ const std::string& checksum_method, const std::string& checksum_value,
+ uint64_t garbage_blob_count = 0, uint64_t garbage_blob_bytes = 0) {
+ ASSERT_EQ(bmd.blob_file_number, blob_file_number);
+ ASSERT_EQ(bmd.blob_file_name, BlobFileName("", blob_file_number));
+ ASSERT_EQ(bmd.blob_file_size,
+ total_blob_bytes + BlobLogHeader::kSize + BlobLogFooter::kSize);
+
+ ASSERT_EQ(bmd.total_blob_count, total_blob_count);
+ ASSERT_EQ(bmd.total_blob_bytes, total_blob_bytes);
+ ASSERT_EQ(bmd.garbage_blob_count, garbage_blob_count);
+ ASSERT_EQ(bmd.garbage_blob_bytes, garbage_blob_bytes);
+ ASSERT_EQ(bmd.checksum_method, checksum_method);
+ ASSERT_EQ(bmd.checksum_value, checksum_value);
+}
+
+TEST_F(DBTest, MetaDataTest) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+
+ int64_t temp_time = 0;
+ options.env->GetCurrentTime(&temp_time);
+ uint64_t start_time = static_cast<uint64_t>(temp_time);
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_index = 0;
+ for (int i = 0; i < 100; ++i) {
+ // Add a single blob reference to each file
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000,
+ /* offset */ 1234, /* size */ 5678, kNoCompression);
+
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index),
+ blob_index));
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+ ++key_index;
+
+ // Fill up the rest of the file with random values.
+ GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+
+ ASSERT_OK(Flush());
+ }
+
+ std::vector<std::vector<FileMetaData>> files_by_level;
+ dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
+
+ options.env->GetCurrentTime(&temp_time);
+ uint64_t end_time = static_cast<uint64_t>(temp_time);
+
+ ColumnFamilyMetaData cf_meta;
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ CheckColumnFamilyMeta(cf_meta, kDefaultColumnFamilyName, files_by_level,
+ start_time, end_time);
+ std::vector<LiveFileMetaData> live_file_meta;
+ db_->GetLiveFilesMetaData(&live_file_meta);
+ CheckLiveFilesMeta(live_file_meta, files_by_level);
+}
+
+TEST_F(DBTest, AllMetaDataTest) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ constexpr uint64_t blob_file_number = 234;
+ constexpr uint64_t total_blob_count = 555;
+ constexpr uint64_t total_blob_bytes = 66666;
+ constexpr char checksum_method[] = "CRC32";
+ constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+ int64_t temp_time = 0;
+ options.env->GetCurrentTime(&temp_time).PermitUncheckedError();
+ uint64_t start_time = static_cast<uint64_t>(temp_time);
+
+ Random rnd(301);
+ dbfull()->TEST_LockMutex();
+ for (int cf = 0; cf < 2; cf++) {
+ AddBlobFile(handles_[cf], blob_file_number * (cf + 1),
+ total_blob_count * (cf + 1), total_blob_bytes * (cf + 1),
+ checksum_method, checksum_value);
+ }
+ dbfull()->TEST_UnlockMutex();
+
+ std::vector<ColumnFamilyMetaData> all_meta;
+ db_->GetAllColumnFamilyMetaData(&all_meta);
+
+ std::vector<std::vector<FileMetaData>> default_files_by_level;
+ std::vector<std::vector<FileMetaData>> pikachu_files_by_level;
+ dbfull()->TEST_GetFilesMetaData(handles_[0], &default_files_by_level);
+ dbfull()->TEST_GetFilesMetaData(handles_[1], &pikachu_files_by_level);
+
+ options.env->GetCurrentTime(&temp_time).PermitUncheckedError();
+ uint64_t end_time = static_cast<uint64_t>(temp_time);
+
+ ASSERT_EQ(all_meta.size(), 2);
+ for (int cf = 0; cf < 2; cf++) {
+ const auto& cfmd = all_meta[cf];
+ if (cf == 0) {
+ CheckColumnFamilyMeta(cfmd, "default", default_files_by_level, start_time,
+ end_time);
+ } else {
+ CheckColumnFamilyMeta(cfmd, "pikachu", pikachu_files_by_level, start_time,
+ end_time);
+ }
+ ASSERT_EQ(cfmd.blob_files.size(), 1U);
+ const auto& bmd = cfmd.blob_files[0];
+ ASSERT_EQ(cfmd.blob_file_count, 1U);
+ ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size);
+ ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_));
+ CheckBlobMetaData(bmd, blob_file_number * (cf + 1),
+ total_blob_count * (cf + 1), total_blob_bytes * (cf + 1),
+ checksum_method, checksum_value);
+ }
+}
+
+namespace {
+void MinLevelHelper(DBTest* self, Options& options) {
+ Random rnd(301);
+
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ std::vector<std::string> values;
+ // Write 120KB (12 values, each 10K)
+ for (int i = 0; i < 12; i++) {
+ values.push_back(rnd.RandomString(10000));
+ ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+ }
+ ASSERT_OK(self->dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1);
+ }
+
+ // generate one more file in level-0, and should trigger level-0 compaction
+ std::vector<std::string> values;
+ for (int i = 0; i < 12; i++) {
+ values.push_back(rnd.RandomString(10000));
+ ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+ }
+ ASSERT_OK(self->dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(self->NumTableFilesAtLevel(0), 0);
+ ASSERT_EQ(self->NumTableFilesAtLevel(1), 1);
+}
+
+// returns false if the calling-Test should be skipped
+bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
+ int lev, int strategy) {
+ fprintf(stderr,
+ "Test with compression options : window_bits = %d, level = %d, "
+ "strategy = %d}\n",
+ wbits, lev, strategy);
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.arena_block_size = 4096;
+ options.num_levels = 3;
+ options.level0_file_num_compaction_trigger = 3;
+ options.create_if_missing = true;
+
+ if (Snappy_Supported()) {
+ type = kSnappyCompression;
+ fprintf(stderr, "using snappy\n");
+ } else if (Zlib_Supported()) {
+ type = kZlibCompression;
+ fprintf(stderr, "using zlib\n");
+ } else if (BZip2_Supported()) {
+ type = kBZip2Compression;
+ fprintf(stderr, "using bzip2\n");
+ } else if (LZ4_Supported()) {
+ type = kLZ4Compression;
+ fprintf(stderr, "using lz4\n");
+ } else if (XPRESS_Supported()) {
+ type = kXpressCompression;
+ fprintf(stderr, "using xpress\n");
+ } else if (ZSTD_Supported()) {
+ type = kZSTD;
+ fprintf(stderr, "using ZSTD\n");
+ } else {
+ fprintf(stderr, "skipping test, compression disabled\n");
+ return false;
+ }
+ options.compression_per_level.resize(options.num_levels);
+
+ // do not compress L0
+ for (int i = 0; i < 1; i++) {
+ options.compression_per_level[i] = kNoCompression;
+ }
+ for (int i = 1; i < options.num_levels; i++) {
+ options.compression_per_level[i] = type;
+ }
+ return true;
+}
+} // anonymous namespace
+
+TEST_F(DBTest, MinLevelToCompress1) {
+ Options options = CurrentOptions();
+ CompressionType type = kSnappyCompression;
+ if (!MinLevelToCompress(type, options, -14, -1, 0)) {
+ return;
+ }
+ Reopen(options);
+ MinLevelHelper(this, options);
+
+ // do not compress L0 and L1
+ for (int i = 0; i < 2; i++) {
+ options.compression_per_level[i] = kNoCompression;
+ }
+ for (int i = 2; i < options.num_levels; i++) {
+ options.compression_per_level[i] = type;
+ }
+ DestroyAndReopen(options);
+ MinLevelHelper(this, options);
+}
+
+TEST_F(DBTest, MinLevelToCompress2) {
+ Options options = CurrentOptions();
+ CompressionType type = kSnappyCompression;
+ if (!MinLevelToCompress(type, options, 15, -1, 0)) {
+ return;
+ }
+ Reopen(options);
+ MinLevelHelper(this, options);
+
+ // do not compress L0 and L1
+ for (int i = 0; i < 2; i++) {
+ options.compression_per_level[i] = kNoCompression;
+ }
+ for (int i = 2; i < options.num_levels; i++) {
+ options.compression_per_level[i] = type;
+ }
+ DestroyAndReopen(options);
+ MinLevelHelper(this, options);
+}
+
+// This test may fail because of a legit case that multiple L0 files
+// are trivial moved to L1.
+TEST_F(DBTest, DISABLED_RepeatedWritesToSameKey) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // We must have at most one file per level except for level-0,
+ // which may have up to kL0_StopWritesTrigger files.
+ const int kMaxFiles =
+ options.num_levels + options.level0_stop_writes_trigger;
+
+ Random rnd(301);
+ std::string value =
+ rnd.RandomString(static_cast<int>(2 * options.write_buffer_size));
+ for (int i = 0; i < 5 * kMaxFiles; i++) {
+ ASSERT_OK(Put(1, "key", value));
+ ASSERT_LE(TotalTableFiles(1), kMaxFiles);
+ }
+ } while (ChangeCompactOptions());
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+ bool result = (val >= low) && (val <= high);
+ if (!result) {
+ fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+ (unsigned long long)(val), (unsigned long long)(low),
+ (unsigned long long)(high));
+ }
+ return result;
+}
+
+TEST_F(DBTest, ApproximateSizesMemTable) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ auto default_cf = db_->DefaultColumnFamily();
+
+ const int N = 128;
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ uint64_t size;
+ std::string start = Key(50);
+ std::string end = Key(60);
+ Range r(start, end);
+ SizeApproximationOptions size_approx_options;
+ size_approx_options.include_memtables = true;
+ size_approx_options.include_files = true;
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_GT(size, 6000);
+ ASSERT_LT(size, 204800);
+ // Zero if not including mem table
+ ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+ ASSERT_EQ(size, 0);
+
+ start = Key(500);
+ end = Key(600);
+ r = Range(start, end);
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_EQ(size, 0);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
+ }
+
+ start = Key(500);
+ end = Key(600);
+ r = Range(start, end);
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_EQ(size, 0);
+
+ start = Key(100);
+ end = Key(1020);
+ r = Range(start, end);
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_GT(size, 6000);
+
+ options.max_write_buffer_number = 8;
+ options.min_write_buffer_number_to_merge = 5;
+ options.write_buffer_size = 1024 * N; // Not very large
+ DestroyAndReopen(options);
+ default_cf = db_->DefaultColumnFamily();
+
+ int keys[N * 3];
+ for (int i = 0; i < N; i++) {
+ keys[i * 3] = i * 5;
+ keys[i * 3 + 1] = i * 5 + 1;
+ keys[i * 3 + 2] = i * 5 + 2;
+ }
+ // MemTable entry counting is estimated and can vary greatly depending on
+ // layout. Thus, using deterministic seed for test stability.
+ RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
+
+ for (int i = 0; i < N * 3; i++) {
+ ASSERT_OK(Put(Key(keys[i] + 1000), rnd.RandomString(1024)));
+ }
+
+ start = Key(100);
+ end = Key(300);
+ r = Range(start, end);
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_EQ(size, 0);
+
+ start = Key(1050);
+ end = Key(1080);
+ r = Range(start, end);
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_GT(size, 6000);
+
+ start = Key(2100);
+ end = Key(2300);
+ r = Range(start, end);
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_EQ(size, 0);
+
+ start = Key(1050);
+ end = Key(1080);
+ r = Range(start, end);
+ uint64_t size_with_mt, size_without_mt;
+ ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+ &size_with_mt));
+ ASSERT_GT(size_with_mt, 6000);
+ ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
+ ASSERT_EQ(size_without_mt, 0);
+
+ ASSERT_OK(Flush());
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i + 1000), rnd.RandomString(1024)));
+ }
+
+ start = Key(1050);
+ end = Key(1080);
+ r = Range(start, end);
+ ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+ &size_with_mt));
+ ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
+ ASSERT_GT(size_with_mt, size_without_mt);
+ ASSERT_GT(size_without_mt, 6000);
+
+ // Check that include_memtables flag works as expected
+ size_approx_options.include_memtables = false;
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_EQ(size, size_without_mt);
+
+ // Check that files_size_error_margin works as expected, when the heuristic
+ // conditions are not met
+ start = Key(1);
+ end = Key(1000 + N - 2);
+ r = Range(start, end);
+ size_approx_options.files_size_error_margin = -1.0; // disabled
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ uint64_t size2;
+ size_approx_options.files_size_error_margin = 0.5; // enabled, but not used
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2));
+ ASSERT_EQ(size, size2);
+}
+
+TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
+ // Roughly 4 keys per data block, 1000 keys per file,
+ // with filter substantially larger than a data block
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(16));
+ table_options.block_size = 100;
+ Options options = CurrentOptions();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.write_buffer_size = 24 * 1024;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.target_file_size_base = 24 * 1024;
+ DestroyAndReopen(options);
+ const auto default_cf = db_->DefaultColumnFamily();
+
+ const int N = 64000;
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(24)));
+ }
+ // Flush everything to files
+ ASSERT_OK(Flush());
+ // Compact the entire key space into the next level
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr));
+
+ // Write more keys
+ for (int i = N; i < (N + N / 4); i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(24)));
+ }
+ // Flush everything to files again
+ ASSERT_OK(Flush());
+
+ // Wait for compaction to finish
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ {
+ const std::string start = Key(0);
+ const std::string end = Key(2 * N);
+ const Range r(start, end);
+
+ SizeApproximationOptions size_approx_options;
+ size_approx_options.include_memtables = false;
+ size_approx_options.include_files = true;
+ size_approx_options.files_size_error_margin = -1.0; // disabled
+
+ // Get the precise size without any approximation heuristic
+ uint64_t size;
+ ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+ &size));
+ ASSERT_NE(size, 0);
+
+ // Get the size with an approximation heuristic
+ uint64_t size2;
+ const double error_margin = 0.2;
+ size_approx_options.files_size_error_margin = error_margin;
+ ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+ &size2));
+ ASSERT_LT(size2, size * (1 + error_margin));
+ ASSERT_GT(size2, size * (1 - error_margin));
+ }
+
+ {
+ // Ensure that metadata is not falsely attributed only to the last data in
+ // the file. (In some applications, filters can be large portion of data
+ // size.)
+ // Perform many queries over small range, enough to ensure crossing file
+ // boundary, and make sure we never see a spike for large filter.
+ for (int i = 0; i < 3000; i += 10) {
+ const std::string start = Key(i);
+ const std::string end = Key(i + 11); // overlap by 1 key
+ const Range r(start, end);
+ uint64_t size;
+ ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+ ASSERT_LE(size, 11 * 100);
+ }
+ }
+}
+
+TEST_F(DBTest, GetApproximateMemTableStats) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ const int N = 128;
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ uint64_t count;
+ uint64_t size;
+
+ std::string start = Key(50);
+ std::string end = Key(60);
+ Range r(start, end);
+ db_->GetApproximateMemTableStats(r, &count, &size);
+ ASSERT_GT(count, 0);
+ ASSERT_LE(count, N);
+ ASSERT_GT(size, 6000);
+ ASSERT_LT(size, 204800);
+
+ start = Key(500);
+ end = Key(600);
+ r = Range(start, end);
+ db_->GetApproximateMemTableStats(r, &count, &size);
+ ASSERT_EQ(count, 0);
+ ASSERT_EQ(size, 0);
+
+ ASSERT_OK(Flush());
+
+ start = Key(50);
+ end = Key(60);
+ r = Range(start, end);
+ db_->GetApproximateMemTableStats(r, &count, &size);
+ ASSERT_EQ(count, 0);
+ ASSERT_EQ(size, 0);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
+ }
+
+ start = Key(100);
+ end = Key(1020);
+ r = Range(start, end);
+ db_->GetApproximateMemTableStats(r, &count, &size);
+ ASSERT_GT(count, 20);
+ ASSERT_GT(size, 6000);
+}
+
+TEST_F(DBTest, ApproximateSizes) {
+ do {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ uint64_t size;
+ ASSERT_OK(Size("", "xyz", 1, &size));
+ ASSERT_TRUE(Between(size, 0, 0));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_OK(Size("", "xyz", 1, &size));
+ ASSERT_TRUE(Between(size, 0, 0));
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ const int N = 80;
+ static const int S1 = 100000;
+ static const int S2 = 105000; // Allow some expansion from metadata
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(1, Key(i), rnd.RandomString(S1)));
+ }
+
+ // 0 because GetApproximateSizes() does not account for memtable space
+ ASSERT_OK(Size("", Key(50), 1, &size));
+ ASSERT_TRUE(Between(size, 0, 0));
+
+ // Check sizes across recovery by reopening a few times
+ for (int run = 0; run < 3; run++) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ for (int compact_start = 0; compact_start < N; compact_start += 10) {
+ for (int i = 0; i < N; i += 10) {
+ ASSERT_OK(Size("", Key(i), 1, &size));
+ ASSERT_TRUE(Between(size, S1 * i, S2 * i));
+ ASSERT_OK(Size("", Key(i) + ".suffix", 1, &size));
+ ASSERT_TRUE(Between(size, S1 * (i + 1), S2 * (i + 1)));
+ ASSERT_OK(Size(Key(i), Key(i + 10), 1, &size));
+ ASSERT_TRUE(Between(size, S1 * 10, S2 * 10));
+ }
+ ASSERT_OK(Size("", Key(50), 1, &size));
+ ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
+ ASSERT_OK(Size("", Key(50) + ".suffix", 1, &size));
+ ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
+
+ std::string cstart_str = Key(compact_start);
+ std::string cend_str = Key(compact_start + 9);
+ Slice cstart = cstart_str;
+ Slice cend = cend_str;
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]));
+ }
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+ }
+ // ApproximateOffsetOf() is not yet implemented in plain table format.
+ } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+ kSkipPlainTable | kSkipHashIndex));
+}
+
+TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+ do {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ std::string big1 = rnd.RandomString(100000);
+ ASSERT_OK(Put(1, Key(0), rnd.RandomString(10000)));
+ ASSERT_OK(Put(1, Key(1), rnd.RandomString(10000)));
+ ASSERT_OK(Put(1, Key(2), big1));
+ ASSERT_OK(Put(1, Key(3), rnd.RandomString(10000)));
+ ASSERT_OK(Put(1, Key(4), big1));
+ ASSERT_OK(Put(1, Key(5), rnd.RandomString(10000)));
+ ASSERT_OK(Put(1, Key(6), rnd.RandomString(300000)));
+ ASSERT_OK(Put(1, Key(7), rnd.RandomString(10000)));
+
+ // Check sizes across recovery by reopening a few times
+ uint64_t size;
+ for (int run = 0; run < 3; run++) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ ASSERT_OK(Size("", Key(0), 1, &size));
+ ASSERT_TRUE(Between(size, 0, 0));
+ ASSERT_OK(Size("", Key(1), 1, &size));
+ ASSERT_TRUE(Between(size, 10000, 11000));
+ ASSERT_OK(Size("", Key(2), 1, &size));
+ ASSERT_TRUE(Between(size, 20000, 21000));
+ ASSERT_OK(Size("", Key(3), 1, &size));
+ ASSERT_TRUE(Between(size, 120000, 121000));
+ ASSERT_OK(Size("", Key(4), 1, &size));
+ ASSERT_TRUE(Between(size, 130000, 131000));
+ ASSERT_OK(Size("", Key(5), 1, &size));
+ ASSERT_TRUE(Between(size, 230000, 232000));
+ ASSERT_OK(Size("", Key(6), 1, &size));
+ ASSERT_TRUE(Between(size, 240000, 242000));
+ // Ensure some overhead is accounted for, even without including all
+ ASSERT_OK(Size("", Key(7), 1, &size));
+ ASSERT_TRUE(Between(size, 540500, 545000));
+ ASSERT_OK(Size("", Key(8), 1, &size));
+ ASSERT_TRUE(Between(size, 550500, 555000));
+
+ ASSERT_OK(Size(Key(3), Key(5), 1, &size));
+ ASSERT_TRUE(Between(size, 110100, 111000));
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+ }
+ // ApproximateOffsetOf() is not yet implemented in plain table format.
+ } while (ChangeOptions(kSkipPlainTable));
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, Snapshot) {
+ env_->SetMockSleep();
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+ ASSERT_OK(Put(0, "foo", "0v1"));
+ ASSERT_OK(Put(1, "foo", "1v1"));
+
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_EQ(1U, GetNumSnapshots());
+ uint64_t time_snap1 = GetTimeOldestSnapshots();
+ ASSERT_GT(time_snap1, 0U);
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ ASSERT_EQ(GetTimeOldestSnapshots(),
+ static_cast<uint64_t>(s1->GetUnixTime()));
+ ASSERT_OK(Put(0, "foo", "0v2"));
+ ASSERT_OK(Put(1, "foo", "1v2"));
+
+ env_->MockSleepForSeconds(1);
+
+ const Snapshot* s2 = db_->GetSnapshot();
+ ASSERT_EQ(2U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ ASSERT_EQ(GetTimeOldestSnapshots(),
+ static_cast<uint64_t>(s1->GetUnixTime()));
+ ASSERT_OK(Put(0, "foo", "0v3"));
+ ASSERT_OK(Put(1, "foo", "1v3"));
+
+ {
+ ManagedSnapshot s3(db_);
+ ASSERT_EQ(3U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ ASSERT_EQ(GetTimeOldestSnapshots(),
+ static_cast<uint64_t>(s1->GetUnixTime()));
+
+ ASSERT_OK(Put(0, "foo", "0v4"));
+ ASSERT_OK(Put(1, "foo", "1v4"));
+ ASSERT_EQ("0v1", Get(0, "foo", s1));
+ ASSERT_EQ("1v1", Get(1, "foo", s1));
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
+ ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ }
+
+ ASSERT_EQ(2U, GetNumSnapshots());
+ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+ ASSERT_EQ(GetTimeOldestSnapshots(),
+ static_cast<uint64_t>(s1->GetUnixTime()));
+ ASSERT_EQ("0v1", Get(0, "foo", s1));
+ ASSERT_EQ("1v1", Get(1, "foo", s1));
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+
+ db_->ReleaseSnapshot(s1);
+ ASSERT_EQ("0v2", Get(0, "foo", s2));
+ ASSERT_EQ("1v2", Get(1, "foo", s2));
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ ASSERT_EQ(1U, GetNumSnapshots());
+ ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
+ ASSERT_EQ(GetTimeOldestSnapshots(),
+ static_cast<uint64_t>(s2->GetUnixTime()));
+
+ db_->ReleaseSnapshot(s2);
+ ASSERT_EQ(0U, GetNumSnapshots());
+ ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
+ ASSERT_EQ("0v4", Get(0, "foo"));
+ ASSERT_EQ("1v4", Get(1, "foo"));
+ } while (ChangeOptions());
+}
+
+TEST_F(DBTest, HiddenValuesAreRemoved) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ uint64_t size;
+ do {
+ Options options = CurrentOptions(options_override);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+ FillLevels("a", "z", 1);
+
+ std::string big = rnd.RandomString(50000);
+ ASSERT_OK(Put(1, "foo", big));
+ ASSERT_OK(Put(1, "pastfoo", "v"));
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(Put(1, "foo", "tiny"));
+ ASSERT_OK(Put(1, "pastfoo2", "v2")); // Advance sequence number one more
+
+ ASSERT_OK(Flush(1));
+ ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+
+ ASSERT_EQ(big, Get(1, "foo", snapshot));
+ ASSERT_OK(Size("", "pastfoo", 1, &size));
+ ASSERT_TRUE(Between(size, 50000, 60000));
+ db_->ReleaseSnapshot(snapshot);
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]");
+ Slice x("x");
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_GE(NumTableFilesAtLevel(1, 1), 1);
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+
+ ASSERT_OK(Size("", "pastfoo", 1, &size));
+ ASSERT_TRUE(Between(size, 0, 1000));
+ // ApproximateOffsetOf() is not yet implemented in plain table format,
+ // which is used by Size().
+ } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+ kSkipPlainTable));
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, UnremovableSingleDelete) {
+ // If we compact:
+ //
+ // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2)
+ //
+ // We do not want to end up with:
+ //
+ // Put(A, v1) Snapshot Put(A, v2)
+ //
+ // Because a subsequent SingleDelete(A) would delete the Put(A, v2)
+ // but not Put(A, v1), so Get(A) would return v1.
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ do {
+ Options options = CurrentOptions(options_override);
+ options.disable_auto_compactions = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "first"));
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(SingleDelete(1, "foo"));
+ ASSERT_OK(Put(1, "foo", "second"));
+ ASSERT_OK(Flush(1));
+
+ ASSERT_EQ("first", Get(1, "foo", snapshot));
+ ASSERT_EQ("second", Get(1, "foo"));
+
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+ nullptr, nullptr));
+ ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1));
+
+ ASSERT_OK(SingleDelete(1, "foo"));
+
+ ASSERT_EQ("first", Get(1, "foo", snapshot));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+ nullptr, nullptr));
+
+ ASSERT_EQ("first", Get(1, "foo", snapshot));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+ db_->ReleaseSnapshot(snapshot);
+ // Skip FIFO and universal compaction because they do not apply to the test
+ // case. Skip MergePut because single delete does not get removed when it
+ // encounters a merge.
+ } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+ kSkipMergePut));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, DeletionMarkers1) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Flush(1));
+ const int last = 2;
+ MoveFilesToLevel(last, 1);
+ // foo => v1 is now in last level
+ ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+ // Place a table at level last-1 to prevent merging with preceding mutation
+ ASSERT_OK(Put(1, "a", "begin"));
+ ASSERT_OK(Put(1, "z", "end"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(last - 1, 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+ ASSERT_OK(Delete(1, "foo"));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+ ASSERT_OK(Flush(1)); // Moves to level last-2
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+ Slice z("z");
+ ASSERT_OK(dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]));
+ // DEL eliminated, but v1 remains because we aren't compacting that level
+ // (DEL can be eliminated because v2 hides v1).
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+ ASSERT_OK(
+ dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]));
+ // Merging last-1 w/ last, so we are the base level for "foo", so
+ // DEL is removed. (as is v1).
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+}
+
+TEST_F(DBTest, DeletionMarkers2) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Flush(1));
+ const int last = 2;
+ MoveFilesToLevel(last, 1);
+ // foo => v1 is now in last level
+ ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+ // Place a table at level last-1 to prevent merging with preceding mutation
+ ASSERT_OK(Put(1, "a", "begin"));
+ ASSERT_OK(Put(1, "z", "end"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(last - 1, 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+ ASSERT_OK(Delete(1, "foo"));
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+ ASSERT_OK(Flush(1)); // Moves to level last-2
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+ ASSERT_OK(
+ dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]));
+ // DEL kept: "last" file overlaps
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+ ASSERT_OK(
+ dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]));
+ // Merging last-1 w/ last, so we are the base level for "foo", so
+ // DEL is removed. (as is v1).
+ ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+}
+
+TEST_F(DBTest, OverlapInLevel0) {
+ do {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Fill levels 1 and 2 to disable the pushing of new memtables to levels >
+ // 0.
+ ASSERT_OK(Put(1, "100", "v100"));
+ ASSERT_OK(Put(1, "999", "v999"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(2, 1);
+ ASSERT_OK(Delete(1, "100"));
+ ASSERT_OK(Delete(1, "999"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(1, 1);
+ ASSERT_EQ("0,1,1", FilesPerLevel(1));
+
+ // Make files spanning the following ranges in level-0:
+ // files[0] 200 .. 900
+ // files[1] 300 .. 500
+ // Note that files are sorted by smallest key.
+ ASSERT_OK(Put(1, "300", "v300"));
+ ASSERT_OK(Put(1, "500", "v500"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "200", "v200"));
+ ASSERT_OK(Put(1, "600", "v600"));
+ ASSERT_OK(Put(1, "900", "v900"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("2,1,1", FilesPerLevel(1));
+
+ // BEGIN addition to existing test
+ // Take this opportunity to verify SST unique ids (including Plain table)
+ TablePropertiesCollection tbc;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &tbc));
+ VerifySstUniqueIds(tbc);
+ // END addition to existing test
+
+ // Compact away the placeholder files we created initially
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+ ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]));
+ ASSERT_EQ("2", FilesPerLevel(1));
+
+ // Do a memtable compaction. Before bug-fix, the compaction would
+ // not detect the overlap with level-0 files and would incorrectly place
+ // the deletion in a deeper level.
+ ASSERT_OK(Delete(1, "600"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("3", FilesPerLevel(1));
+ ASSERT_EQ("NOT_FOUND", Get(1, "600"));
+ } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, ComparatorCheck) {
+ class NewComparator : public Comparator {
+ public:
+ const char* Name() const override { return "rocksdb.NewComparator"; }
+ int Compare(const Slice& a, const Slice& b) const override {
+ return BytewiseComparator()->Compare(a, b);
+ }
+ void FindShortestSeparator(std::string* s, const Slice& l) const override {
+ BytewiseComparator()->FindShortestSeparator(s, l);
+ }
+ void FindShortSuccessor(std::string* key) const override {
+ BytewiseComparator()->FindShortSuccessor(key);
+ }
+ };
+ Options new_options, options;
+ NewComparator cmp;
+ do {
+ options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ new_options = CurrentOptions();
+ new_options.comparator = &cmp;
+ // only the non-default column family has non-matching comparator
+ Status s = TryReopenWithColumnFamilies(
+ {"default", "pikachu"}, std::vector<Options>({options, new_options}));
+ ASSERT_TRUE(!s.ok());
+ ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+ << s.ToString();
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, CustomComparator) {
+ class NumberComparator : public Comparator {
+ public:
+ const char* Name() const override { return "test.NumberComparator"; }
+ int Compare(const Slice& a, const Slice& b) const override {
+ return ToNumber(a) - ToNumber(b);
+ }
+ void FindShortestSeparator(std::string* s, const Slice& l) const override {
+ ToNumber(*s); // Check format
+ ToNumber(l); // Check format
+ }
+ void FindShortSuccessor(std::string* key) const override {
+ ToNumber(*key); // Check format
+ }
+
+ private:
+ static int ToNumber(const Slice& x) {
+ // Check that there are no extra characters.
+ EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']')
+ << EscapeString(x);
+ int val;
+ char ignored;
+ EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+ << EscapeString(x);
+ return val;
+ }
+ };
+ Options new_options;
+ NumberComparator cmp;
+ do {
+ new_options = CurrentOptions();
+ new_options.create_if_missing = true;
+ new_options.comparator = &cmp;
+ new_options.write_buffer_size = 4096; // Compact more often
+ new_options.arena_block_size = 4096;
+ new_options = CurrentOptions(new_options);
+ DestroyAndReopen(new_options);
+ CreateAndReopenWithCF({"pikachu"}, new_options);
+ ASSERT_OK(Put(1, "[10]", "ten"));
+ ASSERT_OK(Put(1, "[0x14]", "twenty"));
+ for (int i = 0; i < 2; i++) {
+ ASSERT_EQ("ten", Get(1, "[10]"));
+ ASSERT_EQ("ten", Get(1, "[0xa]"));
+ ASSERT_EQ("twenty", Get(1, "[20]"));
+ ASSERT_EQ("twenty", Get(1, "[0x14]"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "[15]"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]"));
+ Compact(1, "[0]", "[9999]");
+ }
+
+ for (int run = 0; run < 2; run++) {
+ for (int i = 0; i < 1000; i++) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "[%d]", i * 10);
+ ASSERT_OK(Put(1, buf, buf));
+ }
+ Compact(1, "[0]", "[1000000]");
+ }
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, DBOpen_Options) {
+ Options options = CurrentOptions();
+ std::string dbname = test::PerThreadDBPath("db_options_test");
+ ASSERT_OK(DestroyDB(dbname, options));
+
+ // Does not exist, and create_if_missing == false: error
+ DB* db = nullptr;
+ options.create_if_missing = false;
+ Status s = DB::Open(options, dbname, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+ ASSERT_TRUE(db == nullptr);
+
+ // Does not exist, and create_if_missing == true: OK
+ options.create_if_missing = true;
+ s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ delete db;
+ db = nullptr;
+
+ // Does exist, and error_if_exists == true: error
+ options.create_if_missing = false;
+ options.error_if_exists = true;
+ s = DB::Open(options, dbname, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+ ASSERT_TRUE(db == nullptr);
+
+ // Does exist, and error_if_exists == false: OK
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ delete db;
+ db = nullptr;
+}
+
+TEST_F(DBTest, DBOpen_Change_NumLevels) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ ASSERT_TRUE(db_ != nullptr);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "a", "123"));
+ ASSERT_OK(Put(1, "b", "234"));
+ ASSERT_OK(Flush(1));
+ MoveFilesToLevel(3, 1);
+ Close();
+
+ options.create_if_missing = false;
+ options.num_levels = 2;
+ Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
+ ASSERT_TRUE(db_ == nullptr);
+}
+
+TEST_F(DBTest, DestroyDBMetaDatabase) {
+ std::string dbname = test::PerThreadDBPath("db_meta");
+ ASSERT_OK(env_->CreateDirIfMissing(dbname));
+ std::string metadbname = MetaDatabaseName(dbname, 0);
+ ASSERT_OK(env_->CreateDirIfMissing(metadbname));
+ std::string metametadbname = MetaDatabaseName(metadbname, 0);
+ ASSERT_OK(env_->CreateDirIfMissing(metametadbname));
+
+ // Destroy previous versions if they exist. Using the long way.
+ Options options = CurrentOptions();
+ ASSERT_OK(DestroyDB(metametadbname, options));
+ ASSERT_OK(DestroyDB(metadbname, options));
+ ASSERT_OK(DestroyDB(dbname, options));
+
+ // Setup databases
+ DB* db = nullptr;
+ ASSERT_OK(DB::Open(options, dbname, &db));
+ delete db;
+ db = nullptr;
+ ASSERT_OK(DB::Open(options, metadbname, &db));
+ delete db;
+ db = nullptr;
+ ASSERT_OK(DB::Open(options, metametadbname, &db));
+ delete db;
+ db = nullptr;
+
+ // Delete databases
+ ASSERT_OK(DestroyDB(dbname, options));
+
+ // Check if deletion worked.
+ options.create_if_missing = false;
+ ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
+ ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
+ ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, SnapshotFiles) {
+ do {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ std::vector<std::string> values;
+ for (int i = 0; i < 80; i++) {
+ values.push_back(rnd.RandomString(100000));
+ ASSERT_OK(Put((i < 40), Key(i), values[i]));
+ }
+
+ // assert that nothing makes it to disk yet.
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+
+ // get a file snapshot
+ uint64_t manifest_number = 0;
+ uint64_t manifest_size = 0;
+ std::vector<std::string> files;
+ ASSERT_OK(dbfull()->DisableFileDeletions());
+ ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
+
+ // CURRENT, MANIFEST, OPTIONS, *.sst files (one for each CF)
+ ASSERT_EQ(files.size(), 5U);
+
+ uint64_t number = 0;
+ FileType type;
+
+ // copy these files to a new snapshot directory
+ std::string snapdir = dbname_ + ".snapdir/";
+ if (env_->FileExists(snapdir).ok()) {
+ ASSERT_OK(DestroyDir(env_, snapdir));
+ }
+ ASSERT_OK(env_->CreateDir(snapdir));
+
+ for (size_t i = 0; i < files.size(); i++) {
+ // our clients require that GetLiveFiles returns
+ // files with "/" as first character!
+ ASSERT_EQ(files[i][0], '/');
+ std::string src = dbname_ + files[i];
+ std::string dest = snapdir + files[i];
+
+ uint64_t size;
+ ASSERT_OK(env_->GetFileSize(src, &size));
+
+ // record the number and the size of the
+ // latest manifest file
+ if (ParseFileName(files[i].substr(1), &number, &type)) {
+ if (type == kDescriptorFile) {
+ ASSERT_EQ(manifest_number, 0);
+ manifest_number = number;
+ ASSERT_GE(size, manifest_size);
+ size = manifest_size; // copy only valid MANIFEST data
+ }
+ }
+ CopyFile(src, dest, size);
+ }
+
+ // release file snapshot
+ ASSERT_OK(dbfull()->EnableFileDeletions(/*force*/ false));
+ // overwrite one key, this key should not appear in the snapshot
+ std::vector<std::string> extras;
+ for (unsigned int i = 0; i < 1; i++) {
+ extras.push_back(rnd.RandomString(100000));
+ ASSERT_OK(Put(0, Key(i), extras[i]));
+ }
+
+ // verify that data in the snapshot are correct
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.emplace_back("default", ColumnFamilyOptions());
+ column_families.emplace_back("pikachu", ColumnFamilyOptions());
+ std::vector<ColumnFamilyHandle*> cf_handles;
+ DB* snapdb;
+ DBOptions opts;
+ opts.env = env_;
+ opts.create_if_missing = false;
+ Status stat =
+ DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
+ ASSERT_OK(stat);
+
+ ReadOptions roptions;
+ std::string val;
+ for (unsigned int i = 0; i < 80; i++) {
+ ASSERT_OK(snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val));
+ ASSERT_EQ(values[i].compare(val), 0);
+ }
+ for (auto cfh : cf_handles) {
+ delete cfh;
+ }
+ delete snapdb;
+
+ // look at the new live files after we added an 'extra' key
+ // and after we took the first snapshot.
+ uint64_t new_manifest_number = 0;
+ uint64_t new_manifest_size = 0;
+ std::vector<std::string> newfiles;
+ ASSERT_OK(dbfull()->DisableFileDeletions());
+ ASSERT_OK(dbfull()->GetLiveFiles(newfiles, &new_manifest_size));
+
+ // find the new manifest file. assert that this manifest file is
+ // the same one as in the previous snapshot. But its size should be
+ // larger because we added an extra key after taking the
+ // previous shapshot.
+ for (size_t i = 0; i < newfiles.size(); i++) {
+ std::string src = dbname_ + "/" + newfiles[i];
+ // record the lognumber and the size of the
+ // latest manifest file
+ if (ParseFileName(newfiles[i].substr(1), &number, &type)) {
+ if (type == kDescriptorFile) {
+ ASSERT_EQ(new_manifest_number, 0);
+ uint64_t size;
+ new_manifest_number = number;
+ ASSERT_OK(env_->GetFileSize(src, &size));
+ ASSERT_GE(size, new_manifest_size);
+ }
+ }
+ }
+ ASSERT_EQ(manifest_number, new_manifest_number);
+ ASSERT_GT(new_manifest_size, manifest_size);
+
+ // Also test GetLiveFilesStorageInfo
+ std::vector<LiveFileStorageInfo> new_infos;
+ ASSERT_OK(db_->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(),
+ &new_infos));
+
+ // Close DB (while deletions disabled)
+ Close();
+
+ // Validate
+ for (auto& info : new_infos) {
+ std::string path = info.directory + "/" + info.relative_filename;
+ uint64_t size;
+ ASSERT_OK(env_->GetFileSize(path, &size));
+ if (info.trim_to_size) {
+ ASSERT_LE(info.size, size);
+ } else if (!info.replacement_contents.empty()) {
+ ASSERT_EQ(info.size, info.replacement_contents.size());
+ } else {
+ ASSERT_EQ(info.size, size);
+ }
+ if (info.file_type == kDescriptorFile) {
+ ASSERT_EQ(info.file_number, manifest_number);
+ }
+ }
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, ReadonlyDBGetLiveManifestSize) {
+ do {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+
+ uint64_t manifest_size = 0;
+ std::vector<std::string> files;
+ ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
+
+ for (const std::string& f : files) {
+ uint64_t number = 0;
+ FileType type;
+ if (ParseFileName(f.substr(1), &number, &type)) {
+ if (type == kDescriptorFile) {
+ uint64_t size_on_disk;
+ ASSERT_OK(env_->GetFileSize(dbname_ + "/" + f, &size_on_disk));
+ ASSERT_EQ(manifest_size, size_on_disk);
+ break;
+ }
+ }
+ }
+ Close();
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, GetLiveBlobFiles) {
+ // Note: the following prevents an otherwise harmless data race between the
+ // test setup code (AddBlobFile) below and the periodic stat dumping thread.
+ Options options = CurrentOptions();
+ options.stats_dump_period_sec = 0;
+
+ constexpr uint64_t blob_file_number = 234;
+ constexpr uint64_t total_blob_count = 555;
+ constexpr uint64_t total_blob_bytes = 66666;
+ constexpr char checksum_method[] = "CRC32";
+ constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+ constexpr uint64_t garbage_blob_count = 0;
+ constexpr uint64_t garbage_blob_bytes = 0;
+
+ Reopen(options);
+
+ AddBlobFile(db_->DefaultColumnFamily(), blob_file_number, total_blob_count,
+ total_blob_bytes, checksum_method, checksum_value,
+ garbage_blob_count, garbage_blob_bytes);
+ // Make sure it appears in the results returned by GetLiveFiles.
+ uint64_t manifest_size = 0;
+ std::vector<std::string> files;
+ ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
+
+ ASSERT_FALSE(files.empty());
+ ASSERT_EQ(files[0], BlobFileName("", blob_file_number));
+
+ ColumnFamilyMetaData cfmd;
+
+ db_->GetColumnFamilyMetaData(&cfmd);
+ ASSERT_EQ(cfmd.blob_files.size(), 1);
+ const BlobMetaData& bmd = cfmd.blob_files[0];
+
+ CheckBlobMetaData(bmd, blob_file_number, total_blob_count, total_blob_bytes,
+ checksum_method, checksum_value, garbage_blob_count,
+ garbage_blob_bytes);
+ ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_));
+ ASSERT_EQ(cfmd.blob_file_count, 1U);
+ ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size);
+}
+#endif
+
+TEST_F(DBTest, PurgeInfoLogs) {
+ Options options = CurrentOptions();
+ options.keep_log_file_num = 5;
+ options.create_if_missing = true;
+ options.env = env_;
+ for (int mode = 0; mode <= 1; mode++) {
+ if (mode == 1) {
+ options.db_log_dir = dbname_ + "_logs";
+ ASSERT_OK(env_->CreateDirIfMissing(options.db_log_dir));
+ } else {
+ options.db_log_dir = "";
+ }
+ for (int i = 0; i < 8; i++) {
+ Reopen(options);
+ }
+
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(
+ options.db_log_dir.empty() ? dbname_ : options.db_log_dir, &files));
+ int info_log_count = 0;
+ for (std::string file : files) {
+ if (file.find("LOG") != std::string::npos) {
+ info_log_count++;
+ }
+ }
+ ASSERT_EQ(5, info_log_count);
+
+ Destroy(options);
+ // For mode (1), test DestroyDB() to delete all the logs under DB dir.
+ // For mode (2), no info log file should have been put under DB dir.
+ // Since dbname_ has no children, there is no need to loop db_files
+ std::vector<std::string> db_files;
+ ASSERT_TRUE(env_->GetChildren(dbname_, &db_files).IsNotFound());
+ ASSERT_TRUE(db_files.empty());
+
+ if (mode == 1) {
+ // Cleaning up
+ ASSERT_OK(env_->GetChildren(options.db_log_dir, &files));
+ for (std::string file : files) {
+ ASSERT_OK(env_->DeleteFile(options.db_log_dir + "/" + file));
+ }
+ ASSERT_OK(env_->DeleteDir(options.db_log_dir));
+ }
+ }
+}
+
+#ifndef ROCKSDB_LITE
+// Multi-threaded test:
+namespace {
+
+static const int kColumnFamilies = 10;
+static const int kNumThreads = 10;
+static const int kTestSeconds = 10;
+static const int kNumKeys = 1000;
+
+struct MTState {
+ DBTest* test;
+ std::atomic<int> counter[kNumThreads];
+};
+
+struct MTThread {
+ MTState* state;
+ int id;
+ bool multiget_batched;
+};
+
+static void MTThreadBody(void* arg) {
+ MTThread* t = reinterpret_cast<MTThread*>(arg);
+ int id = t->id;
+ DB* db = t->state->test->db_;
+ int counter = 0;
+ std::shared_ptr<SystemClock> clock = SystemClock::Default();
+ auto end_micros = clock->NowMicros() + kTestSeconds * 1000000U;
+
+ fprintf(stderr, "... starting thread %d\n", id);
+ Random rnd(1000 + id);
+ char valbuf[1500];
+ while (clock->NowMicros() < end_micros) {
+ t->state->counter[id].store(counter, std::memory_order_release);
+
+ int key = rnd.Uniform(kNumKeys);
+ char keybuf[20];
+ snprintf(keybuf, sizeof(keybuf), "%016d", key);
+
+ if (rnd.OneIn(2)) {
+ // Write values of the form <key, my id, counter, cf, unique_id>.
+ // into each of the CFs
+ // We add some padding for force compactions.
+ int unique_id = rnd.Uniform(1000000);
+
+ // Half of the time directly use WriteBatch. Half of the time use
+ // WriteBatchWithIndex.
+ if (rnd.OneIn(2)) {
+ WriteBatch batch;
+ for (int cf = 0; cf < kColumnFamilies; ++cf) {
+ snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+ static_cast<int>(counter), cf, unique_id);
+ ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf),
+ Slice(valbuf)));
+ }
+ ASSERT_OK(db->Write(WriteOptions(), &batch));
+ } else {
+ WriteBatchWithIndex batch(db->GetOptions().comparator);
+ for (int cf = 0; cf < kColumnFamilies; ++cf) {
+ snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+ static_cast<int>(counter), cf, unique_id);
+ ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf),
+ Slice(valbuf)));
+ }
+ ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch()));
+ }
+ } else {
+ // Read a value and verify that it matches the pattern written above
+ // and that writes to all column families were atomic (unique_id is the
+ // same)
+ std::vector<Slice> keys(kColumnFamilies, Slice(keybuf));
+ std::vector<std::string> values;
+ std::vector<Status> statuses;
+ if (!t->multiget_batched) {
+ statuses = db->MultiGet(ReadOptions(), t->state->test->handles_, keys,
+ &values);
+ } else {
+ std::vector<PinnableSlice> pin_values(keys.size());
+ statuses.resize(keys.size());
+ const Snapshot* snapshot = db->GetSnapshot();
+ ReadOptions ro;
+ ro.snapshot = snapshot;
+ for (int cf = 0; cf < kColumnFamilies; ++cf) {
+ db->MultiGet(ro, t->state->test->handles_[cf], 1, &keys[cf],
+ &pin_values[cf], &statuses[cf]);
+ }
+ db->ReleaseSnapshot(snapshot);
+ values.resize(keys.size());
+ for (int cf = 0; cf < kColumnFamilies; ++cf) {
+ if (statuses[cf].ok()) {
+ values[cf].assign(pin_values[cf].data(), pin_values[cf].size());
+ }
+ }
+ }
+ Status s = statuses[0];
+ // all statuses have to be the same
+ for (size_t i = 1; i < statuses.size(); ++i) {
+ // they are either both ok or both not-found
+ ASSERT_TRUE((s.ok() && statuses[i].ok()) ||
+ (s.IsNotFound() && statuses[i].IsNotFound()));
+ }
+ if (s.IsNotFound()) {
+ // Key has not yet been written
+ } else {
+ // Check that the writer thread counter is >= the counter in the value
+ ASSERT_OK(s);
+ int unique_id = -1;
+ for (int i = 0; i < kColumnFamilies; ++i) {
+ int k, w, c, cf, u;
+ ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, &c,
+ &cf, &u))
+ << values[i];
+ ASSERT_EQ(k, key);
+ ASSERT_GE(w, 0);
+ ASSERT_LT(w, kNumThreads);
+ ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire));
+ ASSERT_EQ(cf, i);
+ if (i == 0) {
+ unique_id = u;
+ } else {
+ // this checks that updates across column families happened
+ // atomically -- all unique ids are the same
+ ASSERT_EQ(u, unique_id);
+ }
+ }
+ }
+ }
+ counter++;
+ }
+ fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
+}
+
+} // anonymous namespace
+
+class MultiThreadedDBTest
+ : public DBTest,
+ public ::testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+ void SetUp() override {
+ std::tie(option_config_, multiget_batched_) = GetParam();
+ }
+
+ static std::vector<int> GenerateOptionConfigs() {
+ std::vector<int> optionConfigs;
+ for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) {
+ optionConfigs.push_back(optionConfig);
+ }
+ return optionConfigs;
+ }
+
+ bool multiget_batched_;
+};
+
+TEST_P(MultiThreadedDBTest, MultiThreaded) {
+ if (option_config_ == kPipelinedWrite) return;
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ Options options = CurrentOptions(options_override);
+ std::vector<std::string> cfs;
+ for (int i = 1; i < kColumnFamilies; ++i) {
+ cfs.push_back(std::to_string(i));
+ }
+ Reopen(options);
+ CreateAndReopenWithCF(cfs, options);
+ // Initialize state
+ MTState mt;
+ mt.test = this;
+ for (int id = 0; id < kNumThreads; id++) {
+ mt.counter[id].store(0, std::memory_order_release);
+ }
+
+ // Start threads
+ MTThread thread[kNumThreads];
+ for (int id = 0; id < kNumThreads; id++) {
+ thread[id].state = &mt;
+ thread[id].id = id;
+ thread[id].multiget_batched = multiget_batched_;
+ env_->StartThread(MTThreadBody, &thread[id]);
+ }
+
+ env_->WaitForJoin();
+}
+
+INSTANTIATE_TEST_CASE_P(
+ MultiThreaded, MultiThreadedDBTest,
+ ::testing::Combine(
+ ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()),
+ ::testing::Bool()));
+#endif // ROCKSDB_LITE
+
+// Group commit test:
+#if !defined(OS_WIN)
+// Disable this test temporarily on Travis and appveyor as it fails
+// intermittently. Github issue: #4151
+namespace {
+
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
+
+struct GCThread {
+ DB* db;
+ int id;
+ std::atomic<bool> done;
+};
+
+static void GCThreadBody(void* arg) {
+ GCThread* t = reinterpret_cast<GCThread*>(arg);
+ int id = t->id;
+ DB* db = t->db;
+ WriteOptions wo;
+
+ for (int i = 0; i < kGCNumKeys; ++i) {
+ std::string kv(std::to_string(i + id * kGCNumKeys));
+ ASSERT_OK(db->Put(wo, kv, kv));
+ }
+ t->done = true;
+}
+
+} // anonymous namespace
+
+TEST_F(DBTest, GroupCommitTest) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ Reopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"WriteThread::JoinBatchGroup:BeganWaiting",
+ "DBImpl::WriteImpl:BeforeLeaderEnters"},
+ {"WriteThread::AwaitState:BlockingWaiting",
+ "WriteThread::EnterAsBatchGroupLeader:End"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Start threads
+ GCThread thread[kGCNumThreads];
+ for (int id = 0; id < kGCNumThreads; id++) {
+ thread[id].id = id;
+ thread[id].db = db_;
+ thread[id].done = false;
+ env_->StartThread(GCThreadBody, &thread[id]);
+ }
+ env_->WaitForJoin();
+
+ ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
+
+ std::vector<std::string> expected_db;
+ for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+ expected_db.push_back(std::to_string(i));
+ }
+ std::sort(expected_db.begin(), expected_db.end());
+
+ Iterator* itr = db_->NewIterator(ReadOptions());
+ itr->SeekToFirst();
+ for (auto x : expected_db) {
+ ASSERT_TRUE(itr->Valid());
+ ASSERT_EQ(itr->key().ToString(), x);
+ ASSERT_EQ(itr->value().ToString(), x);
+ itr->Next();
+ }
+ ASSERT_TRUE(!itr->Valid());
+ delete itr;
+
+ HistogramData hist_data;
+ options.statistics->histogramData(DB_WRITE, &hist_data);
+ ASSERT_GT(hist_data.average, 0.0);
+ } while (ChangeOptions(kSkipNoSeekToLast));
+}
+#endif // OS_WIN
+
+namespace {
+using KVMap = std::map<std::string, std::string>;
+}
+
+class ModelDB : public DB {
+ public:
+ class ModelSnapshot : public Snapshot {
+ public:
+ KVMap map_;
+
+ SequenceNumber GetSequenceNumber() const override {
+ // no need to call this
+ assert(false);
+ return 0;
+ }
+
+ int64_t GetUnixTime() const override {
+ // no need to call this
+ assert(false);
+ return 0;
+ }
+
+ uint64_t GetTimestamp() const override {
+ // no need to call this
+ assert(false);
+ return 0;
+ }
+ };
+
+ explicit ModelDB(const Options& options) : options_(options) {}
+ using DB::Put;
+ Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+ const Slice& v) override {
+ WriteBatch batch;
+ Status s = batch.Put(cf, k, v);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(o, &batch);
+ }
+ Status Put(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+ const Slice& /*k*/, const Slice& /*ts*/,
+ const Slice& /*v*/) override {
+ return Status::NotSupported();
+ }
+
+ using DB::PutEntity;
+ Status PutEntity(const WriteOptions& /* options */,
+ ColumnFamilyHandle* /* column_family */,
+ const Slice& /* key */,
+ const WideColumns& /* columns */) override {
+ return Status::NotSupported();
+ }
+
+ using DB::Close;
+ Status Close() override { return Status::OK(); }
+ using DB::Delete;
+ Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
+ const Slice& key) override {
+ WriteBatch batch;
+ Status s = batch.Delete(cf, key);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(o, &batch);
+ }
+ Status Delete(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+ const Slice& /*key*/, const Slice& /*ts*/) override {
+ return Status::NotSupported();
+ }
+ using DB::SingleDelete;
+ Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf,
+ const Slice& key) override {
+ WriteBatch batch;
+ Status s = batch.SingleDelete(cf, key);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(o, &batch);
+ }
+ Status SingleDelete(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+ const Slice& /*key*/, const Slice& /*ts*/) override {
+ return Status::NotSupported();
+ }
+ using DB::Merge;
+ Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+ const Slice& v) override {
+ WriteBatch batch;
+ Status s = batch.Merge(cf, k, v);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(o, &batch);
+ }
+ Status Merge(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+ const Slice& /*k*/, const Slice& /*ts*/,
+ const Slice& /*value*/) override {
+ return Status::NotSupported();
+ }
+ using DB::Get;
+ Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/,
+ const Slice& key, PinnableSlice* /*value*/) override {
+ return Status::NotSupported(key);
+ }
+
+ using DB::GetMergeOperands;
+ virtual Status GetMergeOperands(
+ const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
+ const Slice& key, PinnableSlice* /*slice*/,
+ GetMergeOperandsOptions* /*merge_operands_options*/,
+ int* /*number_of_operands*/) override {
+ return Status::NotSupported(key);
+ }
+
+ using DB::MultiGet;
+ std::vector<Status> MultiGet(
+ const ReadOptions& /*options*/,
+ const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* /*values*/) override {
+ std::vector<Status> s(keys.size(),
+ Status::NotSupported("Not implemented."));
+ return s;
+ }
+
+#ifndef ROCKSDB_LITE
+ using DB::IngestExternalFile;
+ Status IngestExternalFile(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*external_files*/,
+ const IngestExternalFileOptions& /*options*/) override {
+ return Status::NotSupported("Not implemented.");
+ }
+
+ using DB::IngestExternalFiles;
+ Status IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& /*args*/) override {
+ return Status::NotSupported("Not implemented");
+ }
+
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& /*options*/,
+ const std::string& /*column_family_name*/,
+ const ImportColumnFamilyOptions& /*import_options*/,
+ const ExportImportFilesMetaData& /*metadata*/,
+ ColumnFamilyHandle** /*handle*/) override {
+ return Status::NotSupported("Not implemented.");
+ }
+
+ using DB::VerifyChecksum;
+ Status VerifyChecksum(const ReadOptions&) override {
+ return Status::NotSupported("Not implemented.");
+ }
+
+ using DB::GetPropertiesOfAllTables;
+ Status GetPropertiesOfAllTables(
+ ColumnFamilyHandle* /*column_family*/,
+ TablePropertiesCollection* /*props*/) override {
+ return Status();
+ }
+
+ Status GetPropertiesOfTablesInRange(
+ ColumnFamilyHandle* /*column_family*/, const Range* /*range*/,
+ std::size_t /*n*/, TablePropertiesCollection* /*props*/) override {
+ return Status();
+ }
+#endif // ROCKSDB_LITE
+
+ using DB::KeyMayExist;
+ bool KeyMayExist(const ReadOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+ std::string* /*value*/,
+ bool* value_found = nullptr) override {
+ if (value_found != nullptr) {
+ *value_found = false;
+ }
+ return true; // Not Supported directly
+ }
+ using DB::NewIterator;
+ Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* /*column_family*/) override {
+ if (options.snapshot == nullptr) {
+ KVMap* saved = new KVMap;
+ *saved = map_;
+ return new ModelIter(saved, true);
+ } else {
+ const KVMap* snapshot_state =
+ &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
+ return new ModelIter(snapshot_state, false);
+ }
+ }
+ Status NewIterators(const ReadOptions& /*options*/,
+ const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+ std::vector<Iterator*>* /*iterators*/) override {
+ return Status::NotSupported("Not supported yet");
+ }
+ const Snapshot* GetSnapshot() override {
+ ModelSnapshot* snapshot = new ModelSnapshot;
+ snapshot->map_ = map_;
+ return snapshot;
+ }
+
+ void ReleaseSnapshot(const Snapshot* snapshot) override {
+ delete reinterpret_cast<const ModelSnapshot*>(snapshot);
+ }
+
+ Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override {
+ class Handler : public WriteBatch::Handler {
+ public:
+ KVMap* map_;
+ void Put(const Slice& key, const Slice& value) override {
+ (*map_)[key.ToString()] = value.ToString();
+ }
+ void Merge(const Slice& /*key*/, const Slice& /*value*/) override {
+ // ignore merge for now
+ // (*map_)[key.ToString()] = value.ToString();
+ }
+ void Delete(const Slice& key) override { map_->erase(key.ToString()); }
+ };
+ Handler handler;
+ handler.map_ = &map_;
+ return batch->Iterate(&handler);
+ }
+
+ using DB::GetProperty;
+ bool GetProperty(ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*property*/, std::string* /*value*/) override {
+ return false;
+ }
+ using DB::GetIntProperty;
+ bool GetIntProperty(ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*property*/, uint64_t* /*value*/) override {
+ return false;
+ }
+ using DB::GetMapProperty;
+ bool GetMapProperty(ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*property*/,
+ std::map<std::string, std::string>* /*value*/) override {
+ return false;
+ }
+ using DB::GetAggregatedIntProperty;
+ bool GetAggregatedIntProperty(const Slice& /*property*/,
+ uint64_t* /*value*/) override {
+ return false;
+ }
+ using DB::GetApproximateSizes;
+ Status GetApproximateSizes(const SizeApproximationOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Range* /*range*/, int n,
+ uint64_t* sizes) override {
+ for (int i = 0; i < n; i++) {
+ sizes[i] = 0;
+ }
+ return Status::OK();
+ }
+ using DB::GetApproximateMemTableStats;
+ void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/,
+ const Range& /*range*/,
+ uint64_t* const count,
+ uint64_t* const size) override {
+ *count = 0;
+ *size = 0;
+ }
+ using DB::CompactRange;
+ Status CompactRange(const CompactRangeOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*start*/, const Slice* /*end*/) override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ Status SetDBOptions(
+ const std::unordered_map<std::string, std::string>& /*new_options*/)
+ override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ using DB::CompactFiles;
+ Status CompactFiles(
+ const CompactionOptions& /*compact_options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*input_file_names*/,
+ const int /*output_level*/, const int /*output_path_id*/ = -1,
+ std::vector<std::string>* const /*output_file_names*/ = nullptr,
+ CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ Status PauseBackgroundWork() override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ Status ContinueBackgroundWork() override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ Status EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& /*column_family_handles*/)
+ override {
+ return Status::NotSupported("Not supported operation.");
+ }
+
+ void EnableManualCompaction() override { return; }
+
+ void DisableManualCompaction() override { return; }
+
+ using DB::NumberLevels;
+ int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; }
+
+ using DB::MaxMemCompactionLevel;
+ int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override {
+ return 1;
+ }
+
+ using DB::Level0StopWriteTrigger;
+ int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override {
+ return -1;
+ }
+
+ const std::string& GetName() const override { return name_; }
+
+ Env* GetEnv() const override { return nullptr; }
+
+ using DB::GetOptions;
+ Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override {
+ return options_;
+ }
+
+ using DB::GetDBOptions;
+ DBOptions GetDBOptions() const override { return options_; }
+
+ using DB::Flush;
+ Status Flush(const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/) override {
+ Status ret;
+ return ret;
+ }
+ Status Flush(
+ const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
+ const std::vector<ColumnFamilyHandle*>& /*column_families*/) override {
+ return Status::OK();
+ }
+
+ Status SyncWAL() override { return Status::OK(); }
+
+ Status DisableFileDeletions() override { return Status::OK(); }
+
+ Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); }
+#ifndef ROCKSDB_LITE
+
+ Status GetLiveFiles(std::vector<std::string>&, uint64_t* /*size*/,
+ bool /*flush_memtable*/ = true) override {
+ return Status::OK();
+ }
+
+ Status GetLiveFilesChecksumInfo(
+ FileChecksumList* /*checksum_list*/) override {
+ return Status::OK();
+ }
+
+ Status GetLiveFilesStorageInfo(
+ const LiveFilesStorageInfoOptions& /*opts*/,
+ std::vector<LiveFileStorageInfo>* /*files*/) override {
+ return Status::OK();
+ }
+
+ Status GetSortedWalFiles(VectorLogPtr& /*files*/) override {
+ return Status::OK();
+ }
+
+ Status GetCurrentWalFile(
+ std::unique_ptr<LogFile>* /*current_log_file*/) override {
+ return Status::OK();
+ }
+
+ virtual Status GetCreationTimeOfOldestFile(
+ uint64_t* /*creation_time*/) override {
+ return Status::NotSupported();
+ }
+
+ Status DeleteFile(std::string /*name*/) override { return Status::OK(); }
+
+ Status GetUpdatesSince(
+ ROCKSDB_NAMESPACE::SequenceNumber,
+ std::unique_ptr<ROCKSDB_NAMESPACE::TransactionLogIterator>*,
+ const TransactionLogIterator::ReadOptions& /*read_options*/ =
+ TransactionLogIterator::ReadOptions()) override {
+ return Status::NotSupported("Not supported in Model DB");
+ }
+
+ void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
+ ColumnFamilyMetaData* /*metadata*/) override {}
+#endif // ROCKSDB_LITE
+
+ Status GetDbIdentity(std::string& /*identity*/) const override {
+ return Status::OK();
+ }
+
+ Status GetDbSessionId(std::string& /*session_id*/) const override {
+ return Status::OK();
+ }
+
+ SequenceNumber GetLatestSequenceNumber() const override { return 0; }
+
+ Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* /*cf*/,
+ std::string /*ts_low*/) override {
+ return Status::OK();
+ }
+
+ Status GetFullHistoryTsLow(ColumnFamilyHandle* /*cf*/,
+ std::string* /*ts_low*/) override {
+ return Status::OK();
+ }
+
+ ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
+
+ private:
+ class ModelIter : public Iterator {
+ public:
+ ModelIter(const KVMap* map, bool owned)
+ : map_(map), owned_(owned), iter_(map_->end()) {}
+ ~ModelIter() override {
+ if (owned_) delete map_;
+ }
+ bool Valid() const override { return iter_ != map_->end(); }
+ void SeekToFirst() override { iter_ = map_->begin(); }
+ void SeekToLast() override {
+ if (map_->empty()) {
+ iter_ = map_->end();
+ } else {
+ iter_ = map_->find(map_->rbegin()->first);
+ }
+ }
+ void Seek(const Slice& k) override {
+ iter_ = map_->lower_bound(k.ToString());
+ }
+ void SeekForPrev(const Slice& k) override {
+ iter_ = map_->upper_bound(k.ToString());
+ Prev();
+ }
+ void Next() override { ++iter_; }
+ void Prev() override {
+ if (iter_ == map_->begin()) {
+ iter_ = map_->end();
+ return;
+ }
+ --iter_;
+ }
+
+ Slice key() const override { return iter_->first; }
+ Slice value() const override { return iter_->second; }
+ Status status() const override { return Status::OK(); }
+
+ private:
+ const KVMap* const map_;
+ const bool owned_; // Do we own map_
+ KVMap::const_iterator iter_;
+ };
+ const Options options_;
+ KVMap map_;
+ std::string name_ = "";
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+static std::string RandomKey(Random* rnd, int minimum = 0) {
+ int len;
+ do {
+ len = (rnd->OneIn(3)
+ ? 1 // Short sometimes to encourage collisions
+ : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+ } while (len < minimum);
+ return test::RandomKey(rnd, len);
+}
+
+static bool CompareIterators(int step, DB* model, DB* db,
+ const Snapshot* model_snap,
+ const Snapshot* db_snap) {
+ ReadOptions options;
+ options.snapshot = model_snap;
+ Iterator* miter = model->NewIterator(options);
+ options.snapshot = db_snap;
+ Iterator* dbiter = db->NewIterator(options);
+ bool ok = true;
+ int count = 0;
+ for (miter->SeekToFirst(), dbiter->SeekToFirst();
+ ok && miter->Valid() && dbiter->Valid(); miter->Next(), dbiter->Next()) {
+ count++;
+ if (miter->key().compare(dbiter->key()) != 0) {
+ fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", step,
+ EscapeString(miter->key()).c_str(),
+ EscapeString(dbiter->key()).c_str());
+ ok = false;
+ break;
+ }
+
+ if (miter->value().compare(dbiter->value()) != 0) {
+ fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+ step, EscapeString(miter->key()).c_str(),
+ EscapeString(miter->value()).c_str(),
+ EscapeString(dbiter->value()).c_str());
+ ok = false;
+ }
+ }
+
+ if (ok) {
+ if (miter->Valid() != dbiter->Valid()) {
+ fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+ step, miter->Valid(), dbiter->Valid());
+ ok = false;
+ }
+ }
+ delete miter;
+ delete dbiter;
+ return ok;
+}
+
+class DBTestRandomized : public DBTest,
+ public ::testing::WithParamInterface<int> {
+ public:
+ void SetUp() override { option_config_ = GetParam(); }
+
+ static std::vector<int> GenerateOptionConfigs() {
+ std::vector<int> option_configs;
+ // skip cuckoo hash as it does not support snapshot.
+ for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+ if (!ShouldSkipOptions(option_config,
+ kSkipDeletesFilterFirst | kSkipNoSeekToLast)) {
+ option_configs.push_back(option_config);
+ }
+ }
+ option_configs.push_back(kBlockBasedTableWithIndexRestartInterval);
+ return option_configs;
+ }
+};
+
+INSTANTIATE_TEST_CASE_P(
+ DBTestRandomized, DBTestRandomized,
+ ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs()));
+
+TEST_P(DBTestRandomized, Randomized) {
+ anon::OptionsOverride options_override;
+ options_override.skip_policy = kSkipNoSnapshot;
+ Options options = CurrentOptions(options_override);
+ DestroyAndReopen(options);
+
+ Random rnd(test::RandomSeed() + GetParam());
+ ModelDB model(options);
+ const int N = 10000;
+ const Snapshot* model_snap = nullptr;
+ const Snapshot* db_snap = nullptr;
+ std::string k, v;
+ for (int step = 0; step < N; step++) {
+ // TODO(sanjay): Test Get() works
+ int p = rnd.Uniform(100);
+ int minimum = 0;
+ if (option_config_ == kHashSkipList || option_config_ == kHashLinkList ||
+ option_config_ == kPlainTableFirstBytePrefix ||
+ option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
+ option_config_ == kBlockBasedTableWithPrefixHashIndex) {
+ minimum = 1;
+ }
+ if (p < 45) { // Put
+ k = RandomKey(&rnd, minimum);
+ v = rnd.RandomString(rnd.OneIn(20) ? 100 + rnd.Uniform(100)
+ : rnd.Uniform(8));
+ ASSERT_OK(model.Put(WriteOptions(), k, v));
+ ASSERT_OK(db_->Put(WriteOptions(), k, v));
+ } else if (p < 90) { // Delete
+ k = RandomKey(&rnd, minimum);
+ ASSERT_OK(model.Delete(WriteOptions(), k));
+ ASSERT_OK(db_->Delete(WriteOptions(), k));
+ } else { // Multi-element batch
+ WriteBatch b;
+ const int num = rnd.Uniform(8);
+ for (int i = 0; i < num; i++) {
+ if (i == 0 || !rnd.OneIn(10)) {
+ k = RandomKey(&rnd, minimum);
+ } else {
+ // Periodically re-use the same key from the previous iter, so
+ // we have multiple entries in the write batch for the same key
+ }
+ if (rnd.OneIn(2)) {
+ v = rnd.RandomString(rnd.Uniform(10));
+ ASSERT_OK(b.Put(k, v));
+ } else {
+ ASSERT_OK(b.Delete(k));
+ }
+ }
+ ASSERT_OK(model.Write(WriteOptions(), &b));
+ ASSERT_OK(db_->Write(WriteOptions(), &b));
+ }
+
+ if ((step % 100) == 0) {
+ // For DB instances that use the hash index + block-based table, the
+ // iterator will be invalid right when seeking a non-existent key, right
+ // than return a key that is close to it.
+ if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
+ option_config_ != kBlockBasedTableWithPrefixHashIndex) {
+ ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+ ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+ }
+
+ // Save a snapshot from each DB this time that we'll use next
+ // time we compare things, to make sure the current state is
+ // preserved with the snapshot
+ if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+ if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+
+ Reopen(options);
+ ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+
+ model_snap = model.GetSnapshot();
+ db_snap = db_->GetSnapshot();
+ }
+ }
+ if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+ if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(DBTest, BlockBasedTablePrefixIndexTest) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+ Reopen(options);
+ ASSERT_OK(Put("k1", "v1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("k2", "v2"));
+
+ // Reopen with different prefix extractor, make sure everything still works.
+ // RocksDB should just fall back to the binary index.
+ options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+
+ Reopen(options);
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("v2", Get("k2"));
+
+#ifndef ROCKSDB_LITE
+ // Back to original
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}}));
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("v2", Get("k2"));
+#endif // !ROCKSDB_LITE
+
+ // Same if there's a problem initally loading prefix transform
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::ForceNullTablePrefixExtractor",
+ [&](void* arg) { *static_cast<bool*>(arg) = true; });
+ SyncPoint::GetInstance()->EnableProcessing();
+ Reopen(options);
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("v2", Get("k2"));
+
+#ifndef ROCKSDB_LITE
+ // Change again
+ ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("v2", Get("k2"));
+#endif // !ROCKSDB_LITE
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // Reopen with no prefix extractor, make sure everything still works.
+ // RocksDB should just fall back to the binary index.
+ table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset();
+
+ Reopen(options);
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("v2", Get("k2"));
+}
+
+TEST_F(DBTest, BlockBasedTablePrefixHashIndexTest) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewCappedPrefixTransform(2));
+
+ Reopen(options);
+ ASSERT_OK(Put("kk1", "v1"));
+ ASSERT_OK(Put("kk2", "v2"));
+ ASSERT_OK(Put("kk", "v3"));
+ ASSERT_OK(Put("k", "v4"));
+ Flush();
+
+ ASSERT_EQ("v1", Get("kk1"));
+ ASSERT_EQ("v2", Get("kk2"));
+
+ ASSERT_EQ("v3", Get("kk"));
+ ASSERT_EQ("v4", Get("k"));
+}
+
+TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ options.max_open_files = 10;
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = 11;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(options);
+ ASSERT_OK(Put("k1", "v1"));
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 1;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // Force evict tables
+ dbfull()->TEST_table_cache()->SetCapacity(0);
+ // Make table cache to keep one entry.
+ dbfull()->TEST_table_cache()->SetCapacity(1);
+
+ ReadOptions read_options;
+ read_options.total_order_seek = true;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ iter->Seek("k1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("k1", iter->key().ToString());
+ }
+
+ // After total order seek, prefix index should still be used.
+ read_options.total_order_seek = false;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ iter->Seek("k1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("k1", iter->key().ToString());
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest, ChecksumTest) {
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+
+ table_options.checksum = kCRC32c;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_OK(Put("a", "b"));
+ ASSERT_OK(Put("c", "d"));
+ ASSERT_OK(Flush()); // table with crc checksum
+
+ table_options.checksum = kxxHash;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_OK(Put("e", "f"));
+ ASSERT_OK(Put("g", "h"));
+ ASSERT_OK(Flush()); // table with xxhash checksum
+
+ table_options.checksum = kCRC32c;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_EQ("b", Get("a"));
+ ASSERT_EQ("d", Get("c"));
+ ASSERT_EQ("f", Get("e"));
+ ASSERT_EQ("h", Get("g"));
+
+ table_options.checksum = kCRC32c;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_EQ("b", Get("a"));
+ ASSERT_EQ("d", Get("c"));
+ ASSERT_EQ("f", Get("e"));
+ ASSERT_EQ("h", Get("g"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBTestWithParam, FIFOCompactionTest) {
+ for (int iter = 0; iter < 2; ++iter) {
+ // first iteration -- auto compaction
+ // second iteration -- manual compaction
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.arena_block_size = 4096;
+ options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.max_subcompactions = max_subcompactions_;
+ if (iter == 1) {
+ options.disable_auto_compactions = true;
+ }
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 6; ++i) {
+ for (int j = 0; j < 110; ++j) {
+ ASSERT_OK(Put(std::to_string(i * 100 + j), rnd.RandomString(980)));
+ }
+ // flush should happen here
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ if (iter == 0) {
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ } else {
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ }
+ // only 5 files should survive
+ ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+ for (int i = 0; i < 50; ++i) {
+ // these keys should be deleted in previous compaction
+ ASSERT_EQ("NOT_FOUND", Get(std::to_string(i)));
+ }
+ }
+}
+
+TEST_F(DBTest, FIFOCompactionTestWithCompaction) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 20 << 10; // 20K
+ options.arena_block_size = 4096;
+ options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1MB
+ options.compaction_options_fifo.allow_compaction = true;
+ options.level0_file_num_compaction_trigger = 6;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 60; i++) {
+ // Generate and flush a file about 20KB.
+ for (int j = 0; j < 20; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ // It should be compacted to 10 files.
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ for (int i = 0; i < 60; i++) {
+ // Generate and flush a file about 20KB.
+ for (int j = 0; j < 20; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j + 2000), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ // It should be compacted to no more than 20 files.
+ ASSERT_GT(NumTableFilesAtLevel(0), 10);
+ ASSERT_LT(NumTableFilesAtLevel(0), 18);
+ // Size limit is still guaranteed.
+ ASSERT_LE(SizeAtLevel(0),
+ options.compaction_options_fifo.max_table_files_size);
+}
+
+TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 20 << 10; // 20K
+ options.arena_block_size = 4096;
+ options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1MB
+ options.compaction_options_fifo.allow_compaction = true;
+ options.level0_file_num_compaction_trigger = 3;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 3; i++) {
+ // Each file contains a different key which will be dropped later.
+ ASSERT_OK(Put("a" + std::to_string(i), rnd.RandomString(500)));
+ ASSERT_OK(Put("key" + std::to_string(i), ""));
+ ASSERT_OK(Put("z" + std::to_string(i), rnd.RandomString(500)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+ for (int i = 0; i < 3; i++) {
+ ASSERT_EQ("", Get("key" + std::to_string(i)));
+ }
+ for (int i = 0; i < 3; i++) {
+ // Each file contains a different key which will be dropped later.
+ ASSERT_OK(Put("a" + std::to_string(i), rnd.RandomString(500)));
+ ASSERT_OK(Delete("key" + std::to_string(i)));
+ ASSERT_OK(Put("z" + std::to_string(i), rnd.RandomString(500)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+ for (int i = 0; i < 3; i++) {
+ ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i)));
+ }
+}
+
+// Check that FIFO-with-TTL is not supported with max_open_files != -1.
+// Github issue #8014
+TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleFIFO;
+ options.create_if_missing = true;
+ options.ttl = 600; // seconds
+
+ // TTL is not supported with max_open_files != -1.
+ options.max_open_files = 0;
+ ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+ options.max_open_files = 100;
+ ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+ // TTL is supported with unlimited max_open_files
+ options.max_open_files = -1;
+ ASSERT_OK(TryReopen(options));
+}
+
+// Check that FIFO-with-TTL is supported only with BlockBasedTableFactory.
+TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.create_if_missing = true;
+ options.ttl = 600; // seconds
+
+ options = CurrentOptions(options);
+ options.table_factory.reset(NewBlockBasedTableFactory());
+ ASSERT_OK(TryReopen(options));
+
+ Destroy(options);
+ options.table_factory.reset(NewPlainTableFactory());
+ ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+ Destroy(options);
+ options.table_factory.reset(NewAdaptiveTableFactory());
+ ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+TEST_F(DBTest, FIFOCompactionWithTTLTest) {
+ Options options;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.write_buffer_size = 10 << 10; // 10KB
+ options.arena_block_size = 4096;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ env_->SetMockSleep();
+ options.env = env_;
+
+ // Test to make sure that all files with expired ttl are deleted on next
+ // manual compaction.
+ {
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB
+ options.compaction_options_fifo.allow_compaction = false;
+ options.ttl = 1 * 60 * 60; // 1 hour
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Sleep for 2 hours -- which is much greater than TTL.
+ env_->MockSleepForSeconds(2 * 60 * 60);
+
+ // Since no flushes and compactions have run, the db should still be in
+ // the same state even after considerable time has passed.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ }
+
+ // Test to make sure that all files with expired ttl are deleted on next
+ // automatic compaction.
+ {
+ options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB
+ options.compaction_options_fifo.allow_compaction = false;
+ options.ttl = 1 * 60 * 60; // 1 hour
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Sleep for 2 hours -- which is much greater than TTL.
+ env_->MockSleepForSeconds(2 * 60 * 60);
+ // Just to make sure that we are in the same state even after sleeping.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ // Create 1 more file to trigger TTL compaction. The old files are dropped.
+ for (int i = 0; i < 1; i++) {
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Only the new 10 files remain.
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+ ASSERT_LE(SizeAtLevel(0),
+ options.compaction_options_fifo.max_table_files_size);
+ }
+
+ // Test that shows the fall back to size-based FIFO compaction if TTL-based
+ // deletion doesn't move the total size to be less than max_table_files_size.
+ {
+ options.write_buffer_size = 10 << 10; // 10KB
+ options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB
+ options.compaction_options_fifo.allow_compaction = false;
+ options.ttl = 1 * 60 * 60; // 1 hour
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 3; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+
+ // Sleep for 2 hours -- which is much greater than TTL.
+ env_->MockSleepForSeconds(2 * 60 * 60);
+ // Just to make sure that we are in the same state even after sleeping.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+
+ for (int i = 0; i < 5; i++) {
+ for (int j = 0; j < 140; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ // Size limit is still guaranteed.
+ ASSERT_LE(SizeAtLevel(0),
+ options.compaction_options_fifo.max_table_files_size);
+ }
+
+ // Test with TTL + Intra-L0 compactions.
+ {
+ options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB
+ options.compaction_options_fifo.allow_compaction = true;
+ options.ttl = 1 * 60 * 60; // 1 hour
+ options.level0_file_num_compaction_trigger = 6;
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 10; i++) {
+ // Generate and flush a file about 10KB.
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ // With Intra-L0 compaction, out of 10 files, 6 files will be compacted to 1
+ // (due to level0_file_num_compaction_trigger = 6).
+ // So total files = 1 + remaining 4 = 5.
+ ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+
+ // Sleep for 2 hours -- which is much greater than TTL.
+ env_->MockSleepForSeconds(2 * 60 * 60);
+ // Just to make sure that we are in the same state even after sleeping.
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+
+ // Create 10 more files. The old 5 files are dropped as their ttl expired.
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+ ASSERT_LE(SizeAtLevel(0),
+ options.compaction_options_fifo.max_table_files_size);
+ }
+
+ // Test with large TTL + Intra-L0 compactions.
+ // Files dropped based on size, as ttl doesn't kick in.
+ {
+ options.write_buffer_size = 20 << 10; // 20K
+ options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1.5MB
+ options.compaction_options_fifo.allow_compaction = true;
+ options.ttl = 1 * 60 * 60; // 1 hour
+ options.level0_file_num_compaction_trigger = 6;
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 60; i++) {
+ // Generate and flush a file about 20KB.
+ for (int j = 0; j < 20; j++) {
+ ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ // It should be compacted to 10 files.
+ ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+ for (int i = 0; i < 60; i++) {
+ // Generate and flush a file about 20KB.
+ for (int j = 0; j < 20; j++) {
+ ASSERT_OK(
+ Put(std::to_string(i * 20 + j + 2000), rnd.RandomString(980)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ // It should be compacted to no more than 20 files.
+ ASSERT_GT(NumTableFilesAtLevel(0), 10);
+ ASSERT_LT(NumTableFilesAtLevel(0), 18);
+ // Size limit is still guaranteed.
+ ASSERT_LE(SizeAtLevel(0),
+ options.compaction_options_fifo.max_table_files_size);
+ }
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+/*
+ * This test is not reliable enough as it heavily depends on disk behavior.
+ * Disable as it is flaky.
+ */
+TEST_F(DBTest, DISABLED_RateLimitingTest) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 1 << 20; // 1MB
+ options.level0_file_num_compaction_trigger = 2;
+ options.target_file_size_base = 1 << 20; // 1MB
+ options.max_bytes_for_level_base = 4 << 20; // 4MB
+ options.max_bytes_for_level_multiplier = 4;
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ options.env = env_;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.IncreaseParallelism(4);
+ DestroyAndReopen(options);
+
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ // # no rate limiting
+ Random rnd(301);
+ uint64_t start = env_->NowMicros();
+ // Write ~96M data
+ for (int64_t i = 0; i < (96 << 10); ++i) {
+ ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
+ }
+ uint64_t elapsed = env_->NowMicros() - start;
+ double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed;
+ uint64_t rate_limiter_drains =
+ TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS);
+ ASSERT_EQ(0, rate_limiter_drains);
+ Close();
+
+ // # rate limiting with 0.7 x threshold
+ options.rate_limiter.reset(
+ NewGenericRateLimiter(static_cast<int64_t>(0.7 * raw_rate)));
+ env_->bytes_written_ = 0;
+ DestroyAndReopen(options);
+
+ start = env_->NowMicros();
+ // Write ~96M data
+ for (int64_t i = 0; i < (96 << 10); ++i) {
+ ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
+ }
+ rate_limiter_drains =
+ TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
+ rate_limiter_drains;
+ elapsed = env_->NowMicros() - start;
+ Close();
+ ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+ // Most intervals should've been drained (interval time is 100ms, elapsed is
+ // micros)
+ ASSERT_GT(rate_limiter_drains, 0);
+ ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
+ double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+ fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio);
+ ASSERT_TRUE(ratio < 0.8);
+
+ // # rate limiting with half of the raw_rate
+ options.rate_limiter.reset(
+ NewGenericRateLimiter(static_cast<int64_t>(raw_rate / 2)));
+ env_->bytes_written_ = 0;
+ DestroyAndReopen(options);
+
+ start = env_->NowMicros();
+ // Write ~96M data
+ for (int64_t i = 0; i < (96 << 10); ++i) {
+ ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
+ }
+ elapsed = env_->NowMicros() - start;
+ rate_limiter_drains =
+ TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
+ rate_limiter_drains;
+ Close();
+ ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+ // Most intervals should've been drained (interval time is 100ms, elapsed is
+ // micros)
+ ASSERT_GT(rate_limiter_drains, elapsed / 100000 / 2);
+ ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
+ ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+ fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio);
+ ASSERT_LT(ratio, 0.6);
+}
+
+// This is a mocked customed rate limiter without implementing optional APIs
+// (e.g, RateLimiter::GetTotalPendingRequests())
+class MockedRateLimiterWithNoOptionalAPIImpl : public RateLimiter {
+ public:
+ MockedRateLimiterWithNoOptionalAPIImpl() {}
+
+ ~MockedRateLimiterWithNoOptionalAPIImpl() override {}
+
+ void SetBytesPerSecond(int64_t bytes_per_second) override {
+ (void)bytes_per_second;
+ }
+
+ using RateLimiter::Request;
+ void Request(const int64_t bytes, const Env::IOPriority pri,
+ Statistics* stats) override {
+ (void)bytes;
+ (void)pri;
+ (void)stats;
+ }
+
+ int64_t GetSingleBurstBytes() const override { return 200; }
+
+ int64_t GetTotalBytesThrough(
+ const Env::IOPriority pri = Env::IO_TOTAL) const override {
+ (void)pri;
+ return 0;
+ }
+
+ int64_t GetTotalRequests(
+ const Env::IOPriority pri = Env::IO_TOTAL) const override {
+ (void)pri;
+ return 0;
+ }
+
+ int64_t GetBytesPerSecond() const override { return 0; }
+};
+
+// To test that customed rate limiter not implementing optional APIs (e.g,
+// RateLimiter::GetTotalPendingRequests()) works fine with RocksDB basic
+// operations (e.g, Put, Get, Flush)
+TEST_F(DBTest, CustomedRateLimiterWithNoOptionalAPIImplTest) {
+ Options options = CurrentOptions();
+ options.rate_limiter.reset(new MockedRateLimiterWithNoOptionalAPIImpl());
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("abc", "def"));
+ ASSERT_EQ(Get("abc"), "def");
+ ASSERT_OK(Flush());
+ ASSERT_EQ(Get("abc"), "def");
+}
+
+TEST_F(DBTest, TableOptionsSanitizeTest) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false);
+
+ options.table_factory.reset(NewPlainTableFactory());
+ options.prefix_extractor.reset(NewNoopTransform());
+ Destroy(options);
+ ASSERT_TRUE(!TryReopen(options).IsNotSupported());
+
+ // Test for check of prefix_extractor when hash index is used for
+ // block-based table
+ BlockBasedTableOptions to;
+ to.index_type = BlockBasedTableOptions::kHashSearch;
+ options = CurrentOptions();
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(to));
+ ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ ASSERT_OK(TryReopen(options));
+}
+
+TEST_F(DBTest, ConcurrentMemtableNotSupported) {
+ Options options = CurrentOptions();
+ options.allow_concurrent_memtable_write = true;
+ options.soft_pending_compaction_bytes_limit = 0;
+ options.hard_pending_compaction_bytes_limit = 100;
+ options.create_if_missing = true;
+
+ DestroyDB(dbname_, options);
+ options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4));
+ ASSERT_NOK(TryReopen(options));
+
+ options.memtable_factory.reset(new SkipListFactory);
+ ASSERT_OK(TryReopen(options));
+
+ ColumnFamilyOptions cf_options(options);
+ cf_options.memtable_factory.reset(
+ NewHashLinkListRepFactory(4, 0, 3, true, 4));
+ ColumnFamilyHandle* handle;
+ ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle));
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, SanitizeNumThreads) {
+ for (int attempt = 0; attempt < 2; attempt++) {
+ const size_t kTotalTasks = 8;
+ test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+
+ Options options = CurrentOptions();
+ if (attempt == 0) {
+ options.max_background_compactions = 3;
+ options.max_background_flushes = 2;
+ }
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ // Insert 5 tasks to low priority queue and 5 tasks to high priority queue
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_tasks[i],
+ (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH);
+ }
+
+ // Wait until 10s for they are scheduled.
+ for (int i = 0; i < 10000; i++) {
+ if (options.env->GetThreadPoolQueueLen(Env::Priority::LOW) <= 1 &&
+ options.env->GetThreadPoolQueueLen(Env::Priority::HIGH) <= 2) {
+ break;
+ }
+ env_->SleepForMicroseconds(1000);
+ }
+
+ // pool size 3, total task 4. Queue size should be 1.
+ ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
+ // pool size 2, total task 4. Queue size should be 2.
+ ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ sleeping_tasks[i].WakeUp();
+ sleeping_tasks[i].WaitUntilDone();
+ }
+
+ ASSERT_OK(Put("abc", "def"));
+ ASSERT_EQ("def", Get("abc"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("def", Get("abc"));
+ }
+}
+
+TEST_F(DBTest, WriteSingleThreadEntry) {
+ std::vector<port::Thread> threads;
+ dbfull()->TEST_LockMutex();
+ auto w = dbfull()->TEST_BeginWrite();
+ threads.emplace_back([&] { ASSERT_OK(Put("a", "b")); });
+ env_->SleepForMicroseconds(10000);
+ threads.emplace_back([&] { ASSERT_OK(Flush()); });
+ env_->SleepForMicroseconds(10000);
+ dbfull()->TEST_UnlockMutex();
+ dbfull()->TEST_LockMutex();
+ dbfull()->TEST_EndWrite(w);
+ dbfull()->TEST_UnlockMutex();
+
+ for (auto& t : threads) {
+ t.join();
+ }
+}
+
+TEST_F(DBTest, ConcurrentFlushWAL) {
+ const size_t cnt = 100;
+ Options options;
+ options.env = env_;
+ WriteOptions wopt;
+ ReadOptions ropt;
+ for (bool two_write_queues : {false, true}) {
+ for (bool manual_wal_flush : {false, true}) {
+ options.two_write_queues = two_write_queues;
+ options.manual_wal_flush = manual_wal_flush;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ std::vector<port::Thread> threads;
+ threads.emplace_back([&] {
+ for (size_t i = 0; i < cnt; i++) {
+ auto istr = std::to_string(i);
+ ASSERT_OK(db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr,
+ "b" + istr));
+ }
+ });
+ if (two_write_queues) {
+ threads.emplace_back([&] {
+ for (size_t i = cnt; i < 2 * cnt; i++) {
+ auto istr = std::to_string(i);
+ WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ wopt.protection_bytes_per_key,
+ 0 /* default_cf_ts_sz */);
+ ASSERT_OK(batch.Put("a" + istr, "b" + istr));
+ ASSERT_OK(
+ dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true));
+ }
+ });
+ }
+ threads.emplace_back([&] {
+ for (size_t i = 0; i < cnt * 100; i++) { // FlushWAL is faster than Put
+ ASSERT_OK(db_->FlushWAL(false));
+ }
+ });
+ for (auto& t : threads) {
+ t.join();
+ }
+ options.create_if_missing = false;
+ // Recover from the wal and make sure that it is not corrupted
+ Reopen(options);
+ for (size_t i = 0; i < cnt; i++) {
+ PinnableSlice pval;
+ auto istr = std::to_string(i);
+ ASSERT_OK(
+ db_->Get(ropt, db_->DefaultColumnFamily(), "a" + istr, &pval));
+ ASSERT_TRUE(pval == ("b" + istr));
+ }
+ }
+ }
+}
+
+// This test failure will be caught with a probability
+TEST_F(DBTest, ManualFlushWalAndWriteRace) {
+ Options options;
+ options.env = env_;
+ options.manual_wal_flush = true;
+ options.create_if_missing = true;
+
+ DestroyAndReopen(options);
+
+ WriteOptions wopts;
+ wopts.sync = true;
+
+ port::Thread writeThread([&]() {
+ for (int i = 0; i < 100; i++) {
+ auto istr = std::to_string(i);
+ ASSERT_OK(dbfull()->Put(wopts, "key_" + istr, "value_" + istr));
+ }
+ });
+ port::Thread flushThread([&]() {
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(dbfull()->FlushWAL(false));
+ }
+ });
+
+ writeThread.join();
+ flushThread.join();
+ ASSERT_OK(dbfull()->Put(wopts, "foo1", "value1"));
+ ASSERT_OK(dbfull()->Put(wopts, "foo2", "value2"));
+ Reopen(options);
+ ASSERT_EQ("value1", Get("foo1"));
+ ASSERT_EQ("value2", Get("foo2"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, DynamicMemtableOptions) {
+ const uint64_t k64KB = 1 << 16;
+ const uint64_t k128KB = 1 << 17;
+ const uint64_t k5KB = 5 * 1024;
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.compression = kNoCompression;
+ options.max_background_compactions = 1;
+ options.write_buffer_size = k64KB;
+ options.arena_block_size = 16 * 1024;
+ options.max_write_buffer_number = 2;
+ // Don't trigger compact/slowdown/stop
+ options.level0_file_num_compaction_trigger = 1024;
+ options.level0_slowdown_writes_trigger = 1024;
+ options.level0_stop_writes_trigger = 1024;
+ DestroyAndReopen(options);
+
+ auto gen_l0_kb = [this](int size) {
+ const int kNumPutsBeforeWaitForFlush = 64;
+ Random rnd(301);
+ for (int i = 0; i < size; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+
+ // The following condition prevents a race condition between flush jobs
+ // acquiring work and this thread filling up multiple memtables. Without
+ // this, the flush might produce less files than expected because
+ // multiple memtables are flushed into a single L0 file. This race
+ // condition affects assertion (A).
+ if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ };
+
+ // Test write_buffer_size
+ gen_l0_kb(64);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+ ASSERT_LT(SizeAtLevel(0), k64KB + k5KB);
+ ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2);
+
+ // Clean up L0
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ // Increase buffer size
+ ASSERT_OK(dbfull()->SetOptions({
+ {"write_buffer_size", "131072"},
+ }));
+
+ // The existing memtable inflated 64KB->128KB when we invoked SetOptions().
+ // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data.
+ gen_l0_kb(192);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1); // (A)
+ ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB);
+ ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB);
+
+ // Decrease buffer size below current usage
+ ASSERT_OK(dbfull()->SetOptions({
+ {"write_buffer_size", "65536"},
+ }));
+ // The existing memtable became eligible for flush when we reduced its
+ // capacity to 64KB. Two keys need to be added to trigger flush: first causes
+ // memtable to be marked full, second schedules the flush. Then we should have
+ // a 128KB L0 file, a 64KB L0 file, and a memtable with just one key.
+ gen_l0_kb(2);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+ ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
+ ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);
+
+ // Test max_write_buffer_number
+ // Block compaction thread, which will also block the flushes because
+ // max_background_flushes == 0, so flushes are getting executed by the
+ // compaction thread
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ // Start from scratch and disable compaction/flush. Flush can only happen
+ // during compaction but trigger is pretty high
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ env_->SetBackgroundThreads(0, Env::HIGH);
+
+ // Put until writes are stopped, bounded by 256 puts. We should see stop at
+ // ~128KB
+ int count = 0;
+ Random rnd(301);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Wait",
+ [&](void* /*arg*/) { sleeping_task_low.WakeUp(); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ while (!sleeping_task_low.WokenUp() && count < 256) {
+ ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
+ count++;
+ }
+ ASSERT_GT(static_cast<double>(count), 128 * 0.8);
+ ASSERT_LT(static_cast<double>(count), 128 * 1.2);
+
+ sleeping_task_low.WaitUntilDone();
+
+ // Increase
+ ASSERT_OK(dbfull()->SetOptions({
+ {"max_write_buffer_number", "8"},
+ }));
+ // Clean up memtable and L0
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ sleeping_task_low.Reset();
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ count = 0;
+ while (!sleeping_task_low.WokenUp() && count < 1024) {
+ ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
+ count++;
+ }
+// Windows fails this test. Will tune in the future and figure out
+// approp number
+#ifndef OS_WIN
+ ASSERT_GT(static_cast<double>(count), 512 * 0.8);
+ ASSERT_LT(static_cast<double>(count), 512 * 1.2);
+#endif
+ sleeping_task_low.WaitUntilDone();
+
+ // Decrease
+ ASSERT_OK(dbfull()->SetOptions({
+ {"max_write_buffer_number", "4"},
+ }));
+ // Clean up memtable and L0
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ sleeping_task_low.Reset();
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ count = 0;
+ while (!sleeping_task_low.WokenUp() && count < 1024) {
+ ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
+ count++;
+ }
+// Windows fails this test. Will tune in the future and figure out
+// approp number
+#ifndef OS_WIN
+ ASSERT_GT(static_cast<double>(count), 256 * 0.8);
+ ASSERT_LT(static_cast<double>(count), 266 * 1.2);
+#endif
+ sleeping_task_low.WaitUntilDone();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif // ROCKSDB_LITE
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+namespace {
+void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type,
+ int expected_count) {
+ int op_count = 0;
+ std::vector<ThreadStatus> thread_list;
+ ASSERT_OK(env->GetThreadList(&thread_list));
+ for (auto thread : thread_list) {
+ if (thread.operation_type == op_type) {
+ op_count++;
+ }
+ }
+ ASSERT_EQ(op_count, expected_count);
+}
+} // anonymous namespace
+
+TEST_F(DBTest, GetThreadStatus) {
+ Options options;
+ options.env = env_;
+ options.enable_thread_tracking = true;
+ TryReopen(options);
+
+ std::vector<ThreadStatus> thread_list;
+ Status s = env_->GetThreadList(&thread_list);
+
+ for (int i = 0; i < 2; ++i) {
+ // repeat the test with differet number of high / low priority threads
+ const int kTestCount = 3;
+ const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5};
+ const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3};
+ const unsigned int kBottomPriCounts[kTestCount] = {2, 1, 4};
+ for (int test = 0; test < kTestCount; ++test) {
+ // Change the number of threads in high / low priority pool.
+ env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH);
+ env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
+ env_->SetBackgroundThreads(kBottomPriCounts[test], Env::BOTTOM);
+ // Wait to ensure the all threads has been registered
+ unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES];
+ // TODO(ajkr): it'd be better if SetBackgroundThreads returned only after
+ // all threads have been registered.
+ // Try up to 60 seconds.
+ for (int num_try = 0; num_try < 60000; num_try++) {
+ env_->SleepForMicroseconds(1000);
+ thread_list.clear();
+ s = env_->GetThreadList(&thread_list);
+ ASSERT_OK(s);
+ memset(thread_type_counts, 0, sizeof(thread_type_counts));
+ for (auto thread : thread_list) {
+ ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES);
+ thread_type_counts[thread.thread_type]++;
+ }
+ if (thread_type_counts[ThreadStatus::HIGH_PRIORITY] ==
+ kHighPriCounts[test] &&
+ thread_type_counts[ThreadStatus::LOW_PRIORITY] ==
+ kLowPriCounts[test] &&
+ thread_type_counts[ThreadStatus::BOTTOM_PRIORITY] ==
+ kBottomPriCounts[test]) {
+ break;
+ }
+ }
+ // Verify the number of high-priority threads
+ ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY],
+ kHighPriCounts[test]);
+ // Verify the number of low-priority threads
+ ASSERT_EQ(thread_type_counts[ThreadStatus::LOW_PRIORITY],
+ kLowPriCounts[test]);
+ // Verify the number of bottom-priority threads
+ ASSERT_EQ(thread_type_counts[ThreadStatus::BOTTOM_PRIORITY],
+ kBottomPriCounts[test]);
+ }
+ if (i == 0) {
+ // repeat the test with multiple column families
+ CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+ env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+ true);
+ }
+ }
+ ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+ delete handles_[2];
+ handles_.erase(handles_.begin() + 2);
+ env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+ true);
+ Close();
+ env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+ true);
+}
+
+TEST_F(DBTest, DisableThreadStatus) {
+ Options options;
+ options.env = env_;
+ options.enable_thread_tracking = false;
+ TryReopen(options);
+ CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+ // Verify non of the column family info exists
+ env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+ false);
+}
+
+TEST_F(DBTest, ThreadStatusFlush) {
+ Options options;
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.enable_thread_tracking = true;
+ options = CurrentOptions(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"},
+ {"DBTest::ThreadStatusFlush:2", "FlushJob::WriteLevel0Table"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+ VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+ uint64_t num_running_flushes = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes,
+ &num_running_flushes));
+ ASSERT_EQ(num_running_flushes, 0);
+
+ ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable
+ ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger flush
+
+ // The first sync point is to make sure there's one flush job
+ // running when we perform VerifyOperationCount().
+ TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1");
+ VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1);
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes,
+ &num_running_flushes));
+ ASSERT_EQ(num_running_flushes, 1);
+ // This second sync point is to ensure the flush job will not
+ // be completed until we already perform VerifyOperationCount().
+ TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 100;
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ options.compaction_style = kCompactionStyleLevel;
+ options.target_file_size_base = options.write_buffer_size;
+ options.max_bytes_for_level_base = options.target_file_size_base * 2;
+ options.max_bytes_for_level_multiplier = 2;
+ options.compression = kNoCompression;
+ options = CurrentOptions(options);
+ options.env = env_;
+ options.enable_thread_tracking = true;
+ const int kNumL0Files = 4;
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.max_subcompactions = max_subcompactions_;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"},
+ {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"},
+ {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"},
+ });
+ for (int tests = 0; tests < 2; ++tests) {
+ DestroyAndReopen(options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ // The Put Phase.
+ for (int file = 0; file < kNumL0Files; ++file) {
+ for (int key = 0; key < kEntriesPerBuffer; ++key) {
+ ASSERT_OK(Put(std::to_string(key + file * kEntriesPerBuffer),
+ rnd.RandomString(kTestValueSize)));
+ }
+ ASSERT_OK(Flush());
+ }
+ // This makes sure a compaction won't be scheduled until
+ // we have done with the above Put Phase.
+ uint64_t num_running_compactions = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+ &num_running_compactions));
+ ASSERT_EQ(num_running_compactions, 0);
+ TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0");
+ ASSERT_GE(NumTableFilesAtLevel(0),
+ options.level0_file_num_compaction_trigger);
+
+ // This makes sure at least one compaction is running.
+ TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1");
+
+ if (options.enable_thread_tracking) {
+ // expecting one single L0 to L1 compaction
+ VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1);
+ } else {
+ // If thread tracking is not enabled, compaction count should be 0.
+ VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0);
+ }
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+ &num_running_compactions));
+ ASSERT_EQ(num_running_compactions, 1);
+ // TODO(yhchiang): adding assert to verify each compaction stage.
+ TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2");
+
+ // repeat the test with disabling thread tracking.
+ options.enable_thread_tracking = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_P(DBTestWithParam, PreShutdownManualCompaction) {
+ Options options = CurrentOptions();
+ options.max_subcompactions = max_subcompactions_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // iter - 0 with 7 levels
+ // iter - 1 with 3 levels
+ for (int iter = 0; iter < 2; ++iter) {
+ MakeTables(3, "p", "q", 1);
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range falls before files
+ Compact(1, "", "c");
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range falls after files
+ Compact(1, "r", "z");
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range overlaps files
+ Compact(1, "p", "q");
+ ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+ // Populate a different range
+ MakeTables(3, "c", "e", 1);
+ ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+ // Compact just the new range
+ Compact(1, "b", "f");
+ ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+ // Compact all
+ MakeTables(1, "a", "z", 1);
+ ASSERT_EQ("1,0,2", FilesPerLevel(1));
+ CancelAllBackgroundWork(db_);
+ ASSERT_TRUE(
+ db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)
+ .IsShutdownInProgress());
+ ASSERT_EQ("1,0,2", FilesPerLevel(1));
+
+ if (iter == 0) {
+ options = CurrentOptions();
+ options.num_levels = 3;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ }
+ }
+}
+
+TEST_F(DBTest, PreShutdownFlush) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put(1, "key", "value"));
+ CancelAllBackgroundWork(db_);
+ Status s =
+ db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+ ASSERT_TRUE(s.IsShutdownInProgress());
+}
+
+TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 40;
+ const int kNumL0Files = 4;
+
+ const int kHighPriCount = 3;
+ const int kLowPriCount = 5;
+ env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+ env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ options.compaction_style = kCompactionStyleLevel;
+ options.target_file_size_base = options.write_buffer_size;
+ options.max_bytes_for_level_base =
+ options.target_file_size_base * kNumL0Files;
+ options.compression = kNoCompression;
+ options = CurrentOptions(options);
+ options.env = env_;
+ options.enable_thread_tracking = true;
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.max_bytes_for_level_multiplier = 2;
+ options.max_background_compactions = kLowPriCount;
+ options.level0_stop_writes_trigger = 1 << 10;
+ options.level0_slowdown_writes_trigger = 1 << 10;
+ options.max_subcompactions = max_subcompactions_;
+
+ TryReopen(options);
+ Random rnd(301);
+
+ std::vector<ThreadStatus> thread_list;
+ // Delay both flush and compaction
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"},
+ {"CompactionJob::Run():Start",
+ "DBTest::PreShutdownMultipleCompaction:Preshutdown"},
+ {"CompactionJob::Run():Start",
+ "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"},
+ {"DBTest::PreShutdownMultipleCompaction:Preshutdown",
+ "CompactionJob::Run():End"},
+ {"CompactionJob::Run():End",
+ "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Make rocksdb busy
+ int key = 0;
+ // check how many threads are doing compaction using GetThreadList
+ int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+ for (int file = 0; file < 16 * kNumL0Files; ++file) {
+ for (int k = 0; k < kEntriesPerBuffer; ++k) {
+ ASSERT_OK(Put(std::to_string(key++), rnd.RandomString(kTestValueSize)));
+ }
+
+ ASSERT_OK(env_->GetThreadList(&thread_list));
+ for (auto thread : thread_list) {
+ operation_count[thread.operation_type]++;
+ }
+
+ // Speed up the test
+ if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+ operation_count[ThreadStatus::OP_COMPACTION] >
+ 0.6 * options.max_background_compactions) {
+ break;
+ }
+ if (file == 15 * kNumL0Files) {
+ TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+ }
+ }
+
+ TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+ ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+ CancelAllBackgroundWork(db_);
+ TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Record the number of compactions at a time.
+ for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+ operation_count[i] = 0;
+ }
+ ASSERT_OK(env_->GetThreadList(&thread_list));
+ for (auto thread : thread_list) {
+ operation_count[thread.operation_type]++;
+ }
+ ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 40;
+ const int kNumL0Files = 4;
+
+ const int kHighPriCount = 3;
+ const int kLowPriCount = 5;
+ env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+ env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ options.compaction_style = kCompactionStyleLevel;
+ options.target_file_size_base = options.write_buffer_size;
+ options.max_bytes_for_level_base =
+ options.target_file_size_base * kNumL0Files;
+ options.compression = kNoCompression;
+ options = CurrentOptions(options);
+ options.env = env_;
+ options.enable_thread_tracking = true;
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.max_bytes_for_level_multiplier = 2;
+ options.max_background_compactions = kLowPriCount;
+ options.level0_stop_writes_trigger = 1 << 10;
+ options.level0_slowdown_writes_trigger = 1 << 10;
+ options.max_subcompactions = max_subcompactions_;
+
+ TryReopen(options);
+ Random rnd(301);
+
+ std::vector<ThreadStatus> thread_list;
+ // Delay both flush and compaction
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBTest::PreShutdownCompactionMiddle:Preshutdown",
+ "CompactionJob::Run():Inprogress"},
+ {"CompactionJob::Run():Start",
+ "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"},
+ {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"},
+ {"CompactionJob::Run():End",
+ "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Make rocksdb busy
+ int key = 0;
+ // check how many threads are doing compaction using GetThreadList
+ int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+ for (int file = 0; file < 16 * kNumL0Files; ++file) {
+ for (int k = 0; k < kEntriesPerBuffer; ++k) {
+ ASSERT_OK(Put(std::to_string(key++), rnd.RandomString(kTestValueSize)));
+ }
+
+ ASSERT_OK(env_->GetThreadList(&thread_list));
+ for (auto thread : thread_list) {
+ operation_count[thread.operation_type]++;
+ }
+
+ // Speed up the test
+ if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+ operation_count[ThreadStatus::OP_COMPACTION] >
+ 0.6 * options.max_background_compactions) {
+ break;
+ }
+ if (file == 15 * kNumL0Files) {
+ TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction");
+ }
+ }
+
+ ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+ CancelAllBackgroundWork(db_);
+ TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
+ TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Record the number of compactions at a time.
+ for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+ operation_count[i] = 0;
+ }
+ ASSERT_OK(env_->GetThreadList(&thread_list));
+ for (auto thread : thread_list) {
+ operation_count[thread.operation_type]++;
+ }
+ ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+#endif // ROCKSDB_USING_THREAD_STATUS
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, FlushOnDestroy) {
+ WriteOptions wo;
+ wo.disableWAL = true;
+ ASSERT_OK(Put("foo", "v1", wo));
+ CancelAllBackgroundWork(db_);
+}
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+ const int kNKeys = 120;
+ int keys[kNKeys];
+ for (int i = 0; i < kNKeys; i++) {
+ keys[i] = i;
+ }
+ RandomShuffle(std::begin(keys), std::end(keys));
+
+ Random rnd(301);
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.db_write_buffer_size = 20480;
+ options.write_buffer_size = 20480;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 2;
+ options.target_file_size_base = 20480;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 102400;
+ options.max_bytes_for_level_multiplier = 4;
+ options.max_background_compactions = 1;
+ options.num_levels = 5;
+
+ options.compression_per_level.resize(3);
+ options.compression_per_level[0] = kNoCompression;
+ options.compression_per_level[1] = kNoCompression;
+ options.compression_per_level[2] = kSnappyCompression;
+
+ OnFileDeletionListener* listener = new OnFileDeletionListener();
+ options.listeners.emplace_back(listener);
+
+ DestroyAndReopen(options);
+
+ // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
+ // be compressed, so total data size should be more than 80K.
+ for (int i = 0; i < 20; i++) {
+ ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+ // Assuming each files' metadata is at least 50 bytes/
+ ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4);
+
+ // Insert 400KB. Some data will be compressed
+ for (int i = 21; i < 120; i++) {
+ ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+
+ ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4),
+ 120U * 4000U + 50U * 24);
+ // Make sure data in files in L3 is not compacted by removing all files
+ // in L4 and calculate number of rows
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "true"},
+ }));
+ ColumnFamilyMetaData cf_meta;
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ for (auto file : cf_meta.levels[4].files) {
+ listener->SetExpectedFileName(dbname_ + file.name);
+ ASSERT_OK(dbfull()->DeleteFile(file.name));
+ }
+ listener->VerifyMatchedCount(cf_meta.levels[4].files.size());
+
+ int num_keys = 0;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ num_keys++;
+ }
+ ASSERT_OK(iter->status());
+ ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
+}
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
+ if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) {
+ return;
+ }
+ const int kNKeys = 500;
+ int keys[kNKeys];
+ for (int i = 0; i < kNKeys; i++) {
+ keys[i] = i;
+ }
+ RandomShuffle(std::begin(keys), std::end(keys));
+
+ Random rnd(301);
+ Options options;
+ options.create_if_missing = true;
+ options.db_write_buffer_size = 6000000;
+ options.write_buffer_size = 600000;
+ options.max_write_buffer_number = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 2;
+ options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+ options.target_file_size_base = 20;
+ options.env = env_;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.max_bytes_for_level_base = 200;
+ options.max_bytes_for_level_multiplier = 8;
+ options.max_background_compactions = 1;
+ options.num_levels = 5;
+ std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
+ options.table_factory = mtf;
+
+ options.compression_per_level.resize(3);
+ options.compression_per_level[0] = kNoCompression;
+ options.compression_per_level[1] = kLZ4Compression;
+ options.compression_per_level[2] = kZlibCompression;
+
+ DestroyAndReopen(options);
+ // When base level is L4, L4 is LZ4.
+ std::atomic<int> num_zlib(0);
+ std::atomic<int> num_lz4(0);
+ std::atomic<int> num_no(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ if (compaction->output_level() == 4) {
+ ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+ num_lz4.fetch_add(1);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+ auto* compression = reinterpret_cast<CompressionType*>(arg);
+ ASSERT_TRUE(*compression == kNoCompression);
+ num_no.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < 100; i++) {
+ std::string value = rnd.RandomString(200);
+ ASSERT_OK(Put(Key(keys[i]), value));
+ if (i % 25 == 24) {
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ }
+
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+ ASSERT_GT(NumTableFilesAtLevel(4), 0);
+ ASSERT_GT(num_no.load(), 2);
+ ASSERT_GT(num_lz4.load(), 0);
+ int prev_num_files_l4 = NumTableFilesAtLevel(4);
+
+ // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
+ num_lz4.store(0);
+ num_no.store(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ if (compaction->output_level() == 4 && compaction->start_level() == 3) {
+ ASSERT_TRUE(compaction->output_compression() == kZlibCompression);
+ num_zlib.fetch_add(1);
+ } else {
+ ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+ num_lz4.fetch_add(1);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+ auto* compression = reinterpret_cast<CompressionType*>(arg);
+ ASSERT_TRUE(*compression == kNoCompression);
+ num_no.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 101; i < 500; i++) {
+ std::string value = rnd.RandomString(200);
+ ASSERT_OK(Put(Key(keys[i]), value));
+ if (i % 100 == 99) {
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+ ASSERT_GT(NumTableFilesAtLevel(3), 0);
+ ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
+ ASSERT_GT(num_no.load(), 2);
+ ASSERT_GT(num_lz4.load(), 0);
+ ASSERT_GT(num_zlib.load(), 0);
+}
+
+TEST_F(DBTest, DynamicCompactionOptions) {
+ // minimum write buffer size is enforced at 64KB
+ const uint64_t k32KB = 1 << 15;
+ const uint64_t k64KB = 1 << 16;
+ const uint64_t k128KB = 1 << 17;
+ const uint64_t k1MB = 1 << 20;
+ const uint64_t k4KB = 1 << 12;
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.compression = kNoCompression;
+ options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+ options.write_buffer_size = k64KB;
+ options.arena_block_size = 4 * k4KB;
+ options.max_write_buffer_number = 2;
+ // Compaction related options
+ options.level0_file_num_compaction_trigger = 3;
+ options.level0_slowdown_writes_trigger = 4;
+ options.level0_stop_writes_trigger = 8;
+ options.target_file_size_base = k64KB;
+ options.max_compaction_bytes = options.target_file_size_base * 10;
+ options.target_file_size_multiplier = 1;
+ options.max_bytes_for_level_base = k128KB;
+ options.max_bytes_for_level_multiplier = 4;
+
+ // Block flush thread and disable compaction thread
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ DestroyAndReopen(options);
+
+ auto gen_l0_kb = [this](int start, int size, int stride) {
+ Random rnd(301);
+ for (int i = 0; i < size; i++) {
+ ASSERT_OK(Put(Key(start + stride * i), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ };
+
+ // Write 3 files that have the same key range.
+ // Since level0_file_num_compaction_trigger is 3, compaction should be
+ // triggered. The compaction should result in one L1 file
+ gen_l0_kb(0, 64, 1);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+ gen_l0_kb(0, 64, 1);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+ gen_l0_kb(0, 64, 1);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,1", FilesPerLevel());
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(1U, metadata.size());
+ ASSERT_LE(metadata[0].size, k64KB + k4KB);
+ ASSERT_GE(metadata[0].size, k64KB - k4KB);
+
+ // Test compaction trigger and target_file_size_base
+ // Reduce compaction trigger to 2, and reduce L1 file size to 32KB.
+ // Writing to 64KB L0 files should trigger a compaction. Since these
+ // 2 L0 files have the same key range, compaction merge them and should
+ // result in 2 32KB L1 files.
+ ASSERT_OK(
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
+ {"target_file_size_base", std::to_string(k32KB)}}));
+
+ gen_l0_kb(0, 64, 1);
+ ASSERT_EQ("1,1", FilesPerLevel());
+ gen_l0_kb(0, 64, 1);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,2", FilesPerLevel());
+ metadata.clear();
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(2U, metadata.size());
+ ASSERT_LE(metadata[0].size, k32KB + k4KB);
+ ASSERT_GE(metadata[0].size, k32KB - k4KB);
+ ASSERT_LE(metadata[1].size, k32KB + k4KB);
+ ASSERT_GE(metadata[1].size, k32KB - k4KB);
+
+ // Test max_bytes_for_level_base
+ // Increase level base size to 256KB and write enough data that will
+ // fill L1 and L2. L1 size should be around 256KB while L2 size should be
+ // around 256KB x 4.
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"max_bytes_for_level_base", std::to_string(k1MB)}}));
+
+ // writing 96 x 64KB => 6 * 1024KB
+ // (L1 + L2) = (1 + 4) * 1024KB
+ for (int i = 0; i < 96; ++i) {
+ gen_l0_kb(i, 64, 96);
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_GT(SizeAtLevel(1), k1MB / 2);
+ ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2);
+
+ // Within (0.5, 1.5) of 4MB.
+ ASSERT_GT(SizeAtLevel(2), 2 * k1MB);
+ ASSERT_LT(SizeAtLevel(2), 6 * k1MB);
+
+ // Test max_bytes_for_level_multiplier and
+ // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
+ // After filling enough data that can fit in L1 - L3, we should see L1 size
+ // reduces to 128KB from 256KB which was asserted previously. Same for L2.
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"max_bytes_for_level_multiplier", "2"},
+ {"max_bytes_for_level_base", std::to_string(k128KB)}}));
+
+ // writing 20 x 64KB = 10 x 128KB
+ // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB
+ for (int i = 0; i < 20; ++i) {
+ gen_l0_kb(i, 64, 32);
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3);
+ ASSERT_TRUE(total_size < k128KB * 7 * 1.5);
+
+ // Test level0_stop_writes_trigger.
+ // Clean up memtable and L0. Block compaction threads. If continue to write
+ // and flush memtables. We should see put stop after 8 memtable flushes
+ // since level0_stop_writes_trigger = 8
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // Block compaction
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ sleeping_task_low.WaitUntilSleeping();
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ int count = 0;
+ Random rnd(301);
+ WriteOptions wo;
+ while (count < 64) {
+ ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+ count++;
+ if (dbfull()->TEST_write_controler().IsStopped()) {
+ sleeping_task_low.WakeUp();
+ break;
+ }
+ }
+ // Stop trigger = 8
+ ASSERT_EQ(count, 8);
+ // Unblock
+ sleeping_task_low.WaitUntilDone();
+
+ // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0.
+ // Block compaction thread again. Perform the put and memtable flushes
+ // until we see the stop after 6 memtable flushes.
+ ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}}));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ // Block compaction again
+ sleeping_task_low.Reset();
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ sleeping_task_low.WaitUntilSleeping();
+ count = 0;
+ while (count < 64) {
+ ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+ count++;
+ if (dbfull()->TEST_write_controler().IsStopped()) {
+ sleeping_task_low.WakeUp();
+ break;
+ }
+ }
+ ASSERT_EQ(count, 6);
+ // Unblock
+ sleeping_task_low.WaitUntilDone();
+
+ // Test disable_auto_compactions
+ // Compaction thread is unblocked but auto compaction is disabled. Write
+ // 4 L0 files and compaction should be triggered. If auto compaction is
+ // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
+ // L0 files do not change after the call.
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}}));
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ for (int i = 0; i < 4; ++i) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+ // Wait for compaction so that put won't stop
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+
+ // Enable auto compaction and perform the same test, # of L0 files should be
+ // reduced after compaction.
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+ for (int i = 0; i < 4; ++i) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+ // Wait for compaction so that put won't stop
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_LT(NumTableFilesAtLevel(0), 4);
+}
+
+// Test dynamic FIFO compaction options.
+// This test covers just option parsing and makes sure that the options are
+// correctly assigned. Also look at DBOptionsTest.SetFIFOCompactionOptions
+// test which makes sure that the FIFO compaction funcionality is working
+// as expected on dynamically changing the options.
+// Even more FIFOCompactionTests are at DBTest.FIFOCompaction* .
+TEST_F(DBTest, DynamicFIFOCompactionOptions) {
+ Options options;
+ options.ttl = 0;
+ options.create_if_missing = true;
+ options.env = env_;
+ DestroyAndReopen(options);
+
+ // Initial defaults
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 1024 * 1024 * 1024);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo", "{max_table_files_size=23;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 23);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions({{"ttl", "97"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 23);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 97);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions({{"ttl", "203"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 23);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 23);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo", "{max_table_files_size=31;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 31);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_fifo",
+ "{max_table_files_size=51;allow_compaction=true;}"}}));
+ ASSERT_OK(dbfull()->SetOptions({{"ttl", "49"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+ 51);
+ ASSERT_EQ(dbfull()->GetOptions().ttl, 49);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+ true);
+}
+
+TEST_F(DBTest, DynamicUniversalCompactionOptions) {
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_;
+ DestroyAndReopen(options);
+
+ // Initial defaults
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1U);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+ 2u);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+ UINT_MAX);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.max_size_amplification_percent,
+ 200u);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.compression_size_percent,
+ -1);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+ kCompactionStopStyleTotalSize);
+ ASSERT_EQ(
+ dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_universal", "{size_ratio=7;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+ 2u);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+ UINT_MAX);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.max_size_amplification_percent,
+ 200u);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.compression_size_percent,
+ -1);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+ kCompactionStopStyleTotalSize);
+ ASSERT_EQ(
+ dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+ false);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"compaction_options_universal", "{min_merge_width=11;}"}}));
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+ 11u);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+ UINT_MAX);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.max_size_amplification_percent,
+ 200u);
+ ASSERT_EQ(dbfull()
+ ->GetOptions()
+ .compaction_options_universal.compression_size_percent,
+ -1);
+ ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+ kCompactionStopStyleTotalSize);
+ ASSERT_EQ(
+ dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+ false);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, FileCreationRandomFailure) {
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.target_file_size_base = 200000;
+ options.max_bytes_for_level_base = 1000000;
+ options.max_bytes_for_level_multiplier = 2;
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ constexpr int kCDTKeysPerBuffer = 4;
+ constexpr int kTestSize = kCDTKeysPerBuffer * 4096;
+ constexpr int kTotalIteration = 20;
+ // the second half of the test involves in random failure
+ // of file creation.
+ constexpr int kRandomFailureTest = kTotalIteration / 2;
+
+ std::vector<std::string> values;
+ for (int i = 0; i < kTestSize; ++i) {
+ values.push_back("NOT_FOUND");
+ }
+ for (int j = 0; j < kTotalIteration; ++j) {
+ if (j == kRandomFailureTest) {
+ env_->non_writeable_rate_.store(90);
+ }
+ for (int k = 0; k < kTestSize; ++k) {
+ // here we expect some of the Put fails.
+ std::string value = rnd.RandomString(100);
+ Status s = Put(Key(k), Slice(value));
+ if (s.ok()) {
+ // update the latest successful put
+ values[k] = value;
+ }
+ // But everything before we simulate the failure-test should succeed.
+ if (j < kRandomFailureTest) {
+ ASSERT_OK(s);
+ }
+ }
+ }
+
+ // If rocksdb does not do the correct job, internal assert will fail here.
+ ASSERT_TRUE(dbfull()->TEST_WaitForFlushMemTable().IsIOError());
+ ASSERT_TRUE(dbfull()->TEST_WaitForCompact().IsIOError());
+
+ // verify we have the latest successful update
+ for (int k = 0; k < kTestSize; ++k) {
+ auto v = Get(Key(k));
+ ASSERT_EQ(v, values[k]);
+ }
+
+ // reopen and reverify we have the latest successful update
+ env_->non_writeable_rate_.store(0);
+ Reopen(options);
+ for (int k = 0; k < kTestSize; ++k) {
+ auto v = Get(Key(k));
+ ASSERT_EQ(v, values[k]);
+ }
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, DynamicMiscOptions) {
+ // Test max_sequential_skip_in_iterations
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ options.max_sequential_skip_in_iterations = 16;
+ options.compression = kNoCompression;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ auto assert_reseek_count = [this, &options](int key_start, int num_reseek) {
+ int key0 = key_start;
+ int key1 = key_start + 1;
+ int key2 = key_start + 2;
+ Random rnd(301);
+ ASSERT_OK(Put(Key(key0), rnd.RandomString(8)));
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(Put(Key(key1), rnd.RandomString(8)));
+ }
+ ASSERT_OK(Put(Key(key2), rnd.RandomString(8)));
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ iter->Seek(Key(key1));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Key(key1)), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(Key(key2)), 0);
+ ASSERT_EQ(num_reseek,
+ TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+ };
+ // No reseek
+ assert_reseek_count(100, 0);
+
+ ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}}));
+ // Clear memtable and make new option effective
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+ // Trigger reseek
+ assert_reseek_count(200, 1);
+
+ ASSERT_OK(
+ dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}}));
+ // Clear memtable and make new option effective
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+ // No reseek
+ assert_reseek_count(300, 1);
+
+ MutableCFOptions mutable_cf_options;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // Test soft_pending_compaction_bytes_limit,
+ // hard_pending_compaction_bytes_limit
+ ASSERT_OK(dbfull()->SetOptions(
+ handles_[1], {{"soft_pending_compaction_bytes_limit", "200"},
+ {"hard_pending_compaction_bytes_limit", "300"}}));
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+ &mutable_cf_options));
+ ASSERT_EQ(200, mutable_cf_options.soft_pending_compaction_bytes_limit);
+ ASSERT_EQ(300, mutable_cf_options.hard_pending_compaction_bytes_limit);
+ // Test report_bg_io_stats
+ ASSERT_OK(
+ dbfull()->SetOptions(handles_[1], {{"report_bg_io_stats", "true"}}));
+ // sanity check
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+ &mutable_cf_options));
+ ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
+ // Test compression
+ // sanity check
+ ASSERT_OK(dbfull()->SetOptions({{"compression", "kNoCompression"}}));
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+ &mutable_cf_options));
+ ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression);
+
+ if (Snappy_Supported()) {
+ ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+ &mutable_cf_options));
+ ASSERT_EQ(CompressionType::kSnappyCompression,
+ mutable_cf_options.compression);
+ }
+
+ // Test paranoid_file_checks already done in db_block_cache_test
+ ASSERT_OK(
+ dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}}));
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+ &mutable_cf_options));
+ ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
+ ASSERT_TRUE(mutable_cf_options.check_flush_compaction_key_order);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ handles_[1], {{"check_flush_compaction_key_order", "false"}}));
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+ &mutable_cf_options));
+ ASSERT_FALSE(mutable_cf_options.check_flush_compaction_key_order);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, L0L1L2AndUpHitCounter) {
+ const int kNumLevels = 3;
+ const int kNumKeysPerLevel = 10000;
+ const int kNumKeysPerDb = kNumLevels * kNumKeysPerLevel;
+
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ Reopen(options);
+
+ // After the below loop there will be one file on each of L0, L1, and L2.
+ int key = 0;
+ for (int output_level = kNumLevels - 1; output_level >= 0; --output_level) {
+ for (int i = 0; i < kNumKeysPerLevel; ++i) {
+ ASSERT_OK(Put(Key(key), "val"));
+ key++;
+ }
+ ASSERT_OK(Flush());
+ for (int input_level = 0; input_level < output_level; ++input_level) {
+ // `TEST_CompactRange(input_level, ...)` compacts from `input_level` to
+ // `input_level + 1`.
+ ASSERT_OK(dbfull()->TEST_CompactRange(input_level, nullptr, nullptr));
+ }
+ }
+ assert(key == kNumKeysPerDb);
+
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+ ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+ for (int i = 0; i < kNumKeysPerDb; i++) {
+ ASSERT_EQ(Get(Key(i)), "val");
+ }
+
+ ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L0));
+ ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L1));
+ ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+ ASSERT_EQ(kNumKeysPerDb, TestGetTickerCount(options, GET_HIT_L0) +
+ TestGetTickerCount(options, GET_HIT_L1) +
+ TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+}
+
+TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
+ // iter 0 -- zlib
+ // iter 1 -- bzip2
+ // iter 2 -- lz4
+ // iter 3 -- lz4HC
+ // iter 4 -- xpress
+ CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+ kLZ4Compression, kLZ4HCCompression,
+ kXpressCompression};
+ for (auto comp : compressions) {
+ if (!CompressionTypeSupported(comp)) {
+ continue;
+ }
+ // first_table_version 1 -- generate with table_version == 1, read with
+ // table_version == 2
+ // first_table_version 2 -- generate with table_version == 2, read with
+ // table_version == 1
+ for (int first_table_version = 1; first_table_version <= 2;
+ ++first_table_version) {
+ BlockBasedTableOptions table_options;
+ table_options.format_version = first_table_version;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ Options options = CurrentOptions();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.create_if_missing = true;
+ options.compression = comp;
+ DestroyAndReopen(options);
+
+ int kNumKeysWritten = 1000;
+
+ Random rnd(301);
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ // compressible string
+ ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
+ }
+
+ table_options.format_version = first_table_version == 1 ? 2 : 1;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ auto r = Get(Key(i));
+ ASSERT_EQ(r.substr(128), std::string(128, 'a'));
+ }
+ }
+ }
+}
+
+TEST_F(DBTest, CloseSpeedup) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_write_buffer_number = 16;
+
+ // Block background threads
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ test::SleepingBackgroundTask sleeping_task_high;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_high, Env::Priority::HIGH);
+
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ // In Windows, LOCK file cannot be deleted because it is locked by db_test
+ // After closing db_test, the LOCK file is unlocked and can be deleted
+ // Delete archival files.
+ bool deleteDir = true;
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ Status s = env_->DeleteFile(dbname_ + "/" + filenames[i]);
+ if (!s.ok()) {
+ deleteDir = false;
+ }
+ }
+ if (deleteDir) {
+ ASSERT_OK(env_->DeleteDir(dbname_));
+ }
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are not going to level 2
+ // After that, (100K, 200K)
+ for (int num = 0; num < 5; num++) {
+ GenerateNewFile(&rnd, &key_idx, true);
+ }
+
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ Close();
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Unblock background threads
+ sleeping_task_high.WakeUp();
+ sleeping_task_high.WaitUntilDone();
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ Destroy(options);
+}
+
+class DelayedMergeOperator : public MergeOperator {
+ private:
+ DBTest* db_test_;
+
+ public:
+ explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {}
+
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ db_test_->env_->MockSleepForMicroseconds(1000 *
+ merge_in.operand_list.size());
+ merge_out->new_value = "";
+ return true;
+ }
+
+ const char* Name() const override { return "DelayedMergeOperator"; }
+};
+
+TEST_F(DBTest, MergeTestTime) {
+ std::string one, two, three;
+ PutFixed64(&one, 1);
+ PutFixed64(&two, 2);
+ PutFixed64(&three, 3);
+
+ // Enable time profiling
+ SetPerfLevel(kEnableTime);
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.merge_operator.reset(new DelayedMergeOperator(this));
+ SetTimeElapseOnlySleepOnReopen(&options);
+ DestroyAndReopen(options);
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", one));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "foo", two));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "foo", three));
+ ASSERT_OK(Flush());
+
+ ReadOptions opt;
+ opt.verify_checksums = true;
+ opt.snapshot = nullptr;
+ std::string result;
+ ASSERT_OK(db_->Get(opt, "foo", &result));
+
+ ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+
+ ReadOptions read_options;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ int count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ++count;
+ }
+
+ ASSERT_EQ(1, count);
+ ASSERT_EQ(4000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0);
+#endif // ROCKSDB_USING_THREAD_STATUS
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBTestWithParam, MergeCompactionTimeTest) {
+ SetPerfLevel(kEnableTime);
+ Options options = CurrentOptions();
+ options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.merge_operator.reset(new DelayedMergeOperator(this));
+ options.disable_auto_compactions = true;
+ options.max_subcompactions = max_subcompactions_;
+ SetTimeElapseOnlySleepOnReopen(&options);
+ DestroyAndReopen(options);
+
+ constexpr unsigned n = 1000;
+ for (unsigned i = 0; i < n; i++) {
+ ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST"));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_EQ(uint64_t{n} * 1000000U,
+ TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+}
+
+TEST_P(DBTestWithParam, FilterCompactionTimeTest) {
+ Options options = CurrentOptions();
+ options.compaction_filter_factory =
+ std::make_shared<DelayFilterFactory>(this);
+ options.disable_auto_compactions = true;
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.statistics->set_stats_level(kExceptTimeForMutex);
+ options.max_subcompactions = max_subcompactions_;
+ SetTimeElapseOnlySleepOnReopen(&options);
+ DestroyAndReopen(options);
+
+ unsigned n = 0;
+ // put some data
+ for (int table = 0; table < 4; ++table) {
+ for (int i = 0; i < 10 + table; ++i) {
+ ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+ ++n;
+ }
+ ASSERT_OK(Flush());
+ }
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ(0U, CountLiveFiles());
+
+ Reopen(options);
+
+ Iterator* itr = db_->NewIterator(ReadOptions());
+ itr->SeekToFirst();
+ ASSERT_OK(itr->status());
+ ASSERT_EQ(uint64_t{n} * 1000000U,
+ TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME));
+ delete itr;
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, TestLogCleanup) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 64 * 1024; // very small
+ // only two memtables allowed ==> only two log files
+ options.max_write_buffer_number = 2;
+ Reopen(options);
+
+ for (int i = 0; i < 100000; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ // only 2 memtables will be alive, so logs_to_free needs to always be below
+ // 2
+ ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, EmptyCompactedDB) {
+ Options options = CurrentOptions();
+ options.max_open_files = -1;
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ Status s = Put("new", "value");
+ ASSERT_TRUE(s.IsNotSupported());
+ Close();
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, SuggestCompactRangeTest) {
+ class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
+ public:
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ saved_context = context;
+ std::unique_ptr<CompactionFilter> empty_filter;
+ return empty_filter;
+ }
+ const char* Name() const override {
+ return "CompactionFilterFactoryGetContext";
+ }
+ static bool IsManual(CompactionFilterFactory* compaction_filter_factory) {
+ return reinterpret_cast<CompactionFilterFactoryGetContext*>(
+ compaction_filter_factory)
+ ->saved_context.is_manual_compaction;
+ }
+ CompactionFilter::Context saved_context;
+ };
+
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+ DBTestBase::kNumKeysByGenerateNewRandomFile));
+ options.compaction_style = kCompactionStyleLevel;
+ options.compaction_filter_factory.reset(
+ new CompactionFilterFactoryGetContext());
+ options.write_buffer_size = 200 << 10;
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 450 << 10;
+ options.target_file_size_base = 98 << 10;
+ options.max_compaction_bytes = static_cast<uint64_t>(1) << 60; // inf
+
+ Reopen(options);
+
+ Random rnd(301);
+
+ for (int num = 0; num < 10; num++) {
+ GenerateNewRandomFile(&rnd);
+ }
+
+ ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
+ options.compaction_filter_factory.get()));
+
+ // make sure either L0 or L1 has file
+ while (NumTableFilesAtLevel(0) == 0 && NumTableFilesAtLevel(1) == 0) {
+ GenerateNewRandomFile(&rnd);
+ }
+
+ // compact it three times
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ // All files are compacted
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // nonoverlapping with the file on level 0
+ Slice start("a"), end("b");
+ ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // should not compact the level 0 file
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ start = Slice("j");
+ end = Slice("m");
+ ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // SuggestCompactRange() is not going to be reported as manual compaction
+ ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
+ options.compaction_filter_factory.get()));
+
+ // now it should compact the level 0 file
+ // as it's a trivial move to L1, it triggers another one to compact to L2
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBTest, SuggestCompactRangeUniversal) {
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+ DBTestBase::kNumKeysByGenerateNewRandomFile));
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 200 << 10;
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 450 << 10;
+ options.target_file_size_base = 98 << 10;
+ options.max_compaction_bytes = static_cast<uint64_t>(1) << 60; // inf
+
+ Reopen(options);
+
+ Random rnd(301);
+
+ for (int num = 0; num < 10; num++) {
+ GenerateNewRandomFile(&rnd);
+ }
+
+ ASSERT_EQ("1,2,3,4", FilesPerLevel());
+ for (int i = 0; i < 3; i++) {
+ ASSERT_OK(
+ db_->SuggestCompactRange(db_->DefaultColumnFamily(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ // All files are compacted
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+ ASSERT_EQ(0, NumTableFilesAtLevel(2));
+
+ GenerateNewRandomFile(&rnd);
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // nonoverlapping with the file on level 0
+ Slice start("a"), end("b");
+ ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // should not compact the level 0 file
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ start = Slice("j");
+ end = Slice("m");
+ ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // now it should compact the level 0 file to the last level
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBTest, PromoteL0) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 10 * 1024 * 1024;
+ DestroyAndReopen(options);
+
+ // non overlapping ranges
+ std::vector<std::pair<int32_t, int32_t>> ranges = {
+ {81, 160}, {0, 80}, {161, 240}, {241, 320}};
+
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+ for (const auto& range : ranges) {
+ for (int32_t j = range.first; j < range.second; j++) {
+ values[j] = rnd.RandomString(value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ int32_t level0_files = NumTableFilesAtLevel(0, 0);
+ ASSERT_EQ(level0_files, ranges.size());
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1
+
+ // Promote L0 level to L2.
+ ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2));
+ // We expect that all the files were trivially moved from L0 to L2
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files);
+
+ for (const auto& kv : values) {
+ ASSERT_EQ(Get(Key(kv.first)), kv.second);
+ }
+}
+
+TEST_F(DBTest, PromoteL0Failure) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 10 * 1024 * 1024;
+ DestroyAndReopen(options);
+
+ // Produce two L0 files with overlapping ranges.
+ ASSERT_OK(Put(Key(0), ""));
+ ASSERT_OK(Put(Key(3), ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(1), ""));
+ ASSERT_OK(Flush());
+
+ Status status;
+ // Fails because L0 has overlapping files.
+ status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+ ASSERT_TRUE(status.IsInvalidArgument());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // Now there is a file in L1.
+ ASSERT_GE(NumTableFilesAtLevel(1, 0), 1);
+
+ ASSERT_OK(Put(Key(5), ""));
+ ASSERT_OK(Flush());
+ // Fails because L1 is non-empty.
+ status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+ ASSERT_TRUE(status.IsInvalidArgument());
+}
+
+// Github issue #596
+TEST_F(DBTest, CompactRangeWithEmptyBottomLevel) {
+ const int kNumLevels = 2;
+ const int kNumL0Files = 2;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumL0Files; ++i) {
+ ASSERT_OK(Put(Key(0), rnd.RandomString(1024)));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ(NumTableFilesAtLevel(0), kNumL0Files);
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1), kNumL0Files);
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, AutomaticConflictsWithManualCompaction) {
+ const int kNumL0Files = 50;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 4;
+ // never slowdown / stop
+ options.level0_slowdown_writes_trigger = 999999;
+ options.level0_stop_writes_trigger = 999999;
+ options.max_background_compactions = 10;
+ DestroyAndReopen(options);
+
+ // schedule automatic compactions after the manual one starts, but before it
+ // finishes to ensure conflict.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BackgroundCompaction:Start",
+ "DBTest::AutomaticConflictsWithManualCompaction:PrePuts"},
+ {"DBTest::AutomaticConflictsWithManualCompaction:PostPuts",
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+ std::atomic<int> callback_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::MaybeScheduleFlushOrCompaction:Conflict",
+ [&](void* /*arg*/) { callback_count.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < 2; ++i) {
+ // put two keys to ensure no trivial move
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ port::Thread manual_compaction_thread([this]() {
+ CompactRangeOptions croptions;
+ croptions.exclusive_manual_compaction = true;
+ ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PrePuts");
+ for (int i = 0; i < kNumL0Files; ++i) {
+ // put two keys to ensure no trivial move
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PostPuts");
+
+ ASSERT_GE(callback_count.load(), 1);
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_NE("NOT_FOUND", Get(Key(i)));
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ manual_compaction_thread.join();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) {
+ Options options = CurrentOptions();
+ options.max_background_compactions = 1;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 36;
+ options.level0_stop_writes_trigger = 36;
+ DestroyAndReopen(options);
+
+ // generate files for manual compaction
+ Random rnd(301);
+ for (int i = 0; i < 2; ++i) {
+ // put two keys to ensure no trivial move
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data;
+ db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+
+ std::vector<std::string> input_files;
+ input_files.push_back(cf_meta_data.levels[0].files[0].name);
+
+ SyncPoint::GetInstance()->LoadDependency({
+ {"CompactFilesImpl:0",
+ "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin"},
+ {"DBTest::CompactFilesShouldTriggerAutoCompaction:End",
+ "CompactFilesImpl:1"},
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread manual_compaction_thread([&]() {
+ auto s = db_->CompactFiles(CompactionOptions(), db_->DefaultColumnFamily(),
+ input_files, 0);
+ ASSERT_OK(s);
+ });
+
+ TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:Begin");
+ // generate enough files to trigger compaction
+ for (int i = 0; i < 20; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+ ASSERT_GT(cf_meta_data.levels[0].files.size(),
+ options.level0_file_num_compaction_trigger);
+ TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:End");
+
+ manual_compaction_thread.join();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+ ASSERT_LE(cf_meta_data.levels[0].files.size(),
+ options.level0_file_num_compaction_trigger);
+}
+#endif // ROCKSDB_LITE
+
+// Github issue #595
+// Large write batch with column families
+TEST_F(DBTest, LargeBatchWithColumnFamilies) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ CreateAndReopenWithCF({"pikachu"}, options);
+ int64_t j = 0;
+ for (int i = 0; i < 5; i++) {
+ for (int pass = 1; pass <= 3; pass++) {
+ WriteBatch batch;
+ size_t write_size = 1024 * 1024 * (5 + i);
+ fprintf(stderr, "prepare: %" ROCKSDB_PRIszt " MB, pass:%d\n",
+ (write_size / 1024 / 1024), pass);
+ for (;;) {
+ std::string data(3000, j++ % 127 + 20);
+ data += std::to_string(j);
+ ASSERT_OK(batch.Put(handles_[0], Slice(data), Slice(data)));
+ if (batch.GetDataSize() > write_size) {
+ break;
+ }
+ }
+ fprintf(stderr, "write: %" ROCKSDB_PRIszt " MB\n",
+ (batch.GetDataSize() / 1024 / 1024));
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+ fprintf(stderr, "done\n");
+ }
+ }
+ // make sure we can re-open it.
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+// Make sure that Flushes can proceed in parallel with CompactRange()
+TEST_F(DBTest, FlushesInParallelWithCompactRange) {
+ // iter == 0 -- leveled
+ // iter == 1 -- leveled, but throw in a flush between two levels compacting
+ // iter == 2 -- universal
+ for (int iter = 0; iter < 3; ++iter) {
+ Options options = CurrentOptions();
+ if (iter < 2) {
+ options.compaction_style = kCompactionStyleLevel;
+ } else {
+ options.compaction_style = kCompactionStyleUniversal;
+ }
+ options.write_buffer_size = 110 << 10;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 450 << 10;
+ options.target_file_size_base = 98 << 10;
+ options.max_write_buffer_number = 2;
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int num = 0; num < 14; num++) {
+ GenerateNewRandomFile(&rnd);
+ }
+
+ if (iter == 1) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::RunManualCompaction()::1",
+ "DBTest::FlushesInParallelWithCompactRange:1"},
+ {"DBTest::FlushesInParallelWithCompactRange:2",
+ "DBImpl::RunManualCompaction()::2"}});
+ } else {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"CompactionJob::Run():Start",
+ "DBTest::FlushesInParallelWithCompactRange:1"},
+ {"DBTest::FlushesInParallelWithCompactRange:2",
+ "CompactionJob::Run():End"}});
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<port::Thread> threads;
+ threads.emplace_back([&]() { Compact("a", "z"); });
+
+ TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1");
+
+ // this has to start a flush. if flushes are blocked, this will try to
+ // create
+ // 3 memtables, and that will fail because max_write_buffer_number is 2
+ for (int num = 0; num < 3; num++) {
+ GenerateNewRandomFile(&rnd, /* nowait */ true);
+ }
+
+ TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2");
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBTest, DelayedWriteRate) {
+ const int kEntriesPerMemTable = 100;
+ const int kTotalFlushes = 12;
+
+ Options options = CurrentOptions();
+ env_->SetBackgroundThreads(1, Env::LOW);
+ options.env = env_;
+ options.write_buffer_size = 100000000;
+ options.max_write_buffer_number = 256;
+ options.max_background_compactions = 1;
+ options.level0_file_num_compaction_trigger = 3;
+ options.level0_slowdown_writes_trigger = 3;
+ options.level0_stop_writes_trigger = 999999;
+ options.delayed_write_rate = 20000000; // Start with 200MB/s
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kEntriesPerMemTable));
+
+ SetTimeElapseOnlySleepOnReopen(&options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Block compactions
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ for (int i = 0; i < 3; i++) {
+ ASSERT_OK(Put(Key(i), std::string(10000, 'x')));
+ ASSERT_OK(Flush());
+ }
+
+ // These writes will be slowed down to 1KB/s
+ uint64_t estimated_sleep_time = 0;
+ Random rnd(301);
+ ASSERT_OK(Put("", ""));
+ uint64_t cur_rate = options.delayed_write_rate;
+ for (int i = 0; i < kTotalFlushes; i++) {
+ uint64_t size_memtable = 0;
+ for (int j = 0; j < kEntriesPerMemTable; j++) {
+ auto rand_num = rnd.Uniform(20);
+ // Spread the size range to more.
+ size_t entry_size = rand_num * rand_num * rand_num;
+ WriteOptions wo;
+ ASSERT_OK(Put(Key(i), std::string(entry_size, 'x'), wo));
+ size_memtable += entry_size + 18;
+ // Occasionally sleep a while
+ if (rnd.Uniform(20) == 6) {
+ env_->SleepForMicroseconds(2666);
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ estimated_sleep_time += size_memtable * 1000000u / cur_rate;
+ // Slow down twice. One for memtable switch and one for flush finishes.
+ cur_rate = static_cast<uint64_t>(static_cast<double>(cur_rate) *
+ kIncSlowdownRatio * kIncSlowdownRatio);
+ }
+ // Estimate the total sleep time fall into the rough range.
+ ASSERT_GT(env_->NowMicros(), estimated_sleep_time / 2);
+ ASSERT_LT(env_->NowMicros(), estimated_sleep_time * 2);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBTest, HardLimit) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ env_->SetBackgroundThreads(1, Env::LOW);
+ options.max_write_buffer_number = 256;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 * 1024;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 999999;
+ options.level0_stop_writes_trigger = 999999;
+ options.hard_pending_compaction_bytes_limit = 800 << 10;
+ options.max_bytes_for_level_base = 10000000000u;
+ options.max_background_compactions = 1;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ std::atomic<int> callback_count(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
+ callback_count.fetch_add(1);
+ sleeping_task_low.WakeUp();
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ int key_idx = 0;
+ for (int num = 0; num < 5; num++) {
+ GenerateNewFile(&rnd, &key_idx, true);
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+
+ ASSERT_EQ(0, callback_count.load());
+
+ for (int num = 0; num < 5; num++) {
+ GenerateNewFile(&rnd, &key_idx, true);
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_GE(callback_count.load(), 1);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ sleeping_task_low.WaitUntilDone();
+}
+
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+class WriteStallListener : public EventListener {
+ public:
+ WriteStallListener() : condition_(WriteStallCondition::kNormal) {}
+ void OnStallConditionsChanged(const WriteStallInfo& info) override {
+ MutexLock l(&mutex_);
+ condition_ = info.condition.cur;
+ }
+ bool CheckCondition(WriteStallCondition expected) {
+ MutexLock l(&mutex_);
+ return expected == condition_;
+ }
+
+ private:
+ port::Mutex mutex_;
+ WriteStallCondition condition_;
+};
+
+TEST_F(DBTest, SoftLimit) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.max_write_buffer_number = 256;
+ options.level0_file_num_compaction_trigger = 1;
+ options.level0_slowdown_writes_trigger = 3;
+ options.level0_stop_writes_trigger = 999999;
+ options.delayed_write_rate = 20000; // About 200KB/s limited rate
+ options.soft_pending_compaction_bytes_limit = 160000;
+ options.target_file_size_base = 99999999; // All into one file
+ options.max_bytes_for_level_base = 50000;
+ options.max_bytes_for_level_multiplier = 10;
+ options.max_background_compactions = 1;
+ options.compression = kNoCompression;
+ WriteStallListener* listener = new WriteStallListener();
+ options.listeners.emplace_back(listener);
+
+ // FlushMemtable with opt.wait=true does not wait for
+ // `OnStallConditionsChanged` being called. The event listener is triggered
+ // on `JobContext::Clean`, which happens after flush result is installed.
+ // We use sync point to create a custom WaitForFlush that waits for
+ // context cleanup.
+ port::Mutex flush_mutex;
+ port::CondVar flush_cv(&flush_mutex);
+ bool flush_finished = false;
+ auto InstallFlushCallback = [&]() {
+ {
+ MutexLock l(&flush_mutex);
+ flush_finished = false;
+ }
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCallFlush:ContextCleanedUp", [&](void*) {
+ {
+ MutexLock l(&flush_mutex);
+ flush_finished = true;
+ }
+ flush_cv.SignalAll();
+ });
+ };
+ auto WaitForFlush = [&]() {
+ {
+ MutexLock l(&flush_mutex);
+ while (!flush_finished) {
+ flush_cv.Wait();
+ }
+ }
+ SyncPoint::GetInstance()->ClearCallBack(
+ "DBImpl::BackgroundCallFlush:ContextCleanedUp");
+ };
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(options);
+
+ // Generating 360KB in Level 3
+ for (int i = 0; i < 72; i++) {
+ ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
+ if (i % 10 == 0) {
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ MoveFilesToLevel(3);
+
+ // Generating 360KB in Level 2
+ for (int i = 0; i < 72; i++) {
+ ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
+ if (i % 10 == 0) {
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ MoveFilesToLevel(2);
+
+ ASSERT_OK(Put(Key(0), ""));
+
+ test::SleepingBackgroundTask sleeping_task_low;
+ // Block compactions
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ sleeping_task_low.WaitUntilSleeping();
+
+ // Create 3 L0 files, making score of L0 to be 3.
+ for (int i = 0; i < 3; i++) {
+ ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
+ ASSERT_OK(Put(Key(100 - i), std::string(5000, 'x')));
+ // Flush the file. File size is around 30KB.
+ InstallFlushCallback();
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+ WaitForFlush();
+ }
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+ sleeping_task_low.Reset();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Now there is one L1 file but doesn't trigger soft_rate_limit
+ //
+ // TODO: soft_rate_limit is depreciated. If this test
+ // relies on soft_rate_limit, then we need to change the test.
+ //
+ // The L1 file size is around 30KB.
+ ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+ // Only allow one compactin going through.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void* /*arg*/) {
+ // Schedule a sleeping task.
+ sleeping_task_low.Reset();
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_low, Env::Priority::LOW);
+ });
+
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+ sleeping_task_low.WaitUntilSleeping();
+ // Create 3 L0 files, making score of L0 to be 3
+ for (int i = 0; i < 3; i++) {
+ ASSERT_OK(Put(Key(10 + i), std::string(5000, 'x')));
+ ASSERT_OK(Put(Key(90 - i), std::string(5000, 'x')));
+ // Flush the file. File size is around 30KB.
+ InstallFlushCallback();
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+ WaitForFlush();
+ }
+
+ // Wake up sleep task to enable compaction to run and waits
+ // for it to go to sleep state again to make sure one compaction
+ // goes through.
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilSleeping();
+
+ // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB
+ // Given level multiplier 10, estimated pending compaction is around 100KB
+ // doesn't trigger soft_pending_compaction_bytes_limit
+ ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+ // Create 3 L0 files, making score of L0 to be 3, higher than L0.
+ for (int i = 0; i < 3; i++) {
+ ASSERT_OK(Put(Key(20 + i), std::string(5000, 'x')));
+ ASSERT_OK(Put(Key(80 - i), std::string(5000, 'x')));
+ // Flush the file. File size is around 30KB.
+ InstallFlushCallback();
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+ WaitForFlush();
+ }
+ // Wake up sleep task to enable compaction to run and waits
+ // for it to go to sleep state again to make sure one compaction
+ // goes through.
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilSleeping();
+
+ // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB
+ // L2 size is 360KB, so the estimated level fanout 4, estimated pending
+ // compaction is around 200KB
+ // triggerring soft_pending_compaction_bytes_limit
+ ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilSleeping();
+
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+ // shrink level base so L2 will hit soft limit easier.
+ ASSERT_OK(dbfull()->SetOptions({
+ {"max_bytes_for_level_base", "5000"},
+ }));
+
+ ASSERT_OK(Put("", ""));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+ ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+ sleeping_task_low.WaitUntilSleeping();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBTest, LastWriteBufferDelay) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000;
+ options.max_write_buffer_number = 4;
+ options.delayed_write_rate = 20000;
+ options.compression = kNoCompression;
+ options.disable_auto_compactions = true;
+ int kNumKeysPerMemtable = 3;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+
+ Reopen(options);
+ test::SleepingBackgroundTask sleeping_task;
+ // Block flushes
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::HIGH);
+ sleeping_task.WaitUntilSleeping();
+
+ // Create 3 L0 files, making score of L0 to be 3.
+ for (int i = 0; i < 3; i++) {
+ // Fill one mem table
+ for (int j = 0; j < kNumKeysPerMemtable; j++) {
+ ASSERT_OK(Put(Key(j), ""));
+ }
+ ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+ }
+ // Inserting a new entry would create a new mem table, triggering slow down.
+ ASSERT_OK(Put(Key(0), ""));
+ ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+}
+#endif // !defined(ROCKSDB_LITE) &&
+ // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+
+TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
+ CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+ kLZ4Compression, kLZ4HCCompression,
+ kXpressCompression};
+ for (auto comp : compressions) {
+ if (!CompressionTypeSupported(comp)) {
+ // not supported, we should fail the Open()
+ Options options = CurrentOptions();
+ options.compression = comp;
+ ASSERT_TRUE(!TryReopen(options).ok());
+ // Try if CreateColumnFamily also fails
+ options.compression = kNoCompression;
+ ASSERT_OK(TryReopen(options));
+ ColumnFamilyOptions cf_options(options);
+ cf_options.compression = comp;
+ ColumnFamilyHandle* handle;
+ ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok());
+ }
+ }
+}
+
+TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
+ Options options = CurrentOptions();
+ options.max_open_files = 100;
+ Reopen(options);
+
+ ColumnFamilyOptions cf_options(options);
+ // ttl is now supported when max_open_files is -1.
+ cf_options.ttl = 3600;
+ ColumnFamilyHandle* handle;
+ ASSERT_OK(db_->CreateColumnFamily(cf_options, "pikachu", &handle));
+ delete handle;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, RowCache) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.row_cache = NewLRUCache(8192);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+ ASSERT_EQ(Get("foo"), "bar");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+ ASSERT_EQ(Get("foo"), "bar");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+}
+
+TEST_F(DBTest, PinnableSliceAndRowCache) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.row_cache = NewLRUCache(8192);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(Get("foo"), "bar");
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 1);
+
+ {
+ PinnableSlice pin_slice;
+ ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+ ASSERT_EQ(pin_slice.ToString(), "bar");
+ // Entry is already in cache, lookup will remove the element from lru
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 0);
+ }
+ // After PinnableSlice destruction element is added back in LRU
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 1);
+}
+
+TEST_F(DBTest, ReusePinnableSlice) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.row_cache = NewLRUCache(8192);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(Get("foo"), "bar");
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 1);
+
+ {
+ PinnableSlice pin_slice;
+ ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+ ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+ ASSERT_EQ(pin_slice.ToString(), "bar");
+
+ // Entry is already in cache, lookup will remove the element from lru
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 0);
+ }
+ // After PinnableSlice destruction element is added back in LRU
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 1);
+
+ {
+ std::vector<Slice> multiget_keys;
+ multiget_keys.push_back("foo");
+ std::vector<PinnableSlice> multiget_values(1);
+ std::vector<Status> statuses({Status::NotFound()});
+ ReadOptions ropt;
+ dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(),
+ multiget_keys.size(), multiget_keys.data(),
+ multiget_values.data(), statuses.data());
+ ASSERT_EQ(Status::OK(), statuses[0]);
+ dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(),
+ multiget_keys.size(), multiget_keys.data(),
+ multiget_values.data(), statuses.data());
+ ASSERT_EQ(Status::OK(), statuses[0]);
+
+ // Entry is already in cache, lookup will remove the element from lru
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 0);
+ }
+ // After PinnableSlice destruction element is added back in LRU
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 1);
+
+ {
+ std::vector<ColumnFamilyHandle*> multiget_cfs;
+ multiget_cfs.push_back(dbfull()->DefaultColumnFamily());
+ std::vector<Slice> multiget_keys;
+ multiget_keys.push_back("foo");
+ std::vector<PinnableSlice> multiget_values(1);
+ std::vector<Status> statuses({Status::NotFound()});
+ ReadOptions ropt;
+ dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(),
+ multiget_keys.data(), multiget_values.data(),
+ statuses.data());
+ ASSERT_EQ(Status::OK(), statuses[0]);
+ dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(),
+ multiget_keys.data(), multiget_values.data(),
+ statuses.data());
+ ASSERT_EQ(Status::OK(), statuses[0]);
+
+ // Entry is already in cache, lookup will remove the element from lru
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 0);
+ }
+ // After PinnableSlice destruction element is added back in LRU
+ ASSERT_EQ(
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+ 1);
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest, DeletingOldWalAfterDrop) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"Test:AllowFlushes", "DBImpl::BGWorkFlush"},
+ {"DBImpl::BGWorkFlush:done", "Test:WaitForFlush"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ Options options = CurrentOptions();
+ options.max_total_wal_size = 8192;
+ options.compression = kNoCompression;
+ options.write_buffer_size = 1 << 20;
+ options.level0_file_num_compaction_trigger = (1 << 30);
+ options.level0_slowdown_writes_trigger = (1 << 30);
+ options.level0_stop_writes_trigger = (1 << 30);
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateColumnFamilies({"cf1", "cf2"}, options);
+ ASSERT_OK(Put(0, "key1", DummyString(8192)));
+ ASSERT_OK(Put(0, "key2", DummyString(8192)));
+ // the oldest wal should now be getting_flushed
+ ASSERT_OK(db_->DropColumnFamily(handles_[0]));
+ // all flushes should now do nothing because their CF is dropped
+ TEST_SYNC_POINT("Test:AllowFlushes");
+ TEST_SYNC_POINT("Test:WaitForFlush");
+ uint64_t lognum1 = dbfull()->TEST_LogfileNumber();
+ ASSERT_OK(Put(1, "key3", DummyString(8192)));
+ ASSERT_OK(Put(1, "key4", DummyString(8192)));
+ // new wal should have been created
+ uint64_t lognum2 = dbfull()->TEST_LogfileNumber();
+ EXPECT_GT(lognum2, lognum1);
+}
+
+TEST_F(DBTest, UnsupportedManualSync) {
+ DestroyAndReopen(CurrentOptions());
+ env_->is_wal_sync_thread_safe_.store(false);
+ Status s = db_->SyncWAL();
+ ASSERT_TRUE(s.IsNotSupported());
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam,
+ ::testing::Combine(::testing::Values(1, 4),
+ ::testing::Bool()));
+
+TEST_F(DBTest, PauseBackgroundWorkTest) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000; // Small write buffer
+ Reopen(options);
+
+ std::vector<port::Thread> threads;
+ std::atomic<bool> done(false);
+ ASSERT_OK(db_->PauseBackgroundWork());
+ threads.emplace_back([&]() {
+ Random rnd(301);
+ for (int i = 0; i < 10000; ++i) {
+ ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+ }
+ done.store(true);
+ });
+ env_->SleepForMicroseconds(200000);
+ // make sure the thread is not done
+ ASSERT_FALSE(done.load());
+ ASSERT_OK(db_->ContinueBackgroundWork());
+ for (auto& t : threads) {
+ t.join();
+ }
+ // now it's done
+ ASSERT_TRUE(done.load());
+}
+
+// Keep spawning short-living threads that create an iterator and quit.
+// Meanwhile in another thread keep flushing memtables.
+// This used to cause a deadlock.
+TEST_F(DBTest, ThreadLocalPtrDeadlock) {
+ std::atomic<int> flushes_done{0};
+ std::atomic<int> threads_destroyed{0};
+ auto done = [&] { return flushes_done.load() > 10; };
+
+ port::Thread flushing_thread([&] {
+ for (int i = 0; !done(); ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), Slice("hi"),
+ Slice(std::to_string(i).c_str())));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ int cnt = ++flushes_done;
+ fprintf(stderr, "Flushed %d times\n", cnt);
+ }
+ });
+
+ std::vector<port::Thread> thread_spawning_threads(10);
+ for (auto& t : thread_spawning_threads) {
+ t = port::Thread([&] {
+ while (!done()) {
+ {
+ port::Thread tmp_thread([&] {
+ auto it = db_->NewIterator(ReadOptions());
+ ASSERT_OK(it->status());
+ delete it;
+ });
+ tmp_thread.join();
+ }
+ ++threads_destroyed;
+ }
+ });
+ }
+
+ for (auto& t : thread_spawning_threads) {
+ t.join();
+ }
+ flushing_thread.join();
+ fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n",
+ flushes_done.load(), threads_destroyed.load());
+}
+
+TEST_F(DBTest, LargeBlockSizeTest) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put(0, "foo", "bar"));
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 8LL * 1024 * 1024 * 1024LL;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, CreationTimeOfOldestFile) {
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 2;
+ const int kValueSize = 100;
+
+ Options options = CurrentOptions();
+ options.max_open_files = -1;
+ env_->SetMockSleep();
+ options.env = env_;
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ DestroyAndReopen(options);
+
+ bool set_file_creation_time_to_zero = true;
+ int idx = 0;
+
+ int64_t time_1 = 0;
+ env_->GetCurrentTime(&time_1);
+ const uint64_t uint_time_1 = static_cast<uint64_t>(time_1);
+
+ // Add 50 hours
+ env_->MockSleepForSeconds(50 * 60 * 60);
+
+ int64_t time_2 = 0;
+ env_->GetCurrentTime(&time_2);
+ const uint64_t uint_time_2 = static_cast<uint64_t>(time_2);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+ TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+ if (set_file_creation_time_to_zero) {
+ if (idx == 0) {
+ props->file_creation_time = 0;
+ idx++;
+ } else if (idx == 1) {
+ props->file_creation_time = uint_time_1;
+ idx = 0;
+ }
+ } else {
+ if (idx == 0) {
+ props->file_creation_time = uint_time_1;
+ idx++;
+ } else if (idx == 1) {
+ props->file_creation_time = uint_time_2;
+ }
+ }
+ });
+ // Set file creation time in manifest all to 0.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FileMetaData::FileMetaData", [&](void* arg) {
+ FileMetaData* meta = static_cast<FileMetaData*>(arg);
+ meta->file_creation_time = 0;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // At this point there should be 2 files, one with file_creation_time = 0 and
+ // the other non-zero. GetCreationTimeOfOldestFile API should return 0.
+ uint64_t creation_time;
+ Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time);
+ ASSERT_EQ(0, creation_time);
+ ASSERT_EQ(s1, Status::OK());
+
+ // Testing with non-zero file creation time.
+ set_file_creation_time_to_zero = false;
+ options = CurrentOptions();
+ options.max_open_files = -1;
+ options.env = env_;
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // At this point there should be 2 files with non-zero file creation time.
+ // GetCreationTimeOfOldestFile API should return non-zero value.
+ uint64_t ctime;
+ Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+ ASSERT_EQ(uint_time_1, ctime);
+ ASSERT_EQ(s2, Status::OK());
+
+ // Testing with max_open_files != -1
+ options = CurrentOptions();
+ options.max_open_files = 10;
+ DestroyAndReopen(options);
+ Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+ ASSERT_EQ(s3, Status::NotSupported());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest, MemoryUsageWithMaxWriteBufferSizeToMaintain) {
+ Options options = CurrentOptions();
+ options.max_write_buffer_size_to_maintain = 10000;
+ options.write_buffer_size = 160000;
+ Reopen(options);
+ Random rnd(301);
+ bool memory_limit_exceeded = false;
+
+ ColumnFamilyData* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+ for (int i = 0; i < 1000; i++) {
+ std::string value = rnd.RandomString(1000);
+ ASSERT_OK(Put("keykey_" + std::to_string(i), value));
+
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ const uint64_t cur_active_mem = cfd->mem()->ApproximateMemoryUsage();
+ const uint64_t size_all_mem_table =
+ cur_active_mem + cfd->imm()->ApproximateMemoryUsage();
+
+ // Errors out if memory usage keeps on increasing beyond the limit.
+ // Once memory limit exceeds, memory_limit_exceeded is set and if
+ // size_all_mem_table doesn't drop out in the next write then it errors out
+ // (not expected behaviour). If memory usage drops then
+ // memory_limit_exceeded is set to false.
+ if ((size_all_mem_table > cur_active_mem) &&
+ (cur_active_mem >=
+ static_cast<uint64_t>(options.max_write_buffer_size_to_maintain)) &&
+ (size_all_mem_table >
+ static_cast<uint64_t>(options.max_write_buffer_size_to_maintain) +
+ options.write_buffer_size)) {
+ ASSERT_FALSE(memory_limit_exceeded);
+ memory_limit_exceeded = true;
+ } else {
+ memory_limit_exceeded = false;
+ }
+ }
+}
+
+TEST_F(DBTest, ShuttingDownNotBlockStalledWrites) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ Random rnd(403);
+
+ for (int i = 0; i < 20; i++) {
+ ASSERT_OK(Put("key_" + std::to_string(i), rnd.RandomString(10)));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ(GetSstFileCount(dbname_), 20);
+
+ // We need !disable_auto_compactions for writes to stall but also want to
+ // delay compaction so stalled writes unblocked due to kShutdownInProgress. BG
+ // compaction will first wait for the sync point
+ // DBTest::ShuttingDownNotBlockStalledWrites. Then it waits extra 2 sec to
+ // allow CancelAllBackgroundWork() to set shutting_down_.
+ SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0",
+ [&](void* /* arg */) { env_->SleepForMicroseconds(2 * 1000 * 1000); });
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::DelayWrite:Wait", "DBTest::ShuttingDownNotBlockStalledWrites"},
+ {"DBTest::ShuttingDownNotBlockStalledWrites",
+ "BackgroundCallCompaction:0"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ options.level0_stop_writes_trigger = 20;
+ options.disable_auto_compactions = false;
+ Reopen(options);
+
+ std::thread thd([&]() {
+ Status s = Put("key_" + std::to_string(101), "101");
+ ASSERT_EQ(s.code(), Status::kShutdownInProgress);
+ });
+
+ TEST_SYNC_POINT("DBTest::ShuttingDownNotBlockStalledWrites");
+ CancelAllBackgroundWork(db_, true);
+
+ thd.join();
+}
+#endif
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_test2.cc b/src/rocksdb/db/db_test2.cc
new file mode 100644
index 000000000..8adde3680
--- /dev/null
+++ b/src/rocksdb/db/db_test2.cc
@@ -0,0 +1,7652 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <atomic>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "db/read_callback.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/trace_record_result.h"
+#include "rocksdb/utilities/replayer.h"
+#include "rocksdb/wal_filter.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTest2 : public DBTestBase {
+ public:
+ DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, OpenForReadOnly) {
+ DB* db_ptr = nullptr;
+ std::string dbname = test::PerThreadDBPath("db_readonly");
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // OpenForReadOnly should fail but will create <dbname> in the file system
+ ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
+ // Since <dbname> is created, we should be able to delete the dir
+ // We first get the list files under <dbname>
+ // There should not be any subdirectories -- this is not checked here
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(dbname, &files));
+ for (auto& f : files) {
+ ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
+ }
+ // <dbname> should be empty now and we should be able to delete it
+ ASSERT_OK(env_->DeleteDir(dbname));
+ options.create_if_missing = false;
+ // OpenForReadOnly should fail since <dbname> was successfully deleted
+ ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
+ // With create_if_missing false, there should not be a dir in the file system
+ ASSERT_NOK(env_->FileExists(dbname));
+}
+
+TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) {
+ DB* db_ptr = nullptr;
+ std::string dbname = test::PerThreadDBPath("db_readonly");
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ column_families.push_back(ColumnFamilyDescriptor("goku", cf_options));
+ std::vector<ColumnFamilyHandle*> handles;
+ // OpenForReadOnly should fail but will create <dbname> in the file system
+ ASSERT_NOK(
+ DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
+ // Since <dbname> is created, we should be able to delete the dir
+ // We first get the list files under <dbname>
+ // There should not be any subdirectories -- this is not checked here
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(dbname, &files));
+ for (auto& f : files) {
+ ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
+ }
+ // <dbname> should be empty now and we should be able to delete it
+ ASSERT_OK(env_->DeleteDir(dbname));
+ options.create_if_missing = false;
+ // OpenForReadOnly should fail since <dbname> was successfully deleted
+ ASSERT_NOK(
+ DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
+ // With create_if_missing false, there should not be a dir in the file system
+ ASSERT_NOK(env_->FileExists(dbname));
+}
+
+class TestReadOnlyWithCompressedCache
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+ TestReadOnlyWithCompressedCache()
+ : DBTestBase("test_readonly_with_compressed_cache",
+ /*env_do_fsync=*/true) {
+ max_open_files_ = std::get<0>(GetParam());
+ use_mmap_ = std::get<1>(GetParam());
+ }
+ int max_open_files_;
+ bool use_mmap_;
+};
+
+TEST_P(TestReadOnlyWithCompressedCache, ReadOnlyWithCompressedCache) {
+ if (use_mmap_ && !IsMemoryMappedAccessSupported()) {
+ ROCKSDB_GTEST_SKIP("Test requires MMAP support");
+ return;
+ }
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("foo2", "barbarbarbarbarbarbarbar"));
+ ASSERT_OK(Flush());
+
+ DB* db_ptr = nullptr;
+ Options options = CurrentOptions();
+ options.allow_mmap_reads = use_mmap_;
+ options.max_open_files = max_open_files_;
+ options.compression = kSnappyCompression;
+ BlockBasedTableOptions table_options;
+ table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+ table_options.no_block_cache = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.statistics = CreateDBStatistics();
+
+ ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db_ptr));
+
+ std::string v;
+ ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("bar", v);
+ ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+ ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("bar", v);
+ if (Snappy_Supported()) {
+ if (use_mmap_) {
+ ASSERT_EQ(0,
+ options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+ } else {
+ ASSERT_EQ(1,
+ options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+ }
+ }
+
+ delete db_ptr;
+}
+
+INSTANTIATE_TEST_CASE_P(TestReadOnlyWithCompressedCache,
+ TestReadOnlyWithCompressedCache,
+ ::testing::Combine(::testing::Values(-1, 100),
+ ::testing::Bool()));
+
+class PartitionedIndexTestListener : public EventListener {
+ public:
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+ ASSERT_GT(info.table_properties.index_partitions, 1);
+ ASSERT_EQ(info.table_properties.index_key_is_user_key, 0);
+ }
+};
+
+TEST_F(DBTest2, PartitionedIndexUserToInternalKey) {
+ const int kValueSize = 10500;
+ const int kNumEntriesPerFile = 1000;
+ const int kNumFiles = 3;
+ const int kNumDistinctKeys = 30;
+
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+ PartitionedIndexTestListener* listener = new PartitionedIndexTestListener();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.listeners.emplace_back(listener);
+ std::vector<const Snapshot*> snapshots;
+ Reopen(options);
+ Random rnd(301);
+
+ for (int i = 0; i < kNumFiles; i++) {
+ for (int j = 0; j < kNumEntriesPerFile; j++) {
+ int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys;
+ std::string value = rnd.RandomString(kValueSize);
+ ASSERT_OK(Put("keykey_" + std::to_string(key_id), value));
+ snapshots.push_back(db_->GetSnapshot());
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (auto s : snapshots) {
+ db_->ReleaseSnapshot(s);
+ }
+}
+
+#endif // ROCKSDB_LITE
+
+class PrefixFullBloomWithReverseComparator
+ : public DBTestBase,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ PrefixFullBloomWithReverseComparator()
+ : DBTestBase("prefix_bloom_reverse", /*env_do_fsync=*/true) {}
+ void SetUp() override { if_cache_filter_ = GetParam(); }
+ bool if_cache_filter_;
+};
+
+TEST_P(PrefixFullBloomWithReverseComparator,
+ PrefixFullBloomWithReverseComparator) {
+ Options options = last_options_;
+ options.comparator = ReverseBytewiseComparator();
+ options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions bbto;
+ if (if_cache_filter_) {
+ bbto.no_block_cache = false;
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.block_cache = NewLRUCache(1);
+ }
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo"));
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2"));
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3"));
+
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+
+ if (bbto.block_cache) {
+ bbto.block_cache->EraseUnRefEntries();
+ }
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ iter->Seek("bar345");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar234", iter->key().ToString());
+ ASSERT_EQ("foo2", iter->value().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar123", iter->key().ToString());
+ ASSERT_EQ("foo", iter->value().ToString());
+
+ iter->Seek("foo234");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo123", iter->key().ToString());
+ ASSERT_EQ("foo3", iter->value().ToString());
+
+ iter->Seek("bar");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+}
+
+INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator,
+ PrefixFullBloomWithReverseComparator, testing::Bool());
+
+TEST_F(DBTest2, IteratorPropertyVersionNumber) {
+ ASSERT_OK(Put("", ""));
+ Iterator* iter1 = db_->NewIterator(ReadOptions());
+ ASSERT_OK(iter1->status());
+ std::string prop_value;
+ ASSERT_OK(
+ iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+ uint64_t version_number1 =
+ static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+ ASSERT_OK(Put("", ""));
+ ASSERT_OK(Flush());
+
+ Iterator* iter2 = db_->NewIterator(ReadOptions());
+ ASSERT_OK(iter2->status());
+ ASSERT_OK(
+ iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+ uint64_t version_number2 =
+ static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+ ASSERT_GT(version_number2, version_number1);
+
+ ASSERT_OK(Put("", ""));
+
+ Iterator* iter3 = db_->NewIterator(ReadOptions());
+ ASSERT_OK(iter3->status());
+ ASSERT_OK(
+ iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+ uint64_t version_number3 =
+ static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+ ASSERT_EQ(version_number2, version_number3);
+
+ iter1->SeekToFirst();
+ ASSERT_OK(
+ iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+ uint64_t version_number1_new =
+ static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+ ASSERT_EQ(version_number1, version_number1_new);
+
+ delete iter1;
+ delete iter2;
+ delete iter3;
+}
+
+TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "a", "begin"));
+ ASSERT_OK(Put(1, "z", "end"));
+ ASSERT_OK(Flush(1));
+ TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ std::string value;
+ value = Get(1, "a");
+}
+
+TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.max_successive_merges = 3;
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("poi", "Finch"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Reese"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Shaw"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Root"));
+ options.max_successive_merges = 2;
+ Reopen(options);
+}
+
+#ifndef ROCKSDB_LITE
+class DBTestSharedWriteBufferAcrossCFs
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ DBTestSharedWriteBufferAcrossCFs()
+ : DBTestBase("db_test_shared_write_buffer", /*env_do_fsync=*/true) {}
+ void SetUp() override {
+ use_old_interface_ = std::get<0>(GetParam());
+ cost_cache_ = std::get<1>(GetParam());
+ }
+ bool use_old_interface_;
+ bool cost_cache_;
+};
+
+TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+ auto flush_listener = std::make_shared<FlushCounterListener>();
+ options.listeners.push_back(flush_listener);
+ // Don't trip the listener at shutdown.
+ options.avoid_flush_during_shutdown = true;
+
+ // Avoid undeterministic value by malloc_usable_size();
+ // Force arena block size to 1
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Arena::Arena:0", [&](void* arg) {
+ size_t* block_size = static_cast<size_t*>(arg);
+ *block_size = 1;
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Arena::AllocateNewBlock:0", [&](void* arg) {
+ std::pair<size_t*, size_t*>* pair =
+ static_cast<std::pair<size_t*, size_t*>*>(arg);
+ *std::get<0>(*pair) = *std::get<1>(*pair);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // The total soft write buffer size is about 105000
+ std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+ ASSERT_LT(cache->GetUsage(), 256 * 1024);
+
+ if (use_old_interface_) {
+ options.db_write_buffer_size = 120000; // this is the real limit
+ } else if (!cost_cache_) {
+ options.write_buffer_manager.reset(new WriteBufferManager(114285));
+ } else {
+ options.write_buffer_manager.reset(new WriteBufferManager(114285, cache));
+ }
+ options.write_buffer_size = 500000; // this is never hit
+ CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ std::function<void()> wait_flush = [&]() {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
+ // Ensure background work is fully finished including listener callbacks
+ // before accessing listener state.
+ ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+ };
+
+ // Create some data and flush "default" and "nikitich" so that they
+ // are newer CFs created.
+ flush_listener->expected_flush_reason = FlushReason::kManualFlush;
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ Flush(3);
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ Flush(0);
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(1));
+
+ flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
+ ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+ if (cost_cache_) {
+ ASSERT_GE(cache->GetUsage(), 256 * 1024);
+ ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
+ }
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(60000), wo));
+ if (cost_cache_) {
+ ASSERT_GE(cache->GetUsage(), 256 * 1024);
+ ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
+ }
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ // No flush should trigger
+ wait_flush();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(1));
+ }
+
+ // Trigger a flush. Flushing "nikitich".
+ ASSERT_OK(Put(3, Key(2), DummyString(30000), wo));
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ wait_flush();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(2));
+ }
+
+ // Without hitting the threshold, no flush should trigger.
+ ASSERT_OK(Put(2, Key(1), DummyString(30000), wo));
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ wait_flush();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(2));
+ }
+
+ // Hit the write buffer limit again. "default"
+ // will have been flushed.
+ ASSERT_OK(Put(2, Key(2), DummyString(10000), wo));
+ wait_flush();
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ wait_flush();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(2));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(2));
+ }
+
+ // Trigger another flush. This time "dobrynia". "pikachu" should not
+ // be flushed, althrough it was never flushed.
+ ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(80000), wo));
+ wait_flush();
+ ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ wait_flush();
+
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(2));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(2));
+ }
+ if (cost_cache_) {
+ ASSERT_GE(cache->GetUsage(), 256 * 1024);
+ Close();
+ options.write_buffer_manager.reset();
+ last_options_.write_buffer_manager.reset();
+ ASSERT_LT(cache->GetUsage(), 256 * 1024);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs,
+ DBTestSharedWriteBufferAcrossCFs,
+ ::testing::Values(std::make_tuple(true, false),
+ std::make_tuple(false, false),
+ std::make_tuple(false, true)));
+
+TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
+ std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+ auto flush_listener = std::make_shared<FlushCounterListener>();
+ options.listeners.push_back(flush_listener);
+ // Don't trip the listener at shutdown.
+ options.avoid_flush_during_shutdown = true;
+ // Avoid undeterministic value by malloc_usable_size();
+ // Force arena block size to 1
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Arena::Arena:0", [&](void* arg) {
+ size_t* block_size = static_cast<size_t*>(arg);
+ *block_size = 1;
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Arena::AllocateNewBlock:0", [&](void* arg) {
+ std::pair<size_t*, size_t*>* pair =
+ static_cast<std::pair<size_t*, size_t*>*>(arg);
+ *std::get<0>(*pair) = *std::get<1>(*pair);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ options.write_buffer_size = 500000; // this is never hit
+ // Use a write buffer total size so that the soft limit is about
+ // 105000.
+ options.write_buffer_manager.reset(new WriteBufferManager(120000));
+ CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+ ASSERT_OK(DestroyDB(dbname2, options));
+ DB* db2 = nullptr;
+ ASSERT_OK(DB::Open(options, dbname2, &db2));
+
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ std::function<void()> wait_flush = [&]() {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+ ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+ // Ensure background work is fully finished including listener callbacks
+ // before accessing listener state.
+ ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+ ASSERT_OK(
+ static_cast_with_check<DBImpl>(db2)->TEST_WaitForBackgroundWork());
+ };
+
+ // Trigger a flush on cf2
+ flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
+ ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
+ wait_flush();
+ ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
+ wait_flush();
+
+ // Insert to DB2
+ ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000)));
+ wait_flush();
+
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ wait_flush();
+ ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
+ GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
+ GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+ static_cast<uint64_t>(0));
+ }
+
+ // Triggering to flush another CF in DB1
+ ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000)));
+ wait_flush();
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ wait_flush();
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+ static_cast<uint64_t>(0));
+ }
+
+ // Triggering flush in DB2.
+ ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000)));
+ wait_flush();
+ ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
+ wait_flush();
+ ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+ {
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+ static_cast<uint64_t>(0));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+ static_cast<uint64_t>(1));
+ }
+
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) {
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+ std::shared_ptr<Cache> cache = NewLRUCache(LRUCacheOptions(
+ 10000000 /* capacity */, 1 /* num_shard_bits */,
+ false /* strict_capacity_limit */, 0.0 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata));
+
+ options.write_buffer_size = 50000; // this is never hit
+ // Use a write buffer total size so that the soft limit is about
+ // 105000.
+ options.write_buffer_manager.reset(new WriteBufferManager(0, cache));
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ // One dummy entry is 256KB.
+ ASSERT_GT(cache->GetUsage(), 128000);
+}
+
+namespace {
+void ValidateKeyExistence(DB* db, const std::vector<Slice>& keys_must_exist,
+ const std::vector<Slice>& keys_must_not_exist) {
+ // Ensure that expected keys exist
+ std::vector<std::string> values;
+ if (keys_must_exist.size() > 0) {
+ std::vector<Status> status_list =
+ db->MultiGet(ReadOptions(), keys_must_exist, &values);
+ for (size_t i = 0; i < keys_must_exist.size(); i++) {
+ ASSERT_OK(status_list[i]);
+ }
+ }
+
+ // Ensure that given keys don't exist
+ if (keys_must_not_exist.size() > 0) {
+ std::vector<Status> status_list =
+ db->MultiGet(ReadOptions(), keys_must_not_exist, &values);
+ for (size_t i = 0; i < keys_must_not_exist.size(); i++) {
+ ASSERT_TRUE(status_list[i].IsNotFound());
+ }
+ }
+}
+
+} // anonymous namespace
+
+TEST_F(DBTest2, WalFilterTest) {
+ class TestWalFilter : public WalFilter {
+ private:
+ // Processing option that is requested to be applied at the given index
+ WalFilter::WalProcessingOption wal_processing_option_;
+ // Index at which to apply wal_processing_option_
+ // At other indexes default wal_processing_option::kContinueProcessing is
+ // returned.
+ size_t apply_option_at_record_index_;
+ // Current record index, incremented with each record encountered.
+ size_t current_record_index_;
+
+ public:
+ TestWalFilter(WalFilter::WalProcessingOption wal_processing_option,
+ size_t apply_option_for_record_index)
+ : wal_processing_option_(wal_processing_option),
+ apply_option_at_record_index_(apply_option_for_record_index),
+ current_record_index_(0) {}
+
+ WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
+ WriteBatch* /*new_batch*/,
+ bool* /*batch_changed*/) const override {
+ WalFilter::WalProcessingOption option_to_return;
+
+ if (current_record_index_ == apply_option_at_record_index_) {
+ option_to_return = wal_processing_option_;
+ } else {
+ option_to_return = WalProcessingOption::kContinueProcessing;
+ }
+
+ // Filter is passed as a const object for RocksDB to not modify the
+ // object, however we modify it for our own purpose here and hence
+ // cast the constness away.
+ (const_cast<TestWalFilter*>(this)->current_record_index_)++;
+
+ return option_to_return;
+ }
+
+ const char* Name() const override { return "TestWalFilter"; }
+ };
+
+ // Create 3 batches with two keys each
+ std::vector<std::vector<std::string>> batch_keys(3);
+
+ batch_keys[0].push_back("key1");
+ batch_keys[0].push_back("key2");
+ batch_keys[1].push_back("key3");
+ batch_keys[1].push_back("key4");
+ batch_keys[2].push_back("key5");
+ batch_keys[2].push_back("key6");
+
+ // Test with all WAL processing options
+ for (int option = 0;
+ option < static_cast<int>(
+ WalFilter::WalProcessingOption::kWalProcessingOptionMax);
+ option++) {
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Write given keys in given batches
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
+ }
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+ }
+
+ WalFilter::WalProcessingOption wal_processing_option =
+ static_cast<WalFilter::WalProcessingOption>(option);
+
+ // Create a test filter that would apply wal_processing_option at the first
+ // record
+ size_t apply_option_for_record_index = 1;
+ TestWalFilter test_wal_filter(wal_processing_option,
+ apply_option_for_record_index);
+
+ // Reopen database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ options.wal_filter = &test_wal_filter;
+ Status status =
+ TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ if (wal_processing_option ==
+ WalFilter::WalProcessingOption::kCorruptedRecord) {
+ ASSERT_NOK(status);
+ // In case of corruption we can turn off paranoid_checks to reopen
+ // databse
+ options.paranoid_checks = false;
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ } else {
+ ASSERT_OK(status);
+ }
+
+ // Compute which keys we expect to be found
+ // and which we expect not to be found after recovery.
+ std::vector<Slice> keys_must_exist;
+ std::vector<Slice> keys_must_not_exist;
+ switch (wal_processing_option) {
+ case WalFilter::WalProcessingOption::kCorruptedRecord:
+ case WalFilter::WalProcessingOption::kContinueProcessing: {
+ fprintf(stderr, "Testing with complete WAL processing\n");
+ // we expect all records to be processed
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ keys_must_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ }
+ break;
+ }
+ case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: {
+ fprintf(stderr,
+ "Testing with ignoring record %" ROCKSDB_PRIszt " only\n",
+ apply_option_for_record_index);
+ // We expect the record with apply_option_for_record_index to be not
+ // found.
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ if (i == apply_option_for_record_index) {
+ keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+ } else {
+ keys_must_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ }
+ }
+ break;
+ }
+ case WalFilter::WalProcessingOption::kStopReplay: {
+ fprintf(stderr,
+ "Testing with stopping replay from record %" ROCKSDB_PRIszt
+ "\n",
+ apply_option_for_record_index);
+ // We expect records beyond apply_option_for_record_index to be not
+ // found.
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ if (i >= apply_option_for_record_index) {
+ keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+ } else {
+ keys_must_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ }
+ }
+ break;
+ }
+ default:
+ FAIL(); // unhandled case
+ }
+
+ bool checked_after_reopen = false;
+
+ while (true) {
+ // Ensure that expected keys exists
+ // and not expected keys don't exist after recovery
+ ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+
+ if (checked_after_reopen) {
+ break;
+ }
+
+ // reopen database again to make sure previous log(s) are not used
+ //(even if they were skipped)
+ // reopn database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ checked_after_reopen = true;
+ }
+ }
+}
+
+TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
+ class ChangeBatchHandler : public WriteBatch::Handler {
+ private:
+ // Batch to insert keys in
+ WriteBatch* new_write_batch_;
+ // Number of keys to add in the new batch
+ size_t num_keys_to_add_in_new_batch_;
+ // Number of keys added to new batch
+ size_t num_keys_added_;
+
+ public:
+ ChangeBatchHandler(WriteBatch* new_write_batch,
+ size_t num_keys_to_add_in_new_batch)
+ : new_write_batch_(new_write_batch),
+ num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
+ num_keys_added_(0) {}
+ void Put(const Slice& key, const Slice& value) override {
+ if (num_keys_added_ < num_keys_to_add_in_new_batch_) {
+ ASSERT_OK(new_write_batch_->Put(key, value));
+ ++num_keys_added_;
+ }
+ }
+ };
+
+ class TestWalFilterWithChangeBatch : public WalFilter {
+ private:
+ // Index at which to start changing records
+ size_t change_records_from_index_;
+ // Number of keys to add in the new batch
+ size_t num_keys_to_add_in_new_batch_;
+ // Current record index, incremented with each record encountered.
+ size_t current_record_index_;
+
+ public:
+ TestWalFilterWithChangeBatch(size_t change_records_from_index,
+ size_t num_keys_to_add_in_new_batch)
+ : change_records_from_index_(change_records_from_index),
+ num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
+ current_record_index_(0) {}
+
+ WalProcessingOption LogRecord(const WriteBatch& batch,
+ WriteBatch* new_batch,
+ bool* batch_changed) const override {
+ if (current_record_index_ >= change_records_from_index_) {
+ ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_);
+ Status s = batch.Iterate(&handler);
+ if (s.ok()) {
+ *batch_changed = true;
+ } else {
+ assert(false);
+ }
+ }
+
+ // Filter is passed as a const object for RocksDB to not modify the
+ // object, however we modify it for our own purpose here and hence
+ // cast the constness away.
+ (const_cast<TestWalFilterWithChangeBatch*>(this)
+ ->current_record_index_)++;
+
+ return WalProcessingOption::kContinueProcessing;
+ }
+
+ const char* Name() const override { return "TestWalFilterWithChangeBatch"; }
+ };
+
+ std::vector<std::vector<std::string>> batch_keys(3);
+
+ batch_keys[0].push_back("key1");
+ batch_keys[0].push_back("key2");
+ batch_keys[1].push_back("key3");
+ batch_keys[1].push_back("key4");
+ batch_keys[2].push_back("key5");
+ batch_keys[2].push_back("key6");
+
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Write given keys in given batches
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
+ }
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+ }
+
+ // Create a test filter that would apply wal_processing_option at the first
+ // record
+ size_t change_records_from_index = 1;
+ size_t num_keys_to_add_in_new_batch = 1;
+ TestWalFilterWithChangeBatch test_wal_filter_with_change_batch(
+ change_records_from_index, num_keys_to_add_in_new_batch);
+
+ // Reopen database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ options.wal_filter = &test_wal_filter_with_change_batch;
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ // Ensure that all keys exist before change_records_from_index_
+ // And after that index only single key exists
+ // as our filter adds only single key for each batch
+ std::vector<Slice> keys_must_exist;
+ std::vector<Slice> keys_must_not_exist;
+
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) {
+ keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+ } else {
+ keys_must_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ }
+ }
+
+ bool checked_after_reopen = false;
+
+ while (true) {
+ // Ensure that expected keys exists
+ // and not expected keys don't exist after recovery
+ ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+
+ if (checked_after_reopen) {
+ break;
+ }
+
+ // reopen database again to make sure previous log(s) are not used
+ //(even if they were skipped)
+ // reopn database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ checked_after_reopen = true;
+ }
+}
+
+TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) {
+ class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter {
+ public:
+ WalProcessingOption LogRecord(const WriteBatch& batch,
+ WriteBatch* new_batch,
+ bool* batch_changed) const override {
+ *new_batch = batch;
+ Status s = new_batch->Put("key_extra", "value_extra");
+ if (s.ok()) {
+ *batch_changed = true;
+ } else {
+ assert(false);
+ }
+ return WalProcessingOption::kContinueProcessing;
+ }
+
+ const char* Name() const override {
+ return "WalFilterTestWithChangeBatchExtraKeys";
+ }
+ };
+
+ std::vector<std::vector<std::string>> batch_keys(3);
+
+ batch_keys[0].push_back("key1");
+ batch_keys[0].push_back("key2");
+ batch_keys[1].push_back("key3");
+ batch_keys[1].push_back("key4");
+ batch_keys[2].push_back("key5");
+ batch_keys[2].push_back("key6");
+
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Write given keys in given batches
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
+ }
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+ }
+
+ // Create a test filter that would add extra keys
+ TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys;
+
+ // Reopen database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ options.wal_filter = &test_wal_filter_extra_keys;
+ Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_TRUE(status.IsNotSupported());
+
+ // Reopen without filter, now reopen should succeed - previous
+ // attempt to open must not have altered the db.
+ options = OptionsForLogIterTest();
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ std::vector<Slice> keys_must_exist;
+ std::vector<Slice> keys_must_not_exist; // empty vector
+
+ for (size_t i = 0; i < batch_keys.size(); i++) {
+ for (size_t j = 0; j < batch_keys[i].size(); j++) {
+ keys_must_exist.push_back(Slice(batch_keys[i][j]));
+ }
+ }
+
+ ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+}
+
+TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
+ class TestWalFilterWithColumnFamilies : public WalFilter {
+ private:
+ // column_family_id -> log_number map (provided to WALFilter)
+ std::map<uint32_t, uint64_t> cf_log_number_map_;
+ // column_family_name -> column_family_id map (provided to WALFilter)
+ std::map<std::string, uint32_t> cf_name_id_map_;
+ // column_family_name -> keys_found_in_wal map
+ // We store keys that are applicable to the column_family
+ // during recovery (i.e. aren't already flushed to SST file(s))
+ // for verification against the keys we expect.
+ std::map<uint32_t, std::vector<std::string>> cf_wal_keys_;
+
+ public:
+ void ColumnFamilyLogNumberMap(
+ const std::map<uint32_t, uint64_t>& cf_lognumber_map,
+ const std::map<std::string, uint32_t>& cf_name_id_map) override {
+ cf_log_number_map_ = cf_lognumber_map;
+ cf_name_id_map_ = cf_name_id_map;
+ }
+
+ WalProcessingOption LogRecordFound(unsigned long long log_number,
+ const std::string& /*log_file_name*/,
+ const WriteBatch& batch,
+ WriteBatch* /*new_batch*/,
+ bool* /*batch_changed*/) override {
+ class LogRecordBatchHandler : public WriteBatch::Handler {
+ private:
+ const std::map<uint32_t, uint64_t>& cf_log_number_map_;
+ std::map<uint32_t, std::vector<std::string>>& cf_wal_keys_;
+ unsigned long long log_number_;
+
+ public:
+ LogRecordBatchHandler(
+ unsigned long long current_log_number,
+ const std::map<uint32_t, uint64_t>& cf_log_number_map,
+ std::map<uint32_t, std::vector<std::string>>& cf_wal_keys)
+ : cf_log_number_map_(cf_log_number_map),
+ cf_wal_keys_(cf_wal_keys),
+ log_number_(current_log_number) {}
+
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& /*value*/) override {
+ auto it = cf_log_number_map_.find(column_family_id);
+ assert(it != cf_log_number_map_.end());
+ unsigned long long log_number_for_cf = it->second;
+ // If the current record is applicable for column_family_id
+ // (i.e. isn't flushed to SST file(s) for column_family_id)
+ // add it to the cf_wal_keys_ map for verification.
+ if (log_number_ >= log_number_for_cf) {
+ cf_wal_keys_[column_family_id].push_back(
+ std::string(key.data(), key.size()));
+ }
+ return Status::OK();
+ }
+ } handler(log_number, cf_log_number_map_, cf_wal_keys_);
+
+ Status s = batch.Iterate(&handler);
+ if (!s.ok()) {
+ // TODO(AR) is this ok?
+ return WalProcessingOption::kCorruptedRecord;
+ }
+
+ return WalProcessingOption::kContinueProcessing;
+ }
+
+ const char* Name() const override {
+ return "WalFilterTestWithColumnFamilies";
+ }
+
+ const std::map<uint32_t, std::vector<std::string>>& GetColumnFamilyKeys() {
+ return cf_wal_keys_;
+ }
+
+ const std::map<std::string, uint32_t>& GetColumnFamilyNameIdMap() {
+ return cf_name_id_map_;
+ }
+ };
+
+ std::vector<std::vector<std::string>> batch_keys_pre_flush(3);
+
+ batch_keys_pre_flush[0].push_back("key1");
+ batch_keys_pre_flush[0].push_back("key2");
+ batch_keys_pre_flush[1].push_back("key3");
+ batch_keys_pre_flush[1].push_back("key4");
+ batch_keys_pre_flush[2].push_back("key5");
+ batch_keys_pre_flush[2].push_back("key6");
+
+ Options options = OptionsForLogIterTest();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Write given keys in given batches
+ for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
+ ASSERT_OK(batch.Put(handles_[0], batch_keys_pre_flush[i][j],
+ DummyString(1024)));
+ ASSERT_OK(batch.Put(handles_[1], batch_keys_pre_flush[i][j],
+ DummyString(1024)));
+ }
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+ }
+
+ // Flush default column-family
+ ASSERT_OK(db_->Flush(FlushOptions(), handles_[0]));
+
+ // Do some more writes
+ std::vector<std::vector<std::string>> batch_keys_post_flush(3);
+
+ batch_keys_post_flush[0].push_back("key7");
+ batch_keys_post_flush[0].push_back("key8");
+ batch_keys_post_flush[1].push_back("key9");
+ batch_keys_post_flush[1].push_back("key10");
+ batch_keys_post_flush[2].push_back("key11");
+ batch_keys_post_flush[2].push_back("key12");
+
+ // Write given keys in given batches
+ for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+ ASSERT_OK(batch.Put(handles_[0], batch_keys_post_flush[i][j],
+ DummyString(1024)));
+ ASSERT_OK(batch.Put(handles_[1], batch_keys_post_flush[i][j],
+ DummyString(1024)));
+ }
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+ }
+
+ // On Recovery we should only find the second batch applicable to default CF
+ // But both batches applicable to pikachu CF
+
+ // Create a test filter that would add extra keys
+ TestWalFilterWithColumnFamilies test_wal_filter_column_families;
+
+ // Reopen database with option to use WAL filter
+ options = OptionsForLogIterTest();
+ options.wal_filter = &test_wal_filter_column_families;
+ Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_TRUE(status.ok());
+
+ // verify that handles_[0] only has post_flush keys
+ // while handles_[1] has pre and post flush keys
+ auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys();
+ auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap();
+ size_t index = 0;
+ auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]];
+ // default column-family, only post_flush keys are expected
+ for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+ for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+ Slice key_from_the_log(keys_cf[index++]);
+ Slice batch_key(batch_keys_post_flush[i][j]);
+ ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
+ }
+ }
+ ASSERT_EQ(index, keys_cf.size());
+
+ index = 0;
+ keys_cf = cf_wal_keys[name_id_map["pikachu"]];
+ // pikachu column-family, all keys are expected
+ for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
+ for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
+ Slice key_from_the_log(keys_cf[index++]);
+ Slice batch_key(batch_keys_pre_flush[i][j]);
+ ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
+ }
+ }
+
+ for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+ for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+ Slice key_from_the_log(keys_cf[index++]);
+ Slice batch_key(batch_keys_post_flush[i][j]);
+ ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
+ }
+ }
+ ASSERT_EQ(index, keys_cf.size());
+}
+
+TEST_F(DBTest2, PresetCompressionDict) {
+ // Verifies that compression ratio improves when dictionary is enabled, and
+ // improves even further when the dictionary is trained by ZSTD.
+ const size_t kBlockSizeBytes = 4 << 10;
+ const size_t kL0FileBytes = 128 << 10;
+ const size_t kApproxPerBlockOverheadBytes = 50;
+ const int kNumL0Files = 5;
+
+ Options options;
+ // Make sure to use any custom env that the test is configured with.
+ options.env = CurrentOptions().env;
+ options.allow_concurrent_memtable_write = false;
+ options.arena_block_size = kBlockSizeBytes;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
+ options.num_levels = 2;
+ options.target_file_size_base = kL0FileBytes;
+ options.target_file_size_multiplier = 2;
+ options.write_buffer_size = kL0FileBytes;
+ BlockBasedTableOptions table_options;
+ table_options.block_size = kBlockSizeBytes;
+ std::vector<CompressionType> compression_types;
+ if (Zlib_Supported()) {
+ compression_types.push_back(kZlibCompression);
+ }
+#if LZ4_VERSION_NUMBER >= 10400 // r124+
+ compression_types.push_back(kLZ4Compression);
+ compression_types.push_back(kLZ4HCCompression);
+#endif // LZ4_VERSION_NUMBER >= 10400
+ if (ZSTD_Supported()) {
+ compression_types.push_back(kZSTD);
+ }
+
+ enum DictionaryTypes : int {
+ kWithoutDict,
+ kWithDict,
+ kWithZSTDfinalizeDict,
+ kWithZSTDTrainedDict,
+ kDictEnd,
+ };
+
+ for (auto compression_type : compression_types) {
+ options.compression = compression_type;
+ size_t bytes_without_dict = 0;
+ size_t bytes_with_dict = 0;
+ size_t bytes_with_zstd_finalize_dict = 0;
+ size_t bytes_with_zstd_trained_dict = 0;
+ for (int i = kWithoutDict; i < kDictEnd; i++) {
+ // First iteration: compress without preset dictionary
+ // Second iteration: compress with preset dictionary
+ // Third iteration (zstd only): compress with zstd-trained dictionary
+ //
+ // To make sure the compression dictionary has the intended effect, we
+ // verify the compressed size is smaller in successive iterations. Also in
+ // the non-first iterations, verify the data we get out is the same data
+ // we put in.
+ switch (i) {
+ case kWithoutDict:
+ options.compression_opts.max_dict_bytes = 0;
+ options.compression_opts.zstd_max_train_bytes = 0;
+ break;
+ case kWithDict:
+ options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+ options.compression_opts.zstd_max_train_bytes = 0;
+ break;
+ case kWithZSTDfinalizeDict:
+ if (compression_type != kZSTD ||
+ !ZSTD_FinalizeDictionarySupported()) {
+ continue;
+ }
+ options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+ options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+ options.compression_opts.use_zstd_dict_trainer = false;
+ break;
+ case kWithZSTDTrainedDict:
+ if (compression_type != kZSTD || !ZSTD_TrainDictionarySupported()) {
+ continue;
+ }
+ options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+ options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+ options.compression_opts.use_zstd_dict_trainer = true;
+ break;
+ default:
+ assert(false);
+ }
+
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+ std::string seq_datas[10];
+ for (int j = 0; j < 10; ++j) {
+ seq_datas[j] =
+ rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes);
+ }
+
+ ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+ for (int j = 0; j < kNumL0Files; ++j) {
+ for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
+ auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
+ ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
+ seq_datas[(key_num / 10) % 10]));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
+ }
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+ true /* disallow_trivial_move */));
+ ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+ // Get the live sst files size
+ size_t total_sst_bytes = TotalSize(1);
+ if (i == kWithoutDict) {
+ bytes_without_dict = total_sst_bytes;
+ } else if (i == kWithDict) {
+ bytes_with_dict = total_sst_bytes;
+ } else if (i == kWithZSTDfinalizeDict) {
+ bytes_with_zstd_finalize_dict = total_sst_bytes;
+ } else if (i == kWithZSTDTrainedDict) {
+ bytes_with_zstd_trained_dict = total_sst_bytes;
+ }
+
+ for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
+ j++) {
+ ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
+ }
+ if (i == kWithDict) {
+ ASSERT_GT(bytes_without_dict, bytes_with_dict);
+ } else if (i == kWithZSTDTrainedDict) {
+ // In zstd compression, it is sometimes possible that using a finalized
+ // dictionary does not get as good a compression ratio as raw content
+ // dictionary. But using a dictionary should always get better
+ // compression ratio than not using one.
+ ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict ||
+ bytes_without_dict > bytes_with_zstd_finalize_dict);
+ } else if (i == kWithZSTDTrainedDict) {
+ // In zstd compression, it is sometimes possible that using a trained
+ // dictionary does not get as good a compression ratio as without
+ // training.
+ // But using a dictionary (with or without training) should always get
+ // better compression ratio than not using one.
+ ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
+ bytes_without_dict > bytes_with_zstd_trained_dict);
+ }
+
+ DestroyAndReopen(options);
+ }
+ }
+}
+
+TEST_F(DBTest2, PresetCompressionDictLocality) {
+ if (!ZSTD_Supported()) {
+ return;
+ }
+ // Verifies that compression dictionary is generated from local data. The
+ // verification simply checks all output SSTs have different compression
+ // dictionaries. We do not verify effectiveness as that'd likely be flaky in
+ // the future.
+ const int kNumEntriesPerFile = 1 << 10; // 1KB
+ const int kNumBytesPerEntry = 1 << 10; // 1KB
+ const int kNumFiles = 4;
+ Options options = CurrentOptions();
+ options.compression = kZSTD;
+ options.compression_opts.max_dict_bytes = 1 << 14; // 16KB
+ options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kNumEntriesPerFile; ++j) {
+ ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
+ rnd.RandomString(kNumBytesPerEntry)));
+ }
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+ ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
+ }
+
+ // Store all the dictionaries generated during a full compaction.
+ std::vector<std::string> compression_dicts;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+ [&](void* arg) {
+ compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ CompactRangeOptions compact_range_opts;
+ compact_range_opts.bottommost_level_compaction =
+ BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+ // Dictionary compression should not be so good as to compress four totally
+ // random files into one. If it does then there's probably something wrong
+ // with the test.
+ ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+ // Furthermore, there should be one compression dictionary generated per file.
+ // And they should all be different from each other.
+ ASSERT_EQ(NumTableFilesAtLevel(1),
+ static_cast<int>(compression_dicts.size()));
+ for (size_t i = 1; i < compression_dicts.size(); ++i) {
+ std::string& a = compression_dicts[i - 1];
+ std::string& b = compression_dicts[i];
+ size_t alen = a.size();
+ size_t blen = b.size();
+ ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
+ }
+}
+
+class PresetCompressionDictTest
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
+ public:
+ PresetCompressionDictTest()
+ : DBTestBase("db_test2", false /* env_do_fsync */),
+ compression_type_(std::get<0>(GetParam())),
+ bottommost_(std::get<1>(GetParam())) {}
+
+ protected:
+ const CompressionType compression_type_;
+ const bool bottommost_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ DBTest2, PresetCompressionDictTest,
+ ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
+ ::testing::Bool()));
+
+TEST_P(PresetCompressionDictTest, Flush) {
+ // Verifies that dictionary is generated and written during flush only when
+ // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
+ // size of the dictionary is within expectations according to the limit on
+ // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+ const size_t kValueLen = 256;
+ const size_t kKeysPerFile = 1 << 10;
+ const size_t kDictLen = 16 << 10;
+ const size_t kBlockLen = 4 << 10;
+
+ Options options = CurrentOptions();
+ if (bottommost_) {
+ options.bottommost_compression = compression_type_;
+ options.bottommost_compression_opts.enabled = true;
+ options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+ options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+ } else {
+ options.compression = compression_type_;
+ options.compression_opts.max_dict_bytes = kDictLen;
+ options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+ }
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile));
+ options.statistics = CreateDBStatistics();
+ BlockBasedTableOptions bbto;
+ bbto.block_size = kBlockLen;
+ bbto.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ Random rnd(301);
+ for (size_t i = 0; i <= kKeysPerFile; ++i) {
+ ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+ // compression dictionary exists since dictionaries would be preloaded when
+ // the flush finishes.
+ if (bottommost_) {
+ // Flush is never considered bottommost. This should change in the future
+ // since flushed files may have nothing underneath them, like the one in
+ // this test case.
+ ASSERT_EQ(
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+ 0);
+ } else {
+ ASSERT_GT(
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+ 0);
+ // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+ // number of bytes needs to be adjusted in case the cached block is in
+ // ZSTD's digested dictionary format.
+ if (compression_type_ != kZSTD &&
+ compression_type_ != kZSTDNotFinalCompression) {
+ // Although we limited buffering to `kBlockLen`, there may be up to two
+ // blocks of data included in the dictionary since we only check limit
+ // after each block is built.
+ ASSERT_LE(TestGetTickerCount(options,
+ BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+ 2 * kBlockLen);
+ }
+ }
+}
+
+TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
+ // Verifies that dictionary is generated and written during compaction to
+ // non-bottommost level only when `ColumnFamilyOptions::compression` enables
+ // dictionary. Also verifies the size of the dictionary is within expectations
+ // according to the limit on buffering set by
+ // `CompressionOptions::max_dict_buffer_bytes`.
+ const size_t kValueLen = 256;
+ const size_t kKeysPerFile = 1 << 10;
+ const size_t kDictLen = 16 << 10;
+ const size_t kBlockLen = 4 << 10;
+
+ Options options = CurrentOptions();
+ if (bottommost_) {
+ options.bottommost_compression = compression_type_;
+ options.bottommost_compression_opts.enabled = true;
+ options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+ options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+ } else {
+ options.compression = compression_type_;
+ options.compression_opts.max_dict_bytes = kDictLen;
+ options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+ }
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+ BlockBasedTableOptions bbto;
+ bbto.block_size = kBlockLen;
+ bbto.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ Random rnd(301);
+ for (size_t j = 0; j <= kKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+ }
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(2);
+
+ for (int i = 0; i < 2; ++i) {
+ for (size_t j = 0; j <= kKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+ }
+ ASSERT_OK(Flush());
+ }
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,0,1", FilesPerLevel(0));
+#endif // ROCKSDB_LITE
+
+ uint64_t prev_compression_dict_bytes_inserted =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+ // This L0->L1 compaction merges the two L0 files into L1. The produced L1
+ // file is not bottommost due to the existing L2 file covering the same key-
+ // range.
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,1,1", FilesPerLevel(0));
+#endif // ROCKSDB_LITE
+ // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+ // compression dictionary exists since dictionaries would be preloaded when
+ // the compaction finishes.
+ if (bottommost_) {
+ ASSERT_EQ(
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+ prev_compression_dict_bytes_inserted);
+ } else {
+ ASSERT_GT(
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+ prev_compression_dict_bytes_inserted);
+ // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+ // number of bytes needs to be adjusted in case the cached block is in
+ // ZSTD's digested dictionary format.
+ if (compression_type_ != kZSTD &&
+ compression_type_ != kZSTDNotFinalCompression) {
+ // Although we limited buffering to `kBlockLen`, there may be up to two
+ // blocks of data included in the dictionary since we only check limit
+ // after each block is built.
+ ASSERT_LE(TestGetTickerCount(options,
+ BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+ prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+ }
+ }
+}
+
+TEST_P(PresetCompressionDictTest, CompactBottommost) {
+ // Verifies that dictionary is generated and written during compaction to
+ // non-bottommost level only when either `ColumnFamilyOptions::compression` or
+ // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
+ // verifies the size of the dictionary is within expectations according to the
+ // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+ const size_t kValueLen = 256;
+ const size_t kKeysPerFile = 1 << 10;
+ const size_t kDictLen = 16 << 10;
+ const size_t kBlockLen = 4 << 10;
+
+ Options options = CurrentOptions();
+ if (bottommost_) {
+ options.bottommost_compression = compression_type_;
+ options.bottommost_compression_opts.enabled = true;
+ options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+ options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+ } else {
+ options.compression = compression_type_;
+ options.compression_opts.max_dict_bytes = kDictLen;
+ options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+ }
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+ BlockBasedTableOptions bbto;
+ bbto.block_size = kBlockLen;
+ bbto.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 2; ++i) {
+ for (size_t j = 0; j <= kKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+ }
+ ASSERT_OK(Flush());
+ }
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2", FilesPerLevel(0));
+#endif // ROCKSDB_LITE
+
+ uint64_t prev_compression_dict_bytes_inserted =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+ CompactRangeOptions cro;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif // ROCKSDB_LITE
+ ASSERT_GT(
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+ prev_compression_dict_bytes_inserted);
+ // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+ // number of bytes needs to be adjusted in case the cached block is in ZSTD's
+ // digested dictionary format.
+ if (compression_type_ != kZSTD &&
+ compression_type_ != kZSTDNotFinalCompression) {
+ // Although we limited buffering to `kBlockLen`, there may be up to two
+ // blocks of data included in the dictionary since we only check limit after
+ // each block is built.
+ ASSERT_LE(
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+ prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+ }
+}
+
+class CompactionCompressionListener : public EventListener {
+ public:
+ explicit CompactionCompressionListener(Options* db_options)
+ : db_options_(db_options) {}
+
+ void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+ // Figure out last level with files
+ int bottommost_level = 0;
+ for (int level = 0; level < db->NumberLevels(); level++) {
+ std::string files_at_level;
+ ASSERT_TRUE(
+ db->GetProperty("rocksdb.num-files-at-level" + std::to_string(level),
+ &files_at_level));
+ if (files_at_level != "0") {
+ bottommost_level = level;
+ }
+ }
+
+ if (db_options_->bottommost_compression != kDisableCompressionOption &&
+ ci.output_level == bottommost_level) {
+ ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
+ } else if (db_options_->compression_per_level.size() != 0) {
+ ASSERT_EQ(ci.compression,
+ db_options_->compression_per_level[ci.output_level]);
+ } else {
+ ASSERT_EQ(ci.compression, db_options_->compression);
+ }
+ max_level_checked = std::max(max_level_checked, ci.output_level);
+ }
+
+ int max_level_checked = 0;
+ const Options* db_options_;
+};
+
+enum CompressionFailureType {
+ kTestCompressionFail,
+ kTestDecompressionFail,
+ kTestDecompressionCorruption
+};
+
+class CompressionFailuresTest
+ : public DBTest2,
+ public testing::WithParamInterface<std::tuple<
+ CompressionFailureType, CompressionType, uint32_t, uint32_t>> {
+ public:
+ CompressionFailuresTest() {
+ std::tie(compression_failure_type_, compression_type_,
+ compression_max_dict_bytes_, compression_parallel_threads_) =
+ GetParam();
+ }
+
+ CompressionFailureType compression_failure_type_ = kTestCompressionFail;
+ CompressionType compression_type_ = kNoCompression;
+ uint32_t compression_max_dict_bytes_ = 0;
+ uint32_t compression_parallel_threads_ = 0;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ DBTest2, CompressionFailuresTest,
+ ::testing::Combine(::testing::Values(kTestCompressionFail,
+ kTestDecompressionFail,
+ kTestDecompressionCorruption),
+ ::testing::ValuesIn(GetSupportedCompressions()),
+ ::testing::Values(0, 10), ::testing::Values(1, 4)));
+
+TEST_P(CompressionFailuresTest, CompressionFailures) {
+ if (compression_type_ == kNoCompression) {
+ return;
+ }
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ options.max_bytes_for_level_base = 1024;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 7;
+ options.max_background_compactions = 1;
+ options.target_file_size_base = 512;
+
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 512;
+ table_options.verify_compression = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ options.compression = compression_type_;
+ options.compression_opts.parallel_threads = compression_parallel_threads_;
+ options.compression_opts.max_dict_bytes = compression_max_dict_bytes_;
+ options.bottommost_compression_opts.parallel_threads =
+ compression_parallel_threads_;
+ options.bottommost_compression_opts.max_dict_bytes =
+ compression_max_dict_bytes_;
+
+ if (compression_failure_type_ == kTestCompressionFail) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompressData:TamperWithReturnValue", [](void* arg) {
+ bool* ret = static_cast<bool*>(arg);
+ *ret = false;
+ });
+ } else if (compression_failure_type_ == kTestDecompressionFail) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UncompressBlockData:TamperWithReturnValue", [](void* arg) {
+ Status* ret = static_cast<Status*>(arg);
+ ASSERT_OK(*ret);
+ *ret = Status::Corruption("kTestDecompressionFail");
+ });
+ } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UncompressBlockData:"
+ "TamperWithDecompressionOutput",
+ [](void* arg) {
+ BlockContents* contents = static_cast<BlockContents*>(arg);
+ // Ensure uncompressed data != original data
+ const size_t len = contents->data.size() + 1;
+ std::unique_ptr<char[]> fake_data(new char[len]());
+ *contents = BlockContents(std::move(fake_data), len);
+ });
+ }
+
+ std::map<std::string, std::string> key_value_written;
+
+ const int kKeySize = 5;
+ const int kValUnitSize = 16;
+ const int kValSize = 256;
+ Random rnd(405);
+
+ Status s = Status::OK();
+
+ DestroyAndReopen(options);
+ // Write 10 random files
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 5; j++) {
+ std::string key = rnd.RandomString(kKeySize);
+ // Ensure good compression ratio
+ std::string valueUnit = rnd.RandomString(kValUnitSize);
+ std::string value;
+ for (int k = 0; k < kValSize; k += kValUnitSize) {
+ value += valueUnit;
+ }
+ s = Put(key, value);
+ if (compression_failure_type_ == kTestCompressionFail) {
+ key_value_written[key] = value;
+ ASSERT_OK(s);
+ }
+ }
+ s = Flush();
+ if (compression_failure_type_ == kTestCompressionFail) {
+ ASSERT_OK(s);
+ }
+ s = dbfull()->TEST_WaitForCompact();
+ if (compression_failure_type_ == kTestCompressionFail) {
+ ASSERT_OK(s);
+ }
+ if (i == 4) {
+ // Make compression fail at the mid of table building
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ }
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ if (compression_failure_type_ == kTestCompressionFail) {
+ // Should be kNoCompression, check content consistency
+ std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ std::string key = db_iter->key().ToString();
+ std::string value = db_iter->value().ToString();
+ ASSERT_NE(key_value_written.find(key), key_value_written.end());
+ ASSERT_EQ(key_value_written[key], value);
+ key_value_written.erase(key);
+ }
+ ASSERT_EQ(0, key_value_written.size());
+ } else if (compression_failure_type_ == kTestDecompressionFail) {
+ ASSERT_EQ(std::string(s.getState()),
+ "Could not decompress: kTestDecompressionFail");
+ } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+ ASSERT_EQ(std::string(s.getState()),
+ "Decompressed block did not match pre-compression block");
+ }
+}
+
+TEST_F(DBTest2, CompressionOptions) {
+ if (!Zlib_Supported() || !Snappy_Supported()) {
+ return;
+ }
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ options.max_bytes_for_level_base = 100;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 7;
+ options.max_background_compactions = 1;
+
+ CompactionCompressionListener* listener =
+ new CompactionCompressionListener(&options);
+ options.listeners.emplace_back(listener);
+
+ const int kKeySize = 5;
+ const int kValSize = 20;
+ Random rnd(301);
+
+ std::vector<uint32_t> compression_parallel_threads = {1, 4};
+
+ std::map<std::string, std::string> key_value_written;
+
+ for (int iter = 0; iter <= 2; iter++) {
+ listener->max_level_checked = 0;
+
+ if (iter == 0) {
+ // Use different compression algorithms for different levels but
+ // always use Zlib for bottommost level
+ options.compression_per_level = {kNoCompression, kNoCompression,
+ kNoCompression, kSnappyCompression,
+ kSnappyCompression, kSnappyCompression,
+ kZlibCompression};
+ options.compression = kNoCompression;
+ options.bottommost_compression = kZlibCompression;
+ } else if (iter == 1) {
+ // Use Snappy except for bottommost level use ZLib
+ options.compression_per_level = {};
+ options.compression = kSnappyCompression;
+ options.bottommost_compression = kZlibCompression;
+ } else if (iter == 2) {
+ // Use Snappy everywhere
+ options.compression_per_level = {};
+ options.compression = kSnappyCompression;
+ options.bottommost_compression = kDisableCompressionOption;
+ }
+
+ for (auto num_threads : compression_parallel_threads) {
+ options.compression_opts.parallel_threads = num_threads;
+ options.bottommost_compression_opts.parallel_threads = num_threads;
+
+ DestroyAndReopen(options);
+ // Write 10 random files
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 5; j++) {
+ std::string key = rnd.RandomString(kKeySize);
+ std::string value = rnd.RandomString(kValSize);
+ key_value_written[key] = value;
+ ASSERT_OK(Put(key, value));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ // Make sure that we wrote enough to check all 7 levels
+ ASSERT_EQ(listener->max_level_checked, 6);
+
+ // Make sure database content is the same as key_value_written
+ std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ std::string key = db_iter->key().ToString();
+ std::string value = db_iter->value().ToString();
+ ASSERT_NE(key_value_written.find(key), key_value_written.end());
+ ASSERT_EQ(key_value_written[key], value);
+ key_value_written.erase(key);
+ }
+ ASSERT_OK(db_iter->status());
+ ASSERT_EQ(0, key_value_written.size());
+ }
+ }
+}
+
+class CompactionStallTestListener : public EventListener {
+ public:
+ CompactionStallTestListener()
+ : compacting_files_cnt_(0), compacted_files_cnt_(0) {}
+
+ void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+ ASSERT_EQ(ci.cf_name, "default");
+ ASSERT_EQ(ci.base_input_level, 0);
+ ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+ compacting_files_cnt_ += ci.input_files.size();
+ }
+
+ void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+ ASSERT_EQ(ci.cf_name, "default");
+ ASSERT_EQ(ci.base_input_level, 0);
+ ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+ compacted_files_cnt_ += ci.input_files.size();
+ }
+
+ std::atomic<size_t> compacting_files_cnt_;
+ std::atomic<size_t> compacted_files_cnt_;
+};
+
+TEST_F(DBTest2, CompactionStall) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"},
+ {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"},
+ {"DBTest2::CompactionStall:2",
+ "DBImpl::NotifyOnCompactionBegin::UnlockMutex"},
+ {"DBTest2::CompactionStall:3",
+ "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 4;
+ options.max_background_compactions = 40;
+ CompactionStallTestListener* listener = new CompactionStallTestListener();
+ options.listeners.emplace_back(listener);
+ DestroyAndReopen(options);
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ Random rnd(301);
+
+ // 4 Files in L0
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // Wait for compaction to be triggered
+ TEST_SYNC_POINT("DBTest2::CompactionStall:0");
+
+ // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again
+ // at DBTest2::CompactionStall::1
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+ // Another 6 L0 files to trigger compaction again
+ for (int i = 0; i < 6; i++) {
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // Wait for another compaction to be triggered
+ TEST_SYNC_POINT("DBTest2::CompactionStall:1");
+
+ // Hold NotifyOnCompactionBegin in the unlock mutex section
+ TEST_SYNC_POINT("DBTest2::CompactionStall:2");
+
+ // Hold NotifyOnCompactionCompleted in the unlock mutex section
+ TEST_SYNC_POINT("DBTest2::CompactionStall:3");
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_LT(NumTableFilesAtLevel(0),
+ options.level0_file_num_compaction_trigger);
+ ASSERT_GT(listener->compacted_files_cnt_.load(),
+ 10 - options.level0_file_num_compaction_trigger);
+ ASSERT_EQ(listener->compacting_files_cnt_.load(),
+ listener->compacted_files_cnt_.load());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest2, FirstSnapshotTest) {
+ Options options;
+ options.write_buffer_size = 100000; // Small write buffer
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // This snapshot will have sequence number 0 what is expected behaviour.
+ const Snapshot* s1 = db_->GetSnapshot();
+
+ ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable
+ ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger flush
+
+ db_->ReleaseSnapshot(s1);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, DuplicateSnapshot) {
+ Options options;
+ options = CurrentOptions(options);
+ std::vector<const Snapshot*> snapshots;
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ SequenceNumber oldest_ww_snap, first_ww_snap;
+
+ ASSERT_OK(Put("k", "v")); // inc seq
+ snapshots.push_back(db_->GetSnapshot());
+ snapshots.push_back(db_->GetSnapshot());
+ ASSERT_OK(Put("k", "v")); // inc seq
+ snapshots.push_back(db_->GetSnapshot());
+ snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+ first_ww_snap = snapshots.back()->GetSequenceNumber();
+ ASSERT_OK(Put("k", "v")); // inc seq
+ snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+ snapshots.push_back(db_->GetSnapshot());
+ ASSERT_OK(Put("k", "v")); // inc seq
+ snapshots.push_back(db_->GetSnapshot());
+
+ {
+ InstrumentedMutexLock l(dbi->mutex());
+ auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap);
+ ASSERT_EQ(seqs.size(), 4); // duplicates are not counted
+ ASSERT_EQ(oldest_ww_snap, first_ww_snap);
+ }
+
+ for (auto s : snapshots) {
+ db_->ReleaseSnapshot(s);
+ }
+}
+#endif // ROCKSDB_LITE
+
+class PinL0IndexAndFilterBlocksTest
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ PinL0IndexAndFilterBlocksTest()
+ : DBTestBase("db_pin_l0_index_bloom_test", /*env_do_fsync=*/true) {}
+ void SetUp() override {
+ infinite_max_files_ = std::get<0>(GetParam());
+ disallow_preload_ = std::get<1>(GetParam());
+ }
+
+ void CreateTwoLevels(Options* options, bool close_afterwards) {
+ if (infinite_max_files_) {
+ options->max_open_files = -1;
+ }
+ options->create_if_missing = true;
+ options->statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ options->table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, *options);
+
+ ASSERT_OK(Put(1, "a", "begin"));
+ ASSERT_OK(Put(1, "z", "end"));
+ ASSERT_OK(Flush(1));
+ // move this table to L1
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+
+ // reset block cache
+ table_options.block_cache = NewLRUCache(64 * 1024);
+ options->table_factory.reset(NewBlockBasedTableFactory(table_options));
+ TryReopenWithColumnFamilies({"default", "pikachu"}, *options);
+ // create new table at L0
+ ASSERT_OK(Put(1, "a2", "begin2"));
+ ASSERT_OK(Put(1, "z2", "end2"));
+ ASSERT_OK(Flush(1));
+
+ if (close_afterwards) {
+ Close(); // This ensures that there is no ref to block cache entries
+ }
+ table_options.block_cache->EraseUnRefEntries();
+ }
+
+ bool infinite_max_files_;
+ bool disallow_preload_;
+};
+
+TEST_P(PinL0IndexAndFilterBlocksTest,
+ IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) {
+ Options options = CurrentOptions();
+ if (infinite_max_files_) {
+ options.max_open_files = -1;
+ }
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "key", "val"));
+ // Create a new table.
+ ASSERT_OK(Flush(1));
+
+ // index/filter blocks added to block cache right after table creation.
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+ // only index/filter were added
+ ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+
+ std::string value;
+ // Miss and hit count should remain the same, they're all pinned.
+ ASSERT_TRUE(db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+ // Miss and hit count should remain the same, they're all pinned.
+ value = Get(1, "key");
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+}
+
+TEST_P(PinL0IndexAndFilterBlocksTest,
+ MultiLevelIndexAndFilterBlocksCachedWithPinning) {
+ Options options = CurrentOptions();
+ PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, false);
+ // get base cache values
+ uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+ uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+ uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+
+ std::string value;
+ // this should be read from L0
+ // so cache values don't change
+ value = Get(1, "a2");
+ ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+ // this should be read from L1
+ // the file is opened, prefetching results in a cache filter miss
+ // the block is loaded and added to the cache,
+ // then the get results in a cache hit for L1
+ // When we have inifinite max_files, there is still cache miss because we have
+ // reset the block cache
+ value = Get(1, "a");
+ ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+}
+
+TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
+ Options options = CurrentOptions();
+ // This ensures that db does not ref anything in the block cache, so
+ // EraseUnRefEntries could clear them up.
+ bool close_afterwards = true;
+ PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, close_afterwards);
+
+ // Get base cache values
+ uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+ uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+ uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+ uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+
+ if (disallow_preload_) {
+ // Now we have two files. We narrow the max open files to allow 3 entries
+ // so that preloading SST files won't happen.
+ options.max_open_files = 13;
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = 13;
+ });
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Reopen database. If max_open_files is set as -1, table readers will be
+ // preloaded. This will trigger a BlockBasedTable::Open() and prefetch
+ // L0 index and filter. Level 1's prefetching is disabled in DB::Open()
+ TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ if (!disallow_preload_) {
+ // After reopen, cache miss are increased by one because we read (and only
+ // read) filter and index on L0
+ ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ } else {
+ // If max_open_files is not -1, we do not preload table readers, so there is
+ // no change.
+ ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ }
+ std::string value;
+ // this should be read from L0
+ value = Get(1, "a2");
+ // If max_open_files is -1, we have pinned index and filter in Rep, so there
+ // will not be changes in index and filter misses or hits. If max_open_files
+ // is not -1, Get() will open a TableReader and prefetch index and filter.
+ ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+ // this should be read from L1
+ value = Get(1, "a");
+ if (!disallow_preload_) {
+ // In inifinite max files case, there's a cache miss in executing Get()
+ // because index and filter are not prefetched before.
+ ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ } else {
+ // In this case, cache miss will be increased by one in
+ // BlockBasedTable::Open() because this is not in DB::Open() code path so we
+ // will prefetch L1's index and filter. Cache hit will also be increased by
+ // one because Get() will read index and filter from the block cache
+ // prefetched in previous Open() call.
+ ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ }
+
+ // Force a full compaction to one single file. There will be a block
+ // cache read for both of index and filter. If prefetch doesn't explicitly
+ // happen, it will happen when verifying the file.
+ Compact(1, "a", "zzzzz");
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ if (!disallow_preload_) {
+ ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ } else {
+ ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ }
+
+ // Bloom and index hit will happen when a Get() happens.
+ value = Get(1, "a");
+ if (!disallow_preload_) {
+ ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ } else {
+ ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+ ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+ ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+ ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest,
+ PinL0IndexAndFilterBlocksTest,
+ ::testing::Values(std::make_tuple(true, false),
+ std::make_tuple(false, false),
+ std::make_tuple(false, true)));
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, MaxCompactionBytesTest) {
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+ DBTestBase::kNumKeysByGenerateNewRandomFile));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 200 << 10;
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 450 << 10;
+ options.target_file_size_base = 100 << 10;
+ // Infinite for full compaction.
+ options.max_compaction_bytes = options.target_file_size_base * 100;
+
+ Reopen(options);
+
+ Random rnd(301);
+
+ for (int num = 0; num < 8; num++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,8", FilesPerLevel(0));
+
+ // When compact from Ln -> Ln+1, cut a file if the file overlaps with
+ // more than three files in Ln+1.
+ options.max_compaction_bytes = options.target_file_size_base * 3;
+ Reopen(options);
+
+ GenerateNewRandomFile(&rnd);
+ // Add three more small files that overlap with the previous file
+ for (int i = 0; i < 3; i++) {
+ ASSERT_OK(Put("a", "z"));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Output files to L1 are cut to 4 pieces, according to
+ // options.max_compaction_bytes (300K)
+ // There are 8 files on L2 (grandparents level), each one is 100K. The first
+ // file overlaps with a, b which max_compaction_bytes is less than 300K, the
+ // second one overlaps with d, e, which is also less than 300K. Including any
+ // extra grandparent file will make the future compaction larger than 300K.
+ // L1: [ 1 ] [ 2 ] [ 3 ] [ 4 ]
+ // L2: [a] [b] [c] [d] [e] [f] [g] [h]
+ ASSERT_EQ("0,4,8", FilesPerLevel(0));
+}
+
+static void UniqueIdCallback(void* arg) {
+ int* result = reinterpret_cast<int*>(arg);
+ if (*result == -1) {
+ *result = 0;
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+}
+
+class MockPersistentCache : public PersistentCache {
+ public:
+ explicit MockPersistentCache(const bool is_compressed, const size_t max_size)
+ : is_compressed_(is_compressed), max_size_(max_size) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+ }
+
+ ~MockPersistentCache() override {}
+
+ PersistentCache::StatsType Stats() override {
+ return PersistentCache::StatsType();
+ }
+
+ uint64_t NewId() override {
+ return last_id_.fetch_add(1, std::memory_order_relaxed);
+ }
+
+ Status Insert(const Slice& page_key, const char* data,
+ const size_t size) override {
+ MutexLock _(&lock_);
+
+ if (size_ > max_size_) {
+ size_ -= data_.begin()->second.size();
+ data_.erase(data_.begin());
+ }
+
+ data_.insert(std::make_pair(page_key.ToString(), std::string(data, size)));
+ size_ += size;
+ return Status::OK();
+ }
+
+ Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
+ size_t* size) override {
+ MutexLock _(&lock_);
+ auto it = data_.find(page_key.ToString());
+ if (it == data_.end()) {
+ return Status::NotFound();
+ }
+
+ assert(page_key.ToString() == it->first);
+ data->reset(new char[it->second.size()]);
+ memcpy(data->get(), it->second.c_str(), it->second.size());
+ *size = it->second.size();
+ return Status::OK();
+ }
+
+ bool IsCompressed() override { return is_compressed_; }
+
+ std::string GetPrintableOptions() const override {
+ return "MockPersistentCache";
+ }
+
+ port::Mutex lock_;
+ std::map<std::string, std::string> data_;
+ const bool is_compressed_ = true;
+ size_t size_ = 0;
+ const size_t max_size_ = 10 * 1024; // 10KiB
+ std::atomic<uint64_t> last_id_{1};
+};
+
+#ifdef OS_LINUX
+// Make sure that in CPU time perf context counters, Env::NowCPUNanos()
+// is used, rather than Env::CPUNanos();
+TEST_F(DBTest2, TestPerfContextGetCpuTime) {
+ // force resizing table cache so table handle is not preloaded so that
+ // we can measure find_table_nanos during Get().
+ dbfull()->TEST_table_cache()->SetCapacity(0);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ env_->now_cpu_count_.store(0);
+ env_->SetMockSleep();
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ // CPU timing is not enabled with kEnableTimeExceptForMutex
+ SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+ ASSERT_EQ("bar", Get("foo"));
+ ASSERT_EQ(0, get_perf_context()->get_cpu_nanos);
+ ASSERT_EQ(0, env_->now_cpu_count_.load());
+
+ constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
+ constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
+
+ // Add time to NowNanos() reading.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::FindTable:0",
+ [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+ ASSERT_EQ("bar", Get("foo"));
+ ASSERT_GT(env_->now_cpu_count_.load(), 2);
+ ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonNanos);
+ ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
+
+ SetPerfLevel(PerfLevel::kDisable);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestPerfContextIterCpuTime) {
+ DestroyAndReopen(CurrentOptions());
+ // force resizing table cache so table handle is not preloaded so that
+ // we can measure find_table_nanos during iteration
+ dbfull()->TEST_table_cache()->SetCapacity(0);
+
+ const size_t kNumEntries = 10;
+ for (size_t i = 0; i < kNumEntries; ++i) {
+ ASSERT_OK(Put("k" + std::to_string(i), "v" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ for (size_t i = 0; i < kNumEntries; ++i) {
+ ASSERT_EQ("v" + std::to_string(i), Get("k" + std::to_string(i)));
+ }
+ std::string last_key = "k" + std::to_string(kNumEntries - 1);
+ std::string last_value = "v" + std::to_string(kNumEntries - 1);
+ env_->now_cpu_count_.store(0);
+ env_->SetMockSleep();
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ // CPU timing is not enabled with kEnableTimeExceptForMutex
+ SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ iter->Seek("k0");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+ iter->SeekForPrev(last_key);
+ ASSERT_TRUE(iter->Valid());
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(last_value, iter->value().ToString());
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+ ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v1", iter->value().ToString());
+ ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos);
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("v0", iter->value().ToString());
+ ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos);
+ ASSERT_EQ(0, env_->now_cpu_count_.load());
+ delete iter;
+
+ constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
+ constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
+
+ // Add time to NowNanos() reading.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::FindTable:0",
+ [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+ iter = db_->NewIterator(ReadOptions());
+ iter->Seek("k0");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+ iter->SeekForPrev(last_key);
+ ASSERT_TRUE(iter->Valid());
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(last_value, iter->value().ToString());
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+ ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0);
+ ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonNanos);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v1", iter->value().ToString());
+ ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0);
+ ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonNanos);
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ("v0", iter->value().ToString());
+ ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0);
+ ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonNanos);
+ ASSERT_GE(env_->now_cpu_count_.load(), 12);
+ ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
+
+ SetPerfLevel(PerfLevel::kDisable);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ delete iter;
+}
+#endif // OS_LINUX
+
+#if !defined OS_SOLARIS
+TEST_F(DBTest2, PersistentCache) {
+ int num_iter = 80;
+
+ Options options;
+ options.write_buffer_size = 64 * 1024; // small write buffer
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options = CurrentOptions(options);
+
+ auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024};
+ auto types = {/*compressed*/ 1, /*uncompressed*/ 0};
+ for (auto bsize : bsizes) {
+ for (auto type : types) {
+ BlockBasedTableOptions table_options;
+ table_options.persistent_cache.reset(
+ new MockPersistentCache(type, 10 * 1024));
+ table_options.no_block_cache = true;
+ table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr;
+ table_options.block_cache_compressed = nullptr;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // default column family doesn't have block cache
+ Options no_block_cache_opts;
+ no_block_cache_opts.statistics = options.statistics;
+ no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+ BlockBasedTableOptions table_options_no_bc;
+ table_options_no_bc.no_block_cache = true;
+ no_block_cache_opts.table_factory.reset(
+ NewBlockBasedTableFactory(table_options_no_bc));
+ ReopenWithColumnFamilies(
+ {"default", "pikachu"},
+ std::vector<Options>({no_block_cache_opts, options}));
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ std::vector<std::string> values;
+ std::string str;
+ for (int i = 0; i < num_iter; i++) {
+ if (i % 4 == 0) { // high compression ratio
+ str = rnd.RandomString(1000);
+ }
+ values.push_back(str);
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+
+ // flush all data from memtable so that reads are from block cache
+ ASSERT_OK(Flush(1));
+
+ for (int i = 0; i < num_iter; i++) {
+ ASSERT_EQ(Get(1, Key(i)), values[i]);
+ }
+
+ auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT);
+ auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS);
+
+ ASSERT_GT(hit, 0);
+ ASSERT_GT(miss, 0);
+ }
+ }
+}
+#endif // !defined OS_SOLARIS
+
+namespace {
+void CountSyncPoint() {
+ TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */);
+}
+} // anonymous namespace
+
+TEST_F(DBTest2, SyncPointMarker) {
+ std::atomic<int> sync_point_called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTest2::MarkedPoint",
+ [&](void* /*arg*/) { sync_point_called.fetch_add(1); });
+
+ // The first dependency enforces Marker can be loaded before MarkedPoint.
+ // The second checks that thread 1's MarkedPoint should be disabled here.
+ // Execution order:
+ // | Thread 1 | Thread 2 |
+ // | | Marker |
+ // | MarkedPoint | |
+ // | Thread1First | |
+ // | | MarkedPoint |
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
+ {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}},
+ {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::function<void()> func1 = [&]() {
+ CountSyncPoint();
+ TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First");
+ };
+
+ std::function<void()> func2 = [&]() {
+ TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker");
+ CountSyncPoint();
+ };
+
+ auto thread1 = port::Thread(func1);
+ auto thread2 = port::Thread(func2);
+ thread1.join();
+ thread2.join();
+
+ // Callback is only executed once
+ ASSERT_EQ(sync_point_called.load(), 1);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif
+
+size_t GetEncodedEntrySize(size_t key_size, size_t value_size) {
+ std::string buffer;
+
+ PutVarint32(&buffer, static_cast<uint32_t>(0));
+ PutVarint32(&buffer, static_cast<uint32_t>(key_size));
+ PutVarint32(&buffer, static_cast<uint32_t>(value_size));
+
+ return buffer.size() + key_size + value_size;
+}
+
+TEST_F(DBTest2, ReadAmpBitmap) {
+ Options options = CurrentOptions();
+ BlockBasedTableOptions bbto;
+ uint32_t bytes_per_bit[2] = {1, 16};
+ for (size_t k = 0; k < 2; k++) {
+ // Disable delta encoding to make it easier to calculate read amplification
+ bbto.use_delta_encoding = false;
+ // Huge block cache to make it easier to calculate read amplification
+ bbto.block_cache = NewLRUCache(1024 * 1024 * 1024);
+ bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ const size_t kNumEntries = 10000;
+
+ Random rnd(301);
+ for (size_t i = 0; i < kNumEntries; i++) {
+ ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(100)));
+ }
+ ASSERT_OK(Flush());
+
+ Close();
+ Reopen(options);
+
+ // Read keys/values randomly and verify that reported read amp error
+ // is less than 2%
+ uint64_t total_useful_bytes = 0;
+ std::set<int> read_keys;
+ std::string value;
+ for (size_t i = 0; i < kNumEntries * 5; i++) {
+ int key_idx = rnd.Next() % kNumEntries;
+ std::string key = Key(key_idx);
+ ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+ if (read_keys.find(key_idx) == read_keys.end()) {
+ auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+ total_useful_bytes +=
+ GetEncodedEntrySize(internal_key.size(), value.size());
+ read_keys.insert(key_idx);
+ }
+
+ double expected_read_amp =
+ static_cast<double>(total_useful_bytes) /
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+ double read_amp =
+ static_cast<double>(options.statistics->getTickerCount(
+ READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+ double error_pct = fabs(expected_read_amp - read_amp) * 100;
+ // Error between reported read amp and real read amp should be less than
+ // 2%
+ EXPECT_LE(error_pct, 2);
+ }
+
+ // Make sure we read every thing in the DB (which is smaller than our cache)
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
+ }
+ ASSERT_OK(iter->status());
+ delete iter;
+
+ // Read amp is on average 100% since we read all what we loaded in memory
+ if (k == 0) {
+ ASSERT_EQ(
+ options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES),
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES));
+ } else {
+ ASSERT_NEAR(
+ options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) *
+ 1.0f /
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES),
+ 1, .01);
+ }
+ }
+}
+
+#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented
+TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
+ {
+ const int kIdBufLen = 100;
+ char id_buf[kIdBufLen];
+ Status s = Status::NotSupported();
+#ifndef OS_WIN
+ // You can't open a directory on windows using random access file
+ std::unique_ptr<RandomAccessFile> file;
+ s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions());
+ if (s.ok()) {
+ if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
+ // fs holding db directory doesn't support getting a unique file id,
+ // this means that running this test will fail because lru_cache will
+ // load the blocks again regardless of them being already in the cache
+ return;
+ }
+ }
+#endif
+ if (!s.ok()) {
+ std::unique_ptr<Directory> dir;
+ ASSERT_OK(env_->NewDirectory(dbname_, &dir));
+ if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
+ // fs holding db directory doesn't support getting a unique file id,
+ // this means that running this test will fail because lru_cache will
+ // load the blocks again regardless of them being already in the cache
+ return;
+ }
+ }
+ }
+ uint32_t bytes_per_bit[2] = {1, 16};
+ for (size_t k = 0; k < 2; k++) {
+ std::shared_ptr<Cache> lru_cache = NewLRUCache(1024 * 1024 * 1024);
+ std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ Options options = CurrentOptions();
+ BlockBasedTableOptions bbto;
+ // Disable delta encoding to make it easier to calculate read amplification
+ bbto.use_delta_encoding = false;
+ // Huge block cache to make it easier to calculate read amplification
+ bbto.block_cache = lru_cache;
+ bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.statistics = stats;
+ DestroyAndReopen(options);
+
+ const int kNumEntries = 10000;
+
+ Random rnd(301);
+ for (int i = 0; i < kNumEntries; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+ }
+ ASSERT_OK(Flush());
+
+ Close();
+ Reopen(options);
+
+ std::set<int> read_keys;
+ std::string value;
+ // Iter1: Read half the DB, Read even keys
+ // Key(0), Key(2), Key(4), Key(6), Key(8), ...
+ for (int i = 0; i < kNumEntries; i += 2) {
+ std::string key = Key(i);
+ ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+ if (read_keys.find(i) == read_keys.end()) {
+ auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+ read_keys.insert(i);
+ }
+ }
+
+ size_t total_useful_bytes_iter1 =
+ options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+ size_t total_loaded_bytes_iter1 =
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+ Close();
+ std::shared_ptr<Statistics> new_statistics =
+ ROCKSDB_NAMESPACE::CreateDBStatistics();
+ // Destroy old statistics obj that the blocks in lru_cache are pointing to
+ options.statistics.reset();
+ // Use the statistics object that we just created
+ options.statistics = new_statistics;
+ Reopen(options);
+
+ // Iter2: Read half the DB, Read odd keys
+ // Key(1), Key(3), Key(5), Key(7), Key(9), ...
+ for (int i = 1; i < kNumEntries; i += 2) {
+ std::string key = Key(i);
+ ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+ if (read_keys.find(i) == read_keys.end()) {
+ auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+ read_keys.insert(i);
+ }
+ }
+
+ size_t total_useful_bytes_iter2 =
+ options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+ size_t total_loaded_bytes_iter2 =
+ options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+ // Read amp is on average 100% since we read all what we loaded in memory
+ if (k == 0) {
+ ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2,
+ total_loaded_bytes_iter1 + total_loaded_bytes_iter2);
+ } else {
+ ASSERT_NEAR((total_useful_bytes_iter1 + total_useful_bytes_iter2) * 1.0f /
+ (total_loaded_bytes_iter1 + total_loaded_bytes_iter2),
+ 1, .01);
+ }
+ }
+}
+#endif // !OS_SOLARIS
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) {
+ Options options = CurrentOptions();
+ options.num_levels = 3;
+ options.IncreaseParallelism(20);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "a"));
+ ASSERT_OK(Put(Key(5), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(10), "a"));
+ ASSERT_OK(Put(Key(15), "a"));
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ auto get_stat = [](std::string level_str, LevelStatType type,
+ std::map<std::string, std::string> props) {
+ auto prop_str =
+ "compaction." + level_str + "." +
+ InternalStats::compaction_level_stats.at(type).property_name.c_str();
+ auto prop_item = props.find(prop_str);
+ return prop_item == props.end() ? 0 : std::stod(prop_item->second);
+ };
+
+ // Trivial move 2 files to L2
+ ASSERT_EQ("0,0,2", FilesPerLevel());
+ // Also test that the stats GetMapProperty API reporting the same result
+ {
+ std::map<std::string, std::string> prop;
+ ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
+ ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop));
+ ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop));
+ ASSERT_EQ(2, get_stat("L2", LevelStatType::NUM_FILES, prop));
+ ASSERT_EQ(2, get_stat("Sum", LevelStatType::NUM_FILES, prop));
+ }
+
+ // While the compaction is running, we will create 2 new files that
+ // can fit in L2, these 2 files will be moved to L2 and overlap with
+ // the running compaction and break the LSM consistency.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():Start", [&](void* /*arg*/) {
+ ASSERT_OK(
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
+ {"max_bytes_for_level_base", "1"}}));
+ ASSERT_OK(Put(Key(6), "a"));
+ ASSERT_OK(Put(Key(7), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(8), "a"));
+ ASSERT_OK(Put(Key(9), "a"));
+ ASSERT_OK(Flush());
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Run a manual compaction that will compact the 2 files in L2
+ // into 1 file in L2
+ cro.exclusive_manual_compaction = false;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ // Test that the stats GetMapProperty API reporting 1 file in L2
+ {
+ std::map<std::string, std::string> prop;
+ ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
+ ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop));
+ }
+}
+
+TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) {
+ Options options = CurrentOptions();
+ options.num_levels = 2;
+ options.IncreaseParallelism(20);
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "a"));
+ ASSERT_OK(Put(Key(5), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(10), "a"));
+ ASSERT_OK(Put(Key(15), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Trivial move 2 files to L1
+ ASSERT_EQ("0,2", FilesPerLevel());
+
+ std::function<void()> bg_manual_compact = [&]() {
+ std::string k1 = Key(6);
+ std::string k2 = Key(9);
+ Slice k1s(k1);
+ Slice k2s(k2);
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = false;
+ ASSERT_OK(db_->CompactRange(cro, &k1s, &k2s));
+ };
+ ROCKSDB_NAMESPACE::port::Thread bg_thread;
+
+ // While the compaction is running, we will create 2 new files that
+ // can fit in L1, these 2 files will be moved to L1 and overlap with
+ // the running compaction and break the LSM consistency.
+ std::atomic<bool> flag(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():Start", [&](void* /*arg*/) {
+ if (flag.exchange(true)) {
+ // We want to make sure to call this callback only once
+ return;
+ }
+ ASSERT_OK(Put(Key(6), "a"));
+ ASSERT_OK(Put(Key(7), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(8), "a"));
+ ASSERT_OK(Put(Key(9), "a"));
+ ASSERT_OK(Flush());
+
+ // Start a non-exclusive manual compaction in a bg thread
+ bg_thread = port::Thread(bg_manual_compact);
+ // This manual compaction conflict with the other manual compaction
+ // so it should wait until the first compaction finish
+ env_->SleepForMicroseconds(1000000);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Run a manual compaction that will compact the 2 files in L1
+ // into 1 file in L1
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = false;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ bg_thread.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction1) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+ // Generate a file containing 10 keys.
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+
+ // Generate another file containing same keys
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+
+ int manual_compactions_paused = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) {
+ auto canceled = static_cast<std::atomic<bool>*>(arg);
+ // CompactRange triggers manual compaction and cancel the compaction
+ // by set *canceled as true
+ if (canceled != nullptr) {
+ canceled->store(true, std::memory_order_release);
+ }
+ manual_compactions_paused += 1;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) {
+ auto paused = static_cast<std::atomic<int>*>(arg);
+ // CompactFiles() relies on manual_compactions_paused to
+ // determine if thie compaction should be paused or not
+ ASSERT_EQ(0, paused->load(std::memory_order_acquire));
+ paused->fetch_add(1, std::memory_order_release);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<std::string> files_before_compact, files_after_compact;
+ // Remember file name before compaction is triggered
+ std::vector<LiveFileMetaData> files_meta;
+ dbfull()->GetLiveFilesMetaData(&files_meta);
+ for (auto file : files_meta) {
+ files_before_compact.push_back(file.name);
+ }
+
+ // OK, now trigger a manual compaction
+ ASSERT_TRUE(dbfull()
+ ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+ .IsManualCompactionPaused());
+
+ // Wait for compactions to get scheduled and stopped
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ // Get file names after compaction is stopped
+ files_meta.clear();
+ dbfull()->GetLiveFilesMetaData(&files_meta);
+ for (auto file : files_meta) {
+ files_after_compact.push_back(file.name);
+ }
+
+ // Like nothing happened
+ ASSERT_EQ(files_before_compact, files_after_compact);
+ ASSERT_EQ(manual_compactions_paused, 1);
+
+ manual_compactions_paused = 0;
+ // Now make sure CompactFiles also not run
+ ASSERT_TRUE(dbfull()
+ ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+ files_before_compact, 0)
+ .IsManualCompactionPaused());
+ // Wait for manual compaction to get scheduled and finish
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ files_meta.clear();
+ files_after_compact.clear();
+ dbfull()->GetLiveFilesMetaData(&files_meta);
+ for (auto file : files_meta) {
+ files_after_compact.push_back(file.name);
+ }
+
+ ASSERT_EQ(files_before_compact, files_after_compact);
+ // CompactFiles returns at entry point
+ ASSERT_EQ(manual_compactions_paused, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// PausingManualCompaction does not affect auto compaction
+TEST_F(DBTest2, PausingManualCompaction2) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ options.disable_auto_compactions = false;
+
+ DestroyAndReopen(options);
+ dbfull()->DisableManualCompaction();
+
+ Random rnd(301);
+ for (int i = 0; i < 2; i++) {
+ // Generate a file containing 100 keys.
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(j), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ std::vector<LiveFileMetaData> files_meta;
+ dbfull()->GetLiveFilesMetaData(&files_meta);
+ ASSERT_EQ(files_meta.size(), 1);
+}
+
+TEST_F(DBTest2, PausingManualCompaction3) {
+ CompactRangeOptions compact_options;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+
+ Random rnd(301);
+ auto generate_files = [&]() {
+ for (int i = 0; i < options.num_levels; i++) {
+ for (int j = 0; j < options.num_levels - i + 1; j++) {
+ for (int k = 0; k < 1000; k++) {
+ ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int l = 1; l < options.num_levels - i; l++) {
+ MoveFilesToLevel(l);
+ }
+ }
+ };
+
+ DestroyAndReopen(options);
+ generate_files();
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ int run_manual_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():PausingManualCompaction:1",
+ [&](void* /*arg*/) { run_manual_compactions++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ dbfull()->DisableManualCompaction();
+ ASSERT_TRUE(dbfull()
+ ->CompactRange(compact_options, nullptr, nullptr)
+ .IsManualCompactionPaused());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+ // As manual compaction disabled, not even reach sync point
+ ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "CompactionJob::Run():PausingManualCompaction:1");
+ dbfull()->EnableManualCompaction();
+ ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction4) {
+ CompactRangeOptions compact_options;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+
+ Random rnd(301);
+ auto generate_files = [&]() {
+ for (int i = 0; i < options.num_levels; i++) {
+ for (int j = 0; j < options.num_levels - i + 1; j++) {
+ for (int k = 0; k < 1000; k++) {
+ ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int l = 1; l < options.num_levels - i; l++) {
+ MoveFilesToLevel(l);
+ }
+ }
+ };
+
+ DestroyAndReopen(options);
+ generate_files();
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ int run_manual_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) {
+ auto canceled = static_cast<std::atomic<bool>*>(arg);
+ // CompactRange triggers manual compaction and cancel the compaction
+ // by set *canceled as true
+ if (canceled != nullptr) {
+ canceled->store(true, std::memory_order_release);
+ }
+ run_manual_compactions++;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) {
+ auto paused = static_cast<std::atomic<int>*>(arg);
+ // CompactFiles() relies on manual_compactions_paused to
+ // determine if thie compaction should be paused or not
+ ASSERT_EQ(0, paused->load(std::memory_order_acquire));
+ paused->fetch_add(1, std::memory_order_release);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_TRUE(dbfull()
+ ->CompactRange(compact_options, nullptr, nullptr)
+ .IsManualCompactionPaused());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+ ASSERT_EQ(run_manual_compactions, 1);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "CompactionJob::Run():PausingManualCompaction:2");
+ ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CancelManualCompaction1) {
+ CompactRangeOptions compact_options;
+ auto canceledPtr =
+ std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+ compact_options.canceled = canceledPtr.get();
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+
+ Random rnd(301);
+ auto generate_files = [&]() {
+ for (int i = 0; i < options.num_levels; i++) {
+ for (int j = 0; j < options.num_levels - i + 1; j++) {
+ for (int k = 0; k < 1000; k++) {
+ ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int l = 1; l < options.num_levels - i; l++) {
+ MoveFilesToLevel(l);
+ }
+ }
+ };
+
+ DestroyAndReopen(options);
+ generate_files();
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ int run_manual_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():PausingManualCompaction:1",
+ [&](void* /*arg*/) { run_manual_compactions++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Setup a callback to disable compactions after a couple of levels are
+ // compacted
+ int compactions_run = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::RunManualCompaction()::1",
+ [&](void* /*arg*/) { ++compactions_run; });
+
+ ASSERT_TRUE(dbfull()
+ ->CompactRange(compact_options, nullptr, nullptr)
+ .IsManualCompactionPaused());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ // Since compactions are disabled, we shouldn't start compacting.
+ // E.g. we should call the compaction function exactly one time.
+ ASSERT_EQ(compactions_run, 0);
+ ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ compactions_run = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "DBImpl::RunManualCompaction()::1");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
+ ++compactions_run;
+ // After 3 compactions disable
+ if (compactions_run == 3) {
+ compact_options.canceled->store(true, std::memory_order_release);
+ }
+ });
+
+ compact_options.canceled->store(false, std::memory_order_release);
+ ASSERT_TRUE(dbfull()
+ ->CompactRange(compact_options, nullptr, nullptr)
+ .IsManualCompactionPaused());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ ASSERT_EQ(compactions_run, 3);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "DBImpl::RunManualCompaction()::1");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "CompactionJob::Run():PausingManualCompaction:1");
+
+ // Compactions should work again if we re-enable them..
+ compact_options.canceled->store(false, std::memory_order_relaxed);
+ ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CancelManualCompaction2) {
+ CompactRangeOptions compact_options;
+ auto canceledPtr =
+ std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+ compact_options.canceled = canceledPtr.get();
+ compact_options.max_subcompactions = 1;
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+
+ Random rnd(301);
+ auto generate_files = [&]() {
+ for (int i = 0; i < options.num_levels; i++) {
+ for (int j = 0; j < options.num_levels - i + 1; j++) {
+ for (int k = 0; k < 1000; k++) {
+ ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int l = 1; l < options.num_levels - i; l++) {
+ MoveFilesToLevel(l);
+ }
+ }
+ };
+
+ DestroyAndReopen(options);
+ generate_files();
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ int compactions_run = 0;
+ std::atomic<int> kv_compactions{0};
+ int compactions_stopped_at = 0;
+ int kv_compactions_stopped_at = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
+ ++compactions_run;
+ // After 3 compactions disable
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
+ int kv_compactions_run =
+ kv_compactions.fetch_add(1, std::memory_order_release);
+ if (kv_compactions_run == 5) {
+ compact_options.canceled->store(true, std::memory_order_release);
+ kv_compactions_stopped_at = kv_compactions_run;
+ compactions_stopped_at = compactions_run;
+ }
+ });
+
+ compact_options.canceled->store(false, std::memory_order_release);
+ ASSERT_TRUE(dbfull()
+ ->CompactRange(compact_options, nullptr, nullptr)
+ .IsManualCompactionPaused());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to
+ // the canceled variable from the single compacting thread (via callback),
+ // this value is deterministically kv_compactions_stopped_at + 1.
+ ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1);
+ ASSERT_EQ(compactions_run, compactions_stopped_at);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "CompactionIterator::ProcessKV");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "DBImpl::RunManualCompaction()::1");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "CompactionJob::Run():PausingManualCompaction:1");
+
+ // Compactions should work again if we re-enable them..
+ compact_options.canceled->store(false, std::memory_order_relaxed);
+ ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class CancelCompactionListener : public EventListener {
+ public:
+ CancelCompactionListener()
+ : num_compaction_started_(0), num_compaction_ended_(0) {}
+
+ void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+ ASSERT_EQ(ci.cf_name, "default");
+ ASSERT_EQ(ci.base_input_level, 0);
+ num_compaction_started_++;
+ }
+
+ void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+ ASSERT_EQ(ci.cf_name, "default");
+ ASSERT_EQ(ci.base_input_level, 0);
+ ASSERT_EQ(ci.status.code(), code_);
+ ASSERT_EQ(ci.status.subcode(), subcode_);
+ num_compaction_ended_++;
+ }
+
+ std::atomic<size_t> num_compaction_started_;
+ std::atomic<size_t> num_compaction_ended_;
+ Status::Code code_;
+ Status::SubCode subcode_;
+};
+
+TEST_F(DBTest2, CancelManualCompactionWithListener) {
+ CompactRangeOptions compact_options;
+ auto canceledPtr =
+ std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+ compact_options.canceled = canceledPtr.get();
+ compact_options.max_subcompactions = 1;
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ CancelCompactionListener* listener = new CancelCompactionListener();
+ options.listeners.emplace_back(listener);
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ ASSERT_OK(Put(Key(i + j * 10), rnd.RandomString(50)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
+ compact_options.canceled->store(true, std::memory_order_release);
+ });
+
+ int running_compaction = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::FinishCompactionOutputFile1",
+ [&](void* /*arg*/) { running_compaction++; });
+
+ // Case I: 1 Notify begin compaction, 2 Set *canceled as true to disable
+ // manual compaction in the callback function, 3 Compaction not run,
+ // 4 Notify compaction end.
+ listener->code_ = Status::kIncomplete;
+ listener->subcode_ = Status::SubCode::kManualCompactionPaused;
+
+ compact_options.canceled->store(false, std::memory_order_release);
+ ASSERT_TRUE(dbfull()
+ ->CompactRange(compact_options, nullptr, nullptr)
+ .IsManualCompactionPaused());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ ASSERT_GT(listener->num_compaction_started_, 0);
+ ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+ ASSERT_EQ(running_compaction, 0);
+
+ listener->num_compaction_started_ = 0;
+ listener->num_compaction_ended_ = 0;
+
+ // Case II: 1 Set *canceled as true in the callback function to disable manual
+ // compaction, 2 Notify begin compaction (return without notifying), 3 Notify
+ // compaction end (return without notifying).
+ ASSERT_TRUE(dbfull()
+ ->CompactRange(compact_options, nullptr, nullptr)
+ .IsManualCompactionPaused());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ ASSERT_EQ(listener->num_compaction_started_, 0);
+ ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+ ASSERT_EQ(running_compaction, 0);
+
+ // Case III: 1 Notify begin compaction, 2 Compaction in between
+ // 3. Set *canceled as true in the callback function to disable manual
+ // compaction, 4 Notify compaction end.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "CompactionIterator:ProcessKV");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run:BeforeVerify", [&](void* /*arg*/) {
+ compact_options.canceled->store(true, std::memory_order_release);
+ });
+
+ listener->code_ = Status::kOk;
+ listener->subcode_ = Status::SubCode::kNone;
+
+ compact_options.canceled->store(false, std::memory_order_release);
+ ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ ASSERT_GT(listener->num_compaction_started_, 0);
+ ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+
+ // Compaction job will succeed.
+ ASSERT_GT(running_compaction, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CompactionOnBottomPriorityWithListener) {
+ int num_levels = 3;
+ const int kNumFilesTrigger = 4;
+
+ Options options = CurrentOptions();
+ env_->SetBackgroundThreads(0, Env::Priority::HIGH);
+ env_->SetBackgroundThreads(0, Env::Priority::LOW);
+ env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+ options.env = env_;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+
+ CancelCompactionListener* listener = new CancelCompactionListener();
+ options.listeners.emplace_back(listener);
+
+ DestroyAndReopen(options);
+
+ int num_bottom_thread_compaction_scheduled = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+ [&](void* /*arg*/) { num_bottom_thread_compaction_scheduled++; });
+
+ int num_compaction_jobs = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():End",
+ [&](void* /*arg*/) { num_compaction_jobs++; });
+
+ listener->code_ = Status::kOk;
+ listener->subcode_ = Status::SubCode::kNone;
+
+ Random rnd(301);
+ for (int i = 0; i < 1; ++i) {
+ for (int num = 0; num < kNumFilesTrigger; num++) {
+ int key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+ // use no_wait above because that one waits for flush and compaction. We
+ // don't want to wait for compaction because the full compaction is
+ // intentionally blocked while more files are flushed.
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_GT(num_bottom_thread_compaction_scheduled, 0);
+ ASSERT_EQ(num_compaction_jobs, 1);
+ ASSERT_GT(listener->num_compaction_started_, 0);
+ ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, OptimizeForPointLookup) {
+ Options options = CurrentOptions();
+ Close();
+ options.OptimizeForPointLookup(2);
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("v1", Get("foo"));
+}
+
+TEST_F(DBTest2, OptimizeForSmallDB) {
+ Options options = CurrentOptions();
+ Close();
+ options.OptimizeForSmallDb();
+
+ // Find the cache object
+ ASSERT_TRUE(options.table_factory->IsInstanceOf(
+ TableFactory::kBlockBasedTableName()));
+ auto table_options =
+ options.table_factory->GetOptions<BlockBasedTableOptions>();
+
+ ASSERT_TRUE(table_options != nullptr);
+ std::shared_ptr<Cache> cache = table_options->block_cache;
+
+ ASSERT_EQ(0, cache->GetUsage());
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ ASSERT_OK(Put("foo", "v1"));
+
+ // memtable size is costed to the block cache
+ ASSERT_NE(0, cache->GetUsage());
+
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_OK(Flush());
+
+ size_t prev_size = cache->GetUsage();
+ // Remember block cache size, so that we can find that
+ // it is filled after Get().
+ // Use pinnable slice so that it can ping the block so that
+ // when we check the size it is not evicted.
+ PinnableSlice value;
+ ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value));
+ ASSERT_GT(cache->GetUsage(), prev_size);
+ value.Reset();
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest2, IterRaceFlush1) {
+ ASSERT_OK(Put("foo", "v1"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::NewIterator:1", "DBTest2::IterRaceFlush:1"},
+ {"DBTest2::IterRaceFlush:2", "DBImpl::NewIterator:2"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread t1([&] {
+ TEST_SYNC_POINT("DBTest2::IterRaceFlush:1");
+ ASSERT_OK(Put("foo", "v2"));
+ ASSERT_OK(Flush());
+ TEST_SYNC_POINT("DBTest2::IterRaceFlush:2");
+ });
+
+ // iterator is created after the first Put(), and its snapshot sequence is
+ // assigned after second Put(), so it must see v2.
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->Seek("foo");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_OK(it->status());
+ ASSERT_EQ("foo", it->key().ToString());
+ ASSERT_EQ("v2", it->value().ToString());
+ }
+
+ t1.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, IterRaceFlush2) {
+ ASSERT_OK(Put("foo", "v1"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::NewIterator:3", "DBTest2::IterRaceFlush2:1"},
+ {"DBTest2::IterRaceFlush2:2", "DBImpl::NewIterator:4"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread t1([&] {
+ TEST_SYNC_POINT("DBTest2::IterRaceFlush2:1");
+ ASSERT_OK(Put("foo", "v2"));
+ ASSERT_OK(Flush());
+ TEST_SYNC_POINT("DBTest2::IterRaceFlush2:2");
+ });
+
+ // iterator is created after the first Put(), and its snapshot sequence is
+ // assigned before second Put(), thus it must see v1.
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->Seek("foo");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_OK(it->status());
+ ASSERT_EQ("foo", it->key().ToString());
+ ASSERT_EQ("v1", it->value().ToString());
+ }
+
+ t1.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, IterRefreshRaceFlush) {
+ ASSERT_OK(Put("foo", "v1"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ArenaWrappedDBIter::Refresh:1", "DBTest2::IterRefreshRaceFlush:1"},
+ {"DBTest2::IterRefreshRaceFlush:2", "ArenaWrappedDBIter::Refresh:2"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread t1([&] {
+ TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:1");
+ ASSERT_OK(Put("foo", "v2"));
+ ASSERT_OK(Flush());
+ TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:2");
+ });
+
+ // iterator is refreshed after the first Put(), and its sequence number is
+ // assigned after second Put(), thus it must see v2.
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ ASSERT_OK(it->status());
+ ASSERT_OK(it->Refresh());
+ it->Seek("foo");
+ ASSERT_TRUE(it->Valid());
+ ASSERT_OK(it->status());
+ ASSERT_EQ("foo", it->key().ToString());
+ ASSERT_EQ("v2", it->value().ToString());
+ }
+
+ t1.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, GetRaceFlush1) {
+ ASSERT_OK(Put("foo", "v1"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::GetImpl:1", "DBTest2::GetRaceFlush:1"},
+ {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:2"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread t1([&] {
+ TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
+ ASSERT_OK(Put("foo", "v2"));
+ ASSERT_OK(Flush());
+ TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
+ });
+
+ // Get() is issued after the first Put(), so it should see either
+ // "v1" or "v2".
+ ASSERT_NE("NOT_FOUND", Get("foo"));
+ t1.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, GetRaceFlush2) {
+ ASSERT_OK(Put("foo", "v1"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::GetImpl:3", "DBTest2::GetRaceFlush:1"},
+ {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:4"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread t1([&] {
+ TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
+ ASSERT_OK(Put("foo", "v2"));
+ ASSERT_OK(Flush());
+ TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
+ });
+
+ // Get() is issued after the first Put(), so it should see either
+ // "v1" or "v2".
+ ASSERT_NE("NOT_FOUND", Get("foo"));
+ t1.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, DirectIO) {
+ if (!IsDirectIOSupported()) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
+ true;
+ options.allow_mmap_reads = options.allow_mmap_writes = false;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "a"));
+ ASSERT_OK(Put(Key(5), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(10), "a"));
+ ASSERT_OK(Put(Key(15), "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ Reopen(options);
+}
+
+TEST_F(DBTest2, MemtableOnlyIterator) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "first"));
+ ASSERT_OK(Put(1, "bar", "second"));
+
+ ReadOptions ropt;
+ ropt.read_tier = kMemtableTier;
+ std::string value;
+ Iterator* it = nullptr;
+
+ // Before flushing
+ // point lookups
+ ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+ ASSERT_EQ("first", value);
+ ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+ ASSERT_EQ("second", value);
+
+ // Memtable-only iterator (read_tier=kMemtableTier); data not flushed yet.
+ it = db_->NewIterator(ropt, handles_[1]);
+ int count = 0;
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ ASSERT_TRUE(it->Valid());
+ count++;
+ }
+ ASSERT_TRUE(!it->Valid());
+ ASSERT_EQ(2, count);
+ delete it;
+
+ Flush(1);
+
+ // After flushing
+ // point lookups
+ ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+ ASSERT_EQ("first", value);
+ ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+ ASSERT_EQ("second", value);
+ // nothing should be returned using memtable-only iterator after flushing.
+ it = db_->NewIterator(ropt, handles_[1]);
+ ASSERT_OK(it->status());
+ count = 0;
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ ASSERT_TRUE(it->Valid());
+ count++;
+ }
+ ASSERT_TRUE(!it->Valid());
+ ASSERT_EQ(0, count);
+ ASSERT_OK(it->status());
+ delete it;
+
+ // Add a key to memtable
+ ASSERT_OK(Put(1, "foobar", "third"));
+ it = db_->NewIterator(ropt, handles_[1]);
+ ASSERT_OK(it->status());
+ count = 0;
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ("foobar", it->key().ToString());
+ ASSERT_EQ("third", it->value().ToString());
+ count++;
+ }
+ ASSERT_TRUE(!it->Valid());
+ ASSERT_EQ(1, count);
+ ASSERT_OK(it->status());
+ delete it;
+}
+
+TEST_F(DBTest2, LowPriWrite) {
+ Options options = CurrentOptions();
+ // Compaction pressure should trigger since 6 files
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 12;
+ options.level0_stop_writes_trigger = 30;
+ options.delayed_write_rate = 8 * 1024 * 1024;
+ Reopen(options);
+
+ std::atomic<int> rate_limit_count(0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GenericRateLimiter::Request:1", [&](void* arg) {
+ rate_limit_count.fetch_add(1);
+ int64_t* rate_bytes_per_sec = static_cast<int64_t*>(arg);
+ ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec);
+ });
+ // Block compaction
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ WriteOptions wo;
+ for (int i = 0; i < 6; i++) {
+ wo.low_pri = false;
+ ASSERT_OK(Put("", "", wo));
+ wo.low_pri = true;
+ ASSERT_OK(Put("", "", wo));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ(0, rate_limit_count.load());
+ wo.low_pri = true;
+ ASSERT_OK(Put("", "", wo));
+ ASSERT_EQ(1, rate_limit_count.load());
+ wo.low_pri = false;
+ ASSERT_OK(Put("", "", wo));
+ ASSERT_EQ(1, rate_limit_count.load());
+
+ TEST_SYNC_POINT("DBTest.LowPriWrite:0");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ wo.low_pri = true;
+ ASSERT_OK(Put("", "", wo));
+ ASSERT_EQ(1, rate_limit_count.load());
+ wo.low_pri = false;
+ ASSERT_OK(Put("", "", wo));
+ ASSERT_EQ(1, rate_limit_count.load());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RateLimitedCompactionReads) {
+ // compaction input has 512KB data
+ const int kNumKeysPerFile = 128;
+ const int kBytesPerKey = 1024;
+ const int kNumL0Files = 4;
+
+ for (int compaction_readahead_size : {0, 32 << 10}) {
+ for (auto use_direct_io : {false, true}) {
+ if (use_direct_io && !IsDirectIOSupported()) {
+ continue;
+ }
+ Options options = CurrentOptions();
+ options.compaction_readahead_size = compaction_readahead_size;
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ // takes roughly one second, split into 100 x 10ms intervals. Each
+ // interval permits 5.12KB, which is smaller than the block size, so this
+ // test exercises the code for chunking reads.
+ options.rate_limiter.reset(NewGenericRateLimiter(
+ static_cast<int64_t>(kNumL0Files * kNumKeysPerFile *
+ kBytesPerKey) /* rate_bytes_per_sec */,
+ 10 * 1000 /* refill_period_us */, 10 /* fairness */,
+ RateLimiter::Mode::kReadsOnly));
+ options.use_direct_reads =
+ options.use_direct_io_for_flush_and_compaction = use_direct_io;
+ BlockBasedTableOptions bbto;
+ bbto.block_size = 16384;
+ bbto.no_block_cache = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < kNumL0Files; ++i) {
+ for (int j = 0; j <= kNumKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey)));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ if (i + 1 < kNumL0Files) {
+ ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+ // should be slightly above 512KB due to non-data blocks read. Arbitrarily
+ // chose 1MB as the upper bound on the total bytes read.
+ size_t rate_limited_bytes = static_cast<size_t>(
+ options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL));
+ // The charges can exist for `IO_LOW` and `IO_USER` priorities.
+ size_t rate_limited_bytes_by_pri =
+ options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) +
+ options.rate_limiter->GetTotalBytesThrough(Env::IO_USER);
+ ASSERT_EQ(rate_limited_bytes,
+ static_cast<size_t>(rate_limited_bytes_by_pri));
+ // Include the explicit prefetch of the footer in direct I/O case.
+ size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
+ ASSERT_GE(
+ rate_limited_bytes,
+ static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files));
+ ASSERT_LT(
+ rate_limited_bytes,
+ static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files +
+ direct_io_extra));
+
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ ASSERT_OK(iter->status());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey));
+ }
+ delete iter;
+ // bytes read for user iterator shouldn't count against the rate limit.
+ rate_limited_bytes_by_pri =
+ options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) +
+ options.rate_limiter->GetTotalBytesThrough(Env::IO_USER);
+ ASSERT_EQ(rate_limited_bytes,
+ static_cast<size_t>(rate_limited_bytes_by_pri));
+ }
+ }
+}
+#endif // ROCKSDB_LITE
+
+// Make sure DB can be reopen with reduced number of levels, given no file
+// is on levels higher than the new num_levels.
+TEST_F(DBTest2, ReduceLevel) {
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+ Reopen(options);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 1;
+ ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,1", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ options.num_levels = 3;
+ Reopen(options);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,1", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+}
+
+// Test that ReadCallback is actually used in both memtbale and sst tables
+TEST_F(DBTest2, ReadCallbackTest) {
+ Options options;
+ options.disable_auto_compactions = true;
+ options.num_levels = 7;
+ options.env = env_;
+ Reopen(options);
+ std::vector<const Snapshot*> snapshots;
+ // Try to create a db with multiple layers and a memtable
+ const std::string key = "foo";
+ const std::string value = "bar";
+ // This test assumes that the seq start with 1 and increased by 1 after each
+ // write batch of size 1. If that behavior changes, the test needs to be
+ // updated as well.
+ // TODO(myabandeh): update this test to use the seq number that is returned by
+ // the DB instead of assuming what seq the DB used.
+ int i = 1;
+ for (; i < 10; i++) {
+ ASSERT_OK(Put(key, value + std::to_string(i)));
+ // Take a snapshot to avoid the value being removed during compaction
+ auto snapshot = dbfull()->GetSnapshot();
+ snapshots.push_back(snapshot);
+ }
+ ASSERT_OK(Flush());
+ for (; i < 20; i++) {
+ ASSERT_OK(Put(key, value + std::to_string(i)));
+ // Take a snapshot to avoid the value being removed during compaction
+ auto snapshot = dbfull()->GetSnapshot();
+ snapshots.push_back(snapshot);
+ }
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ for (; i < 30; i++) {
+ ASSERT_OK(Put(key, value + std::to_string(i)));
+ auto snapshot = dbfull()->GetSnapshot();
+ snapshots.push_back(snapshot);
+ }
+ ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ // And also add some values to the memtable
+ for (; i < 40; i++) {
+ ASSERT_OK(Put(key, value + std::to_string(i)));
+ auto snapshot = dbfull()->GetSnapshot();
+ snapshots.push_back(snapshot);
+ }
+
+ class TestReadCallback : public ReadCallback {
+ public:
+ explicit TestReadCallback(SequenceNumber snapshot)
+ : ReadCallback(snapshot), snapshot_(snapshot) {}
+ bool IsVisibleFullCheck(SequenceNumber seq) override {
+ return seq <= snapshot_;
+ }
+
+ private:
+ SequenceNumber snapshot_;
+ };
+
+ for (int seq = 1; seq < i; seq++) {
+ PinnableSlice pinnable_val;
+ ReadOptions roptions;
+ TestReadCallback callback(seq);
+ bool dont_care = true;
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = dbfull()->DefaultColumnFamily();
+ get_impl_options.value = &pinnable_val;
+ get_impl_options.value_found = &dont_care;
+ get_impl_options.callback = &callback;
+ Status s = dbfull()->GetImpl(roptions, key, get_impl_options);
+ ASSERT_TRUE(s.ok());
+ // Assuming that after each Put the DB increased seq by one, the value and
+ // seq number must be equal since we also inc value by 1 after each Put.
+ ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString());
+ }
+
+ for (auto snapshot : snapshots) {
+ dbfull()->ReleaseSnapshot(snapshot);
+ }
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
+ // Regression test for race condition where an obsolete file is returned to
+ // user as a "live file" but then deleted, all while file deletions are
+ // disabled.
+ //
+ // It happened like this:
+ //
+ // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles
+ // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the
+ // latter returned "x.log"
+ // 3. [flush thread] PurgeObsoleteFiles deleted "x.log"
+ // 4. [user thread] Reading "x.log" failed
+ //
+ // Unfortunately the only regression test I can come up with involves sleep.
+ // We cannot set SyncPoints to repro since, once the fix is applied, the
+ // SyncPoints would cause a deadlock as the repro's sequence of events is now
+ // prohibited.
+ //
+ // Instead, if we sleep for a second between Find and Purge, and ensure the
+ // read attempt happens after purge, then the sequence of events will almost
+ // certainly happen on the old code.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::BackgroundCallFlush:FilesFound",
+ "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"},
+ {"DBImpl::PurgeObsoleteFiles:End",
+ "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::PurgeObsoleteFiles:Begin",
+ [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("key", "val"));
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ db_->Flush(flush_opts);
+ TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered");
+
+ ASSERT_OK(db_->DisableFileDeletions());
+ VectorLogPtr log_files;
+ ASSERT_OK(db_->GetSortedWalFiles(log_files));
+ TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured");
+ for (const auto& log_file : log_files) {
+ ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber())));
+ }
+
+ ASSERT_OK(db_->EnableFileDeletions());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestNumPread) {
+ Options options = CurrentOptions();
+ bool prefetch_supported =
+ test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+ // disable block cache
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ env_->count_random_reads_ = true;
+ env_->random_file_open_counter_.store(0);
+ ASSERT_OK(Put("bar", "foo"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ if (prefetch_supported) {
+ // After flush, we'll open the file and read footer, meta block,
+ // property block and index block.
+ ASSERT_EQ(4, env_->random_read_counter_.Read());
+ } else {
+ // With prefetch not supported, we will do a single read into a buffer
+ ASSERT_EQ(1, env_->random_read_counter_.Read());
+ }
+ ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+ // One pread per a normal data block read
+ env_->random_file_open_counter_.store(0);
+ env_->random_read_counter_.Reset();
+ ASSERT_EQ("bar", Get("foo"));
+ ASSERT_EQ(1, env_->random_read_counter_.Read());
+ // All files are already opened.
+ ASSERT_EQ(0, env_->random_file_open_counter_.load());
+
+ env_->random_file_open_counter_.store(0);
+ env_->random_read_counter_.Reset();
+ ASSERT_OK(Put("bar2", "foo2"));
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Flush());
+ if (prefetch_supported) {
+ // After flush, we'll open the file and read footer, meta block,
+ // property block and index block.
+ ASSERT_EQ(4, env_->random_read_counter_.Read());
+ } else {
+ // With prefetch not supported, we will do a single read into a buffer
+ ASSERT_EQ(1, env_->random_read_counter_.Read());
+ }
+ ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+ env_->random_file_open_counter_.store(0);
+ env_->random_read_counter_.Reset();
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ if (prefetch_supported) {
+ // Compaction needs two input blocks, which requires 2 preads, and
+ // generate a new SST file which needs 4 preads (footer, meta block,
+ // property block and index block). In total 6.
+ ASSERT_EQ(6, env_->random_read_counter_.Read());
+ } else {
+ // With prefetch off, compaction needs two input blocks,
+ // followed by a single buffered read. In total 3.
+ ASSERT_EQ(3, env_->random_read_counter_.Read());
+ }
+ // All compaction input files should have already been opened.
+ ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+ // One pread per a normal data block read
+ env_->random_file_open_counter_.store(0);
+ env_->random_read_counter_.Reset();
+ ASSERT_EQ("foo2", Get("bar2"));
+ ASSERT_EQ(1, env_->random_read_counter_.Read());
+ // SST files are already opened.
+ ASSERT_EQ(0, env_->random_file_open_counter_.load());
+}
+
+class TraceExecutionResultHandler : public TraceRecordResult::Handler {
+ public:
+ TraceExecutionResultHandler() {}
+ ~TraceExecutionResultHandler() override {}
+
+ virtual Status Handle(const StatusOnlyTraceExecutionResult& result) override {
+ if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+ return Status::InvalidArgument("Invalid timestamps.");
+ }
+ result.GetStatus().PermitUncheckedError();
+ switch (result.GetTraceType()) {
+ case kTraceWrite: {
+ total_latency_ += result.GetLatency();
+ cnt_++;
+ writes_++;
+ break;
+ }
+ default:
+ return Status::Corruption("Type mismatch.");
+ }
+ return Status::OK();
+ }
+
+ virtual Status Handle(
+ const SingleValueTraceExecutionResult& result) override {
+ if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+ return Status::InvalidArgument("Invalid timestamps.");
+ }
+ result.GetStatus().PermitUncheckedError();
+ switch (result.GetTraceType()) {
+ case kTraceGet: {
+ total_latency_ += result.GetLatency();
+ cnt_++;
+ gets_++;
+ break;
+ }
+ default:
+ return Status::Corruption("Type mismatch.");
+ }
+ return Status::OK();
+ }
+
+ virtual Status Handle(
+ const MultiValuesTraceExecutionResult& result) override {
+ if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+ return Status::InvalidArgument("Invalid timestamps.");
+ }
+ for (const Status& s : result.GetMultiStatus()) {
+ s.PermitUncheckedError();
+ }
+ switch (result.GetTraceType()) {
+ case kTraceMultiGet: {
+ total_latency_ += result.GetLatency();
+ cnt_++;
+ multigets_++;
+ break;
+ }
+ default:
+ return Status::Corruption("Type mismatch.");
+ }
+ return Status::OK();
+ }
+
+ virtual Status Handle(const IteratorTraceExecutionResult& result) override {
+ if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+ return Status::InvalidArgument("Invalid timestamps.");
+ }
+ result.GetStatus().PermitUncheckedError();
+ switch (result.GetTraceType()) {
+ case kTraceIteratorSeek:
+ case kTraceIteratorSeekForPrev: {
+ total_latency_ += result.GetLatency();
+ cnt_++;
+ seeks_++;
+ break;
+ }
+ default:
+ return Status::Corruption("Type mismatch.");
+ }
+ return Status::OK();
+ }
+
+ void Reset() {
+ total_latency_ = 0;
+ cnt_ = 0;
+ writes_ = 0;
+ gets_ = 0;
+ seeks_ = 0;
+ multigets_ = 0;
+ }
+
+ double GetAvgLatency() const {
+ return cnt_ == 0 ? 0.0 : 1.0 * total_latency_ / cnt_;
+ }
+
+ int GetNumWrites() const { return writes_; }
+
+ int GetNumGets() const { return gets_; }
+
+ int GetNumIterSeeks() const { return seeks_; }
+
+ int GetNumMultiGets() const { return multigets_; }
+
+ private:
+ std::atomic<uint64_t> total_latency_{0};
+ std::atomic<uint32_t> cnt_{0};
+ std::atomic<int> writes_{0};
+ std::atomic<int> gets_{0};
+ std::atomic<int> seeks_{0};
+ std::atomic<int> multigets_{0};
+};
+
+TEST_F(DBTest2, TraceAndReplay) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ ReadOptions ro;
+ WriteOptions wo;
+ TraceOptions trace_opts;
+ EnvOptions env_opts;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+ Iterator* single_iter = nullptr;
+
+ ASSERT_TRUE(db_->EndTrace().IsIOError());
+
+ std::string trace_filename = dbname_ + "/rocksdb.trace";
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+ ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+ // 5 Writes
+ ASSERT_OK(Put(0, "a", "1"));
+ ASSERT_OK(Merge(0, "b", "2"));
+ ASSERT_OK(Delete(0, "c"));
+ ASSERT_OK(SingleDelete(0, "d"));
+ ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+ // 6th Write
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("f", "11"));
+ ASSERT_OK(batch.Merge("g", "12"));
+ ASSERT_OK(batch.Delete("h"));
+ ASSERT_OK(batch.SingleDelete("i"));
+ ASSERT_OK(batch.DeleteRange("j", "k"));
+ ASSERT_OK(db_->Write(wo, &batch));
+
+ // 2 Seek(ForPrev)s
+ single_iter = db_->NewIterator(ro);
+ single_iter->Seek("f"); // Seek 1
+ single_iter->SeekForPrev("g");
+ ASSERT_OK(single_iter->status());
+ delete single_iter;
+
+ // 2 Gets
+ ASSERT_EQ("1", Get(0, "a"));
+ ASSERT_EQ("12", Get(0, "g"));
+
+ // 7th and 8th Write, 3rd Get
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "rocksdb", "rocks"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+ // Total Write x 8, Get x 3, Seek x 2.
+ ASSERT_OK(db_->EndTrace());
+ // These should not get into the trace file as it is after EndTrace.
+ ASSERT_OK(Put("hello", "world"));
+ ASSERT_OK(Merge("foo", "bar"));
+
+ // Open another db, replay, and verify the data
+ std::string value;
+ std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ // Using a different name than db2, to pacify infer's use-after-lifetime
+ // warnings (http://fbinfer.com).
+ DB* db2_init = nullptr;
+ options.create_if_missing = true;
+ ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+ ColumnFamilyHandle* cf;
+ ASSERT_OK(
+ db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+ delete cf;
+ delete db2_init;
+
+ DB* db2 = nullptr;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ ColumnFamilyOptions cf_options;
+ cf_options.merge_operator = MergeOperators::CreatePutOperator();
+ column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+ column_families.push_back(
+ ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+ std::vector<ColumnFamilyHandle*> handles;
+ DBOptions db_opts;
+ db_opts.env = env_;
+ ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+ env_->SleepForMicroseconds(100);
+ // Verify that the keys don't already exist
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+ std::unique_ptr<Replayer> replayer;
+ ASSERT_OK(
+ db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+
+ TraceExecutionResultHandler res_handler;
+ std::function<void(Status, std::unique_ptr<TraceRecordResult> &&)> res_cb =
+ [&res_handler](Status exec_s, std::unique_ptr<TraceRecordResult>&& res) {
+ ASSERT_TRUE(exec_s.ok() || exec_s.IsNotSupported());
+ if (res != nullptr) {
+ ASSERT_OK(res->Accept(&res_handler));
+ res.reset();
+ }
+ };
+
+ // Unprepared replay should fail with Status::Incomplete()
+ ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
+ ASSERT_OK(replayer->Prepare());
+ // Ok to repeatedly Prepare().
+ ASSERT_OK(replayer->Prepare());
+ // Replay using 1 thread, 1x speed.
+ ASSERT_OK(replayer->Replay(ReplayOptions(1, 1.0), res_cb));
+ ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+ ASSERT_EQ(res_handler.GetNumWrites(), 8);
+ ASSERT_EQ(res_handler.GetNumGets(), 3);
+ ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+ ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+ res_handler.Reset();
+
+ ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
+ ASSERT_EQ("1", value);
+ ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
+ ASSERT_EQ("12", value);
+ ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+
+ ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
+ ASSERT_EQ("bar", value);
+ ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
+ ASSERT_EQ("rocks", value);
+
+ // Re-replay should fail with Status::Incomplete() if Prepare() was not
+ // called. Currently we don't distinguish between unprepared and trace end.
+ ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
+
+ // Re-replay using 2 threads, 2x speed.
+ ASSERT_OK(replayer->Prepare());
+ ASSERT_OK(replayer->Replay(ReplayOptions(2, 2.0), res_cb));
+ ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+ ASSERT_EQ(res_handler.GetNumWrites(), 8);
+ ASSERT_EQ(res_handler.GetNumGets(), 3);
+ ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+ ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+ res_handler.Reset();
+
+ // Re-replay using 2 threads, 1/2 speed.
+ ASSERT_OK(replayer->Prepare());
+ ASSERT_OK(replayer->Replay(ReplayOptions(2, 0.5), res_cb));
+ ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+ ASSERT_EQ(res_handler.GetNumWrites(), 8);
+ ASSERT_EQ(res_handler.GetNumGets(), 3);
+ ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+ ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+ res_handler.Reset();
+
+ replayer.reset();
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceAndManualReplay) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ ReadOptions ro;
+ WriteOptions wo;
+ TraceOptions trace_opts;
+ EnvOptions env_opts;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+ Iterator* single_iter = nullptr;
+
+ ASSERT_TRUE(db_->EndTrace().IsIOError());
+
+ std::string trace_filename = dbname_ + "/rocksdb.trace";
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+ ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+ ASSERT_OK(Put(0, "a", "1"));
+ ASSERT_OK(Merge(0, "b", "2"));
+ ASSERT_OK(Delete(0, "c"));
+ ASSERT_OK(SingleDelete(0, "d"));
+ ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("f", "11"));
+ ASSERT_OK(batch.Merge("g", "12"));
+ ASSERT_OK(batch.Delete("h"));
+ ASSERT_OK(batch.SingleDelete("i"));
+ ASSERT_OK(batch.DeleteRange("j", "k"));
+ ASSERT_OK(db_->Write(wo, &batch));
+
+ single_iter = db_->NewIterator(ro);
+ single_iter->Seek("f");
+ single_iter->SeekForPrev("g");
+ ASSERT_OK(single_iter->status());
+ delete single_iter;
+
+ // Write some sequenced keys for testing lower/upper bounds of iterator.
+ batch.Clear();
+ ASSERT_OK(batch.Put("iter-0", "iter-0"));
+ ASSERT_OK(batch.Put("iter-1", "iter-1"));
+ ASSERT_OK(batch.Put("iter-2", "iter-2"));
+ ASSERT_OK(batch.Put("iter-3", "iter-3"));
+ ASSERT_OK(batch.Put("iter-4", "iter-4"));
+ ASSERT_OK(db_->Write(wo, &batch));
+
+ ReadOptions bounded_ro = ro;
+ Slice lower_bound("iter-1");
+ Slice upper_bound("iter-3");
+ bounded_ro.iterate_lower_bound = &lower_bound;
+ bounded_ro.iterate_upper_bound = &upper_bound;
+ single_iter = db_->NewIterator(bounded_ro);
+ single_iter->Seek("iter-0");
+ ASSERT_EQ(single_iter->key().ToString(), "iter-1");
+ single_iter->Seek("iter-2");
+ ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+ single_iter->Seek("iter-4");
+ ASSERT_FALSE(single_iter->Valid());
+ single_iter->SeekForPrev("iter-0");
+ ASSERT_FALSE(single_iter->Valid());
+ single_iter->SeekForPrev("iter-2");
+ ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+ single_iter->SeekForPrev("iter-4");
+ ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+ ASSERT_OK(single_iter->status());
+ delete single_iter;
+
+ ASSERT_EQ("1", Get(0, "a"));
+ ASSERT_EQ("12", Get(0, "g"));
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "rocksdb", "rocks"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+ // Same as TraceAndReplay, Write x 8, Get x 3, Seek x 2.
+ // Plus 1 WriteBatch for iterator with lower/upper bounds, and 6
+ // Seek(ForPrev)s.
+ // Total Write x 9, Get x 3, Seek x 8
+ ASSERT_OK(db_->EndTrace());
+ // These should not get into the trace file as it is after EndTrace.
+ ASSERT_OK(Put("hello", "world"));
+ ASSERT_OK(Merge("foo", "bar"));
+
+ // Open another db, replay, and verify the data
+ std::string value;
+ std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ // Using a different name than db2, to pacify infer's use-after-lifetime
+ // warnings (http://fbinfer.com).
+ DB* db2_init = nullptr;
+ options.create_if_missing = true;
+ ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+ ColumnFamilyHandle* cf;
+ ASSERT_OK(
+ db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+ delete cf;
+ delete db2_init;
+
+ DB* db2 = nullptr;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ ColumnFamilyOptions cf_options;
+ cf_options.merge_operator = MergeOperators::CreatePutOperator();
+ column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+ column_families.push_back(
+ ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+ std::vector<ColumnFamilyHandle*> handles;
+ DBOptions db_opts;
+ db_opts.env = env_;
+ ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+ env_->SleepForMicroseconds(100);
+ // Verify that the keys don't already exist
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+ std::unique_ptr<Replayer> replayer;
+ ASSERT_OK(
+ db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+
+ TraceExecutionResultHandler res_handler;
+
+ // Manual replay for 2 times. The 2nd checks if the replay can restart.
+ std::unique_ptr<TraceRecord> record;
+ std::unique_ptr<TraceRecordResult> result;
+ for (int i = 0; i < 2; i++) {
+ // Next should fail if unprepared.
+ ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
+ ASSERT_OK(replayer->Prepare());
+ Status s = Status::OK();
+ // Looping until trace end.
+ while (s.ok()) {
+ s = replayer->Next(&record);
+ // Skip unsupported operations.
+ if (s.IsNotSupported()) {
+ continue;
+ }
+ if (s.ok()) {
+ ASSERT_OK(replayer->Execute(record, &result));
+ if (result != nullptr) {
+ ASSERT_OK(result->Accept(&res_handler));
+ if (record->GetTraceType() == kTraceIteratorSeek ||
+ record->GetTraceType() == kTraceIteratorSeekForPrev) {
+ IteratorSeekQueryTraceRecord* iter_rec =
+ dynamic_cast<IteratorSeekQueryTraceRecord*>(record.get());
+ IteratorTraceExecutionResult* iter_res =
+ dynamic_cast<IteratorTraceExecutionResult*>(result.get());
+ // Check if lower/upper bounds are correctly saved and decoded.
+ std::string lower_str = iter_rec->GetLowerBound().ToString();
+ std::string upper_str = iter_rec->GetUpperBound().ToString();
+ std::string iter_key = iter_res->GetKey().ToString();
+ std::string iter_value = iter_res->GetValue().ToString();
+ if (!lower_str.empty() && !upper_str.empty()) {
+ ASSERT_EQ(lower_str, "iter-1");
+ ASSERT_EQ(upper_str, "iter-3");
+ if (iter_res->GetValid()) {
+ // If iterator is valid, then lower_bound <= key < upper_bound.
+ ASSERT_GE(iter_key, lower_str);
+ ASSERT_LT(iter_key, upper_str);
+ } else {
+ // If iterator is invalid, then
+ // key < lower_bound or key >= upper_bound.
+ ASSERT_TRUE(iter_key < lower_str || iter_key >= upper_str);
+ }
+ }
+ // If iterator is invalid, the key and value should be empty.
+ if (!iter_res->GetValid()) {
+ ASSERT_TRUE(iter_key.empty());
+ ASSERT_TRUE(iter_value.empty());
+ }
+ }
+ result.reset();
+ }
+ }
+ }
+ // Status::Incomplete() will be returned when manually reading the trace
+ // end, or Prepare() was not called.
+ ASSERT_TRUE(s.IsIncomplete());
+ ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
+ ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+ ASSERT_EQ(res_handler.GetNumWrites(), 9);
+ ASSERT_EQ(res_handler.GetNumGets(), 3);
+ ASSERT_EQ(res_handler.GetNumIterSeeks(), 8);
+ ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+ res_handler.Reset();
+ }
+
+ ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
+ ASSERT_EQ("1", value);
+ ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
+ ASSERT_EQ("12", value);
+ ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+
+ ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
+ ASSERT_EQ("bar", value);
+ ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
+ ASSERT_EQ("rocks", value);
+
+ // Test execution of artificially created TraceRecords.
+ uint64_t fake_ts = 1U;
+ // Write
+ batch.Clear();
+ ASSERT_OK(batch.Put("trace-record-write1", "write1"));
+ ASSERT_OK(batch.Put("trace-record-write2", "write2"));
+ record.reset(new WriteQueryTraceRecord(batch.Data(), fake_ts++));
+ ASSERT_OK(replayer->Execute(record, &result));
+ ASSERT_TRUE(result != nullptr);
+ ASSERT_OK(result->Accept(&res_handler)); // Write x 1
+ ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write1", &value));
+ ASSERT_EQ("write1", value);
+ ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write2", &value));
+ ASSERT_EQ("write2", value);
+ ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+ ASSERT_EQ(res_handler.GetNumWrites(), 1);
+ ASSERT_EQ(res_handler.GetNumGets(), 0);
+ ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+ ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+ res_handler.Reset();
+
+ // Get related
+ // Get an existing key.
+ record.reset(new GetQueryTraceRecord(handles[0]->GetID(),
+ "trace-record-write1", fake_ts++));
+ ASSERT_OK(replayer->Execute(record, &result));
+ ASSERT_TRUE(result != nullptr);
+ ASSERT_OK(result->Accept(&res_handler)); // Get x 1
+ // Get an non-existing key, should still return Status::OK().
+ record.reset(new GetQueryTraceRecord(handles[0]->GetID(), "trace-record-get",
+ fake_ts++));
+ ASSERT_OK(replayer->Execute(record, &result));
+ ASSERT_TRUE(result != nullptr);
+ ASSERT_OK(result->Accept(&res_handler)); // Get x 2
+ // Get from an invalid (non-existing) cf_id.
+ uint32_t invalid_cf_id = handles[1]->GetID() + 1;
+ record.reset(new GetQueryTraceRecord(invalid_cf_id, "whatever", fake_ts++));
+ ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+ ASSERT_TRUE(result == nullptr);
+ ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+ ASSERT_EQ(res_handler.GetNumWrites(), 0);
+ ASSERT_EQ(res_handler.GetNumGets(), 2);
+ ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+ ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+ res_handler.Reset();
+
+ // Iteration related
+ for (IteratorSeekQueryTraceRecord::SeekType seekType :
+ {IteratorSeekQueryTraceRecord::kSeek,
+ IteratorSeekQueryTraceRecord::kSeekForPrev}) {
+ // Seek to an existing key.
+ record.reset(new IteratorSeekQueryTraceRecord(
+ seekType, handles[0]->GetID(), "trace-record-write1", fake_ts++));
+ ASSERT_OK(replayer->Execute(record, &result));
+ ASSERT_TRUE(result != nullptr);
+ ASSERT_OK(result->Accept(&res_handler)); // Seek x 1 in one iteration
+ // Seek to an non-existing key, should still return Status::OK().
+ record.reset(new IteratorSeekQueryTraceRecord(
+ seekType, handles[0]->GetID(), "trace-record-get", fake_ts++));
+ ASSERT_OK(replayer->Execute(record, &result));
+ ASSERT_TRUE(result != nullptr);
+ ASSERT_OK(result->Accept(&res_handler)); // Seek x 2 in one iteration
+ // Seek from an invalid cf_id.
+ record.reset(new IteratorSeekQueryTraceRecord(seekType, invalid_cf_id,
+ "whatever", fake_ts++));
+ ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+ ASSERT_TRUE(result == nullptr);
+ }
+ ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+ ASSERT_EQ(res_handler.GetNumWrites(), 0);
+ ASSERT_EQ(res_handler.GetNumGets(), 0);
+ ASSERT_EQ(res_handler.GetNumIterSeeks(), 4); // Seek x 2 in two iterations
+ ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+ res_handler.Reset();
+
+ // MultiGet related
+ // Get existing keys.
+ record.reset(new MultiGetQueryTraceRecord(
+ std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+ std::vector<std::string>({"a", "foo"}), fake_ts++));
+ ASSERT_OK(replayer->Execute(record, &result));
+ ASSERT_TRUE(result != nullptr);
+ ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 1
+ // Get all non-existing keys, should still return Status::OK().
+ record.reset(new MultiGetQueryTraceRecord(
+ std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+ std::vector<std::string>({"no1", "no2"}), fake_ts++));
+ ASSERT_OK(replayer->Execute(record, &result));
+ ASSERT_TRUE(result != nullptr);
+ ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 2
+ // Get mixed of existing and non-existing keys, should still return
+ // Status::OK().
+ record.reset(new MultiGetQueryTraceRecord(
+ std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+ std::vector<std::string>({"a", "no2"}), fake_ts++));
+ ASSERT_OK(replayer->Execute(record, &result));
+ ASSERT_TRUE(result != nullptr);
+ MultiValuesTraceExecutionResult* mvr =
+ dynamic_cast<MultiValuesTraceExecutionResult*>(result.get());
+ ASSERT_TRUE(mvr != nullptr);
+ ASSERT_OK(mvr->GetMultiStatus()[0]);
+ ASSERT_TRUE(mvr->GetMultiStatus()[1].IsNotFound());
+ ASSERT_EQ(mvr->GetValues()[0], "1");
+ ASSERT_EQ(mvr->GetValues()[1], "");
+ ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 3
+ // Get from an invalid (non-existing) cf_id.
+ record.reset(new MultiGetQueryTraceRecord(
+ std::vector<uint32_t>(
+ {handles[0]->GetID(), handles[1]->GetID(), invalid_cf_id}),
+ std::vector<std::string>({"a", "foo", "whatever"}), fake_ts++));
+ ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+ ASSERT_TRUE(result == nullptr);
+ // Empty MultiGet
+ record.reset(new MultiGetQueryTraceRecord(
+ std::vector<uint32_t>(), std::vector<std::string>(), fake_ts++));
+ ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
+ ASSERT_TRUE(result == nullptr);
+ // MultiGet size mismatch
+ record.reset(new MultiGetQueryTraceRecord(
+ std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+ std::vector<std::string>({"a"}), fake_ts++));
+ ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
+ ASSERT_TRUE(result == nullptr);
+ ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+ ASSERT_EQ(res_handler.GetNumWrites(), 0);
+ ASSERT_EQ(res_handler.GetNumGets(), 0);
+ ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+ ASSERT_EQ(res_handler.GetNumMultiGets(), 3);
+ res_handler.Reset();
+
+ replayer.reset();
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithLimit) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ ReadOptions ro;
+ WriteOptions wo;
+ TraceOptions trace_opts;
+ EnvOptions env_opts;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+
+ // test the max trace file size options
+ trace_opts.max_trace_file_size = 5;
+ std::string trace_filename = dbname_ + "/rocksdb.trace1";
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+ ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+ ASSERT_OK(Put(0, "a", "1"));
+ ASSERT_OK(Put(0, "b", "1"));
+ ASSERT_OK(Put(0, "c", "1"));
+ ASSERT_OK(db_->EndTrace());
+
+ std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay2");
+ std::string value;
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ // Using a different name than db2, to pacify infer's use-after-lifetime
+ // warnings (http://fbinfer.com).
+ DB* db2_init = nullptr;
+ options.create_if_missing = true;
+ ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+ ColumnFamilyHandle* cf;
+ ASSERT_OK(
+ db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+ delete cf;
+ delete db2_init;
+
+ DB* db2 = nullptr;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ ColumnFamilyOptions cf_options;
+ cf_options.merge_operator = MergeOperators::CreatePutOperator();
+ column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+ column_families.push_back(
+ ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+ std::vector<ColumnFamilyHandle*> handles;
+ DBOptions db_opts;
+ db_opts.env = env_;
+ ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+ env_->SleepForMicroseconds(100);
+ // Verify that the keys don't already exist
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+ std::unique_ptr<Replayer> replayer;
+ ASSERT_OK(
+ db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+ ASSERT_OK(replayer->Prepare());
+ ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+ replayer.reset();
+
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithSampling) {
+ Options options = CurrentOptions();
+ ReadOptions ro;
+ WriteOptions wo;
+ TraceOptions trace_opts;
+ EnvOptions env_opts;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+
+ // test the trace file sampling options
+ trace_opts.sampling_frequency = 2;
+ std::string trace_filename = dbname_ + "/rocksdb.trace_sampling";
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+ ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+ ASSERT_OK(Put(0, "a", "1"));
+ ASSERT_OK(Put(0, "b", "2"));
+ ASSERT_OK(Put(0, "c", "3"));
+ ASSERT_OK(Put(0, "d", "4"));
+ ASSERT_OK(Put(0, "e", "5"));
+ ASSERT_OK(db_->EndTrace());
+
+ std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay_sampling");
+ std::string value;
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ // Using a different name than db2, to pacify infer's use-after-lifetime
+ // warnings (http://fbinfer.com).
+ DB* db2_init = nullptr;
+ options.create_if_missing = true;
+ ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+ ColumnFamilyHandle* cf;
+ ASSERT_OK(
+ db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+ delete cf;
+ delete db2_init;
+
+ DB* db2 = nullptr;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ ColumnFamilyOptions cf_options;
+ column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+ column_families.push_back(
+ ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+ std::vector<ColumnFamilyHandle*> handles;
+ DBOptions db_opts;
+ db_opts.env = env_;
+ ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+ env_->SleepForMicroseconds(100);
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+ std::unique_ptr<Replayer> replayer;
+ ASSERT_OK(
+ db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+ ASSERT_OK(replayer->Prepare());
+ ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+ replayer.reset();
+
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+ ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithFilter) {
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ ReadOptions ro;
+ WriteOptions wo;
+ TraceOptions trace_opts;
+ EnvOptions env_opts;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Random rnd(301);
+ Iterator* single_iter = nullptr;
+
+ trace_opts.filter = TraceFilterType::kTraceFilterWrite;
+
+ std::string trace_filename = dbname_ + "/rocksdb.trace";
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+ ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+ ASSERT_OK(Put(0, "a", "1"));
+ ASSERT_OK(Merge(0, "b", "2"));
+ ASSERT_OK(Delete(0, "c"));
+ ASSERT_OK(SingleDelete(0, "d"));
+ ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("f", "11"));
+ ASSERT_OK(batch.Merge("g", "12"));
+ ASSERT_OK(batch.Delete("h"));
+ ASSERT_OK(batch.SingleDelete("i"));
+ ASSERT_OK(batch.DeleteRange("j", "k"));
+ ASSERT_OK(db_->Write(wo, &batch));
+
+ single_iter = db_->NewIterator(ro);
+ single_iter->Seek("f");
+ single_iter->SeekForPrev("g");
+ delete single_iter;
+
+ ASSERT_EQ("1", Get(0, "a"));
+ ASSERT_EQ("12", Get(0, "g"));
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Put(1, "rocksdb", "rocks"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+ ASSERT_OK(db_->EndTrace());
+ // These should not get into the trace file as it is after EndTrace.
+ ASSERT_OK(Put("hello", "world"));
+ ASSERT_OK(Merge("foo", "bar"));
+
+ // Open another db, replay, and verify the data
+ std::string value;
+ std::string dbname2 = test::PerThreadDBPath(env_, "db_replay");
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ // Using a different name than db2, to pacify infer's use-after-lifetime
+ // warnings (http://fbinfer.com).
+ DB* db2_init = nullptr;
+ options.create_if_missing = true;
+ ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+ ColumnFamilyHandle* cf;
+ ASSERT_OK(
+ db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+ delete cf;
+ delete db2_init;
+
+ DB* db2 = nullptr;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ ColumnFamilyOptions cf_options;
+ cf_options.merge_operator = MergeOperators::CreatePutOperator();
+ column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+ column_families.push_back(
+ ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+ std::vector<ColumnFamilyHandle*> handles;
+ DBOptions db_opts;
+ db_opts.env = env_;
+ ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+ env_->SleepForMicroseconds(100);
+ // Verify that the keys don't already exist
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+ std::unique_ptr<Replayer> replayer;
+ ASSERT_OK(
+ db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+ ASSERT_OK(replayer->Prepare());
+ ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+ replayer.reset();
+
+ // All the key-values should not present since we filter out the WRITE ops.
+ ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound());
+ ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound());
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname2, options));
+
+ // Set up a new db.
+ std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read");
+ ASSERT_OK(DestroyDB(dbname3, options));
+
+ DB* db3_init = nullptr;
+ options.create_if_missing = true;
+ ColumnFamilyHandle* cf3;
+ ASSERT_OK(DB::Open(options, dbname3, &db3_init));
+ ASSERT_OK(
+ db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3));
+ delete cf3;
+ delete db3_init;
+
+ column_families.clear();
+ column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+ column_families.push_back(
+ ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+ handles.clear();
+
+ DB* db3 = nullptr;
+ ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3));
+
+ env_->SleepForMicroseconds(100);
+ // Verify that the keys don't already exist
+ ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound());
+ ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound());
+
+ // The tracer will not record the READ ops.
+ trace_opts.filter = TraceFilterType::kTraceFilterGet;
+ std::string trace_filename3 = dbname_ + "/rocksdb.trace_3";
+ std::unique_ptr<TraceWriter> trace_writer3;
+ ASSERT_OK(
+ NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3));
+ ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3)));
+
+ ASSERT_OK(db3->Put(wo, handles[0], "a", "1"));
+ ASSERT_OK(db3->Merge(wo, handles[0], "b", "2"));
+ ASSERT_OK(db3->Delete(wo, handles[0], "c"));
+ ASSERT_OK(db3->SingleDelete(wo, handles[0], "d"));
+
+ ASSERT_OK(db3->Get(ro, handles[0], "a", &value));
+ ASSERT_EQ(value, "1");
+ ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound());
+
+ ASSERT_OK(db3->EndTrace());
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+ delete db3;
+ ASSERT_OK(DestroyDB(dbname3, options));
+
+ std::unique_ptr<TraceReader> trace_reader3;
+ ASSERT_OK(
+ NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3));
+
+ // Count the number of records in the trace file;
+ int count = 0;
+ std::string data;
+ Status s;
+ while (true) {
+ s = trace_reader3->Read(&data);
+ if (!s.ok()) {
+ break;
+ }
+ count += 1;
+ }
+ // We also need to count the header and footer
+ // 4 WRITE + HEADER + FOOTER = 6
+ ASSERT_EQ(count, 6);
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest2, PinnableSliceAndMmapReads) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ if (!IsMemoryMappedAccessSupported()) {
+ ROCKSDB_GTEST_SKIP("Test requires default environment");
+ return;
+ }
+ options.allow_mmap_reads = true;
+ options.max_open_files = 100;
+ options.compression = kNoCompression;
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ PinnableSlice pinned_value;
+ ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+ // It is not safe to pin mmap files as they might disappear by compaction
+ ASSERT_FALSE(pinned_value.IsPinned());
+ ASSERT_EQ(pinned_value.ToString(), "bar");
+
+ ASSERT_OK(dbfull()->TEST_CompactRange(
+ 0 /* level */, nullptr /* begin */, nullptr /* end */,
+ nullptr /* column_family */, true /* disallow_trivial_move */));
+
+ // Ensure pinned_value doesn't rely on memory munmap'd by the above
+ // compaction. It crashes if it does.
+ ASSERT_EQ(pinned_value.ToString(), "bar");
+
+#ifndef ROCKSDB_LITE
+ pinned_value.Reset();
+ // Unsafe to pin mmap files when they could be kicked out of table cache
+ Close();
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+ ASSERT_FALSE(pinned_value.IsPinned());
+ ASSERT_EQ(pinned_value.ToString(), "bar");
+
+ pinned_value.Reset();
+ // In read-only mode with infinite capacity on table cache it should pin the
+ // value and avoid the memcpy
+ Close();
+ options.max_open_files = -1;
+ ASSERT_OK(ReadOnlyReopen(options));
+ ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+ ASSERT_TRUE(pinned_value.IsPinned());
+ ASSERT_EQ(pinned_value.ToString(), "bar");
+#endif
+}
+
+TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions bbto;
+ bbto.no_block_cache = false;
+ bbto.cache_index_and_filter_blocks = false;
+ bbto.block_cache = NewLRUCache(100000);
+ bbto.block_size = 400; // small block size
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ Reopen(options);
+
+ Random rnd(301);
+ std::string v = rnd.RandomString(400);
+
+ // Since v is the size of a block, each key should take a block
+ // of 400+ bytes.
+ ASSERT_OK(Put("1", v));
+ ASSERT_OK(Put("3", v));
+ ASSERT_OK(Put("5", v));
+ ASSERT_OK(Put("7", v));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+ // Verify that iterators don't pin more than one data block in block cache
+ // at each time.
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ iter->SeekToFirst();
+
+ for (int i = 0; i < 4; i++) {
+ ASSERT_TRUE(iter->Valid());
+ // Block cache should contain exactly one block.
+ ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+ ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+ iter->Next();
+ }
+ ASSERT_FALSE(iter->Valid());
+
+ iter->Seek("4");
+ ASSERT_TRUE(iter->Valid());
+
+ ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+ ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+
+ iter->Seek("3");
+ ASSERT_TRUE(iter->Valid());
+
+ ASSERT_OK(iter->status());
+
+ ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+ ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+ }
+ ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+ // Test compaction case
+ ASSERT_OK(Put("2", v));
+ ASSERT_OK(Put("5", v));
+ ASSERT_OK(Put("6", v));
+ ASSERT_OK(Put("8", v));
+ ASSERT_OK(Flush());
+
+ // Clear existing data in block cache
+ bbto.block_cache->SetCapacity(0);
+ bbto.block_cache->SetCapacity(100000);
+
+ // Verify compaction input iterators don't hold more than one data blocks at
+ // one time.
+ std::atomic<bool> finished(false);
+ std::atomic<int> block_newed(0);
+ std::atomic<int> block_destroyed(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Block::Block:0", [&](void* /*arg*/) {
+ if (finished) {
+ return;
+ }
+ // Two iterators. At most 2 outstanding blocks.
+ EXPECT_GE(block_newed.load(), block_destroyed.load());
+ EXPECT_LE(block_newed.load(), block_destroyed.load() + 1);
+ block_newed.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "Block::~Block", [&](void* /*arg*/) {
+ if (finished) {
+ return;
+ }
+ // Two iterators. At most 2 outstanding blocks.
+ EXPECT_GE(block_newed.load(), block_destroyed.load() + 1);
+ EXPECT_LE(block_newed.load(), block_destroyed.load() + 2);
+ block_destroyed.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run:BeforeVerify",
+ [&](void* /*arg*/) { finished = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Two input files. Each of them has 4 data blocks.
+ ASSERT_EQ(8, block_newed.load());
+ ASSERT_EQ(8, block_destroyed.load());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestBBTTailPrefetch) {
+ std::atomic<bool> called(false);
+ size_t expected_lower_bound = 512 * 1024;
+ size_t expected_higher_bound = 512 * 1024;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+ size_t* prefetch_size = static_cast<size_t*>(arg);
+ EXPECT_LE(expected_lower_bound, *prefetch_size);
+ EXPECT_GE(expected_higher_bound, *prefetch_size);
+ called = true;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("1", "1"));
+ ASSERT_OK(Put("9", "1"));
+ ASSERT_OK(Flush());
+
+ expected_lower_bound = 0;
+ expected_higher_bound = 8 * 1024;
+
+ ASSERT_OK(Put("1", "1"));
+ ASSERT_OK(Put("9", "1"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("1", "1"));
+ ASSERT_OK(Put("9", "1"));
+ ASSERT_OK(Flush());
+
+ // Full compaction to make sure there is no L0 file after the open.
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ ASSERT_TRUE(called.load());
+ called = false;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ std::atomic<bool> first_call(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+ size_t* prefetch_size = static_cast<size_t*>(arg);
+ if (first_call) {
+ EXPECT_EQ(4 * 1024, *prefetch_size);
+ first_call = false;
+ } else {
+ EXPECT_GE(4 * 1024, *prefetch_size);
+ }
+ called = true;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.max_file_opening_threads = 1; // one thread
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.max_open_files = -1;
+ Reopen(options);
+
+ ASSERT_OK(Put("1", "1"));
+ ASSERT_OK(Put("9", "1"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("1", "1"));
+ ASSERT_OK(Put("9", "1"));
+ ASSERT_OK(Flush());
+
+ ASSERT_TRUE(called.load());
+ called = false;
+
+ // Parallel loading SST files
+ options.max_file_opening_threads = 16;
+ Reopen(options);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ ASSERT_TRUE(called.load());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
+ // Setup sync point dependency to reproduce the race condition of
+ // DBImpl::GetColumnFamilyHandleUnlocked
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1",
+ "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"},
+ {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2",
+ "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateColumnFamilies({"test1", "test2"}, Options());
+ ASSERT_EQ(handles_.size(), 2);
+
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+ port::Thread user_thread1([&]() {
+ auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
+ ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+ TEST_SYNC_POINT(
+ "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1");
+ TEST_SYNC_POINT(
+ "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1");
+ ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+ });
+
+ port::Thread user_thread2([&]() {
+ TEST_SYNC_POINT(
+ "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2");
+ auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID());
+ ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+ TEST_SYNC_POINT(
+ "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2");
+ ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+ });
+
+ user_thread1.join();
+ user_thread2.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, TestCompactFiles) {
+ // Setup sync point dependency to reproduce the race condition of
+ // DBImpl::GetColumnFamilyHandleUnlocked
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"TestCompactFiles::IngestExternalFile1",
+ "TestCompactFiles::IngestExternalFile2"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options;
+ options.env = env_;
+ options.num_levels = 2;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ auto* handle = db_->DefaultColumnFamily();
+ ASSERT_EQ(db_->NumberLevels(handle), 2);
+
+ ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{
+ ROCKSDB_NAMESPACE::EnvOptions(), options};
+ std::string external_file1 = dbname_ + "/test_compact_files1.sst_t";
+ std::string external_file2 = dbname_ + "/test_compact_files2.sst_t";
+ std::string external_file3 = dbname_ + "/test_compact_files3.sst_t";
+
+ ASSERT_OK(sst_file_writer.Open(external_file1));
+ ASSERT_OK(sst_file_writer.Put("1", "1"));
+ ASSERT_OK(sst_file_writer.Put("2", "2"));
+ ASSERT_OK(sst_file_writer.Finish());
+
+ ASSERT_OK(sst_file_writer.Open(external_file2));
+ ASSERT_OK(sst_file_writer.Put("3", "3"));
+ ASSERT_OK(sst_file_writer.Put("4", "4"));
+ ASSERT_OK(sst_file_writer.Finish());
+
+ ASSERT_OK(sst_file_writer.Open(external_file3));
+ ASSERT_OK(sst_file_writer.Put("5", "5"));
+ ASSERT_OK(sst_file_writer.Put("6", "6"));
+ ASSERT_OK(sst_file_writer.Finish());
+
+ ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3},
+ IngestExternalFileOptions()));
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
+ std::vector<std::string> files;
+ GetSstFiles(env_, dbname_, &files);
+ ASSERT_EQ(files.size(), 2);
+
+ Status user_thread1_status;
+ port::Thread user_thread1([&]() {
+ user_thread1_status =
+ db_->CompactFiles(CompactionOptions(), handle, files, 1);
+ });
+
+ Status user_thread2_status;
+ port::Thread user_thread2([&]() {
+ user_thread2_status = db_->IngestExternalFile(handle, {external_file2},
+ IngestExternalFileOptions());
+ TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1");
+ });
+
+ user_thread1.join();
+ user_thread2.join();
+
+ ASSERT_OK(user_thread1_status);
+ ASSERT_OK(user_thread2_status);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(DBTest2, MultiDBParallelOpenTest) {
+ const int kNumDbs = 2;
+ Options options = CurrentOptions();
+ std::vector<std::string> dbnames;
+ for (int i = 0; i < kNumDbs; ++i) {
+ dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + std::to_string(i)));
+ ASSERT_OK(DestroyDB(dbnames.back(), options));
+ }
+
+ // Verify empty DBs can be created in parallel
+ std::vector<std::thread> open_threads;
+ std::vector<DB*> dbs{static_cast<unsigned int>(kNumDbs), nullptr};
+ options.create_if_missing = true;
+ for (int i = 0; i < kNumDbs; ++i) {
+ open_threads.emplace_back(
+ [&](int dbnum) {
+ ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+ },
+ i);
+ }
+
+ // Now add some data and close, so next we can verify non-empty DBs can be
+ // recovered in parallel
+ for (int i = 0; i < kNumDbs; ++i) {
+ open_threads[i].join();
+ ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua"));
+ delete dbs[i];
+ }
+
+ // Verify non-empty DBs can be recovered in parallel
+ open_threads.clear();
+ for (int i = 0; i < kNumDbs; ++i) {
+ open_threads.emplace_back(
+ [&](int dbnum) {
+ ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+ },
+ i);
+ }
+
+ // Wait and cleanup
+ for (int i = 0; i < kNumDbs; ++i) {
+ open_threads[i].join();
+ delete dbs[i];
+ ASSERT_OK(DestroyDB(dbnames[i], options));
+ }
+}
+
+namespace {
+class DummyOldStats : public Statistics {
+ public:
+ const char* Name() const override { return "DummyOldStats"; }
+ uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; }
+ void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override {
+ num_rt++;
+ }
+ void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {}
+ uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override {
+ return 0;
+ }
+ void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override {
+ num_mt++;
+ }
+ void histogramData(
+ uint32_t /*histogram_type*/,
+ ROCKSDB_NAMESPACE::HistogramData* const /*data*/) const override {}
+ std::string getHistogramString(uint32_t /*type*/) const override {
+ return "";
+ }
+ bool HistEnabledForType(uint32_t /*type*/) const override { return false; }
+ std::string ToString() const override { return ""; }
+ std::atomic<int> num_rt{0};
+ std::atomic<int> num_mt{0};
+};
+} // anonymous namespace
+
+TEST_F(DBTest2, OldStatsInterface) {
+ DummyOldStats* dos = new DummyOldStats();
+ std::shared_ptr<Statistics> stats(dos);
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.statistics = stats;
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ("bar", Get("foo"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("bar", Get("foo"));
+
+ ASSERT_GT(dos->num_rt, 0);
+ ASSERT_GT(dos->num_mt, 0);
+}
+
+TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
+ const Snapshot* ss = db_->GetSnapshot();
+
+ for (auto h : handles_) {
+ db_->DestroyColumnFamilyHandle(h);
+ }
+ handles_.clear();
+
+ ASSERT_NOK(db_->Close());
+ db_->ReleaseSnapshot(ss);
+ ASSERT_OK(db_->Close());
+ delete db_;
+ db_ = nullptr;
+}
+
+TEST_F(DBTest2, PrefixBloomReseek) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ // Construct two L1 files with keys:
+ // f1:[aaa1 ccc1] f2:[ddd0]
+ ASSERT_OK(Put("aaa1", ""));
+ ASSERT_OK(Put("ccc1", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("ddd0", ""));
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_OK(Put("bbb1", ""));
+
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ ASSERT_OK(iter->status());
+
+ // Seeking into f1, the iterator will check bloom filter which returns the
+ // file iterator ot be invalidate, and the cursor will put into f2, with
+ // the next key to be "ddd0".
+ iter->Seek("bbb1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bbb1", iter->key().ToString());
+
+ // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek.
+ iter->Seek("ccc1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("ccc1", iter->key().ToString());
+
+ delete iter;
+}
+
+TEST_F(DBTest2, PrefixBloomFilteredOut) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ // Construct two L1 files with keys:
+ // f1:[aaa1 ccc1] f2:[ddd0]
+ ASSERT_OK(Put("aaa1", ""));
+ ASSERT_OK(Put("ccc1", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("ddd0", ""));
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ ASSERT_OK(iter->status());
+
+ // Bloom filter is filterd out by f1.
+ // This is just one of several valid position following the contract.
+ // Postioning to ccc1 or ddd0 is also valid. This is just to validate
+ // the behavior of the current implementation. If underlying implementation
+ // changes, the test might fail here.
+ iter->Seek("bbb1");
+ ASSERT_OK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+
+ delete iter;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RowCacheSnapshot) {
+ Options options = CurrentOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.row_cache = NewLRUCache(8 * 8192);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo", "bar1"));
+
+ const Snapshot* s1 = db_->GetSnapshot();
+
+ ASSERT_OK(Put("foo", "bar2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("foo2", "bar"));
+ const Snapshot* s2 = db_->GetSnapshot();
+ ASSERT_OK(Put("foo3", "bar"));
+ const Snapshot* s3 = db_->GetSnapshot();
+
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+ ASSERT_EQ(Get("foo"), "bar2");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+ ASSERT_EQ(Get("foo"), "bar2");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+ ASSERT_EQ(Get("foo", s1), "bar1");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+ ASSERT_EQ(Get("foo", s2), "bar2");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+ ASSERT_EQ(Get("foo", s1), "bar1");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+ ASSERT_EQ(Get("foo", s3), "bar2");
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
+ ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+
+ db_->ReleaseSnapshot(s1);
+ db_->ReleaseSnapshot(s2);
+ db_->ReleaseSnapshot(s3);
+}
+#endif // ROCKSDB_LITE
+
+// When DB is reopened with multiple column families, the manifest file
+// is written after the first CF is flushed, and it is written again
+// after each flush. If DB crashes between the flushes, the flushed CF
+// flushed will pass the latest log file, and now we require it not
+// to be corrupted, and triggering a corruption report.
+// We need to fix the bug and enable the test.
+TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
+ const std::vector<std::string> sync_points = {
+ "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable",
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"};
+ for (const auto& test_sync_point : sync_points) {
+ Options options = CurrentOptions();
+ // First destroy original db to ensure a clean start.
+ DestroyAndReopen(options);
+ options.create_if_missing = true;
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(1, "foo", "bar"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put(1, "foo", "bar"));
+ // The value is large enough to be divided to two blocks.
+ std::string large_value(400, ' ');
+ ASSERT_OK(Put("foo1", large_value));
+ ASSERT_OK(Put("foo2", large_value));
+ Close();
+
+ // Corrupt the log file in the middle, so that it is not corrupted
+ // in the tail.
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ for (const auto& f : filenames) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) {
+ std::string fname = dbname_ + "/" + f;
+ std::string file_content;
+ ASSERT_OK(ReadFileToString(env_, fname, &file_content));
+ file_content[400] = 'h';
+ file_content[401] = 'a';
+ ASSERT_OK(WriteStringToFile(env_, file_content, fname));
+ break;
+ }
+ }
+
+ // Reopen and freeze the file system after the first manifest write.
+ FaultInjectionTestEnv fit_env(options.env);
+ options.env = &fit_env;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ test_sync_point,
+ [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_NOK(TryReopenWithColumnFamilies(
+ {kDefaultColumnFamilyName, "pikachu"}, options));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ fit_env.SetFilesystemActive(true);
+ // If we continue using failure ingestion Env, it will conplain something
+ // when renaming current file, which is not expected. Need to investigate
+ // why.
+ options.env = env_;
+ ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+ options));
+ }
+}
+
+TEST_F(DBTest2, SeekFileRangeDeleteTail) {
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset(NewCappedPrefixTransform(1));
+ options.num_levels = 3;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("a", "a"));
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f"));
+ ASSERT_OK(Put("b", "a"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("x", "a"));
+ ASSERT_OK(Put("z", "a"));
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ {
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+ ASSERT_OK(iter->status());
+ iter->Seek("e");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("x", iter->key().ToString());
+ }
+ db_->ReleaseSnapshot(s1);
+}
+
+TEST_F(DBTest2, BackgroundPurgeTest) {
+ Options options = CurrentOptions();
+ options.write_buffer_manager =
+ std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(1 << 20);
+ options.avoid_unnecessary_blocking_io = true;
+ DestroyAndReopen(options);
+ size_t base_value = options.write_buffer_manager->memory_usage();
+
+ ASSERT_OK(Put("a", "a"));
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ ASSERT_OK(iter->status());
+ ASSERT_OK(Flush());
+ size_t value = options.write_buffer_manager->memory_usage();
+ ASSERT_GT(value, base_value);
+
+ db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH);
+ test::SleepingBackgroundTask sleeping_task_after;
+ db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_after, Env::Priority::HIGH);
+ delete iter;
+
+ Env::Default()->SleepForMicroseconds(100000);
+ value = options.write_buffer_manager->memory_usage();
+ ASSERT_GT(value, base_value);
+
+ sleeping_task_after.WakeUp();
+ sleeping_task_after.WaitUntilDone();
+
+ test::SleepingBackgroundTask sleeping_task_after2;
+ db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_after2, Env::Priority::HIGH);
+ sleeping_task_after2.WakeUp();
+ sleeping_task_after2.WaitUntilDone();
+
+ value = options.write_buffer_manager->memory_usage();
+ ASSERT_EQ(base_value, value);
+}
+
+TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ options.max_manifest_file_size = 10;
+ options.create_if_missing = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_EQ(2, handles_.size());
+
+ ASSERT_OK(Put("foo", "value"));
+ const int kL0Files = options.level0_file_num_compaction_trigger;
+ for (int i = 0; i < kL0Files; ++i) {
+ ASSERT_OK(Put(/*cf=*/1, "a", std::to_string(i)));
+ ASSERT_OK(Flush(/*cf=*/1));
+ }
+
+ port::Thread thread([&]() { ASSERT_OK(Flush()); });
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ thread.join();
+}
+
+TEST_F(DBTest2, SameSmallestInSameLevel) {
+ // This test validates fractional casacading logic when several files at one
+ // one level only contains the same user key.
+ Options options = CurrentOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("key", "1"));
+ ASSERT_OK(Put("key", "2"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "3"));
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "4"));
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(dbfull()->CompactRange(cro, db_->DefaultColumnFamily(), nullptr,
+ nullptr));
+
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "5"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "6"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "7"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "8"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,4,1", FilesPerLevel());
+#endif // ROCKSDB_LITE
+
+ ASSERT_EQ("2,3,4,5,6,7,8", Get("key"));
+}
+
+TEST_F(DBTest2, FileConsistencyCheckInOpen) {
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+ Status* ret_s = static_cast<Status*>(arg);
+ *ret_s = Status::Corruption("fcc");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.force_consistency_checks = true;
+ ASSERT_NOK(TryReopen(options));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ table_options.block_size = 300;
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ table_options.index_shortening =
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+ Reopen(options);
+
+ Random rnd(301);
+ std::string large_value = rnd.RandomString(500);
+
+ ASSERT_OK(Put("a1", large_value));
+ ASSERT_OK(Put("x1", large_value));
+ ASSERT_OK(Put("y1", large_value));
+ ASSERT_OK(Flush());
+
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+ ASSERT_OK(iterator->status());
+ iterator->SeekForPrev("x3");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("x1", iterator->key().ToString());
+
+ iterator->SeekForPrev("a3");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("a1", iterator->key().ToString());
+
+ iterator->SeekForPrev("y3");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("y1", iterator->key().ToString());
+
+ // Query more than one non-existing prefix to cover the case both
+ // of empty hash bucket and hash bucket conflict.
+ iterator->SeekForPrev("b1");
+ // Result should be not valid or "a1".
+ if (iterator->Valid()) {
+ ASSERT_EQ("a1", iterator->key().ToString());
+ }
+
+ iterator->SeekForPrev("c1");
+ // Result should be not valid or "a1".
+ if (iterator->Valid()) {
+ ASSERT_EQ("a1", iterator->key().ToString());
+ }
+
+ iterator->SeekForPrev("d1");
+ // Result should be not valid or "a1".
+ if (iterator->Valid()) {
+ ASSERT_EQ("a1", iterator->key().ToString());
+ }
+
+ iterator->SeekForPrev("y3");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("y1", iterator->key().ToString());
+ }
+}
+
+TEST_F(DBTest2, PartitionedIndexPrefetchFailure) {
+ Options options = last_options_;
+ options.env = env_;
+ options.max_open_files = 20;
+ BlockBasedTableOptions bbto;
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ bbto.metadata_block_size = 128;
+ bbto.block_size = 128;
+ bbto.block_cache = NewLRUCache(16777216);
+ bbto.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ // Force no table cache so every read will preload the SST file.
+ dbfull()->TEST_table_cache()->SetCapacity(0);
+ bbto.block_cache->SetCapacity(0);
+
+ Random rnd(301);
+ for (int i = 0; i < 4096; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(32)));
+ }
+ ASSERT_OK(Flush());
+
+ // Try different random failures in table open for 300 times.
+ for (int i = 0; i < 300; i++) {
+ env_->num_reads_fails_ = 0;
+ env_->rand_reads_fail_odd_ = 8;
+
+ std::string value;
+ Status s = dbfull()->Get(ReadOptions(), Key(1), &value);
+ if (env_->num_reads_fails_ > 0) {
+ ASSERT_NOK(s);
+ } else {
+ ASSERT_OK(s);
+ }
+ }
+
+ env_->rand_reads_fail_odd_ = 0;
+}
+
+TEST_F(DBTest2, ChangePrefixExtractor) {
+ for (bool use_partitioned_filter : {true, false}) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+
+ // Sometimes filter is checked based on upper bound. Assert counters
+ // for that case. Otherwise, only check data correctness.
+#ifndef ROCKSDB_LITE
+ bool expect_filter_check = !use_partitioned_filter;
+#else
+ bool expect_filter_check = false;
+#endif
+ table_options.partition_filters = use_partitioned_filter;
+ if (use_partitioned_filter) {
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.statistics = CreateDBStatistics();
+
+ options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ ASSERT_OK(Put("aa", ""));
+ ASSERT_OK(Put("xb", ""));
+ ASSERT_OK(Put("xx1", ""));
+ ASSERT_OK(Put("xz1", ""));
+ ASSERT_OK(Put("zz", ""));
+ ASSERT_OK(Flush());
+
+ // After reopening DB with prefix size 2 => 1, prefix extractor
+ // won't take effective unless it won't change results based
+ // on upper bound and seek key.
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ Reopen(options);
+
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+ ASSERT_OK(iterator->status());
+ iterator->Seek("xa");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ // It's a bug that the counter BLOOM_FILTER_PREFIX_CHECKED is not
+ // correct in this case. So don't check counters in this case.
+ if (expect_filter_check) {
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ iterator->Seek("xz");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xz1", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+ }
+
+ std::string ub_str = "xg9";
+ Slice ub(ub_str);
+ ReadOptions ro;
+ ro.iterate_upper_bound = &ub;
+
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ ASSERT_OK(iterator->status());
+
+ // SeekForPrev() never uses prefix bloom if it is changed.
+ iterator->SeekForPrev("xg0");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+ }
+
+ ub_str = "xx9";
+ ub = Slice(ub_str);
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ ASSERT_OK(iterator->status());
+
+ iterator->Seek("x");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ iterator->Seek("xx0");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xx1", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+ }
+
+ CompactRangeOptions compact_range_opts;
+ compact_range_opts.bottommost_level_compaction =
+ BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+ ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+ // Re-execute similar queries after a full compaction
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+
+ iterator->Seek("x");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ iterator->Seek("xg");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xx1", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ iterator->Seek("xz");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xz1", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ ASSERT_OK(iterator->status());
+ }
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+ iterator->SeekForPrev("xx0");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(5, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ iterator->Seek("xx0");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xx1", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+
+ ASSERT_OK(iterator->status());
+ }
+
+ ub_str = "xg9";
+ ub = Slice(ub_str);
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->SeekForPrev("xg0");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("xb", iterator->key().ToString());
+ if (expect_filter_check) {
+ ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+ }
+ ASSERT_OK(iterator->status());
+ }
+ }
+}
+
+TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {
+ // create a DB with block prefix index
+ BlockBasedTableOptions table_options;
+ Options options = CurrentOptions();
+ table_options.block_size = 300;
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ table_options.index_shortening =
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.level0_file_num_compaction_trigger = 8;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("b1", "ok"));
+ ASSERT_OK(Flush());
+
+ // Flushing several files so that the chance that hash bucket
+ // is empty fo "b" in at least one of the files is high.
+ ASSERT_OK(Put("a1", ""));
+ ASSERT_OK(Put("c1", ""));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("a2", ""));
+ ASSERT_OK(Put("c2", ""));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("a3", ""));
+ ASSERT_OK(Put("c3", ""));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("a4", ""));
+ ASSERT_OK(Put("c4", ""));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("a5", ""));
+ ASSERT_OK(Put("c5", ""));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("ok", Get("b1"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, AutoPrefixMode1) {
+ do {
+ // create a DB with block prefix index
+ Options options = CurrentOptions();
+ BlockBasedTableOptions table_options =
+ *options.table_factory->GetOptions<BlockBasedTableOptions>();
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.statistics = CreateDBStatistics();
+
+ Reopen(options);
+
+ Random rnd(301);
+ std::string large_value = rnd.RandomString(500);
+
+ ASSERT_OK(Put("a1", large_value));
+ ASSERT_OK(Put("x1", large_value));
+ ASSERT_OK(Put("y1", large_value));
+ ASSERT_OK(Flush());
+
+ ReadOptions ro;
+ ro.total_order_seek = false;
+ ro.auto_prefix_mode = true;
+
+ const auto stat = BLOOM_FILTER_PREFIX_CHECKED;
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->Seek("b1");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("x1", iterator->key().ToString());
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+ ASSERT_OK(iterator->status());
+ }
+
+ Slice ub;
+ ro.iterate_upper_bound = &ub;
+
+ ub = "b9";
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->Seek("b1");
+ ASSERT_FALSE(iterator->Valid());
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+ ASSERT_OK(iterator->status());
+ }
+
+ ub = "z";
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->Seek("b1");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("x1", iterator->key().ToString());
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+ ASSERT_OK(iterator->status());
+ }
+
+ ub = "c";
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->Seek("b1");
+ ASSERT_FALSE(iterator->Valid());
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+ ASSERT_OK(iterator->status());
+ }
+
+ ub = "c1";
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->Seek("b1");
+ ASSERT_FALSE(iterator->Valid());
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+ ASSERT_OK(iterator->status());
+ }
+
+ // The same queries without recreating iterator
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+ ub = "b9";
+ iterator->Seek("b1");
+ ASSERT_FALSE(iterator->Valid());
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+ ASSERT_OK(iterator->status());
+
+ ub = "z";
+ iterator->Seek("b1");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("x1", iterator->key().ToString());
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+ ub = "c";
+ iterator->Seek("b1");
+ ASSERT_FALSE(iterator->Valid());
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+
+ ub = "b9";
+ iterator->SeekForPrev("b1");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("a1", iterator->key().ToString());
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+ ub = "zz";
+ iterator->SeekToLast();
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("y1", iterator->key().ToString());
+
+ iterator->SeekToFirst();
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("a1", iterator->key().ToString());
+ }
+
+ // Similar, now with reverse comparator
+ // Technically, we are violating axiom 2 of prefix_extractors, but
+ // it should be revised because of major use-cases using
+ // ReverseBytewiseComparator with capped/fixed prefix Seek. (FIXME)
+ options.comparator = ReverseBytewiseComparator();
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("a1", large_value));
+ ASSERT_OK(Put("x1", large_value));
+ ASSERT_OK(Put("y1", large_value));
+ ASSERT_OK(Flush());
+
+ {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+ ub = "b1";
+ iterator->Seek("b9");
+ ASSERT_FALSE(iterator->Valid());
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+ ASSERT_OK(iterator->status());
+
+ ub = "b1";
+ iterator->Seek("z");
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("y1", iterator->key().ToString());
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+ ub = "b1";
+ iterator->Seek("c");
+ ASSERT_FALSE(iterator->Valid());
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+ ub = "b";
+ iterator->Seek("c9");
+ ASSERT_FALSE(iterator->Valid());
+ // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+ // is "correctly" implemented.
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+ ub = "a";
+ iterator->Seek("b9");
+ // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+ // is "correctly" implemented.
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("a1", iterator->key().ToString());
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+ ub = "b";
+ iterator->Seek("a");
+ ASSERT_FALSE(iterator->Valid());
+ // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+ // matches BytewiseComparator::IsSameLengthImmediateSuccessor. Upper
+ // comparing before seek key prevents a real bug from surfacing.
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+ ub = "b1";
+ iterator->SeekForPrev("b9");
+ ASSERT_TRUE(iterator->Valid());
+ // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+ // is "correctly" implemented.
+ ASSERT_EQ("x1", iterator->key().ToString());
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+ ub = "a";
+ iterator->SeekToLast();
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("a1", iterator->key().ToString());
+
+ iterator->SeekToFirst();
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("y1", iterator->key().ToString());
+ }
+
+ // Now something a bit different, related to "short" keys that
+ // auto_prefix_mode can omit. See "BUG" section of auto_prefix_mode.
+ options.comparator = BytewiseComparator();
+ for (const auto config : {"fixed:2", "capped:2"}) {
+ ASSERT_OK(SliceTransform::CreateFromString(ConfigOptions(), config,
+ &options.prefix_extractor));
+
+ // FIXME: kHashSearch, etc. requires all keys be InDomain
+ if (StartsWith(config, "fixed") &&
+ (table_options.index_type == BlockBasedTableOptions::kHashSearch ||
+ StartsWith(options.memtable_factory->Name(), "Hash"))) {
+ continue;
+ }
+ DestroyAndReopen(options);
+
+ const char* a_end_stuff = "a\xffXYZ";
+ const char* b_begin_stuff = "b\x00XYZ";
+ ASSERT_OK(Put("a", large_value));
+ ASSERT_OK(Put("b", large_value));
+ ASSERT_OK(Put(Slice(b_begin_stuff, 3), large_value));
+ ASSERT_OK(Put("c", large_value));
+ ASSERT_OK(Flush());
+
+ // control showing valid optimization with auto_prefix mode
+ ub = Slice(a_end_stuff, 4);
+ ro.iterate_upper_bound = &ub;
+
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+ iterator->Seek(Slice(a_end_stuff, 2));
+ ASSERT_FALSE(iterator->Valid());
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+ ASSERT_OK(iterator->status());
+
+ // test, cannot be validly optimized with auto_prefix_mode
+ ub = Slice(b_begin_stuff, 2);
+ ro.iterate_upper_bound = &ub;
+
+ iterator->Seek(Slice(a_end_stuff, 2));
+ // !!! BUG !!! See "BUG" section of auto_prefix_mode.
+ ASSERT_FALSE(iterator->Valid());
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+ ASSERT_OK(iterator->status());
+
+ // To prove that is the wrong result, now use total order seek
+ ReadOptions tos_ro = ro;
+ tos_ro.total_order_seek = true;
+ tos_ro.auto_prefix_mode = false;
+ iterator.reset(db_->NewIterator(tos_ro));
+ iterator->Seek(Slice(a_end_stuff, 2));
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ("b", iterator->key().ToString());
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+ ASSERT_OK(iterator->status());
+ }
+ } while (ChangeOptions(kSkipPlainTable));
+}
+
+class RenameCurrentTest : public DBTestBase,
+ public testing::WithParamInterface<std::string> {
+ public:
+ RenameCurrentTest()
+ : DBTestBase("rename_current_test", /*env_do_fsync=*/true),
+ sync_point_(GetParam()) {}
+
+ ~RenameCurrentTest() override {}
+
+ void SetUp() override {
+ env_->no_file_overwrite_.store(true, std::memory_order_release);
+ }
+
+ void TearDown() override {
+ env_->no_file_overwrite_.store(false, std::memory_order_release);
+ }
+
+ void SetupSyncPoints() {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) {
+ Status* s = reinterpret_cast<Status*>(arg);
+ assert(s);
+ *s = Status::IOError("Injected IO error.");
+ });
+ }
+
+ const std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest,
+ ::testing::Values("SetCurrentFile:BeforeRename",
+ "SetCurrentFile:AfterRename"));
+
+TEST_P(RenameCurrentTest, Open) {
+ Destroy(last_options_);
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ SetupSyncPoints();
+ SyncPoint::GetInstance()->EnableProcessing();
+ Status s = TryReopen(options);
+ ASSERT_NOK(s);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ Reopen(options);
+}
+
+TEST_P(RenameCurrentTest, Flush) {
+ Destroy(last_options_);
+ Options options = GetDefaultOptions();
+ options.max_manifest_file_size = 1;
+ options.create_if_missing = true;
+ Reopen(options);
+ ASSERT_OK(Put("key", "value"));
+ SetupSyncPoints();
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_NOK(Flush());
+
+ ASSERT_NOK(Put("foo", "value"));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ Reopen(options);
+ ASSERT_EQ("value", Get("key"));
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST_P(RenameCurrentTest, Compaction) {
+ Destroy(last_options_);
+ Options options = GetDefaultOptions();
+ options.max_manifest_file_size = 1;
+ options.create_if_missing = true;
+ Reopen(options);
+ ASSERT_OK(Put("a", "a_value"));
+ ASSERT_OK(Put("c", "c_value"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("b", "b_value"));
+ ASSERT_OK(Put("d", "d_value"));
+ ASSERT_OK(Flush());
+
+ SetupSyncPoints();
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+
+ ASSERT_NOK(Put("foo", "value"));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ Reopen(options);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ ASSERT_EQ("d_value", Get("d"));
+}
+
+TEST_F(DBTest2, LastLevelTemperature) {
+ class TestListener : public EventListener {
+ public:
+ void OnFileReadFinish(const FileOperationInfo& info) override {
+ UpdateFileTemperature(info);
+ }
+
+ void OnFileWriteFinish(const FileOperationInfo& info) override {
+ UpdateFileTemperature(info);
+ }
+
+ void OnFileFlushFinish(const FileOperationInfo& info) override {
+ UpdateFileTemperature(info);
+ }
+
+ void OnFileSyncFinish(const FileOperationInfo& info) override {
+ UpdateFileTemperature(info);
+ }
+
+ void OnFileCloseFinish(const FileOperationInfo& info) override {
+ UpdateFileTemperature(info);
+ }
+
+ bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+ std::unordered_map<uint64_t, Temperature> file_temperatures;
+
+ private:
+ void UpdateFileTemperature(const FileOperationInfo& info) {
+ auto filename = GetFileName(info.path);
+ uint64_t number;
+ FileType type;
+ ASSERT_TRUE(ParseFileName(filename, &number, &type));
+ if (type == kTableFile) {
+ MutexLock l(&mutex_);
+ auto ret = file_temperatures.insert({number, info.temperature});
+ if (!ret.second) {
+ // the same file temperature should always be the same for all events
+ ASSERT_TRUE(ret.first->second == info.temperature);
+ }
+ }
+ }
+
+ std::string GetFileName(const std::string& fname) {
+ auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
+ // workaround only for Windows that the file path could contain both
+ // Windows FilePathSeparator and '/'
+ filename = filename.substr(filename.find_last_of('/') + 1);
+ return filename;
+ }
+
+ port::Mutex mutex_;
+ };
+
+ const int kNumLevels = 7;
+ const int kLastLevel = kNumLevels - 1;
+
+ auto* listener = new TestListener();
+
+ Options options = CurrentOptions();
+ options.bottommost_temperature = Temperature::kWarm;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.num_levels = kNumLevels;
+ options.statistics = CreateDBStatistics();
+ options.listeners.emplace_back(listener);
+ Reopen(options);
+
+ auto size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kHot);
+ ASSERT_EQ(size, 0);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ get_iostats_context()->Reset();
+ IOStatsContext* iostats = get_iostats_context();
+
+ ColumnFamilyMetaData metadata;
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(1, metadata.file_count);
+ SstFileMetaData meta = metadata.levels[kLastLevel].files[0];
+ ASSERT_EQ(Temperature::kWarm, meta.temperature);
+ uint64_t number;
+ FileType type;
+ ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+ ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_GT(size, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+ ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+ ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+
+ ASSERT_EQ("bar", Get("foo"));
+
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+ ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+ ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+ ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+ ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+ // non-bottommost file still has unknown temperature
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("bar", Get("bar"));
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+ ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+ ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+ ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+ ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(2, metadata.file_count);
+ meta = metadata.levels[0].files[0];
+ ASSERT_EQ(Temperature::kUnknown, meta.temperature);
+ ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+ ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+ meta = metadata.levels[kLastLevel].files[0];
+ ASSERT_EQ(Temperature::kWarm, meta.temperature);
+ ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+ ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_GT(size, 0);
+
+ // reopen and check the information is persisted
+ Reopen(options);
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(2, metadata.file_count);
+ meta = metadata.levels[0].files[0];
+ ASSERT_EQ(Temperature::kUnknown, meta.temperature);
+ ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+ ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+ meta = metadata.levels[kLastLevel].files[0];
+ ASSERT_EQ(Temperature::kWarm, meta.temperature);
+ ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+ ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_GT(size, 0);
+
+ // check other non-exist temperatures
+ size = GetSstSizeHelper(Temperature::kHot);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kCold);
+ ASSERT_EQ(size, 0);
+ std::string prop;
+ ASSERT_TRUE(dbfull()->GetProperty(
+ DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+ &prop));
+ ASSERT_EQ(std::atoi(prop.c_str()), 0);
+
+ Reopen(options);
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(2, metadata.file_count);
+ meta = metadata.levels[0].files[0];
+ ASSERT_EQ(Temperature::kUnknown, meta.temperature);
+ ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+ ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+ meta = metadata.levels[kLastLevel].files[0];
+ ASSERT_EQ(Temperature::kWarm, meta.temperature);
+ ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+ ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+}
+
+TEST_F(DBTest2, LastLevelTemperatureUniversal) {
+ const int kTriggerNum = 3;
+ const int kNumLevels = 5;
+ const int kBottommostLevel = kNumLevels - 1;
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.level0_file_num_compaction_trigger = kTriggerNum;
+ options.num_levels = kNumLevels;
+ options.statistics = CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ auto size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kHot);
+ ASSERT_EQ(size, 0);
+ get_iostats_context()->Reset();
+ IOStatsContext* iostats = get_iostats_context();
+
+ for (int i = 0; i < kTriggerNum; i++) {
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ColumnFamilyMetaData metadata;
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(1, metadata.file_count);
+ ASSERT_EQ(Temperature::kUnknown,
+ metadata.levels[kBottommostLevel].files[0].temperature);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_EQ(size, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count, 0);
+ ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+ ASSERT_EQ("bar", Get("foo"));
+
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+ ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(2, metadata.file_count);
+ ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_EQ(size, 0);
+
+ // Update bottommost temperature
+ options.bottommost_temperature = Temperature::kWarm;
+ Reopen(options);
+ db_->GetColumnFamilyMetaData(&metadata);
+ // Should not impact existing ones
+ ASSERT_EQ(Temperature::kUnknown,
+ metadata.levels[kBottommostLevel].files[0].temperature);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_EQ(size, 0);
+
+ // new generated file should have the new settings
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(1, metadata.file_count);
+ ASSERT_EQ(Temperature::kWarm,
+ metadata.levels[kBottommostLevel].files[0].temperature);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_GT(size, 0);
+ ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+ ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+ ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+ // non-bottommost file still has unknown temperature
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(2, metadata.file_count);
+ ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_GT(size, 0);
+
+ // check other non-exist temperatures
+ size = GetSstSizeHelper(Temperature::kHot);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kCold);
+ ASSERT_EQ(size, 0);
+ std::string prop;
+ ASSERT_TRUE(dbfull()->GetProperty(
+ DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+ &prop));
+ ASSERT_EQ(std::atoi(prop.c_str()), 0);
+
+ // Update bottommost temperature dynamically with SetOptions
+ auto s = db_->SetOptions({{"last_level_temperature", "kCold"}});
+ ASSERT_OK(s);
+ ASSERT_EQ(db_->GetOptions().bottommost_temperature, Temperature::kCold);
+ db_->GetColumnFamilyMetaData(&metadata);
+ // Should not impact the existing files
+ ASSERT_EQ(Temperature::kWarm,
+ metadata.levels[kBottommostLevel].files[0].temperature);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kCold);
+ ASSERT_EQ(size, 0);
+
+ // new generated files should have the new settings
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(1, metadata.file_count);
+ ASSERT_EQ(Temperature::kCold,
+ metadata.levels[kBottommostLevel].files[0].temperature);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kCold);
+ ASSERT_GT(size, 0);
+
+ // kLastTemperature is an invalid temperature
+ options.bottommost_temperature = Temperature::kLastTemperature;
+ s = TryReopen(options);
+ ASSERT_TRUE(s.IsIOError());
+}
+
+TEST_F(DBTest2, LastLevelStatistics) {
+ Options options = CurrentOptions();
+ options.bottommost_temperature = Temperature::kWarm;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.statistics = CreateDBStatistics();
+ Reopen(options);
+
+ // generate 1 sst on level 0
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("bar", Get("bar"));
+
+ ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), 0);
+ ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), 0);
+ ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), 0);
+
+ // 2nd flush to trigger compaction
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("bar", Get("bar"));
+
+ ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES),
+ options.statistics->getTickerCount(WARM_FILE_READ_BYTES));
+ ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT),
+ options.statistics->getTickerCount(WARM_FILE_READ_COUNT));
+
+ auto pre_bytes =
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES);
+ auto pre_count =
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+
+ // 3rd flush to generate 1 sst on level 0
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("bar", Get("bar"));
+
+ ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES),
+ pre_bytes);
+ ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT),
+ pre_count);
+ ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES),
+ options.statistics->getTickerCount(WARM_FILE_READ_BYTES));
+ ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT),
+ options.statistics->getTickerCount(WARM_FILE_READ_COUNT));
+}
+
+TEST_F(DBTest2, CheckpointFileTemperature) {
+ class NoLinkTestFS : public FileTemperatureTestFS {
+ using FileTemperatureTestFS::FileTemperatureTestFS;
+
+ IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&,
+ IODebugContext*) override {
+ // return not supported to force checkpoint copy the file instead of just
+ // link
+ return IOStatus::NotSupported();
+ }
+ };
+ auto test_fs = std::make_shared<NoLinkTestFS>(env_->GetFileSystem());
+ std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
+ Options options = CurrentOptions();
+ options.bottommost_temperature = Temperature::kWarm;
+ // set dynamic_level to true so the compaction would compact the data to the
+ // last level directly which will have the last_level_temperature
+ options.level_compaction_dynamic_level_bytes = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.env = env.get();
+ Reopen(options);
+
+ // generate a bottommost file and a non-bottommost file
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ auto size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_GT(size, 0);
+
+ std::map<uint64_t, Temperature> temperatures;
+ std::vector<LiveFileStorageInfo> infos;
+ ASSERT_OK(
+ dbfull()->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), &infos));
+ for (auto info : infos) {
+ temperatures.emplace(info.file_number, info.temperature);
+ }
+
+ test_fs->PopRequestedSstFileTemperatures();
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(
+ checkpoint->CreateCheckpoint(dbname_ + kFilePathSeparator + "tempcp"));
+
+ // checking src file src_temperature hints: 2 sst files: 1 sst is kWarm,
+ // another is kUnknown
+ std::vector<std::pair<uint64_t, Temperature>> requested_temps;
+ test_fs->PopRequestedSstFileTemperatures(&requested_temps);
+ // Two requests
+ ASSERT_EQ(requested_temps.size(), 2);
+ std::set<uint64_t> distinct_requests;
+ for (const auto& requested_temp : requested_temps) {
+ // Matching manifest temperatures
+ ASSERT_EQ(temperatures.at(requested_temp.first), requested_temp.second);
+ distinct_requests.insert(requested_temp.first);
+ }
+ // Each request to distinct file
+ ASSERT_EQ(distinct_requests.size(), requested_temps.size());
+
+ delete checkpoint;
+ Close();
+}
+
+TEST_F(DBTest2, FileTemperatureManifestFixup) {
+ auto test_fs = std::make_shared<FileTemperatureTestFS>(env_->GetFileSystem());
+ std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
+ Options options = CurrentOptions();
+ options.bottommost_temperature = Temperature::kWarm;
+ // set dynamic_level to true so the compaction would compact the data to the
+ // last level directly which will have the last_level_temperature
+ options.level_compaction_dynamic_level_bytes = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.env = env.get();
+ std::vector<std::string> cfs = {/*"default",*/ "test1", "test2"};
+ CreateAndReopenWithCF(cfs, options);
+ // Needed for later re-opens (weird)
+ cfs.insert(cfs.begin(), kDefaultColumnFamilyName);
+
+ // Generate a bottommost file in all CFs
+ for (int cf = 0; cf < 3; ++cf) {
+ ASSERT_OK(Put(cf, "a", "val"));
+ ASSERT_OK(Put(cf, "c", "val"));
+ ASSERT_OK(Flush(cf));
+ ASSERT_OK(Put(cf, "b", "val"));
+ ASSERT_OK(Put(cf, "d", "val"));
+ ASSERT_OK(Flush(cf));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // verify
+ ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);
+
+ // Generate a non-bottommost file in all CFs
+ for (int cf = 0; cf < 3; ++cf) {
+ ASSERT_OK(Put(cf, "e", "val"));
+ ASSERT_OK(Flush(cf));
+ }
+
+ // re-verify
+ ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0);
+ // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);
+
+ // Now change FS temperature on bottommost file(s) to kCold
+ std::map<uint64_t, Temperature> current_temps;
+ test_fs->CopyCurrentSstFileTemperatures(&current_temps);
+ for (auto e : current_temps) {
+ if (e.second == Temperature::kWarm) {
+ test_fs->OverrideSstFileTemperature(e.first, Temperature::kCold);
+ }
+ }
+ // Metadata not yet updated
+ ASSERT_EQ(Get("a"), "val");
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // Update with Close and UpdateManifestForFilesState, but first save cf
+ // descriptors
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (size_t i = 0; i < handles_.size(); ++i) {
+ ColumnFamilyDescriptor cfdescriptor;
+ // GetDescriptor is not implemented for ROCKSDB_LITE
+ handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError();
+ column_families.push_back(cfdescriptor);
+ }
+ Close();
+ experimental::UpdateManifestForFilesStateOptions update_opts;
+ update_opts.update_temperatures = true;
+
+ ASSERT_OK(experimental::UpdateManifestForFilesState(
+ options, dbname_, column_families, update_opts));
+
+ // Re-open and re-verify after update
+ ReopenWithColumnFamilies(cfs, options);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);
+
+ // Change kUnknown to kHot
+ test_fs->CopyCurrentSstFileTemperatures(&current_temps);
+ for (auto e : current_temps) {
+ if (e.second == Temperature::kUnknown) {
+ test_fs->OverrideSstFileTemperature(e.first, Temperature::kHot);
+ }
+ }
+
+ // Update with Close and UpdateManifestForFilesState
+ Close();
+ ASSERT_OK(experimental::UpdateManifestForFilesState(
+ options, dbname_, column_families, update_opts));
+
+ // Re-open and re-verify after update
+ ReopenWithColumnFamilies(cfs, options);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kHot), 0);
+
+ Close();
+}
+#endif // ROCKSDB_LITE
+
+// WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery.
+TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "value0"));
+ Close();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ bool should_inject_error = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::RecoverLogFiles:BeforeReadWal",
+ [&](void* /*arg*/) { should_inject_error = true; });
+ SyncPoint::GetInstance()->SetCallBack(
+ "LogReader::ReadMore:AfterReadFile", [&](void* arg) {
+ if (should_inject_error) {
+ ASSERT_NE(nullptr, arg);
+ *reinterpret_cast<Status*>(arg) = Status::IOError("Injected IOError");
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ options.avoid_flush_during_recovery = true;
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ Status s = TryReopen(options);
+ ASSERT_TRUE(s.IsIOError());
+}
+
+TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BackgroundCallFlush:Start:1",
+ "PointInTimeRecoveryWithSyncFailureInCFCreation:1"},
+ {"PointInTimeRecoveryWithSyncFailureInCFCreation:2",
+ "DBImpl::BackgroundCallFlush:Start:2"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateColumnFamilies({"test1"}, Options());
+ ASSERT_OK(Put("foo", "bar"));
+
+ // Creating a CF when a flush is going on, log is synced but the
+ // closed log file is not synced and corrupted.
+ port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); });
+ TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1");
+ CreateColumnFamilies({"test2"}, Options());
+ env_->corrupt_in_sync_ = true;
+ TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2");
+ flush_thread.join();
+ env_->corrupt_in_sync_ = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ // Reopening the DB should not corrupt anything
+ Options options = CurrentOptions();
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
+}
+
+TEST_F(DBTest2, RenameDirectory) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "value0"));
+ Close();
+ auto old_dbname = dbname_;
+ auto new_dbname = dbname_ + "_2";
+ EXPECT_OK(env_->RenameFile(dbname_, new_dbname));
+ options.create_if_missing = false;
+ dbname_ = new_dbname;
+ ASSERT_OK(TryReopen(options));
+ ASSERT_EQ("value0", Get("foo"));
+ Destroy(options);
+ dbname_ = old_dbname;
+}
+
+TEST_F(DBTest2, SstUniqueIdVerifyBackwardCompatible) {
+ const int kNumSst = 3;
+ const int kLevel0Trigger = 4;
+ auto options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kLevel0Trigger;
+ options.statistics = CreateDBStatistics();
+ // Skip for now
+ options.verify_sst_unique_id_in_manifest = false;
+ Reopen(options);
+
+ std::atomic_int skipped = 0;
+ std::atomic_int passed = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::SkippedVerifyUniqueId",
+ [&](void* /*arg*/) { skipped++; });
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::PassedVerifyUniqueId",
+ [&](void* /*arg*/) { passed++; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // generate a few SSTs
+ for (int i = 0; i < kNumSst; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value"));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // Verification has been skipped on files so far
+ EXPECT_EQ(skipped, kNumSst);
+ EXPECT_EQ(passed, 0);
+
+ // Reopen with verification
+ options.verify_sst_unique_id_in_manifest = true;
+ skipped = 0;
+ passed = 0;
+ Reopen(options);
+ EXPECT_EQ(skipped, 0);
+ EXPECT_EQ(passed, kNumSst);
+
+ // Now simulate no unique id in manifest for next file
+ // NOTE: this only works for loading manifest from disk,
+ // not in-memory manifest, so we need to re-open below.
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionEdit::EncodeTo:UniqueId", [&](void* arg) {
+ auto unique_id = static_cast<UniqueId64x2*>(arg);
+ // remove id before writing it to manifest
+ (*unique_id)[0] = 0;
+ (*unique_id)[1] = 0;
+ });
+
+ // test compaction generated Sst
+ for (int i = kNumSst; i < kLevel0Trigger; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value"));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif // ROCKSDB_LITE
+
+ // Reopen (with verification)
+ ASSERT_TRUE(options.verify_sst_unique_id_in_manifest);
+ skipped = 0;
+ passed = 0;
+ Reopen(options);
+ EXPECT_EQ(skipped, 1);
+ EXPECT_EQ(passed, 0);
+}
+
+TEST_F(DBTest2, SstUniqueIdVerify) {
+ const int kNumSst = 3;
+ const int kLevel0Trigger = 4;
+ auto options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kLevel0Trigger;
+ // Allow mismatch for now
+ options.verify_sst_unique_id_in_manifest = false;
+ Reopen(options);
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+ auto props = static_cast<TableProperties*>(props_vs);
+ // update table property session_id to a different one, which
+ // changes unique ID
+ props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // generate a few SSTs
+ for (int i = 0; i < kNumSst; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value"));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // Reopen with verification should report corruption
+ options.verify_sst_unique_id_in_manifest = true;
+ auto s = TryReopen(options);
+ ASSERT_TRUE(s.IsCorruption());
+
+ // Reopen without verification should be fine
+ options.verify_sst_unique_id_in_manifest = false;
+ Reopen(options);
+
+ // test compaction generated Sst
+ for (int i = kNumSst; i < kLevel0Trigger; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value"));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif // ROCKSDB_LITE
+
+ // Reopen with verification should fail
+ options.verify_sst_unique_id_in_manifest = true;
+ s = TryReopen(options);
+ ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBTest2, SstUniqueIdVerifyMultiCFs) {
+ const int kNumSst = 3;
+ const int kLevel0Trigger = 4;
+ auto options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kLevel0Trigger;
+ // Allow mismatch for now
+ options.verify_sst_unique_id_in_manifest = false;
+
+ CreateAndReopenWithCF({"one", "two"}, options);
+
+ // generate good SSTs
+ for (int cf_num : {0, 2}) {
+ for (int i = 0; i < kNumSst; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(cf_num, Key(i * 10 + j), "value"));
+ }
+ ASSERT_OK(Flush(cf_num));
+ }
+ }
+
+ // generate SSTs with bad unique id
+ SyncPoint::GetInstance()->SetCallBack(
+ "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+ auto props = static_cast<TableProperties*>(props_vs);
+ // update table property session_id to a different one
+ props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ for (int i = 0; i < kNumSst; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(1, Key(i * 10 + j), "value"));
+ }
+ ASSERT_OK(Flush(1));
+ }
+
+ // Reopen with verification should report corruption
+ options.verify_sst_unique_id_in_manifest = true;
+ auto s = TryReopenWithColumnFamilies({"default", "one", "two"}, options);
+ ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) {
+ const auto tamper_with_uniq_id = [&](void* arg) {
+ auto props = static_cast<TableProperties*>(arg);
+ assert(props);
+ // update table property session_id to a different one
+ props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+ };
+
+ const auto assert_db = [&](size_t expected_count,
+ const std::string& expected_v) {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ size_t cnt = 0;
+ for (it->SeekToFirst(); it->Valid(); it->Next(), ++cnt) {
+ ASSERT_EQ(std::to_string(cnt), it->key());
+ ASSERT_EQ(expected_v, it->value());
+ }
+ ASSERT_EQ(expected_count, cnt);
+ };
+
+ const int num_l0_compaction_trigger = 8;
+ const int num_l0 = num_l0_compaction_trigger - 1;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = num_l0_compaction_trigger;
+
+ for (int k = 0; k < num_l0; ++k) {
+ // Allow mismatch for now
+ options.verify_sst_unique_id_in_manifest = false;
+
+ DestroyAndReopen(options);
+
+ constexpr size_t num_keys_per_file = 10;
+ for (int i = 0; i < num_l0; ++i) {
+ for (size_t j = 0; j < num_keys_per_file; ++j) {
+ ASSERT_OK(Put(std::to_string(j), "v" + std::to_string(i)));
+ }
+ if (i == k) {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->SetCallBack(
+ "PropertyBlockBuilder::AddTableProperty:Start",
+ tamper_with_uniq_id);
+ SyncPoint::GetInstance()->EnableProcessing();
+ }
+ ASSERT_OK(Flush());
+ }
+
+ options.verify_sst_unique_id_in_manifest = true;
+ Status s = TryReopen(options);
+ ASSERT_TRUE(s.IsCorruption());
+
+ options.best_efforts_recovery = true;
+ Reopen(options);
+ assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));
+
+ // Reopen with regular recovery
+ options.best_efforts_recovery = false;
+ Reopen(options);
+ assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ for (size_t i = 0; i < num_keys_per_file; ++i) {
+ ASSERT_OK(Put(std::to_string(i), "v"));
+ }
+ ASSERT_OK(Flush());
+ Reopen(options);
+ {
+ for (size_t i = 0; i < num_keys_per_file; ++i) {
+ ASSERT_EQ("v", Get(std::to_string(i)));
+ }
+ }
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, GetLatestSeqAndTsForKey) {
+ Destroy(last_options_);
+
+ Options options = CurrentOptions();
+ options.max_write_buffer_size_to_maintain = 64 << 10;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ options.statistics = CreateDBStatistics();
+
+ Reopen(options);
+
+ constexpr uint64_t kTsU64Value = 12;
+
+ for (uint64_t key = 0; key < 100; ++key) {
+ std::string ts;
+ PutFixed64(&ts, kTsU64Value);
+
+ std::string key_str;
+ PutFixed64(&key_str, key);
+ std::reverse(key_str.begin(), key_str.end());
+ ASSERT_OK(db_->Put(WriteOptions(), key_str, ts, "value"));
+ }
+
+ ASSERT_OK(Flush());
+
+ constexpr bool cache_only = true;
+ constexpr SequenceNumber lower_bound_seq = 0;
+ auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(
+ dbfull()->DefaultColumnFamily());
+ assert(cfhi);
+ assert(cfhi->cfd());
+ SuperVersion* sv = cfhi->cfd()->GetSuperVersion();
+ for (uint64_t key = 0; key < 100; ++key) {
+ std::string key_str;
+ PutFixed64(&key_str, key);
+ std::reverse(key_str.begin(), key_str.end());
+ std::string ts;
+ SequenceNumber seq = kMaxSequenceNumber;
+ bool found_record_for_key = false;
+ bool is_blob_index = false;
+
+ const Status s = dbfull()->GetLatestSequenceForKey(
+ sv, key_str, cache_only, lower_bound_seq, &seq, &ts,
+ &found_record_for_key, &is_blob_index);
+ ASSERT_OK(s);
+ std::string expected_ts;
+ PutFixed64(&expected_ts, kTsU64Value);
+ ASSERT_EQ(expected_ts, ts);
+ ASSERT_TRUE(found_record_for_key);
+ ASSERT_FALSE(is_blob_index);
+ }
+
+ // Verify that no read to SST files.
+ ASSERT_EQ(0, options.statistics->getTickerCount(GET_HIT_L0));
+}
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_test_util.cc b/src/rocksdb/db/db_test_util.cc
new file mode 100644
index 000000000..d53bca51a
--- /dev/null
+++ b/src/rocksdb/db/db_test_util.cc
@@ -0,0 +1,1773 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+
+#include "cache/cache_reservation_manager.h"
+#include "db/forward_iterator.h"
+#include "env/mock_env.h"
+#include "port/lang.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env_encryption.h"
+#include "rocksdb/unique_id.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "table/format.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+int64_t MaybeCurrentTime(Env* env) {
+ int64_t time = 1337346000; // arbitrary fallback default
+ env->GetCurrentTime(&time).PermitUncheckedError();
+ return time;
+}
+} // anonymous namespace
+
+// Special Env used to delay background operations
+
+SpecialEnv::SpecialEnv(Env* base, bool time_elapse_only_sleep)
+ : EnvWrapper(base),
+ maybe_starting_time_(MaybeCurrentTime(base)),
+ rnd_(301),
+ sleep_counter_(this),
+ time_elapse_only_sleep_(time_elapse_only_sleep),
+ no_slowdown_(time_elapse_only_sleep) {
+ delay_sstable_sync_.store(false, std::memory_order_release);
+ drop_writes_.store(false, std::memory_order_release);
+ no_space_.store(false, std::memory_order_release);
+ non_writable_.store(false, std::memory_order_release);
+ count_random_reads_ = false;
+ count_sequential_reads_ = false;
+ manifest_sync_error_.store(false, std::memory_order_release);
+ manifest_write_error_.store(false, std::memory_order_release);
+ log_write_error_.store(false, std::memory_order_release);
+ no_file_overwrite_.store(false, std::memory_order_release);
+ random_file_open_counter_.store(0, std::memory_order_relaxed);
+ delete_count_.store(0, std::memory_order_relaxed);
+ num_open_wal_file_.store(0);
+ log_write_slowdown_ = 0;
+ bytes_written_ = 0;
+ sync_counter_ = 0;
+ non_writeable_rate_ = 0;
+ new_writable_count_ = 0;
+ non_writable_count_ = 0;
+ table_write_callback_ = nullptr;
+}
+DBTestBase::DBTestBase(const std::string path, bool env_do_fsync)
+ : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) {
+ Env* base_env = Env::Default();
+ ConfigOptions config_options;
+ EXPECT_OK(test::CreateEnvFromSystem(config_options, &base_env, &env_guard_));
+ EXPECT_NE(nullptr, base_env);
+ if (getenv("MEM_ENV")) {
+ mem_env_ = MockEnv::Create(base_env, base_env->GetSystemClock());
+ }
+#ifndef ROCKSDB_LITE
+ if (getenv("ENCRYPTED_ENV")) {
+ std::shared_ptr<EncryptionProvider> provider;
+ std::string provider_id = getenv("ENCRYPTED_ENV");
+ if (provider_id.find("=") == std::string::npos &&
+ !EndsWith(provider_id, "://test")) {
+ provider_id = provider_id + "://test";
+ }
+ EXPECT_OK(EncryptionProvider::CreateFromString(ConfigOptions(), provider_id,
+ &provider));
+ encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, provider);
+ }
+#endif // !ROCKSDB_LITE
+ env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_
+ : (mem_env_ ? mem_env_ : base_env));
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ env_->skip_fsync_ = !env_do_fsync;
+ dbname_ = test::PerThreadDBPath(env_, path);
+ alternative_wal_dir_ = dbname_ + "/wal";
+ alternative_db_log_dir_ = dbname_ + "/db_log_dir";
+ auto options = CurrentOptions();
+ options.env = env_;
+ auto delete_options = options;
+ delete_options.wal_dir = alternative_wal_dir_;
+ EXPECT_OK(DestroyDB(dbname_, delete_options));
+ // Destroy it for not alternative WAL dir is used.
+ EXPECT_OK(DestroyDB(dbname_, options));
+ db_ = nullptr;
+ Reopen(options);
+ Random::GetTLSInstance()->Reset(0xdeadbeef);
+}
+
+DBTestBase::~DBTestBase() {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ Close();
+ Options options;
+ options.db_paths.emplace_back(dbname_, 0);
+ options.db_paths.emplace_back(dbname_ + "_2", 0);
+ options.db_paths.emplace_back(dbname_ + "_3", 0);
+ options.db_paths.emplace_back(dbname_ + "_4", 0);
+ options.env = env_;
+
+ if (getenv("KEEP_DB")) {
+ printf("DB is still at %s\n", dbname_.c_str());
+ } else {
+ EXPECT_OK(DestroyDB(dbname_, options));
+ }
+ delete env_;
+}
+
+bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) {
+#ifdef ROCKSDB_LITE
+ // These options are not supported in ROCKSDB_LITE
+ if (option_config == kHashSkipList ||
+ option_config == kPlainTableFirstBytePrefix ||
+ option_config == kPlainTableCappedPrefix ||
+ option_config == kPlainTableCappedPrefixNonMmap ||
+ option_config == kPlainTableAllBytesPrefix ||
+ option_config == kVectorRep || option_config == kHashLinkList ||
+ option_config == kUniversalCompaction ||
+ option_config == kUniversalCompactionMultiLevel ||
+ option_config == kUniversalSubcompactions ||
+ option_config == kFIFOCompaction ||
+ option_config == kConcurrentSkipList) {
+ return true;
+ }
+#endif
+
+ if ((skip_mask & kSkipUniversalCompaction) &&
+ (option_config == kUniversalCompaction ||
+ option_config == kUniversalCompactionMultiLevel ||
+ option_config == kUniversalSubcompactions)) {
+ return true;
+ }
+ if ((skip_mask & kSkipMergePut) && option_config == kMergePut) {
+ return true;
+ }
+ if ((skip_mask & kSkipNoSeekToLast) &&
+ (option_config == kHashLinkList || option_config == kHashSkipList)) {
+ return true;
+ }
+ if ((skip_mask & kSkipPlainTable) &&
+ (option_config == kPlainTableAllBytesPrefix ||
+ option_config == kPlainTableFirstBytePrefix ||
+ option_config == kPlainTableCappedPrefix ||
+ option_config == kPlainTableCappedPrefixNonMmap)) {
+ return true;
+ }
+ if ((skip_mask & kSkipHashIndex) &&
+ (option_config == kBlockBasedTableWithPrefixHashIndex ||
+ option_config == kBlockBasedTableWithWholeKeyHashIndex)) {
+ return true;
+ }
+ if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) {
+ return true;
+ }
+ if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) {
+ return true;
+ }
+ return false;
+}
+
+// Switch to a fresh database with the next option configuration to
+// test. Return false if there are no more configurations to test.
+bool DBTestBase::ChangeOptions(int skip_mask) {
+ for (option_config_++; option_config_ < kEnd; option_config_++) {
+ if (ShouldSkipOptions(option_config_, skip_mask)) {
+ continue;
+ }
+ break;
+ }
+
+ if (option_config_ >= kEnd) {
+ Destroy(last_options_);
+ return false;
+ } else {
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ return true;
+ }
+}
+
+// Switch between different compaction styles.
+bool DBTestBase::ChangeCompactOptions() {
+ if (option_config_ == kDefault) {
+ option_config_ = kUniversalCompaction;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ Reopen(options);
+ return true;
+ } else if (option_config_ == kUniversalCompaction) {
+ option_config_ = kUniversalCompactionMultiLevel;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ Reopen(options);
+ return true;
+ } else if (option_config_ == kUniversalCompactionMultiLevel) {
+ option_config_ = kLevelSubcompactions;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ assert(options.max_subcompactions > 1);
+ Reopen(options);
+ return true;
+ } else if (option_config_ == kLevelSubcompactions) {
+ option_config_ = kUniversalSubcompactions;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ assert(options.max_subcompactions > 1);
+ Reopen(options);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+// Switch between different WAL settings
+bool DBTestBase::ChangeWalOptions() {
+ if (option_config_ == kDefault) {
+ option_config_ = kDBLogDir;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+ return true;
+ } else if (option_config_ == kDBLogDir) {
+ option_config_ = kWalDirAndMmapReads;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+ return true;
+ } else if (option_config_ == kWalDirAndMmapReads) {
+ option_config_ = kRecycleLogFiles;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ Destroy(options);
+ Reopen(options);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+// Switch between different filter policy
+// Jump from kDefault to kFilter to kFullFilter
+bool DBTestBase::ChangeFilterOptions() {
+ if (option_config_ == kDefault) {
+ option_config_ = kFilter;
+ } else if (option_config_ == kFilter) {
+ option_config_ = kFullFilterWithNewTableReaderForCompactions;
+ } else if (option_config_ == kFullFilterWithNewTableReaderForCompactions) {
+ option_config_ = kPartitionedFilterWithNewTableReaderForCompactions;
+ } else {
+ return false;
+ }
+ Destroy(last_options_);
+
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ TryReopen(options);
+ return true;
+}
+
+// Switch between different DB options for file ingestion tests.
+bool DBTestBase::ChangeOptionsForFileIngestionTest() {
+ if (option_config_ == kDefault) {
+ option_config_ = kUniversalCompaction;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kUniversalCompaction) {
+ option_config_ = kUniversalCompactionMultiLevel;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kUniversalCompactionMultiLevel) {
+ option_config_ = kLevelSubcompactions;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ assert(options.max_subcompactions > 1);
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kLevelSubcompactions) {
+ option_config_ = kUniversalSubcompactions;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ assert(options.max_subcompactions > 1);
+ TryReopen(options);
+ return true;
+ } else if (option_config_ == kUniversalSubcompactions) {
+ option_config_ = kDirectIO;
+ Destroy(last_options_);
+ auto options = CurrentOptions();
+ TryReopen(options);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+// Return the current option configuration.
+Options DBTestBase::CurrentOptions(
+ const anon::OptionsOverride& options_override) const {
+ return GetOptions(option_config_, GetDefaultOptions(), options_override);
+}
+
+Options DBTestBase::CurrentOptions(
+ const Options& default_options,
+ const anon::OptionsOverride& options_override) const {
+ return GetOptions(option_config_, default_options, options_override);
+}
+
+Options DBTestBase::GetDefaultOptions() const {
+ Options options;
+ options.write_buffer_size = 4090 * 4096;
+ options.target_file_size_base = 2 * 1024 * 1024;
+ options.max_bytes_for_level_base = 10 * 1024 * 1024;
+ options.max_open_files = 5000;
+ options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+ options.compaction_pri = CompactionPri::kByCompensatedSize;
+ options.env = env_;
+ if (!env_->skip_fsync_) {
+ options.track_and_verify_wals_in_manifest = true;
+ }
+ return options;
+}
+
+Options DBTestBase::GetOptions(
+ int option_config, const Options& default_options,
+ const anon::OptionsOverride& options_override) const {
+ // this redundant copy is to minimize code change w/o having lint error.
+ Options options = default_options;
+ BlockBasedTableOptions table_options;
+ bool set_block_based_table_factory = true;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+ !defined(OS_AIX)
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "NewRandomAccessFile:O_DIRECT");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "NewWritableFile:O_DIRECT");
+#endif
+ // kMustFreeHeapAllocations -> indicates ASAN build
+ if (kMustFreeHeapAllocations && !options_override.full_block_cache) {
+ // Detecting block cache use-after-free is normally difficult in unit
+ // tests, because as a cache, it tends to keep unreferenced entries in
+ // memory, and we normally want unit tests to take advantage of block
+ // cache for speed. However, we also want a strong chance of detecting
+ // block cache use-after-free in unit tests in ASAN builds, so for ASAN
+ // builds we use a trivially small block cache to which entries can be
+ // added but are immediately freed on no more references.
+ table_options.block_cache = NewLRUCache(/* too small */ 1);
+ }
+
+ bool can_allow_mmap = IsMemoryMappedAccessSupported();
+ switch (option_config) {
+#ifndef ROCKSDB_LITE
+ case kHashSkipList:
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+ options.allow_concurrent_memtable_write = false;
+ options.unordered_write = false;
+ break;
+ case kPlainTableFirstBytePrefix:
+ options.table_factory.reset(NewPlainTableFactory());
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.allow_mmap_reads = can_allow_mmap;
+ options.max_sequential_skip_in_iterations = 999999;
+ set_block_based_table_factory = false;
+ break;
+ case kPlainTableCappedPrefix:
+ options.table_factory.reset(NewPlainTableFactory());
+ options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+ options.allow_mmap_reads = can_allow_mmap;
+ options.max_sequential_skip_in_iterations = 999999;
+ set_block_based_table_factory = false;
+ break;
+ case kPlainTableCappedPrefixNonMmap:
+ options.table_factory.reset(NewPlainTableFactory());
+ options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+ options.allow_mmap_reads = false;
+ options.max_sequential_skip_in_iterations = 999999;
+ set_block_based_table_factory = false;
+ break;
+ case kPlainTableAllBytesPrefix:
+ options.table_factory.reset(NewPlainTableFactory());
+ options.prefix_extractor.reset(NewNoopTransform());
+ options.allow_mmap_reads = can_allow_mmap;
+ options.max_sequential_skip_in_iterations = 999999;
+ set_block_based_table_factory = false;
+ break;
+ case kVectorRep:
+ options.memtable_factory.reset(new VectorRepFactory(100));
+ options.allow_concurrent_memtable_write = false;
+ options.unordered_write = false;
+ break;
+ case kHashLinkList:
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.memtable_factory.reset(
+ NewHashLinkListRepFactory(4, 0, 3, true, 4));
+ options.allow_concurrent_memtable_write = false;
+ options.unordered_write = false;
+ break;
+ case kDirectIO: {
+ options.use_direct_reads = true;
+ options.use_direct_io_for_flush_and_compaction = true;
+ options.compaction_readahead_size = 2 * 1024 * 1024;
+ SetupSyncPointsToMockDirectIO();
+ break;
+ }
+#endif // ROCKSDB_LITE
+ case kMergePut:
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ break;
+ case kFilter:
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ break;
+ case kFullFilterWithNewTableReaderForCompactions:
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ options.compaction_readahead_size = 10 * 1024 * 1024;
+ break;
+ case kPartitionedFilterWithNewTableReaderForCompactions:
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ table_options.partition_filters = true;
+ table_options.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ options.compaction_readahead_size = 10 * 1024 * 1024;
+ break;
+ case kUncompressed:
+ options.compression = kNoCompression;
+ break;
+ case kNumLevel_3:
+ options.num_levels = 3;
+ break;
+ case kDBLogDir:
+ options.db_log_dir = alternative_db_log_dir_;
+ break;
+ case kWalDirAndMmapReads:
+ options.wal_dir = alternative_wal_dir_;
+ // mmap reads should be orthogonal to WalDir setting, so we piggyback to
+ // this option config to test mmap reads as well
+ options.allow_mmap_reads = can_allow_mmap;
+ break;
+ case kManifestFileSize:
+ options.max_manifest_file_size = 50; // 50 bytes
+ break;
+ case kPerfOptions:
+ options.delayed_write_rate = 8 * 1024 * 1024;
+ options.report_bg_io_stats = true;
+ // TODO(3.13) -- test more options
+ break;
+ case kUniversalCompaction:
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ break;
+ case kUniversalCompactionMultiLevel:
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 8;
+ break;
+ case kCompressedBlockCache:
+ options.allow_mmap_writes = can_allow_mmap;
+ table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+ break;
+ case kInfiniteMaxOpenFiles:
+ options.max_open_files = -1;
+ break;
+ case kCRC32cChecksum: {
+ // Old default was CRC32c, but XXH3 (new default) is faster on common
+ // hardware
+ table_options.checksum = kCRC32c;
+ // Thrown in here for basic coverage:
+ options.DisableExtraChecks();
+ break;
+ }
+ case kFIFOCompaction: {
+ options.compaction_style = kCompactionStyleFIFO;
+ options.max_open_files = -1;
+ break;
+ }
+ case kBlockBasedTableWithPrefixHashIndex: {
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ break;
+ }
+ case kBlockBasedTableWithWholeKeyHashIndex: {
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ options.prefix_extractor.reset(NewNoopTransform());
+ break;
+ }
+ case kBlockBasedTableWithPartitionedIndex: {
+ table_options.format_version = 3;
+ table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+ options.prefix_extractor.reset(NewNoopTransform());
+ break;
+ }
+ case kBlockBasedTableWithPartitionedIndexFormat4: {
+ table_options.format_version = 4;
+ // Format 4 changes the binary index format. Since partitioned index is a
+ // super-set of simple indexes, we are also using kTwoLevelIndexSearch to
+ // test this format.
+ table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+ // The top-level index in partition filters are also affected by format 4.
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ table_options.partition_filters = true;
+ table_options.index_block_restart_interval = 8;
+ break;
+ }
+ case kBlockBasedTableWithIndexRestartInterval: {
+ table_options.index_block_restart_interval = 8;
+ break;
+ }
+ case kBlockBasedTableWithLatestFormat: {
+ // In case different from default
+ table_options.format_version = kLatestFormatVersion;
+ break;
+ }
+ case kOptimizeFiltersForHits: {
+ options.optimize_filters_for_hits = true;
+ set_block_based_table_factory = true;
+ break;
+ }
+ case kRowCache: {
+ options.row_cache = NewLRUCache(1024 * 1024);
+ break;
+ }
+ case kRecycleLogFiles: {
+ options.recycle_log_file_num = 2;
+ break;
+ }
+ case kLevelSubcompactions: {
+ options.max_subcompactions = 4;
+ break;
+ }
+ case kUniversalSubcompactions: {
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 8;
+ options.max_subcompactions = 4;
+ break;
+ }
+ case kConcurrentSkipList: {
+ options.allow_concurrent_memtable_write = true;
+ options.enable_write_thread_adaptive_yield = true;
+ break;
+ }
+ case kPipelinedWrite: {
+ options.enable_pipelined_write = true;
+ break;
+ }
+ case kConcurrentWALWrites: {
+ // This options optimize 2PC commit path
+ options.two_write_queues = true;
+ options.manual_wal_flush = true;
+ break;
+ }
+ case kUnorderedWrite: {
+ options.allow_concurrent_memtable_write = false;
+ options.unordered_write = false;
+ break;
+ }
+
+ default:
+ break;
+ }
+
+ if (options_override.filter_policy) {
+ table_options.filter_policy = options_override.filter_policy;
+ table_options.partition_filters = options_override.partition_filters;
+ table_options.metadata_block_size = options_override.metadata_block_size;
+ }
+ if (set_block_based_table_factory) {
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ }
+ options.env = env_;
+ options.create_if_missing = true;
+ options.fail_if_options_file_error = true;
+ return options;
+}
+
+void DBTestBase::CreateColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ColumnFamilyOptions cf_opts(options);
+ size_t cfi = handles_.size();
+ handles_.resize(cfi + cfs.size());
+ for (auto cf : cfs) {
+ Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]);
+ ASSERT_OK(s);
+ }
+}
+
+void DBTestBase::CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+ const Options& options) {
+ CreateColumnFamilies(cfs, options);
+ std::vector<std::string> cfs_plus_default = cfs;
+ cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+ ReopenWithColumnFamilies(cfs_plus_default, options);
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+void DBTestBase::SetTimeElapseOnlySleepOnReopen(DBOptions* options) {
+ time_elapse_only_sleep_on_reopen_ = true;
+
+ // Need to disable stats dumping and persisting which also use
+ // RepeatableThread, which uses InstrumentedCondVar::TimedWaitInternal.
+ // With time_elapse_only_sleep_, this can hang on some platforms (MacOS)
+ // because (a) on some platforms, pthread_cond_timedwait does not appear
+ // to release the lock for other threads to operate if the deadline time
+ // is already passed, and (b) TimedWait calls are currently a bad abstraction
+ // because the deadline parameter is usually computed from Env time,
+ // but is interpreted in real clock time.
+ options->stats_dump_period_sec = 0;
+ options->stats_persist_period_sec = 0;
+}
+
+void DBTestBase::MaybeInstallTimeElapseOnlySleep(const DBOptions& options) {
+ if (time_elapse_only_sleep_on_reopen_) {
+ assert(options.env == env_ ||
+ static_cast_with_check<CompositeEnvWrapper>(options.env)
+ ->env_target() == env_);
+ assert(options.stats_dump_period_sec == 0);
+ assert(options.stats_persist_period_sec == 0);
+ // We cannot set these before destroying the last DB because they might
+ // cause a deadlock or similar without the appropriate options set in
+ // the DB.
+ env_->time_elapse_only_sleep_ = true;
+ env_->no_slowdown_ = true;
+ } else {
+ // Going back in same test run is not yet supported, so no
+ // reset in this case.
+ }
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+ const std::vector<std::string>& cfs, const std::vector<Options>& options) {
+ Close();
+ EXPECT_EQ(cfs.size(), options.size());
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (size_t i = 0; i < cfs.size(); ++i) {
+ column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+ }
+ DBOptions db_opts = DBOptions(options[0]);
+ last_options_ = options[0];
+ MaybeInstallTimeElapseOnlySleep(db_opts);
+ return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+ const std::vector<std::string>& cfs, const Options& options) {
+ Close();
+ std::vector<Options> v_opts(cfs.size(), options);
+ return TryReopenWithColumnFamilies(cfs, v_opts);
+}
+
+void DBTestBase::Reopen(const Options& options) {
+ ASSERT_OK(TryReopen(options));
+}
+
+void DBTestBase::Close() {
+ for (auto h : handles_) {
+ EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
+ }
+ handles_.clear();
+ delete db_;
+ db_ = nullptr;
+}
+
+void DBTestBase::DestroyAndReopen(const Options& options) {
+ // Destroy using last options
+ Destroy(last_options_);
+ Reopen(options);
+}
+
+void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ if (delete_cf_paths) {
+ for (size_t i = 0; i < handles_.size(); ++i) {
+ ColumnFamilyDescriptor cfdescriptor;
+ // GetDescriptor is not implemented for ROCKSDB_LITE
+ handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError();
+ column_families.push_back(cfdescriptor);
+ }
+ }
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options, column_families));
+}
+
+Status DBTestBase::ReadOnlyReopen(const Options& options) {
+ MaybeInstallTimeElapseOnlySleep(options);
+ return DB::OpenForReadOnly(options, dbname_, &db_);
+}
+
+Status DBTestBase::TryReopen(const Options& options) {
+ Close();
+ last_options_.table_factory.reset();
+ // Note: operator= is an unsafe approach here since it destructs
+ // std::shared_ptr in the same order of their creation, in contrast to
+ // destructors which destructs them in the opposite order of creation. One
+ // particular problem is that the cache destructor might invoke callback
+ // functions that use Option members such as statistics. To work around this
+ // problem, we manually call destructor of table_factory which eventually
+ // clears the block cache.
+ last_options_ = options;
+ MaybeInstallTimeElapseOnlySleep(options);
+ return DB::Open(options, dbname_, &db_);
+}
+
+bool DBTestBase::IsDirectIOSupported() {
+ return test::IsDirectIOSupported(env_, dbname_);
+}
+
+bool DBTestBase::IsMemoryMappedAccessSupported() const {
+ return (!encrypted_env_);
+}
+
+Status DBTestBase::Flush(int cf) {
+ if (cf == 0) {
+ return db_->Flush(FlushOptions());
+ } else {
+ return db_->Flush(FlushOptions(), handles_[cf]);
+ }
+}
+
+Status DBTestBase::Flush(const std::vector<int>& cf_ids) {
+ std::vector<ColumnFamilyHandle*> cfhs;
+ std::for_each(cf_ids.begin(), cf_ids.end(),
+ [&cfhs, this](int id) { cfhs.emplace_back(handles_[id]); });
+ return db_->Flush(FlushOptions(), cfhs);
+}
+
+Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) {
+ if (kMergePut == option_config_) {
+ return db_->Merge(wo, k, v);
+ } else {
+ return db_->Put(wo, k, v);
+ }
+}
+
+Status DBTestBase::Put(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo) {
+ if (kMergePut == option_config_) {
+ return db_->Merge(wo, handles_[cf], k, v);
+ } else {
+ return db_->Put(wo, handles_[cf], k, v);
+ }
+}
+
+Status DBTestBase::Merge(const Slice& k, const Slice& v, WriteOptions wo) {
+ return db_->Merge(wo, k, v);
+}
+
+Status DBTestBase::Merge(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo) {
+ return db_->Merge(wo, handles_[cf], k, v);
+}
+
+Status DBTestBase::Delete(const std::string& k) {
+ return db_->Delete(WriteOptions(), k);
+}
+
+Status DBTestBase::Delete(int cf, const std::string& k) {
+ return db_->Delete(WriteOptions(), handles_[cf], k);
+}
+
+Status DBTestBase::SingleDelete(const std::string& k) {
+ return db_->SingleDelete(WriteOptions(), k);
+}
+
+Status DBTestBase::SingleDelete(int cf, const std::string& k) {
+ return db_->SingleDelete(WriteOptions(), handles_[cf], k);
+}
+
+std::string DBTestBase::Get(const std::string& k, const Snapshot* snapshot) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+}
+
+std::string DBTestBase::Get(int cf, const std::string& k,
+ const Snapshot* snapshot) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, handles_[cf], k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+}
+
+std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
+ const std::vector<std::string>& k,
+ const Snapshot* snapshot,
+ const bool batched,
+ const bool async) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ options.async_io = async;
+ std::vector<ColumnFamilyHandle*> handles;
+ std::vector<Slice> keys;
+ std::vector<std::string> result;
+
+ for (unsigned int i = 0; i < cfs.size(); ++i) {
+ handles.push_back(handles_[cfs[i]]);
+ keys.push_back(k[i]);
+ }
+ std::vector<Status> s;
+ if (!batched) {
+ s = db_->MultiGet(options, handles, keys, &result);
+ for (size_t i = 0; i < s.size(); ++i) {
+ if (s[i].IsNotFound()) {
+ result[i] = "NOT_FOUND";
+ } else if (!s[i].ok()) {
+ result[i] = s[i].ToString();
+ }
+ }
+ } else {
+ std::vector<PinnableSlice> pin_values(cfs.size());
+ result.resize(cfs.size());
+ s.resize(cfs.size());
+ db_->MultiGet(options, cfs.size(), handles.data(), keys.data(),
+ pin_values.data(), s.data());
+ for (size_t i = 0; i < s.size(); ++i) {
+ if (s[i].IsNotFound()) {
+ result[i] = "NOT_FOUND";
+ } else if (!s[i].ok()) {
+ result[i] = s[i].ToString();
+ } else {
+ result[i].assign(pin_values[i].data(), pin_values[i].size());
+ // Increase likelihood of detecting potential use-after-free bugs with
+ // PinnableSlices tracking the same resource
+ pin_values[i].Reset();
+ }
+ }
+ }
+ return result;
+}
+
+std::vector<std::string> DBTestBase::MultiGet(const std::vector<std::string>& k,
+ const Snapshot* snapshot,
+ const bool async) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ options.async_io = async;
+ std::vector<Slice> keys;
+ std::vector<std::string> result(k.size());
+ std::vector<Status> statuses(k.size());
+ std::vector<PinnableSlice> pin_values(k.size());
+
+ for (size_t i = 0; i < k.size(); ++i) {
+ keys.push_back(k[i]);
+ }
+ db_->MultiGet(options, dbfull()->DefaultColumnFamily(), keys.size(),
+ keys.data(), pin_values.data(), statuses.data());
+ for (size_t i = 0; i < statuses.size(); ++i) {
+ if (statuses[i].IsNotFound()) {
+ result[i] = "NOT_FOUND";
+ } else if (!statuses[i].ok()) {
+ result[i] = statuses[i].ToString();
+ } else {
+ result[i].assign(pin_values[i].data(), pin_values[i].size());
+ // Increase likelihood of detecting potential use-after-free bugs with
+ // PinnableSlices tracking the same resource
+ pin_values[i].Reset();
+ }
+ }
+ return result;
+}
+
+Status DBTestBase::Get(const std::string& k, PinnableSlice* v) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ Status s = dbfull()->Get(options, dbfull()->DefaultColumnFamily(), k, v);
+ return s;
+}
+
+uint64_t DBTestBase::GetNumSnapshots() {
+ uint64_t int_num;
+ EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num));
+ return int_num;
+}
+
+uint64_t DBTestBase::GetTimeOldestSnapshots() {
+ uint64_t int_num;
+ EXPECT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num));
+ return int_num;
+}
+
+uint64_t DBTestBase::GetSequenceOldestSnapshots() {
+ uint64_t int_num;
+ EXPECT_TRUE(
+ dbfull()->GetIntProperty("rocksdb.oldest-snapshot-sequence", &int_num));
+ return int_num;
+}
+
+// Return a string that contains all key,value pairs in order,
+// formatted like "(k1->v1)(k2->v2)".
+std::string DBTestBase::Contents(int cf) {
+ std::vector<std::string> forward;
+ std::string result;
+ Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions())
+ : db_->NewIterator(ReadOptions(), handles_[cf]);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string s = IterStatus(iter);
+ result.push_back('(');
+ result.append(s);
+ result.push_back(')');
+ forward.push_back(s);
+ }
+
+ // Check reverse iteration results are the reverse of forward results
+ unsigned int matched = 0;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ EXPECT_LT(matched, forward.size());
+ EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+ matched++;
+ }
+ EXPECT_EQ(matched, forward.size());
+
+ delete iter;
+ return result;
+}
+
+void DBTestBase::CheckAllEntriesWithFifoReopen(
+ const std::string& expected_value, const Slice& user_key, int cf,
+ const std::vector<std::string>& cfs, const Options& options) {
+ ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value);
+
+ std::vector<std::string> cfs_plus_default = cfs;
+ cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+
+ Options fifo_options(options);
+ fifo_options.compaction_style = kCompactionStyleFIFO;
+ fifo_options.max_open_files = -1;
+ fifo_options.disable_auto_compactions = true;
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs_plus_default, fifo_options));
+ ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value);
+
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs_plus_default, options));
+ ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value);
+}
+
+std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) {
+ Arena arena;
+ auto options = CurrentOptions();
+ InternalKeyComparator icmp(options.comparator);
+ ReadOptions read_options;
+ ScopedArenaIterator iter;
+ if (cf == 0) {
+ iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+ kMaxSequenceNumber));
+ } else {
+ iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+ kMaxSequenceNumber, handles_[cf]));
+ }
+ InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+ iter->Seek(target.Encode());
+ std::string result;
+ if (!iter->status().ok()) {
+ result = iter->status().ToString();
+ } else {
+ result = "[ ";
+ bool first = true;
+ while (iter->Valid()) {
+ ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+ if (ParseInternalKey(iter->key(), &ikey, true /* log_err_key */) !=
+ Status::OK()) {
+ result += "CORRUPTED";
+ } else {
+ if (!last_options_.comparator->Equal(ikey.user_key, user_key)) {
+ break;
+ }
+ if (!first) {
+ result += ", ";
+ }
+ first = false;
+ switch (ikey.type) {
+ case kTypeValue:
+ result += iter->value().ToString();
+ break;
+ case kTypeMerge:
+ // keep it the same as kTypeValue for testing kMergePut
+ result += iter->value().ToString();
+ break;
+ case kTypeDeletion:
+ result += "DEL";
+ break;
+ case kTypeSingleDeletion:
+ result += "SDEL";
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ }
+ iter->Next();
+ }
+ if (!first) {
+ result += " ";
+ }
+ result += "]";
+ }
+ return result;
+}
+
+#ifndef ROCKSDB_LITE
+int DBTestBase::NumSortedRuns(int cf) {
+ ColumnFamilyMetaData cf_meta;
+ if (cf == 0) {
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ } else {
+ db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+ }
+ int num_sr = static_cast<int>(cf_meta.levels[0].files.size());
+ for (size_t i = 1U; i < cf_meta.levels.size(); i++) {
+ if (cf_meta.levels[i].files.size() > 0) {
+ num_sr++;
+ }
+ }
+ return num_sr;
+}
+
+uint64_t DBTestBase::TotalSize(int cf) {
+ ColumnFamilyMetaData cf_meta;
+ if (cf == 0) {
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ } else {
+ db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+ }
+ return cf_meta.size;
+}
+
+uint64_t DBTestBase::SizeAtLevel(int level) {
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ uint64_t sum = 0;
+ for (const auto& m : metadata) {
+ if (m.level == level) {
+ sum += m.size;
+ }
+ }
+ return sum;
+}
+
+size_t DBTestBase::TotalLiveFiles(int cf) {
+ ColumnFamilyMetaData cf_meta;
+ if (cf == 0) {
+ db_->GetColumnFamilyMetaData(&cf_meta);
+ } else {
+ db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+ }
+ size_t num_files = 0;
+ for (auto& level : cf_meta.levels) {
+ num_files += level.files.size();
+ }
+ return num_files;
+}
+
+size_t DBTestBase::CountLiveFiles() {
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ return metadata.size();
+}
+
+int DBTestBase::NumTableFilesAtLevel(int level, int cf) {
+ std::string property;
+ if (cf == 0) {
+ // default cfd
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.num-files-at-level" + std::to_string(level), &property));
+ } else {
+ EXPECT_TRUE(db_->GetProperty(
+ handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level),
+ &property));
+ }
+ return atoi(property.c_str());
+}
+
+double DBTestBase::CompressionRatioAtLevel(int level, int cf) {
+ std::string property;
+ if (cf == 0) {
+ // default cfd
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.compression-ratio-at-level" + std::to_string(level),
+ &property));
+ } else {
+ EXPECT_TRUE(db_->GetProperty(
+ handles_[cf],
+ "rocksdb.compression-ratio-at-level" + std::to_string(level),
+ &property));
+ }
+ return std::stod(property);
+}
+
+int DBTestBase::TotalTableFiles(int cf, int levels) {
+ if (levels == -1) {
+ levels = (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+ }
+ int result = 0;
+ for (int level = 0; level < levels; level++) {
+ result += NumTableFilesAtLevel(level, cf);
+ }
+ return result;
+}
+
+// Return spread of files per level
+std::string DBTestBase::FilesPerLevel(int cf) {
+ int num_levels =
+ (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+ std::string result;
+ size_t last_non_zero_offset = 0;
+ for (int level = 0; level < num_levels; level++) {
+ int f = NumTableFilesAtLevel(level, cf);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+}
+
+#endif // !ROCKSDB_LITE
+
+std::vector<uint64_t> DBTestBase::GetBlobFileNumbers() {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ Version* const current = cfd->current();
+ assert(current);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+
+ std::vector<uint64_t> result;
+ result.reserve(blob_files.size());
+
+ for (const auto& blob_file : blob_files) {
+ assert(blob_file);
+ result.emplace_back(blob_file->GetBlobFileNumber());
+ }
+
+ return result;
+}
+
+size_t DBTestBase::CountFiles() {
+ size_t count = 0;
+ std::vector<std::string> files;
+ if (env_->GetChildren(dbname_, &files).ok()) {
+ count += files.size();
+ }
+
+ if (dbname_ != last_options_.wal_dir) {
+ if (env_->GetChildren(last_options_.wal_dir, &files).ok()) {
+ count += files.size();
+ }
+ }
+
+ return count;
+};
+
+Status DBTestBase::CountFiles(size_t* count) {
+ std::vector<std::string> files;
+ Status s = env_->GetChildren(dbname_, &files);
+ if (!s.ok()) {
+ return s;
+ }
+ size_t files_count = files.size();
+
+ if (dbname_ != last_options_.wal_dir) {
+ s = env_->GetChildren(last_options_.wal_dir, &files);
+ if (!s.ok()) {
+ return s;
+ }
+ *count = files_count + files.size();
+ }
+
+ return Status::OK();
+}
+
+Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf,
+ uint64_t* size) {
+ Range r(start, limit);
+ if (cf == 0) {
+ return db_->GetApproximateSizes(&r, 1, size);
+ } else {
+ return db_->GetApproximateSizes(handles_[1], &r, 1, size);
+ }
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit,
+ uint32_t target_path_id) {
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = target_path_id;
+ ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(const Slice& start, const Slice& limit) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+}
+
+// Do n memtable compactions, each of which produces an sstable
+// covering the range [small,large].
+void DBTestBase::MakeTables(int n, const std::string& small,
+ const std::string& large, int cf) {
+ for (int i = 0; i < n; i++) {
+ ASSERT_OK(Put(cf, small, "begin"));
+ ASSERT_OK(Put(cf, large, "end"));
+ ASSERT_OK(Flush(cf));
+ MoveFilesToLevel(n - i - 1, cf);
+ }
+}
+
+// Prevent pushing of new sstables into deeper levels by adding
+// tables that cover a specified range to all levels.
+void DBTestBase::FillLevels(const std::string& smallest,
+ const std::string& largest, int cf) {
+ MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf);
+}
+
+void DBTestBase::MoveFilesToLevel(int level, int cf) {
+ for (int l = 0; l < level; ++l) {
+ if (cf > 0) {
+ EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]));
+ } else {
+ EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr));
+ }
+ }
+}
+
+#ifndef ROCKSDB_LITE
+void DBTestBase::DumpFileCounts(const char* label) {
+ fprintf(stderr, "---\n%s:\n", label);
+ fprintf(stderr, "maxoverlap: %" PRIu64 "\n",
+ dbfull()->TEST_MaxNextLevelOverlappingBytes());
+ for (int level = 0; level < db_->NumberLevels(); level++) {
+ int num = NumTableFilesAtLevel(level);
+ if (num > 0) {
+ fprintf(stderr, " level %3d : %d files\n", level, num);
+ }
+ }
+}
+#endif // !ROCKSDB_LITE
+
+std::string DBTestBase::DumpSSTableList() {
+ std::string property;
+ db_->GetProperty("rocksdb.sstables", &property);
+ return property;
+}
+
+void DBTestBase::GetSstFiles(Env* env, std::string path,
+ std::vector<std::string>* files) {
+ EXPECT_OK(env->GetChildren(path, files));
+
+ files->erase(std::remove_if(files->begin(), files->end(),
+ [](std::string name) {
+ uint64_t number;
+ FileType type;
+ return !(ParseFileName(name, &number, &type) &&
+ type == kTableFile);
+ }),
+ files->end());
+}
+
+int DBTestBase::GetSstFileCount(std::string path) {
+ std::vector<std::string> files;
+ DBTestBase::GetSstFiles(env_, path, &files);
+ return static_cast<int>(files.size());
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx,
+ bool nowait) {
+ for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
+ ASSERT_OK(Put(cf, Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990)));
+ (*key_idx)++;
+ }
+ if (!nowait) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) {
+ for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
+ ASSERT_OK(Put(Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990)));
+ (*key_idx)++;
+ }
+ if (!nowait) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+}
+
+const int DBTestBase::kNumKeysByGenerateNewRandomFile = 51;
+
+void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) {
+ for (int i = 0; i < kNumKeysByGenerateNewRandomFile; i++) {
+ ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(2000)));
+ }
+ ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(200)));
+ if (!nowait) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+}
+
+std::string DBTestBase::IterStatus(Iterator* iter) {
+ std::string result;
+ if (iter->Valid()) {
+ result = iter->key().ToString() + "->" + iter->value().ToString();
+ } else {
+ result = "(invalid)";
+ }
+ return result;
+}
+
+Options DBTestBase::OptionsForLogIterTest() {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.WAL_ttl_seconds = 1000;
+ return options;
+}
+
+std::string DBTestBase::DummyString(size_t len, char c) {
+ return std::string(len, c);
+}
+
+void DBTestBase::VerifyIterLast(std::string expected_key, int cf) {
+ Iterator* iter;
+ ReadOptions ro;
+ if (cf == 0) {
+ iter = db_->NewIterator(ro);
+ } else {
+ iter = db_->NewIterator(ro, handles_[cf]);
+ }
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), expected_key);
+ delete iter;
+}
+
+// Used to test InplaceUpdate
+
+// If previous value is nullptr or delta is > than previous value,
+// sets newValue with delta
+// If previous value is not empty,
+// updates previous value with 'b' string of previous value size - 1.
+UpdateStatus DBTestBase::updateInPlaceSmallerSize(char* prevValue,
+ uint32_t* prevSize,
+ Slice delta,
+ std::string* newValue) {
+ if (prevValue == nullptr) {
+ *newValue = std::string(delta.size(), 'c');
+ return UpdateStatus::UPDATED;
+ } else {
+ *prevSize = *prevSize - 1;
+ std::string str_b = std::string(*prevSize, 'b');
+ memcpy(prevValue, str_b.c_str(), str_b.size());
+ return UpdateStatus::UPDATED_INPLACE;
+ }
+}
+
+UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(char* prevValue,
+ uint32_t* prevSize,
+ Slice delta,
+ std::string* newValue) {
+ if (prevValue == nullptr) {
+ *newValue = std::string(delta.size(), 'c');
+ return UpdateStatus::UPDATED;
+ } else {
+ *prevSize = 1;
+ std::string str_b = std::string(*prevSize, 'b');
+ memcpy(prevValue, str_b.c_str(), str_b.size());
+ return UpdateStatus::UPDATED_INPLACE;
+ }
+}
+
+UpdateStatus DBTestBase::updateInPlaceLargerSize(char* /*prevValue*/,
+ uint32_t* /*prevSize*/,
+ Slice delta,
+ std::string* newValue) {
+ *newValue = std::string(delta.size(), 'c');
+ return UpdateStatus::UPDATED;
+}
+
+UpdateStatus DBTestBase::updateInPlaceNoAction(char* /*prevValue*/,
+ uint32_t* /*prevSize*/,
+ Slice /*delta*/,
+ std::string* /*newValue*/) {
+ return UpdateStatus::UPDATE_FAILED;
+}
+
+// Utility method to test InplaceUpdate
+void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
+ Arena arena;
+ auto options = CurrentOptions();
+ InternalKeyComparator icmp(options.comparator);
+ ReadOptions read_options;
+ ScopedArenaIterator iter;
+ if (cf != 0) {
+ iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+ kMaxSequenceNumber, handles_[cf]));
+ } else {
+ iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+ kMaxSequenceNumber));
+ }
+ iter->SeekToFirst();
+ ASSERT_OK(iter->status());
+ int seq = numValues;
+ while (iter->Valid()) {
+ ParsedInternalKey ikey;
+ ikey.clear();
+ ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+
+ // checks sequence number for updates
+ ASSERT_EQ(ikey.sequence, (unsigned)seq--);
+ iter->Next();
+ }
+ ASSERT_EQ(0, seq);
+}
+
+void DBTestBase::CopyFile(const std::string& source,
+ const std::string& destination, uint64_t size) {
+ const EnvOptions soptions;
+ std::unique_ptr<SequentialFile> srcfile;
+ ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+ std::unique_ptr<WritableFile> destfile;
+ ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+ if (size == 0) {
+ // default argument means copy everything
+ ASSERT_OK(env_->GetFileSize(source, &size));
+ }
+
+ char buffer[4096];
+ Slice slice;
+ while (size > 0) {
+ uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+ ASSERT_OK(srcfile->Read(one, &slice, buffer));
+ ASSERT_OK(destfile->Append(slice));
+ size -= slice.size();
+ }
+ ASSERT_OK(destfile->Close());
+}
+
+Status DBTestBase::GetAllDataFiles(
+ const FileType file_type, std::unordered_map<std::string, uint64_t>* files,
+ uint64_t* total_size /* = nullptr */) {
+ if (total_size) {
+ *total_size = 0;
+ }
+ std::vector<std::string> children;
+ Status s = env_->GetChildren(dbname_, &children);
+ if (s.ok()) {
+ for (auto& file_name : children) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(file_name, &number, &type) && type == file_type) {
+ std::string file_path = dbname_ + "/" + file_name;
+ uint64_t file_size = 0;
+ s = env_->GetFileSize(file_path, &file_size);
+ if (!s.ok()) {
+ break;
+ }
+ (*files)[file_path] = file_size;
+ if (total_size) {
+ *total_size += file_size;
+ }
+ }
+ }
+ }
+ return s;
+}
+
+std::vector<std::uint64_t> DBTestBase::ListTableFiles(Env* env,
+ const std::string& path) {
+ std::vector<std::string> files;
+ std::vector<uint64_t> file_numbers;
+ EXPECT_OK(env->GetChildren(path, &files));
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < files.size(); ++i) {
+ if (ParseFileName(files[i], &number, &type)) {
+ if (type == kTableFile) {
+ file_numbers.push_back(number);
+ }
+ }
+ }
+ return file_numbers;
+}
+
+void DBTestBase::VerifyDBFromMap(std::map<std::string, std::string> true_data,
+ size_t* total_reads_res, bool tailing_iter,
+ std::map<std::string, Status> status) {
+ size_t total_reads = 0;
+
+ for (auto& kv : true_data) {
+ Status s = status[kv.first];
+ if (s.ok()) {
+ ASSERT_EQ(Get(kv.first), kv.second);
+ } else {
+ std::string value;
+ ASSERT_EQ(s, db_->Get(ReadOptions(), kv.first, &value));
+ }
+ total_reads++;
+ }
+
+ // Normal Iterator
+ {
+ int iter_cnt = 0;
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ Iterator* iter = db_->NewIterator(ro);
+ // Verify Iterator::Next()
+ iter_cnt = 0;
+ auto data_iter = true_data.begin();
+ Status s;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ Status current_status = status[data_iter->first];
+ if (!current_status.ok()) {
+ s = current_status;
+ }
+ ASSERT_EQ(iter->status(), s);
+ if (current_status.ok()) {
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ }
+ iter_cnt++;
+ total_reads++;
+ }
+ ASSERT_EQ(data_iter, true_data.end())
+ << iter_cnt << " / " << true_data.size();
+ delete iter;
+
+ // Verify Iterator::Prev()
+ // Use a new iterator to make sure its status is clean.
+ iter = db_->NewIterator(ro);
+ iter_cnt = 0;
+ s = Status::OK();
+ auto data_rev = true_data.rbegin();
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev(), data_rev++) {
+ ASSERT_EQ(iter->key().ToString(), data_rev->first);
+ Status current_status = status[data_rev->first];
+ if (!current_status.ok()) {
+ s = current_status;
+ }
+ ASSERT_EQ(iter->status(), s);
+ if (current_status.ok()) {
+ ASSERT_EQ(iter->value().ToString(), data_rev->second);
+ }
+ iter_cnt++;
+ total_reads++;
+ }
+ ASSERT_EQ(data_rev, true_data.rend())
+ << iter_cnt << " / " << true_data.size();
+
+ // Verify Iterator::Seek()
+ for (auto kv : true_data) {
+ iter->Seek(kv.first);
+ ASSERT_EQ(kv.first, iter->key().ToString());
+ ASSERT_EQ(kv.second, iter->value().ToString());
+ total_reads++;
+ }
+ delete iter;
+ }
+
+ if (tailing_iter) {
+#ifndef ROCKSDB_LITE
+ // Tailing iterator
+ int iter_cnt = 0;
+ ReadOptions ro;
+ ro.tailing = true;
+ ro.total_order_seek = true;
+ Iterator* iter = db_->NewIterator(ro);
+
+ // Verify ForwardIterator::Next()
+ iter_cnt = 0;
+ auto data_iter = true_data.begin();
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
+ ASSERT_EQ(iter->key().ToString(), data_iter->first);
+ ASSERT_EQ(iter->value().ToString(), data_iter->second);
+ iter_cnt++;
+ total_reads++;
+ }
+ ASSERT_EQ(data_iter, true_data.end())
+ << iter_cnt << " / " << true_data.size();
+
+ // Verify ForwardIterator::Seek()
+ for (auto kv : true_data) {
+ iter->Seek(kv.first);
+ ASSERT_EQ(kv.first, iter->key().ToString());
+ ASSERT_EQ(kv.second, iter->value().ToString());
+ total_reads++;
+ }
+
+ delete iter;
+#endif // ROCKSDB_LITE
+ }
+
+ if (total_reads_res) {
+ *total_reads_res = total_reads;
+ }
+}
+
+void DBTestBase::VerifyDBInternal(
+ std::vector<std::pair<std::string, std::string>> true_data) {
+ Arena arena;
+ InternalKeyComparator icmp(last_options_.comparator);
+ ReadOptions read_options;
+ auto iter =
+ dbfull()->NewInternalIterator(read_options, &arena, kMaxSequenceNumber);
+ iter->SeekToFirst();
+ for (auto p : true_data) {
+ ASSERT_TRUE(iter->Valid());
+ ParsedInternalKey ikey;
+ ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+ ASSERT_EQ(p.first, ikey.user_key);
+ ASSERT_EQ(p.second, iter->value());
+ iter->Next();
+ };
+ ASSERT_FALSE(iter->Valid());
+ iter->~InternalIterator();
+}
+
+#ifndef ROCKSDB_LITE
+
+uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily(
+ DB* db, std::string column_family_name) {
+ std::vector<LiveFileMetaData> metadata;
+ db->GetLiveFilesMetaData(&metadata);
+ uint64_t result = 0;
+ for (auto& fileMetadata : metadata) {
+ result += (fileMetadata.column_family_name == column_family_name);
+ }
+ return result;
+}
+
+uint64_t DBTestBase::GetSstSizeHelper(Temperature temperature) {
+ std::string prop;
+ EXPECT_TRUE(dbfull()->GetProperty(
+ DB::Properties::kLiveSstFilesSizeAtTemperature +
+ std::to_string(static_cast<uint8_t>(temperature)),
+ &prop));
+ return static_cast<uint64_t>(std::atoi(prop.c_str()));
+}
+#endif // ROCKSDB_LITE
+
+void VerifySstUniqueIds(const TablePropertiesCollection& props) {
+ ASSERT_FALSE(props.empty()); // suspicious test if empty
+ std::unordered_set<std::string> seen;
+ for (auto& pair : props) {
+ std::string id;
+ ASSERT_OK(GetUniqueIdFromTableProperties(*pair.second, &id));
+ ASSERT_TRUE(seen.insert(id).second);
+ }
+}
+
+template <CacheEntryRole R>
+TargetCacheChargeTrackingCache<R>::TargetCacheChargeTrackingCache(
+ std::shared_ptr<Cache> target)
+ : CacheWrapper(std::move(target)),
+ cur_cache_charge_(0),
+ cache_charge_peak_(0),
+ cache_charge_increment_(0),
+ last_peak_tracked_(false),
+ cache_charge_increments_sum_(0) {}
+
+template <CacheEntryRole R>
+Status TargetCacheChargeTrackingCache<R>::Insert(
+ const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value), Handle** handle,
+ Priority priority) {
+ Status s = target_->Insert(key, value, charge, deleter, handle, priority);
+ if (deleter == kNoopDeleter) {
+ if (last_peak_tracked_) {
+ cache_charge_peak_ = 0;
+ cache_charge_increment_ = 0;
+ last_peak_tracked_ = false;
+ }
+ if (s.ok()) {
+ cur_cache_charge_ += charge;
+ }
+ cache_charge_peak_ = std::max(cache_charge_peak_, cur_cache_charge_);
+ cache_charge_increment_ += charge;
+ }
+
+ return s;
+}
+
+template <CacheEntryRole R>
+bool TargetCacheChargeTrackingCache<R>::Release(Handle* handle,
+ bool erase_if_last_ref) {
+ auto deleter = GetDeleter(handle);
+ if (deleter == kNoopDeleter) {
+ if (!last_peak_tracked_) {
+ cache_charge_peaks_.push_back(cache_charge_peak_);
+ cache_charge_increments_sum_ += cache_charge_increment_;
+ last_peak_tracked_ = true;
+ }
+ cur_cache_charge_ -= GetCharge(handle);
+ }
+ bool is_successful = target_->Release(handle, erase_if_last_ref);
+ return is_successful;
+}
+
+template <CacheEntryRole R>
+const Cache::DeleterFn TargetCacheChargeTrackingCache<R>::kNoopDeleter =
+ CacheReservationManagerImpl<R>::TEST_GetNoopDeleterForRole();
+
+template class TargetCacheChargeTrackingCache<
+ CacheEntryRole::kFilterConstruction>;
+template class TargetCacheChargeTrackingCache<
+ CacheEntryRole::kBlockBasedTableReader>;
+template class TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>;
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_test_util.h b/src/rocksdb/db/db_test_util.h
new file mode 100644
index 000000000..29d5cd9d7
--- /dev/null
+++ b/src/rocksdb/db/db_test_util.h
@@ -0,0 +1,1402 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <fcntl.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "file/filename.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "table/mock_table.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+class MockEnv;
+
+namespace anon {
+class AtomicCounter {
+ public:
+ explicit AtomicCounter(Env* env = NULL)
+ : env_(env), cond_count_(&mu_), count_(0) {}
+
+ void Increment() {
+ MutexLock l(&mu_);
+ count_++;
+ cond_count_.SignalAll();
+ }
+
+ int Read() {
+ MutexLock l(&mu_);
+ return count_;
+ }
+
+ bool WaitFor(int count) {
+ MutexLock l(&mu_);
+
+ uint64_t start = env_->NowMicros();
+ while (count_ < count) {
+ uint64_t now = env_->NowMicros();
+ cond_count_.TimedWait(now + /*1s*/ 1 * 1000 * 1000);
+ if (env_->NowMicros() - start > /*10s*/ 10 * 1000 * 1000) {
+ return false;
+ }
+ if (count_ < count) {
+ GTEST_LOG_(WARNING) << "WaitFor is taking more time than usual";
+ }
+ }
+
+ return true;
+ }
+
+ void Reset() {
+ MutexLock l(&mu_);
+ count_ = 0;
+ cond_count_.SignalAll();
+ }
+
+ private:
+ Env* env_;
+ port::Mutex mu_;
+ port::CondVar cond_count_;
+ int count_;
+};
+
+struct OptionsOverride {
+ std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+ // These will be used only if filter_policy is set
+ bool partition_filters = false;
+ // Force using a default block cache. (Setting to false allows ASAN build
+ // use a trivially small block cache for better UAF error detection.)
+ bool full_block_cache = false;
+ uint64_t metadata_block_size = 1024;
+
+ // Used as a bit mask of individual enums in which to skip an XF test point
+ int skip_policy = 0;
+};
+
+} // namespace anon
+
+enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 };
+
+// Special Env used to delay background operations
+class SpecialEnv : public EnvWrapper {
+ public:
+ explicit SpecialEnv(Env* base, bool time_elapse_only_sleep = false);
+
+ static const char* kClassName() { return "SpecialEnv"; }
+ const char* Name() const override { return kClassName(); }
+
+ Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+ const EnvOptions& soptions) override {
+ class SSTableFile : public WritableFile {
+ private:
+ SpecialEnv* env_;
+ std::unique_ptr<WritableFile> base_;
+
+ public:
+ SSTableFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& base)
+ : env_(env), base_(std::move(base)) {}
+ Status Append(const Slice& data) override {
+ if (env_->table_write_callback_) {
+ (*env_->table_write_callback_)();
+ }
+ if (env_->drop_writes_.load(std::memory_order_acquire)) {
+ // Drop writes on the floor
+ return Status::OK();
+ } else if (env_->no_space_.load(std::memory_order_acquire)) {
+ return Status::NoSpace("No space left on device");
+ } else {
+ env_->bytes_written_ += data.size();
+ return base_->Append(data);
+ }
+ }
+ Status Append(
+ const Slice& data,
+ const DataVerificationInfo& /* verification_info */) override {
+ return Append(data);
+ }
+ Status PositionedAppend(const Slice& data, uint64_t offset) override {
+ if (env_->table_write_callback_) {
+ (*env_->table_write_callback_)();
+ }
+ if (env_->drop_writes_.load(std::memory_order_acquire)) {
+ // Drop writes on the floor
+ return Status::OK();
+ } else if (env_->no_space_.load(std::memory_order_acquire)) {
+ return Status::NoSpace("No space left on device");
+ } else {
+ env_->bytes_written_ += data.size();
+ return base_->PositionedAppend(data, offset);
+ }
+ }
+ Status PositionedAppend(
+ const Slice& data, uint64_t offset,
+ const DataVerificationInfo& /* verification_info */) override {
+ return PositionedAppend(data, offset);
+ }
+ Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+ Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+ Status s = base_->RangeSync(offset, nbytes);
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::RangeSync", &s);
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+ return s;
+ }
+ Status Close() override {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ // Check preallocation size
+ // preallocation size is never passed to base file.
+ size_t preallocation_size = preallocation_block_size();
+ TEST_SYNC_POINT_CALLBACK("DBTestWritableFile.GetPreallocationStatus",
+ &preallocation_size);
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+ Status s = base_->Close();
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Close", &s);
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+ return s;
+ }
+ Status Flush() override { return base_->Flush(); }
+ Status Sync() override {
+ ++env_->sync_counter_;
+ while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) {
+ env_->SleepForMicroseconds(100000);
+ }
+ Status s;
+ if (!env_->skip_fsync_) {
+ s = base_->Sync();
+ }
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Sync", &s);
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+ return s;
+ }
+ void SetIOPriority(Env::IOPriority pri) override {
+ base_->SetIOPriority(pri);
+ }
+ Env::IOPriority GetIOPriority() override {
+ return base_->GetIOPriority();
+ }
+ bool use_direct_io() const override { return base_->use_direct_io(); }
+ Status Allocate(uint64_t offset, uint64_t len) override {
+ return base_->Allocate(offset, len);
+ }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return base_->GetUniqueId(id, max_size);
+ }
+ };
+ class ManifestFile : public WritableFile {
+ public:
+ ManifestFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+ : env_(env), base_(std::move(b)) {}
+ Status Append(const Slice& data) override {
+ if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
+ return Status::IOError("simulated writer error");
+ } else {
+ return base_->Append(data);
+ }
+ }
+ Status Append(
+ const Slice& data,
+ const DataVerificationInfo& /*verification_info*/) override {
+ return Append(data);
+ }
+
+ Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+ Status Close() override { return base_->Close(); }
+ Status Flush() override { return base_->Flush(); }
+ Status Sync() override {
+ ++env_->sync_counter_;
+ if (env_->manifest_sync_error_.load(std::memory_order_acquire)) {
+ return Status::IOError("simulated sync error");
+ } else {
+ if (env_->skip_fsync_) {
+ return Status::OK();
+ } else {
+ return base_->Sync();
+ }
+ }
+ }
+ uint64_t GetFileSize() override { return base_->GetFileSize(); }
+ Status Allocate(uint64_t offset, uint64_t len) override {
+ return base_->Allocate(offset, len);
+ }
+
+ private:
+ SpecialEnv* env_;
+ std::unique_ptr<WritableFile> base_;
+ };
+ class WalFile : public WritableFile {
+ public:
+ WalFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+ : env_(env), base_(std::move(b)) {
+ env_->num_open_wal_file_.fetch_add(1);
+ }
+ virtual ~WalFile() { env_->num_open_wal_file_.fetch_add(-1); }
+ Status Append(const Slice& data) override {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ TEST_SYNC_POINT("SpecialEnv::WalFile::Append:1");
+#endif
+ Status s;
+ if (env_->log_write_error_.load(std::memory_order_acquire)) {
+ s = Status::IOError("simulated writer error");
+ } else {
+ int slowdown =
+ env_->log_write_slowdown_.load(std::memory_order_acquire);
+ if (slowdown > 0) {
+ env_->SleepForMicroseconds(slowdown);
+ }
+ s = base_->Append(data);
+ }
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ TEST_SYNC_POINT("SpecialEnv::WalFile::Append:2");
+#endif
+ return s;
+ }
+ Status Append(
+ const Slice& data,
+ const DataVerificationInfo& /* verification_info */) override {
+ return Append(data);
+ }
+ Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+ void PrepareWrite(size_t offset, size_t len) override {
+ base_->PrepareWrite(offset, len);
+ }
+ void SetPreallocationBlockSize(size_t size) override {
+ base_->SetPreallocationBlockSize(size);
+ }
+ Status Close() override {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ // Check preallocation size
+ size_t block_size, last_allocated_block;
+ base_->GetPreallocationStatus(&block_size, &last_allocated_block);
+ TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus",
+ &block_size);
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+
+ return base_->Close();
+ }
+ Status Flush() override { return base_->Flush(); }
+ Status Sync() override {
+ ++env_->sync_counter_;
+ if (env_->corrupt_in_sync_) {
+ EXPECT_OK(Append(std::string(33000, ' ')));
+ return Status::IOError("Ingested Sync Failure");
+ }
+ if (env_->skip_fsync_) {
+ return Status::OK();
+ } else {
+ return base_->Sync();
+ }
+ }
+ bool IsSyncThreadSafe() const override {
+ return env_->is_wal_sync_thread_safe_.load();
+ }
+ Status Allocate(uint64_t offset, uint64_t len) override {
+ return base_->Allocate(offset, len);
+ }
+
+ private:
+ SpecialEnv* env_;
+ std::unique_ptr<WritableFile> base_;
+ };
+ class OtherFile : public WritableFile {
+ public:
+ OtherFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+ : env_(env), base_(std::move(b)) {}
+ Status Append(const Slice& data) override { return base_->Append(data); }
+ Status Append(
+ const Slice& data,
+ const DataVerificationInfo& /*verification_info*/) override {
+ return Append(data);
+ }
+ Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+ Status Close() override { return base_->Close(); }
+ Status Flush() override { return base_->Flush(); }
+ Status Sync() override {
+ if (env_->skip_fsync_) {
+ return Status::OK();
+ } else {
+ return base_->Sync();
+ }
+ }
+ uint64_t GetFileSize() override { return base_->GetFileSize(); }
+ Status Allocate(uint64_t offset, uint64_t len) override {
+ return base_->Allocate(offset, len);
+ }
+
+ private:
+ SpecialEnv* env_;
+ std::unique_ptr<WritableFile> base_;
+ };
+
+ if (no_file_overwrite_.load(std::memory_order_acquire) &&
+ target()->FileExists(f).ok()) {
+ return Status::NotSupported("SpecialEnv::no_file_overwrite_ is true.");
+ }
+
+ if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
+ uint32_t random_number;
+ {
+ MutexLock l(&rnd_mutex_);
+ random_number = rnd_.Uniform(100);
+ }
+ if (random_number < non_writeable_rate_.load()) {
+ return Status::IOError("simulated random write error");
+ }
+ }
+
+ new_writable_count_++;
+
+ if (non_writable_count_.load() > 0) {
+ non_writable_count_--;
+ return Status::IOError("simulated write error");
+ }
+
+ EnvOptions optimized = soptions;
+ if (strstr(f.c_str(), "MANIFEST") != nullptr ||
+ strstr(f.c_str(), "log") != nullptr) {
+ optimized.use_mmap_writes = false;
+ optimized.use_direct_writes = false;
+ }
+
+ Status s = target()->NewWritableFile(f, r, optimized);
+ if (s.ok()) {
+ if (strstr(f.c_str(), ".sst") != nullptr) {
+ r->reset(new SSTableFile(this, std::move(*r)));
+ } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
+ r->reset(new ManifestFile(this, std::move(*r)));
+ } else if (strstr(f.c_str(), "log") != nullptr) {
+ r->reset(new WalFile(this, std::move(*r)));
+ } else {
+ r->reset(new OtherFile(this, std::move(*r)));
+ }
+ }
+ return s;
+ }
+
+ Status NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& soptions) override {
+ class CountingFile : public RandomAccessFile {
+ public:
+ CountingFile(std::unique_ptr<RandomAccessFile>&& target,
+ anon::AtomicCounter* counter,
+ std::atomic<size_t>* bytes_read)
+ : target_(std::move(target)),
+ counter_(counter),
+ bytes_read_(bytes_read) {}
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ counter_->Increment();
+ Status s = target_->Read(offset, n, result, scratch);
+ *bytes_read_ += result->size();
+ return s;
+ }
+
+ virtual Status Prefetch(uint64_t offset, size_t n) override {
+ Status s = target_->Prefetch(offset, n);
+ *bytes_read_ += n;
+ return s;
+ }
+
+ private:
+ std::unique_ptr<RandomAccessFile> target_;
+ anon::AtomicCounter* counter_;
+ std::atomic<size_t>* bytes_read_;
+ };
+
+ class RandomFailureFile : public RandomAccessFile {
+ public:
+ RandomFailureFile(std::unique_ptr<RandomAccessFile>&& target,
+ std::atomic<uint64_t>* failure_cnt, uint32_t fail_odd)
+ : target_(std::move(target)),
+ fail_cnt_(failure_cnt),
+ fail_odd_(fail_odd) {}
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ if (Random::GetTLSInstance()->OneIn(fail_odd_)) {
+ fail_cnt_->fetch_add(1);
+ return Status::IOError("random error");
+ }
+ return target_->Read(offset, n, result, scratch);
+ }
+
+ virtual Status Prefetch(uint64_t offset, size_t n) override {
+ return target_->Prefetch(offset, n);
+ }
+
+ private:
+ std::unique_ptr<RandomAccessFile> target_;
+ std::atomic<uint64_t>* fail_cnt_;
+ uint32_t fail_odd_;
+ };
+
+ Status s = target()->NewRandomAccessFile(f, r, soptions);
+ random_file_open_counter_++;
+ if (s.ok()) {
+ if (count_random_reads_) {
+ r->reset(new CountingFile(std::move(*r), &random_read_counter_,
+ &random_read_bytes_counter_));
+ } else if (rand_reads_fail_odd_ > 0) {
+ r->reset(new RandomFailureFile(std::move(*r), &num_reads_fails_,
+ rand_reads_fail_odd_));
+ }
+ }
+
+ if (s.ok() && soptions.compaction_readahead_size > 0) {
+ compaction_readahead_size_ = soptions.compaction_readahead_size;
+ }
+ return s;
+ }
+
+ virtual Status NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& soptions) override {
+ class CountingFile : public SequentialFile {
+ public:
+ CountingFile(std::unique_ptr<SequentialFile>&& target,
+ anon::AtomicCounter* counter)
+ : target_(std::move(target)), counter_(counter) {}
+ virtual Status Read(size_t n, Slice* result, char* scratch) override {
+ counter_->Increment();
+ return target_->Read(n, result, scratch);
+ }
+ virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
+
+ private:
+ std::unique_ptr<SequentialFile> target_;
+ anon::AtomicCounter* counter_;
+ };
+
+ Status s = target()->NewSequentialFile(f, r, soptions);
+ if (s.ok() && count_sequential_reads_) {
+ r->reset(new CountingFile(std::move(*r), &sequential_read_counter_));
+ }
+ return s;
+ }
+
+ virtual void SleepForMicroseconds(int micros) override {
+ sleep_counter_.Increment();
+ if (no_slowdown_ || time_elapse_only_sleep_) {
+ addon_microseconds_.fetch_add(micros);
+ }
+ if (!no_slowdown_) {
+ target()->SleepForMicroseconds(micros);
+ }
+ }
+
+ void MockSleepForMicroseconds(int64_t micros) {
+ sleep_counter_.Increment();
+ assert(no_slowdown_);
+ addon_microseconds_.fetch_add(micros);
+ }
+
+ void MockSleepForSeconds(int64_t seconds) {
+ sleep_counter_.Increment();
+ assert(no_slowdown_);
+ addon_microseconds_.fetch_add(seconds * 1000000);
+ }
+
+ virtual Status GetCurrentTime(int64_t* unix_time) override {
+ Status s;
+ if (time_elapse_only_sleep_) {
+ *unix_time = maybe_starting_time_;
+ } else {
+ s = target()->GetCurrentTime(unix_time);
+ }
+ if (s.ok()) {
+ // mock microseconds elapsed to seconds of time
+ *unix_time += addon_microseconds_.load() / 1000000;
+ }
+ return s;
+ }
+
+ virtual uint64_t NowCPUNanos() override {
+ now_cpu_count_.fetch_add(1);
+ return target()->NowCPUNanos();
+ }
+
+ virtual uint64_t NowNanos() override {
+ return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) +
+ addon_microseconds_.load() * 1000;
+ }
+
+ virtual uint64_t NowMicros() override {
+ return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) +
+ addon_microseconds_.load();
+ }
+
+ virtual Status DeleteFile(const std::string& fname) override {
+ delete_count_.fetch_add(1);
+ return target()->DeleteFile(fname);
+ }
+
+ void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; }
+
+ Status NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) override {
+ if (!skip_fsync_) {
+ return target()->NewDirectory(name, result);
+ } else {
+ class NoopDirectory : public Directory {
+ public:
+ NoopDirectory() {}
+ ~NoopDirectory() {}
+
+ Status Fsync() override { return Status::OK(); }
+ Status Close() override { return Status::OK(); }
+ };
+
+ result->reset(new NoopDirectory());
+ return Status::OK();
+ }
+ }
+
+ Status RenameFile(const std::string& src, const std::string& dest) override {
+ rename_count_.fetch_add(1);
+ if (rename_error_.load(std::memory_order_acquire)) {
+ return Status::NotSupported("Simulated `RenameFile()` error.");
+ }
+ return target()->RenameFile(src, dest);
+ }
+
+ // Something to return when mocking current time
+ const int64_t maybe_starting_time_;
+
+ Random rnd_;
+ port::Mutex rnd_mutex_; // Lock to pretect rnd_
+
+ // sstable Sync() calls are blocked while this pointer is non-nullptr.
+ std::atomic<bool> delay_sstable_sync_;
+
+ // Drop writes on the floor while this pointer is non-nullptr.
+ std::atomic<bool> drop_writes_;
+
+ // Simulate no-space errors while this pointer is non-nullptr.
+ std::atomic<bool> no_space_;
+
+ // Simulate non-writable file system while this pointer is non-nullptr
+ std::atomic<bool> non_writable_;
+
+ // Force sync of manifest files to fail while this pointer is non-nullptr
+ std::atomic<bool> manifest_sync_error_;
+
+ // Force write to manifest files to fail while this pointer is non-nullptr
+ std::atomic<bool> manifest_write_error_;
+
+ // Force write to log files to fail while this pointer is non-nullptr
+ std::atomic<bool> log_write_error_;
+
+ // Force `RenameFile()` to fail while this pointer is non-nullptr
+ std::atomic<bool> rename_error_{false};
+
+ // Slow down every log write, in micro-seconds.
+ std::atomic<int> log_write_slowdown_;
+
+ // If true, returns Status::NotSupported for file overwrite.
+ std::atomic<bool> no_file_overwrite_;
+
+ // Number of WAL files that are still open for write.
+ std::atomic<int> num_open_wal_file_;
+
+ bool count_random_reads_;
+ uint32_t rand_reads_fail_odd_ = 0;
+ std::atomic<uint64_t> num_reads_fails_;
+ anon::AtomicCounter random_read_counter_;
+ std::atomic<size_t> random_read_bytes_counter_;
+ std::atomic<int> random_file_open_counter_;
+
+ bool count_sequential_reads_;
+ anon::AtomicCounter sequential_read_counter_;
+
+ anon::AtomicCounter sleep_counter_;
+
+ std::atomic<int64_t> bytes_written_;
+
+ std::atomic<int> sync_counter_;
+
+ // If true, all fsync to files and directories are skipped.
+ bool skip_fsync_ = false;
+
+ // If true, ingest the corruption to file during sync.
+ bool corrupt_in_sync_ = false;
+
+ std::atomic<uint32_t> non_writeable_rate_;
+
+ std::atomic<uint32_t> new_writable_count_;
+
+ std::atomic<uint32_t> non_writable_count_;
+
+ std::function<void()>* table_write_callback_;
+
+ std::atomic<int> now_cpu_count_;
+
+ std::atomic<int> delete_count_;
+
+ std::atomic<int> rename_count_{0};
+
+ std::atomic<bool> is_wal_sync_thread_safe_{true};
+
+ std::atomic<size_t> compaction_readahead_size_{};
+
+ private: // accessing these directly is prone to error
+ friend class DBTestBase;
+
+ std::atomic<int64_t> addon_microseconds_{0};
+
+ // Do not modify in the env of a running DB (could cause deadlock)
+ std::atomic<bool> time_elapse_only_sleep_;
+
+ bool no_slowdown_;
+};
+
+#ifndef ROCKSDB_LITE
+class FileTemperatureTestFS : public FileSystemWrapper {
+ public:
+ explicit FileTemperatureTestFS(const std::shared_ptr<FileSystem>& fs)
+ : FileSystemWrapper(fs) {}
+
+ static const char* kClassName() { return "FileTemperatureTestFS"; }
+ const char* Name() const override { return kClassName(); }
+
+ IOStatus NewSequentialFile(const std::string& fname, const FileOptions& opts,
+ std::unique_ptr<FSSequentialFile>* result,
+ IODebugContext* dbg) override {
+ IOStatus s = target()->NewSequentialFile(fname, opts, result, dbg);
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(GetFileName(fname), &number, &type) &&
+ type == kTableFile) {
+ MutexLock lock(&mu_);
+ requested_sst_file_temperatures_.emplace_back(number, opts.temperature);
+ if (s.ok()) {
+ if (opts.temperature != Temperature::kUnknown) {
+ // Be extra picky and don't open if a wrong non-unknown temperature is
+ // provided
+ auto e = current_sst_file_temperatures_.find(number);
+ if (e != current_sst_file_temperatures_.end() &&
+ e->second != opts.temperature) {
+ result->reset();
+ return IOStatus::PathNotFound("Temperature mismatch on " + fname);
+ }
+ }
+ *result = WrapWithTemperature<FSSequentialFileOwnerWrapper>(
+ number, std::move(*result));
+ }
+ }
+ return s;
+ }
+
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& opts,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* dbg) override {
+ IOStatus s = target()->NewRandomAccessFile(fname, opts, result, dbg);
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(GetFileName(fname), &number, &type) &&
+ type == kTableFile) {
+ MutexLock lock(&mu_);
+ requested_sst_file_temperatures_.emplace_back(number, opts.temperature);
+ if (s.ok()) {
+ if (opts.temperature != Temperature::kUnknown) {
+ // Be extra picky and don't open if a wrong non-unknown temperature is
+ // provided
+ auto e = current_sst_file_temperatures_.find(number);
+ if (e != current_sst_file_temperatures_.end() &&
+ e->second != opts.temperature) {
+ result->reset();
+ return IOStatus::PathNotFound("Temperature mismatch on " + fname);
+ }
+ }
+ *result = WrapWithTemperature<FSRandomAccessFileOwnerWrapper>(
+ number, std::move(*result));
+ }
+ }
+ return s;
+ }
+
+ void PopRequestedSstFileTemperatures(
+ std::vector<std::pair<uint64_t, Temperature>>* out = nullptr) {
+ MutexLock lock(&mu_);
+ if (out) {
+ *out = std::move(requested_sst_file_temperatures_);
+ assert(requested_sst_file_temperatures_.empty());
+ } else {
+ requested_sst_file_temperatures_.clear();
+ }
+ }
+
+ IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(GetFileName(fname), &number, &type) &&
+ type == kTableFile) {
+ MutexLock lock(&mu_);
+ current_sst_file_temperatures_[number] = opts.temperature;
+ }
+ return target()->NewWritableFile(fname, opts, result, dbg);
+ }
+
+ void CopyCurrentSstFileTemperatures(std::map<uint64_t, Temperature>* out) {
+ MutexLock lock(&mu_);
+ *out = current_sst_file_temperatures_;
+ }
+
+ void OverrideSstFileTemperature(uint64_t number, Temperature temp) {
+ MutexLock lock(&mu_);
+ current_sst_file_temperatures_[number] = temp;
+ }
+
+ protected:
+ port::Mutex mu_;
+ std::vector<std::pair<uint64_t, Temperature>>
+ requested_sst_file_temperatures_;
+ std::map<uint64_t, Temperature> current_sst_file_temperatures_;
+
+ std::string GetFileName(const std::string& fname) {
+ auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
+ // workaround only for Windows that the file path could contain both Windows
+ // FilePathSeparator and '/'
+ filename = filename.substr(filename.find_last_of('/') + 1);
+ return filename;
+ }
+
+ template <class FileOwnerWrapperT, /*inferred*/ class FileT>
+ std::unique_ptr<FileT> WrapWithTemperature(uint64_t number,
+ std::unique_ptr<FileT>&& t) {
+ class FileWithTemp : public FileOwnerWrapperT {
+ public:
+ FileWithTemp(FileTemperatureTestFS* fs, uint64_t number,
+ std::unique_ptr<FileT>&& t)
+ : FileOwnerWrapperT(std::move(t)), fs_(fs), number_(number) {}
+
+ Temperature GetTemperature() const override {
+ MutexLock lock(&fs_->mu_);
+ return fs_->current_sst_file_temperatures_[number_];
+ }
+
+ private:
+ FileTemperatureTestFS* fs_;
+ uint64_t number_;
+ };
+ return std::make_unique<FileWithTemp>(this, number, std::move(t));
+ }
+};
+
+class OnFileDeletionListener : public EventListener {
+ public:
+ OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {}
+ const char* Name() const override { return kClassName(); }
+ static const char* kClassName() { return "OnFileDeletionListener"; }
+
+ void SetExpectedFileName(const std::string file_name) {
+ expected_file_name_ = file_name;
+ }
+
+ void VerifyMatchedCount(size_t expected_value) {
+ ASSERT_EQ(matched_count_, expected_value);
+ }
+
+ void OnTableFileDeleted(const TableFileDeletionInfo& info) override {
+ if (expected_file_name_ != "") {
+ ASSERT_EQ(expected_file_name_, info.file_path);
+ expected_file_name_ = "";
+ matched_count_++;
+ }
+ }
+
+ private:
+ size_t matched_count_;
+ std::string expected_file_name_;
+};
+
+class FlushCounterListener : public EventListener {
+ public:
+ const char* Name() const override { return kClassName(); }
+ static const char* kClassName() { return "FlushCounterListener"; }
+ std::atomic<int> count{0};
+ std::atomic<FlushReason> expected_flush_reason{FlushReason::kOthers};
+
+ void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
+ count++;
+ ASSERT_EQ(expected_flush_reason.load(), flush_job_info.flush_reason);
+ }
+};
+#endif
+
+// A test merge operator mimics put but also fails if one of merge operands is
+// "corrupted".
+class TestPutOperator : public MergeOperator {
+ public:
+ virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ if (merge_in.existing_value != nullptr &&
+ *(merge_in.existing_value) == "corrupted") {
+ return false;
+ }
+ for (auto value : merge_in.operand_list) {
+ if (value == "corrupted") {
+ return false;
+ }
+ }
+ merge_out->existing_operand = merge_in.operand_list.back();
+ return true;
+ }
+
+ virtual const char* Name() const override { return "TestPutOperator"; }
+};
+
+// A wrapper around Cache that can easily be extended with instrumentation,
+// etc.
+class CacheWrapper : public Cache {
+ public:
+ explicit CacheWrapper(std::shared_ptr<Cache> target)
+ : target_(std::move(target)) {}
+
+ const char* Name() const override { return target_->Name(); }
+
+ using Cache::Insert;
+ Status Insert(const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value),
+ Handle** handle = nullptr,
+ Priority priority = Priority::LOW) override {
+ return target_->Insert(key, value, charge, deleter, handle, priority);
+ }
+
+ using Cache::Lookup;
+ Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
+ return target_->Lookup(key, stats);
+ }
+
+ bool Ref(Handle* handle) override { return target_->Ref(handle); }
+
+ using Cache::Release;
+ bool Release(Handle* handle, bool erase_if_last_ref = false) override {
+ return target_->Release(handle, erase_if_last_ref);
+ }
+
+ void* Value(Handle* handle) override { return target_->Value(handle); }
+
+ void Erase(const Slice& key) override { target_->Erase(key); }
+ uint64_t NewId() override { return target_->NewId(); }
+
+ void SetCapacity(size_t capacity) override { target_->SetCapacity(capacity); }
+
+ void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+ target_->SetStrictCapacityLimit(strict_capacity_limit);
+ }
+
+ bool HasStrictCapacityLimit() const override {
+ return target_->HasStrictCapacityLimit();
+ }
+
+ size_t GetCapacity() const override { return target_->GetCapacity(); }
+
+ size_t GetUsage() const override { return target_->GetUsage(); }
+
+ size_t GetUsage(Handle* handle) const override {
+ return target_->GetUsage(handle);
+ }
+
+ size_t GetPinnedUsage() const override { return target_->GetPinnedUsage(); }
+
+ size_t GetCharge(Handle* handle) const override {
+ return target_->GetCharge(handle);
+ }
+
+ DeleterFn GetDeleter(Handle* handle) const override {
+ return target_->GetDeleter(handle);
+ }
+
+ void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+ bool thread_safe) override {
+ target_->ApplyToAllCacheEntries(callback, thread_safe);
+ }
+
+ void ApplyToAllEntries(
+ const std::function<void(const Slice& key, void* value, size_t charge,
+ DeleterFn deleter)>& callback,
+ const ApplyToAllEntriesOptions& opts) override {
+ target_->ApplyToAllEntries(callback, opts);
+ }
+
+ void EraseUnRefEntries() override { target_->EraseUnRefEntries(); }
+
+ protected:
+ std::shared_ptr<Cache> target_;
+};
+
+/*
+ * A cache wrapper that tracks certain CacheEntryRole's cache charge, its
+ * peaks and increments
+ *
+ * p0
+ * / \ p1
+ * / \ /\
+ * / \/ \
+ * a / b \
+ * peaks = {p0, p1}
+ * increments = {p1-a, p2-b}
+ */
+template <CacheEntryRole R>
+class TargetCacheChargeTrackingCache : public CacheWrapper {
+ public:
+ explicit TargetCacheChargeTrackingCache(std::shared_ptr<Cache> target);
+
+ using Cache::Insert;
+ Status Insert(const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value),
+ Handle** handle = nullptr,
+ Priority priority = Priority::LOW) override;
+
+ using Cache::Release;
+ bool Release(Handle* handle, bool erase_if_last_ref = false) override;
+
+ std::size_t GetCacheCharge() { return cur_cache_charge_; }
+
+ std::deque<std::size_t> GetChargedCachePeaks() { return cache_charge_peaks_; }
+
+ std::size_t GetChargedCacheIncrementSum() {
+ return cache_charge_increments_sum_;
+ }
+
+ private:
+ static const Cache::DeleterFn kNoopDeleter;
+
+ std::size_t cur_cache_charge_;
+ std::size_t cache_charge_peak_;
+ std::size_t cache_charge_increment_;
+ bool last_peak_tracked_;
+ std::deque<std::size_t> cache_charge_peaks_;
+ std::size_t cache_charge_increments_sum_;
+};
+
+class DBTestBase : public testing::Test {
+ public:
+ // Sequence of option configurations to try
+ enum OptionConfig : int {
+ kDefault = 0,
+ kBlockBasedTableWithPrefixHashIndex = 1,
+ kBlockBasedTableWithWholeKeyHashIndex = 2,
+ kPlainTableFirstBytePrefix = 3,
+ kPlainTableCappedPrefix = 4,
+ kPlainTableCappedPrefixNonMmap = 5,
+ kPlainTableAllBytesPrefix = 6,
+ kVectorRep = 7,
+ kHashLinkList = 8,
+ kMergePut = 9,
+ kFilter = 10,
+ kFullFilterWithNewTableReaderForCompactions = 11,
+ kUncompressed = 12,
+ kNumLevel_3 = 13,
+ kDBLogDir = 14,
+ kWalDirAndMmapReads = 15,
+ kManifestFileSize = 16,
+ kPerfOptions = 17,
+ kHashSkipList = 18,
+ kUniversalCompaction = 19,
+ kUniversalCompactionMultiLevel = 20,
+ kCompressedBlockCache = 21,
+ kInfiniteMaxOpenFiles = 22,
+ kCRC32cChecksum = 23,
+ kFIFOCompaction = 24,
+ kOptimizeFiltersForHits = 25,
+ kRowCache = 26,
+ kRecycleLogFiles = 27,
+ kConcurrentSkipList = 28,
+ kPipelinedWrite = 29,
+ kConcurrentWALWrites = 30,
+ kDirectIO,
+ kLevelSubcompactions,
+ kBlockBasedTableWithIndexRestartInterval,
+ kBlockBasedTableWithPartitionedIndex,
+ kBlockBasedTableWithPartitionedIndexFormat4,
+ kBlockBasedTableWithLatestFormat,
+ kPartitionedFilterWithNewTableReaderForCompactions,
+ kUniversalSubcompactions,
+ kUnorderedWrite,
+ // This must be the last line
+ kEnd,
+ };
+
+ public:
+ std::string dbname_;
+ std::string alternative_wal_dir_;
+ std::string alternative_db_log_dir_;
+ MockEnv* mem_env_;
+ Env* encrypted_env_;
+ SpecialEnv* env_;
+ std::shared_ptr<Env> env_guard_;
+ DB* db_;
+ std::vector<ColumnFamilyHandle*> handles_;
+
+ int option_config_;
+ Options last_options_;
+
+ // Skip some options, as they may not be applicable to a specific test.
+ // To add more skip constants, use values 4, 8, 16, etc.
+ enum OptionSkip {
+ kNoSkip = 0,
+ kSkipDeletesFilterFirst = 1,
+ kSkipUniversalCompaction = 2,
+ kSkipMergePut = 4,
+ kSkipPlainTable = 8,
+ kSkipHashIndex = 16,
+ kSkipNoSeekToLast = 32,
+ kSkipFIFOCompaction = 128,
+ kSkipMmapReads = 256,
+ };
+
+ const int kRangeDelSkipConfigs =
+ // Plain tables do not support range deletions.
+ kSkipPlainTable |
+ // MmapReads disables the iterator pinning that RangeDelAggregator
+ // requires.
+ kSkipMmapReads;
+
+ // `env_do_fsync` decides whether the special Env would do real
+ // fsync for files and directories. Skipping fsync can speed up
+ // tests, but won't cover the exact fsync logic.
+ DBTestBase(const std::string path, bool env_do_fsync);
+
+ ~DBTestBase();
+
+ static std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key%06d", i);
+ return std::string(buf);
+ }
+
+ static bool ShouldSkipOptions(int option_config, int skip_mask = kNoSkip);
+
+ // Switch to a fresh database with the next option configuration to
+ // test. Return false if there are no more configurations to test.
+ bool ChangeOptions(int skip_mask = kNoSkip);
+
+ // Switch between different compaction styles.
+ bool ChangeCompactOptions();
+
+ // Switch between different WAL-realted options.
+ bool ChangeWalOptions();
+
+ // Switch between different filter policy
+ // Jump from kDefault to kFilter to kFullFilter
+ bool ChangeFilterOptions();
+
+ // Switch between different DB options for file ingestion tests.
+ bool ChangeOptionsForFileIngestionTest();
+
+ // Return the current option configuration.
+ Options CurrentOptions(const anon::OptionsOverride& options_override =
+ anon::OptionsOverride()) const;
+
+ Options CurrentOptions(const Options& default_options,
+ const anon::OptionsOverride& options_override =
+ anon::OptionsOverride()) const;
+
+ Options GetDefaultOptions() const;
+
+ Options GetOptions(int option_config) const {
+ return GetOptions(option_config, GetDefaultOptions());
+ }
+
+ Options GetOptions(int option_config, const Options& default_options,
+ const anon::OptionsOverride& options_override =
+ anon::OptionsOverride()) const;
+
+ DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+ void CreateColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options);
+
+ void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+ const Options& options);
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options);
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options);
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options);
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options);
+
+ void Reopen(const Options& options);
+
+ void Close();
+
+ void DestroyAndReopen(const Options& options);
+
+ void Destroy(const Options& options, bool delete_cf_paths = false);
+
+ Status ReadOnlyReopen(const Options& options);
+
+ Status TryReopen(const Options& options);
+
+ bool IsDirectIOSupported();
+
+ bool IsMemoryMappedAccessSupported() const;
+
+ Status Flush(int cf = 0);
+
+ Status Flush(const std::vector<int>& cf_ids);
+
+ Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions());
+
+ Status Put(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo = WriteOptions());
+
+ Status Merge(const Slice& k, const Slice& v,
+ WriteOptions wo = WriteOptions());
+
+ Status Merge(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo = WriteOptions());
+
+ Status Delete(const std::string& k);
+
+ Status Delete(int cf, const std::string& k);
+
+ Status SingleDelete(const std::string& k);
+
+ Status SingleDelete(int cf, const std::string& k);
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = nullptr);
+
+ std::string Get(int cf, const std::string& k,
+ const Snapshot* snapshot = nullptr);
+
+ Status Get(const std::string& k, PinnableSlice* v);
+
+ std::vector<std::string> MultiGet(std::vector<int> cfs,
+ const std::vector<std::string>& k,
+ const Snapshot* snapshot,
+ const bool batched,
+ const bool async = false);
+
+ std::vector<std::string> MultiGet(const std::vector<std::string>& k,
+ const Snapshot* snapshot = nullptr,
+ const bool async = false);
+
+ uint64_t GetNumSnapshots();
+
+ uint64_t GetTimeOldestSnapshots();
+
+ uint64_t GetSequenceOldestSnapshots();
+
+ // Return a string that contains all key,value pairs in order,
+ // formatted like "(k1->v1)(k2->v2)".
+ std::string Contents(int cf = 0);
+
+ std::string AllEntriesFor(const Slice& user_key, int cf = 0);
+
+ // Similar to AllEntriesFor but this function also covers reopen with fifo.
+ // Note that test cases with snapshots or entries in memtable should simply
+ // use AllEntriesFor instead as snapshots and entries in memtable will
+ // survive after db reopen.
+ void CheckAllEntriesWithFifoReopen(const std::string& expected_value,
+ const Slice& user_key, int cf,
+ const std::vector<std::string>& cfs,
+ const Options& options);
+
+#ifndef ROCKSDB_LITE
+ int NumSortedRuns(int cf = 0);
+
+ uint64_t TotalSize(int cf = 0);
+
+ uint64_t SizeAtLevel(int level);
+
+ size_t TotalLiveFiles(int cf = 0);
+
+ size_t CountLiveFiles();
+
+ int NumTableFilesAtLevel(int level, int cf = 0);
+
+ double CompressionRatioAtLevel(int level, int cf = 0);
+
+ int TotalTableFiles(int cf = 0, int levels = -1);
+#endif // ROCKSDB_LITE
+
+ std::vector<uint64_t> GetBlobFileNumbers();
+
+ // Return spread of files per level
+ std::string FilesPerLevel(int cf = 0);
+
+ size_t CountFiles();
+
+ Status CountFiles(size_t* count);
+
+ Status Size(const Slice& start, const Slice& limit, uint64_t* size) {
+ return Size(start, limit, 0, size);
+ }
+
+ Status Size(const Slice& start, const Slice& limit, int cf, uint64_t* size);
+
+ void Compact(int cf, const Slice& start, const Slice& limit,
+ uint32_t target_path_id);
+
+ void Compact(int cf, const Slice& start, const Slice& limit);
+
+ void Compact(const Slice& start, const Slice& limit);
+
+ // Do n memtable compactions, each of which produces an sstable
+ // covering the range [small,large].
+ void MakeTables(int n, const std::string& small, const std::string& large,
+ int cf = 0);
+
+ // Prevent pushing of new sstables into deeper levels by adding
+ // tables that cover a specified range to all levels.
+ void FillLevels(const std::string& smallest, const std::string& largest,
+ int cf);
+
+ void MoveFilesToLevel(int level, int cf = 0);
+
+#ifndef ROCKSDB_LITE
+ void DumpFileCounts(const char* label);
+#endif // ROCKSDB_LITE
+
+ std::string DumpSSTableList();
+
+ static void GetSstFiles(Env* env, std::string path,
+ std::vector<std::string>* files);
+
+ int GetSstFileCount(std::string path);
+
+ // this will generate non-overlapping files since it keeps increasing key_idx
+ void GenerateNewFile(Random* rnd, int* key_idx, bool nowait = false);
+
+ void GenerateNewFile(int fd, Random* rnd, int* key_idx, bool nowait = false);
+
+ static const int kNumKeysByGenerateNewRandomFile;
+ static const int KNumKeysByGenerateNewFile = 100;
+
+ void GenerateNewRandomFile(Random* rnd, bool nowait = false);
+
+ std::string IterStatus(Iterator* iter);
+
+ Options OptionsForLogIterTest();
+
+ std::string DummyString(size_t len, char c = 'a');
+
+ void VerifyIterLast(std::string expected_key, int cf = 0);
+
+ // Used to test InplaceUpdate
+
+ // If previous value is nullptr or delta is > than previous value,
+ // sets newValue with delta
+ // If previous value is not empty,
+ // updates previous value with 'b' string of previous value size - 1.
+ static UpdateStatus updateInPlaceSmallerSize(char* prevValue,
+ uint32_t* prevSize, Slice delta,
+ std::string* newValue);
+
+ static UpdateStatus updateInPlaceSmallerVarintSize(char* prevValue,
+ uint32_t* prevSize,
+ Slice delta,
+ std::string* newValue);
+
+ static UpdateStatus updateInPlaceLargerSize(char* prevValue,
+ uint32_t* prevSize, Slice delta,
+ std::string* newValue);
+
+ static UpdateStatus updateInPlaceNoAction(char* prevValue, uint32_t* prevSize,
+ Slice delta, std::string* newValue);
+
+ // Utility method to test InplaceUpdate
+ void validateNumberOfEntries(int numValues, int cf = 0);
+
+ void CopyFile(const std::string& source, const std::string& destination,
+ uint64_t size = 0);
+
+ Status GetAllDataFiles(const FileType file_type,
+ std::unordered_map<std::string, uint64_t>* sst_files,
+ uint64_t* total_size = nullptr);
+
+ std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path);
+
+ void VerifyDBFromMap(
+ std::map<std::string, std::string> true_data,
+ size_t* total_reads_res = nullptr, bool tailing_iter = false,
+ std::map<std::string, Status> status = std::map<std::string, Status>());
+
+ void VerifyDBInternal(
+ std::vector<std::pair<std::string, std::string>> true_data);
+
+#ifndef ROCKSDB_LITE
+ uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
+ std::string column_family_name);
+
+ uint64_t GetSstSizeHelper(Temperature temperature);
+#endif // ROCKSDB_LITE
+
+ uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
+ return options.statistics->getTickerCount(ticker_type);
+ }
+
+ uint64_t TestGetAndResetTickerCount(const Options& options,
+ Tickers ticker_type) {
+ return options.statistics->getAndResetTickerCount(ticker_type);
+ }
+
+ // Note: reverting this setting within the same test run is not yet
+ // supported
+ void SetTimeElapseOnlySleepOnReopen(DBOptions* options);
+
+ private: // Prone to error on direct use
+ void MaybeInstallTimeElapseOnlySleep(const DBOptions& options);
+
+ bool time_elapse_only_sleep_on_reopen_ = false;
+};
+
+// For verifying that all files generated by current version have SST
+// unique ids.
+void VerifySstUniqueIds(const TablePropertiesCollection& props);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_universal_compaction_test.cc b/src/rocksdb/db/db_universal_compaction_test.cc
new file mode 100644
index 000000000..f53c36f22
--- /dev/null
+++ b/src/rocksdb/db/db_universal_compaction_test.cc
@@ -0,0 +1,2235 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#if !defined(ROCKSDB_LITE)
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string CompressibleString(Random* rnd, int len) {
+ std::string r;
+ test::CompressibleString(rnd, 0.8, len, &r);
+ return r;
+}
+
+class DBTestUniversalCompactionBase
+ : public DBTestBase,
+ public ::testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+ explicit DBTestUniversalCompactionBase(const std::string& path)
+ : DBTestBase(path, /*env_do_fsync=*/false) {}
+ void SetUp() override {
+ num_levels_ = std::get<0>(GetParam());
+ exclusive_manual_compaction_ = std::get<1>(GetParam());
+ }
+ int num_levels_;
+ bool exclusive_manual_compaction_;
+};
+
+class DBTestUniversalCompaction : public DBTestUniversalCompactionBase {
+ public:
+ DBTestUniversalCompaction()
+ : DBTestUniversalCompactionBase("/db_universal_compaction_test") {}
+};
+
+class DBTestUniversalCompaction2 : public DBTestBase {
+ public:
+ DBTestUniversalCompaction2()
+ : DBTestBase("db_universal_compaction_test2", /*env_do_fsync=*/false) {}
+};
+
+namespace {
+void VerifyCompactionResult(
+ const ColumnFamilyMetaData& cf_meta,
+ const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+ for (auto& level : cf_meta.levels) {
+ for (auto& file : level.files) {
+ assert(overlapping_file_numbers.find(file.name) ==
+ overlapping_file_numbers.end());
+ }
+ }
+#endif
+}
+
+class KeepFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return false;
+ }
+
+ const char* Name() const override { return "KeepFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit KeepFilterFactory(bool check_context = false)
+ : check_context_(check_context) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override {
+ if (check_context_) {
+ EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+ EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+ }
+ return std::unique_ptr<CompactionFilter>(new KeepFilter());
+ }
+
+ const char* Name() const override { return "KeepFilterFactory"; }
+ bool check_context_;
+ std::atomic_bool expect_full_compaction_;
+ std::atomic_bool expect_manual_compaction_;
+};
+} // anonymous namespace
+
+// Make sure we don't trigger a problem if the trigger condtion is given
+// to be 0, which is invalid.
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSingleSortedRun) {
+ Options options = CurrentOptions();
+
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ // Config universal compaction to always compact to one single sorted run.
+ options.level0_file_num_compaction_trigger = 0;
+ options.compaction_options_universal.size_ratio = 10;
+ options.compaction_options_universal.min_merge_width = 2;
+ options.compaction_options_universal.max_size_amplification_percent = 0;
+
+ options.write_buffer_size = 105 << 10; // 105KB
+ options.arena_block_size = 4 << 10;
+ options.target_file_size_base = 32 << 10; // 32KB
+ // trigger compaction if there are >= 4 files
+ KeepFilterFactory* filter = new KeepFilterFactory(true);
+ filter->expect_manual_compaction_.store(false);
+ options.compaction_filter_factory.reset(filter);
+
+ DestroyAndReopen(options);
+ ASSERT_EQ(1, db_->GetOptions().level0_file_num_compaction_trigger);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ filter->expect_full_compaction_.store(true);
+
+ for (int num = 0; num < 16; num++) {
+ // Write 100KB file. And immediately it should be compacted to one file.
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumSortedRuns(0), 1);
+ }
+ ASSERT_OK(Put(Key(key_idx), ""));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumSortedRuns(0), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, OptimizeFiltersForHits) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 5;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 105 << 10; // 105KB
+ options.arena_block_size = 4 << 10;
+ options.target_file_size_base = 32 << 10; // 32KB
+ // trigger compaction if there are >= 4 files
+ options.level0_file_num_compaction_trigger = 4;
+ BlockBasedTableOptions bbto;
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.optimize_filters_for_hits = true;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(3));
+
+ DestroyAndReopen(options);
+
+ // block compaction from happening
+ env_->SetBackgroundThreads(1, Env::LOW);
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ ASSERT_OK(Put(Key(num * 10), "val"));
+ if (num) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(Put(Key(30 + num * 10), "val"));
+ ASSERT_OK(Put(Key(60 + num * 10), "val"));
+ }
+ ASSERT_OK(Put("", ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+ // Query set of non existing keys
+ for (int i = 5; i < 90; i += 10) {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+
+ // Make sure bloom filter is used at least once.
+ ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+ auto prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL);
+
+ // Make sure bloom filter is used for all but the last L0 file when looking
+ // up a non-existent key that's in the range of all L0 files.
+ ASSERT_EQ(Get(Key(35)), "NOT_FOUND");
+ ASSERT_EQ(prev_counter + NumTableFilesAtLevel(0) - 1,
+ TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+ prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL);
+
+ // Unblock compaction and wait it for happening.
+ sleeping_task_low.WakeUp();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // The same queries will not trigger bloom filter
+ for (int i = 5; i < 90; i += 10) {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ ASSERT_EQ(prev_counter, TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+}
+
+// TODO(kailiu) The tests on UniversalCompaction has some issues:
+// 1. A lot of magic numbers ("11" or "12").
+// 2. Made assumption on the memtable flush conditions, which may change from
+// time to time.
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
+ Options options;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 5;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 105 << 10; // 105KB
+ options.arena_block_size = 4 << 10;
+ options.target_file_size_base = 32 << 10; // 32KB
+ // trigger compaction if there are >= 4 files
+ options.level0_file_num_compaction_trigger = 4;
+ KeepFilterFactory* filter = new KeepFilterFactory(true);
+ filter->expect_manual_compaction_.store(false);
+ options.compaction_filter_factory.reset(filter);
+
+ options = CurrentOptions(options);
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTestWritableFile.GetPreallocationStatus", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ size_t preallocation_size = *(static_cast<size_t*>(arg));
+ if (num_levels_ > 3) {
+ ASSERT_LE(preallocation_size, options.target_file_size_base * 1.1);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ filter->expect_full_compaction_.store(true);
+ // Stage 1:
+ // Generate a set of files at level 0, but don't trigger level-0
+ // compaction.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ // Write 100KB
+ GenerateNewFile(1, &rnd, &key_idx);
+ }
+
+ // Generate one more file at level-0, which should trigger level-0
+ // compaction.
+ GenerateNewFile(1, &rnd, &key_idx);
+ // Suppose each file flushed from mem table has size 1. Now we compact
+ // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+ // file of size 4.
+ ASSERT_EQ(NumSortedRuns(1), 1);
+
+ // Stage 2:
+ // Now we have one file at level 0, with size 4. We also have some data in
+ // mem table. Let's continue generating new files at level 0, but don't
+ // trigger level-0 compaction.
+ // First, clean up memtable before inserting new data. This will generate
+ // a level-0 file, with size around 0.4 (according to previously written
+ // data amount).
+ filter->expect_full_compaction_.store(false);
+ ASSERT_OK(Flush(1));
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+ num++) {
+ GenerateNewFile(1, &rnd, &key_idx);
+ ASSERT_EQ(NumSortedRuns(1), num + 3);
+ }
+
+ // Generate one more file at level-0, which should trigger level-0
+ // compaction.
+ GenerateNewFile(1, &rnd, &key_idx);
+ // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+ // After compaction, we should have 2 files, with size 4, 2.4.
+ ASSERT_EQ(NumSortedRuns(1), 2);
+
+ // Stage 3:
+ // Now we have 2 files at level 0, with size 4 and 2.4. Continue
+ // generating new files at level 0.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+ num++) {
+ GenerateNewFile(1, &rnd, &key_idx);
+ ASSERT_EQ(NumSortedRuns(1), num + 3);
+ }
+
+ // Generate one more file at level-0, which should trigger level-0
+ // compaction.
+ GenerateNewFile(1, &rnd, &key_idx);
+ // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1.
+ // After compaction, we should have 3 files, with size 4, 2.4, 2.
+ ASSERT_EQ(NumSortedRuns(1), 3);
+
+ // Stage 4:
+ // Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a
+ // new file of size 1.
+ GenerateNewFile(1, &rnd, &key_idx);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Level-0 compaction is triggered, but no file will be picked up.
+ ASSERT_EQ(NumSortedRuns(1), 4);
+
+ // Stage 5:
+ // Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate
+ // a new file of size 1.
+ filter->expect_full_compaction_.store(true);
+ GenerateNewFile(1, &rnd, &key_idx);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // All files at level 0 will be compacted into a single one.
+ ASSERT_EQ(NumSortedRuns(1), 1);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 3;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Generate two files in Level 0. Both files are approx the same size.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_EQ(NumSortedRuns(1), num + 1);
+ }
+ ASSERT_EQ(NumSortedRuns(1), 2);
+
+ // Flush whatever is remaining in memtable. This is typically
+ // small, which should not trigger size ratio based compaction
+ // but will instead trigger size amplification.
+ ASSERT_OK(Flush(1));
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Verify that size amplification did occur
+ ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 3;
+ // Initial setup of compaction_options_universal will prevent universal
+ // compaction from happening
+ options.compaction_options_universal.size_ratio = 100;
+ options.compaction_options_universal.min_merge_width = 100;
+ DestroyAndReopen(options);
+
+ int total_picked_compactions = 0;
+ int total_size_amp_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ if (arg) {
+ total_picked_compactions++;
+ Compaction* c = static_cast<Compaction*>(arg);
+ if (c->compaction_reason() ==
+ CompactionReason::kUniversalSizeAmplification) {
+ total_size_amp_compactions++;
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ MutableCFOptions mutable_cf_options;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Generate two files in Level 0. Both files are approx the same size.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_EQ(NumSortedRuns(1), num + 1);
+ }
+ ASSERT_EQ(NumSortedRuns(1), 2);
+
+ // Flush whatever is remaining in memtable. This is typically
+ // small, which should not trigger size ratio based compaction
+ // but could instead trigger size amplification if it's set
+ // to 110.
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Verify compaction did not happen
+ ASSERT_EQ(NumSortedRuns(1), 3);
+
+ // Trigger compaction if size amplification exceeds 110% without reopening DB
+ ASSERT_EQ(dbfull()
+ ->GetOptions(handles_[1])
+ .compaction_options_universal.max_size_amplification_percent,
+ 200U);
+ ASSERT_OK(dbfull()->SetOptions(handles_[1],
+ {{"compaction_options_universal",
+ "{max_size_amplification_percent=110;}"}}));
+ ASSERT_EQ(dbfull()
+ ->GetOptions(handles_[1])
+ .compaction_options_universal.max_size_amplification_percent,
+ 110u);
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+ &mutable_cf_options));
+ ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal
+ .max_size_amplification_percent);
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Verify that size amplification did happen
+ ASSERT_EQ(NumSortedRuns(1), 1);
+ ASSERT_EQ(total_picked_compactions, 1);
+ ASSERT_EQ(total_size_amp_compactions, 1);
+}
+
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 3;
+ // Initial setup of compaction_options_universal will prevent universal
+ // compaction from happening
+ options.compaction_options_universal.max_size_amplification_percent = 2000;
+ options.compaction_options_universal.size_ratio = 0;
+ options.compaction_options_universal.min_merge_width = 100;
+ DestroyAndReopen(options);
+
+ int total_picked_compactions = 0;
+ int total_size_ratio_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ if (arg) {
+ total_picked_compactions++;
+ Compaction* c = static_cast<Compaction*>(arg);
+ if (c->compaction_reason() == CompactionReason::kUniversalSizeRatio) {
+ total_size_ratio_compactions++;
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ MutableCFOptions mutable_cf_options;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Generate three files in Level 0. All files are approx the same size.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_EQ(NumSortedRuns(1), num + 1);
+ }
+ ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger);
+
+ // Flush whatever is remaining in memtable. This is typically small, about
+ // 30KB.
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Verify compaction did not happen
+ ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger + 1);
+ ASSERT_EQ(total_picked_compactions, 0);
+
+ ASSERT_OK(dbfull()->SetOptions(
+ handles_[1],
+ {{"compaction_options_universal",
+ "{min_merge_width=2;max_merge_width=2;size_ratio=100;}"}}));
+ ASSERT_EQ(dbfull()
+ ->GetOptions(handles_[1])
+ .compaction_options_universal.min_merge_width,
+ 2u);
+ ASSERT_EQ(dbfull()
+ ->GetOptions(handles_[1])
+ .compaction_options_universal.max_merge_width,
+ 2u);
+ ASSERT_EQ(
+ dbfull()->GetOptions(handles_[1]).compaction_options_universal.size_ratio,
+ 100u);
+
+ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+ &mutable_cf_options));
+ ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100u);
+ ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width,
+ 2u);
+ ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width,
+ 2u);
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Files in L0 are approx: 0.3 (30KB), 1, 1, 1.
+ // On compaction: the files are below the size amp threshold, so we
+ // fallthrough to checking read amp conditions. The configured size ratio is
+ // not big enough to take 0.3 into consideration. So the next files 1 and 1
+ // are compacted together first as they satisfy size ratio condition and
+ // (min_merge_width, max_merge_width) condition, to give out a file size of 2.
+ // Next, the newly generated 2 and the last file 1 are compacted together. So
+ // at the end: #sortedRuns = 2, #picked_compactions = 2, and all the picked
+ // ones are size ratio based compactions.
+ ASSERT_EQ(NumSortedRuns(1), 2);
+ // If max_merge_width had not been changed dynamically above, and if it
+ // continued to be the default value of UINIT_MAX, total_picked_compactions
+ // would have been 1.
+ ASSERT_EQ(total_picked_compactions, 2);
+ ASSERT_EQ(total_size_ratio_compactions, 2);
+}
+
+TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 10;
+
+ ChangeCompactOptions();
+ Options options;
+ options.create_if_missing = true;
+ options.compaction_style = kCompactionStyleLevel;
+ options.num_levels = 1;
+ options.target_file_size_base = options.write_buffer_size;
+ options.compression = kNoCompression;
+ options = CurrentOptions(options);
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
+ Random rnd(301);
+ for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
+ ASSERT_OK(Put(1, std::to_string(key), rnd.RandomString(kTestValueSize)));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ std::vector<std::string> compaction_input_file_names;
+ for (auto file : cf_meta.levels[0].files) {
+ if (rnd.OneIn(2)) {
+ compaction_input_file_names.push_back(file.name);
+ }
+ }
+
+ if (compaction_input_file_names.size() == 0) {
+ compaction_input_file_names.push_back(cf_meta.levels[0].files[0].name);
+ }
+
+ // expect fail since universal compaction only allow L0 output
+ ASSERT_FALSE(dbfull()
+ ->CompactFiles(CompactionOptions(), handles_[1],
+ compaction_input_file_names, 1)
+ .ok());
+
+ // expect ok and verify the compacted files no longer exist.
+ ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1],
+ compaction_input_file_names, 0));
+
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ VerifyCompactionResult(
+ cf_meta, std::set<std::string>(compaction_input_file_names.begin(),
+ compaction_input_file_names.end()));
+
+ compaction_input_file_names.clear();
+
+ // Pick the first and the last file, expect everything is
+ // compacted into one single file.
+ compaction_input_file_names.push_back(cf_meta.levels[0].files[0].name);
+ compaction_input_file_names.push_back(
+ cf_meta.levels[0].files[cf_meta.levels[0].files.size() - 1].name);
+ ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1],
+ compaction_input_file_names, 0));
+
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ ASSERT_EQ(cf_meta.levels[0].files.size(), 1U);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.num_levels = 7;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ // Generate 3 overlapping files
+ Random rnd(301);
+ for (int i = 0; i < 210; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+ }
+ ASSERT_OK(Flush());
+
+ for (int i = 200; i < 300; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+ }
+ ASSERT_OK(Flush());
+
+ for (int i = 250; i < 260; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("3", FilesPerLevel(0));
+ // Compact all files into 1 file and put it in L4
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 4;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class DBTestUniversalCompactionMultiLevels
+ : public DBTestUniversalCompactionBase {
+ public:
+ DBTestUniversalCompactionMultiLevels()
+ : DBTestUniversalCompactionBase(
+ "/db_universal_compaction_multi_levels_test") {}
+};
+
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.level0_file_num_compaction_trigger = 8;
+ options.max_background_compactions = 3;
+ options.target_file_size_base = 32 * 1024;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int num_keys = 100000;
+ for (int i = 0; i < num_keys * 2; i++) {
+ ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+ }
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ for (int i = num_keys; i < num_keys * 2; i++) {
+ ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+ }
+}
+
+// Tests universal compaction with trivial move enabled
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+ non_trivial_move++;
+ ASSERT_TRUE(arg != nullptr);
+ int output_level = *(static_cast<int*>(arg));
+ ASSERT_EQ(output_level, 0);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.allow_trivial_move = true;
+ options.num_levels = 3;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 2;
+ options.target_file_size_base = 32 * 1024;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int num_keys = 150000;
+ for (int i = 0; i < num_keys; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ std::vector<std::string> values;
+
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_GT(trivial_move, 0);
+ ASSERT_GT(non_trivial_move, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(MultiLevels, DBTestUniversalCompactionMultiLevels,
+ ::testing::Combine(::testing::Values(3, 20),
+ ::testing::Bool()));
+
+class DBTestUniversalCompactionParallel : public DBTestUniversalCompactionBase {
+ public:
+ DBTestUniversalCompactionParallel()
+ : DBTestUniversalCompactionBase("/db_universal_compaction_prallel_test") {
+ }
+};
+
+TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ options.env = env_;
+ options.write_buffer_size = 1 << 10; // 1KB
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 3;
+ options.max_background_flushes = 3;
+ options.target_file_size_base = 1 * 1024;
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Delay every compaction so multiple compactions will happen.
+ std::atomic<int> num_compactions_running(0);
+ std::atomic<bool> has_parallel(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():Start", [&](void* /*arg*/) {
+ if (num_compactions_running.fetch_add(1) > 0) {
+ has_parallel.store(true);
+ return;
+ }
+ for (int nwait = 0; nwait < 20000; nwait++) {
+ if (has_parallel.load() || num_compactions_running.load() > 1) {
+ has_parallel.store(true);
+ break;
+ }
+ env_->SleepForMicroseconds(1000);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():End",
+ [&](void* /*arg*/) { num_compactions_running.fetch_add(-1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int num_keys = 30000;
+ for (int i = 0; i < num_keys * 2; i++) {
+ ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(num_compactions_running.load(), 0);
+ ASSERT_TRUE(has_parallel.load());
+
+ for (int i = num_keys; i < num_keys * 2; i++) {
+ ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+ }
+
+ // Reopen and check.
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ for (int i = num_keys; i < num_keys * 2; i++) {
+ ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+ }
+}
+
+TEST_P(DBTestUniversalCompactionParallel, PickByFileNumberBug) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 1 * 1024; // 1KB
+ options.level0_file_num_compaction_trigger = 7;
+ options.max_background_compactions = 2;
+ options.target_file_size_base = 1024 * 1024; // 1MB
+
+ // Disable size amplifiction compaction
+ options.compaction_options_universal.max_size_amplification_percent =
+ UINT_MAX;
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBTestUniversalCompactionParallel::PickByFileNumberBug:0",
+ "BackgroundCallCompaction:0"},
+ {"UniversalCompactionBuilder::PickCompaction:Return",
+ "DBTestUniversalCompactionParallel::PickByFileNumberBug:1"},
+ {"DBTestUniversalCompactionParallel::PickByFileNumberBug:2",
+ "CompactionJob::Run():Start"}});
+
+ int total_picked_compactions = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ if (arg) {
+ total_picked_compactions++;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Write 7 files to trigger compaction
+ int key_idx = 1;
+ for (int i = 1; i <= 70; i++) {
+ std::string k = Key(key_idx++);
+ ASSERT_OK(Put(k, k));
+ if (i % 10 == 0) {
+ ASSERT_OK(Flush());
+ }
+ }
+
+ // Wait for the 1st background compaction process to start
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+ // Write 3 files while 1st compaction is held
+ // These 3 files have different sizes to avoid compacting based on size_ratio
+ int num_keys = 1000;
+ for (int i = 0; i < 3; i++) {
+ for (int j = 1; j <= num_keys; j++) {
+ std::string k = Key(key_idx++);
+ ASSERT_OK(Put(k, k));
+ }
+ ASSERT_OK(Flush());
+ num_keys -= 100;
+ }
+
+ // Hold the 1st compaction from finishing
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // There should only be one picked compaction as the score drops below one
+ // after the first one is picked.
+ EXPECT_EQ(total_picked_compactions, 1);
+ EXPECT_EQ(TotalTableFiles(), 4);
+
+ // Stop SyncPoint and destroy the DB and reopen it again
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ key_idx = 1;
+ total_picked_compactions = 0;
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Write 7 files to trigger compaction
+ for (int i = 1; i <= 70; i++) {
+ std::string k = Key(key_idx++);
+ ASSERT_OK(Put(k, k));
+ if (i % 10 == 0) {
+ ASSERT_OK(Flush());
+ }
+ }
+
+ // Wait for the 1st background compaction process to start
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+ // Write 8 files while 1st compaction is held
+ // These 8 files have different sizes to avoid compacting based on size_ratio
+ num_keys = 1000;
+ for (int i = 0; i < 8; i++) {
+ for (int j = 1; j <= num_keys; j++) {
+ std::string k = Key(key_idx++);
+ ASSERT_OK(Put(k, k));
+ }
+ ASSERT_OK(Flush());
+ num_keys -= 100;
+ }
+
+ // Wait for the 2nd background compaction process to start
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+
+ // Hold the 1st and 2nd compaction from finishing
+ TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // This time we will trigger a compaction because of size ratio and
+ // another compaction because of number of files that are not compacted
+ // greater than 7
+ EXPECT_GE(total_picked_compactions, 2);
+}
+
+INSTANTIATE_TEST_CASE_P(Parallel, DBTestUniversalCompactionParallel,
+ ::testing::Combine(::testing::Values(1, 10),
+ ::testing::Values(false)));
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 105 << 10; // 105KB
+ options.arena_block_size = 4 << 10; // 4KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = num_levels_;
+ options.compaction_options_universal.compression_size_percent = -1;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ // Write 100KB (100 values, each 1K)
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(990)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+ if (num < options.level0_file_num_compaction_trigger - 1) {
+ ASSERT_EQ(NumSortedRuns(1), num + 1);
+ }
+ }
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 105 << 10; // 105KB
+ options.arena_block_size = 4 << 10; // 4KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ // trigger compaction if there are >= 4 files
+ options.level0_file_num_compaction_trigger = 4;
+ options.compaction_options_universal.size_ratio = 10;
+ options.compaction_options_universal.stop_style =
+ kCompactionStopStyleSimilarSize;
+ options.num_levels = num_levels_;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Stage 1:
+ // Generate a set of files at level 0, but don't trigger level-0
+ // compaction.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ // Write 100KB (100 values, each 1K)
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(NumSortedRuns(), num + 1);
+ }
+
+ // Generate one more file at level-0, which should trigger level-0
+ // compaction.
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Suppose each file flushed from mem table has size 1. Now we compact
+ // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+ // file of size 4.
+ ASSERT_EQ(NumSortedRuns(), 1);
+
+ // Stage 2:
+ // Now we have one file at level 0, with size 4. We also have some data in
+ // mem table. Let's continue generating new files at level 0, but don't
+ // trigger level-0 compaction.
+ // First, clean up memtable before inserting new data. This will generate
+ // a level-0 file, with size around 0.4 (according to previously written
+ // data amount).
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+ num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(NumSortedRuns(), num + 3);
+ }
+
+ // Generate one more file at level-0, which should trigger level-0
+ // compaction.
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+ // After compaction, we should have 3 files, with size 4, 0.4, 2.
+ ASSERT_EQ(NumSortedRuns(), 3);
+ // Stage 3:
+ // Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one
+ // more file at level-0, which should trigger level-0 compaction.
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Level-0 compaction is triggered, but no file will be picked up.
+ ASSERT_EQ(NumSortedRuns(), 4);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio1) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = num_levels_;
+ options.compaction_options_universal.compression_size_percent = 70;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // The first compaction (2) is compressed.
+ for (int num = 0; num < 2; num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_LT(TotalSize(), 110000U * 2 * 0.9);
+
+ // The second compaction (4) is compressed
+ for (int num = 0; num < 2; num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_LT(TotalSize(), 110000 * 4 * 0.9);
+
+ // The third compaction (2 4) is compressed since this time it is
+ // (1 1 3.2) and 3.2/5.2 doesn't reach ratio.
+ for (int num = 0; num < 2; num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_LT(TotalSize(), 110000 * 6 * 0.9);
+
+ // When we start for the compaction up to (2 4 8), the latest
+ // compressed is not compressed.
+ for (int num = 0; num < 8; num++) {
+ // Write 110KB (11 values, each 10K)
+ for (int i = 0; i < 11; i++) {
+ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = num_levels_;
+ options.compaction_options_universal.compression_size_percent = 95;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // When we start for the compaction up to (2 4 8), the latest
+ // compressed is compressed given the size ratio to compress.
+ for (int num = 0; num < 14; num++) {
+ // Write 120KB (12 values, each 10K)
+ for (int i = 0; i < 12; i++) {
+ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+ key_idx++;
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_LT(TotalSize(), 120000U * 12 * 0.82 + 120000 * 2);
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+ non_trivial_move++;
+ ASSERT_TRUE(arg != nullptr);
+ int output_level = *(static_cast<int*>(arg));
+ ASSERT_EQ(output_level, 0);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.allow_trivial_move = true;
+ options.num_levels = 2;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 1;
+ options.target_file_size_base = 32 * 1024;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int num_keys = 250000;
+ for (int i = 0; i < num_keys; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ std::vector<std::string> values;
+
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_GT(trivial_move, 0);
+ ASSERT_GT(non_trivial_move, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) {
+ int32_t trivial_move = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ int output_level = *(static_cast<int*>(arg));
+ ASSERT_EQ(output_level, 0);
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.allow_trivial_move = true;
+ options.num_levels = 15;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.level0_file_num_compaction_trigger = 8;
+ options.max_background_compactions = 2;
+ options.target_file_size_base = 64 * 1024;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ Random rnd(301);
+ int num_keys = 500000;
+ for (int i = 0; i < num_keys; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i)));
+ }
+ std::vector<std::string> values;
+
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_GT(trivial_move, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 300 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 5;
+ options.write_buffer_size = 111 << 10; // 114KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 1;
+
+ std::vector<std::string> filenames;
+ if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) {
+ // Delete archival files.
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ ASSERT_OK(
+ env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]));
+ }
+ ASSERT_OK(env_->DeleteDir(options.db_paths[1].path));
+ }
+ Reopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are not going to second path.
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ // Another 110KB triggers a compaction to 400K file to second path
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+
+ // (1, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1,1,4) -> (2, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 2, 4) -> (3, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 3, 4) -> (8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+ // (1, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 1, 8) -> (2, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ // (1, 2, 8) -> (3, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 3, 8) -> (4, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+ // (1, 4, 8) -> (5, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCFPathUse) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 300 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 10;
+ options.write_buffer_size = 111 << 10; // 114KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 1;
+
+ std::vector<Options> option_vector;
+ option_vector.emplace_back(options);
+ ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+ // Configure CF1 specific paths.
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 300 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 300 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 500 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_4", 1024 * 1024 * 1024);
+ option_vector.emplace_back(DBOptions(options), cf_opt1);
+ CreateColumnFamilies({"one"}, option_vector[1]);
+
+ // Configura CF2 specific paths.
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 300 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 300 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 500 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_4", 1024 * 1024 * 1024);
+ option_vector.emplace_back(DBOptions(options), cf_opt2);
+ CreateColumnFamilies({"two"}, option_vector[2]);
+
+ ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+ Random rnd(301);
+ int key_idx = 0;
+ int key_idx1 = 0;
+ int key_idx2 = 0;
+
+ auto generate_file = [&]() {
+ GenerateNewFile(0, &rnd, &key_idx);
+ GenerateNewFile(1, &rnd, &key_idx1);
+ GenerateNewFile(2, &rnd, &key_idx2);
+ };
+
+ auto check_sstfilecount = [&](int path_id, int expected) {
+ ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+ ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+ ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+ };
+
+ auto check_getvalues = [&]() {
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(0, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ for (int i = 0; i < key_idx1; i++) {
+ auto v = Get(1, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ for (int i = 0; i < key_idx2; i++) {
+ auto v = Get(2, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+ };
+
+ // First three 110KB files are not going to second path.
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ generate_file();
+ }
+
+ // Another 110KB triggers a compaction to 400K file to second path
+ generate_file();
+ check_sstfilecount(2, 1);
+
+ // (1, 4)
+ generate_file();
+ check_sstfilecount(2, 1);
+ check_sstfilecount(0, 1);
+
+ // (1,1,4) -> (2, 4)
+ generate_file();
+ check_sstfilecount(2, 1);
+ check_sstfilecount(1, 1);
+ check_sstfilecount(0, 0);
+
+ // (1, 2, 4) -> (3, 4)
+ generate_file();
+ check_sstfilecount(2, 1);
+ check_sstfilecount(1, 1);
+ check_sstfilecount(0, 0);
+
+ // (1, 3, 4) -> (8)
+ generate_file();
+ check_sstfilecount(3, 1);
+
+ // (1, 8)
+ generate_file();
+ check_sstfilecount(3, 1);
+ check_sstfilecount(0, 1);
+
+ // (1, 1, 8) -> (2, 8)
+ generate_file();
+ check_sstfilecount(3, 1);
+ check_sstfilecount(1, 1);
+
+ // (1, 2, 8) -> (3, 8)
+ generate_file();
+ check_sstfilecount(3, 1);
+ check_sstfilecount(1, 1);
+ check_sstfilecount(0, 0);
+
+ // (1, 3, 8) -> (4, 8)
+ generate_file();
+ check_sstfilecount(2, 1);
+ check_sstfilecount(3, 1);
+
+ // (1, 4, 8) -> (5, 8)
+ generate_file();
+ check_sstfilecount(3, 1);
+ check_sstfilecount(2, 1);
+ check_sstfilecount(0, 0);
+
+ check_getvalues();
+
+ ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+ check_getvalues();
+
+ Destroy(options, true);
+}
+
+TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) {
+ std::function<void(int)> verify_func = [&](int num_keys_in_db) {
+ std::string keys_in_db;
+ Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ keys_in_db.append(iter->key().ToString());
+ keys_in_db.push_back(',');
+ }
+ delete iter;
+
+ std::string expected_keys;
+ for (int i = 0; i <= num_keys_in_db; i++) {
+ expected_keys.append(Key(i));
+ expected_keys.push_back(',');
+ }
+
+ ASSERT_EQ(keys_in_db, expected_keys);
+ };
+
+ Random rnd(301);
+ int max_key1 = 200;
+ int max_key2 = 600;
+ int max_key3 = 800;
+ const int KNumKeysPerFile = 10;
+
+ // Stage 1: open a DB with universal compaction, num_levels=1
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ options.write_buffer_size = 200 << 10; // 200KB
+ options.level0_file_num_compaction_trigger = 3;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysPerFile));
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ for (int i = 0; i <= max_key1; i++) {
+ // each value is 10K
+ ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Stage 2: reopen with universal compaction, num_levels=4
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 4;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ verify_func(max_key1);
+
+ // Insert more keys
+ for (int i = max_key1 + 1; i <= max_key2; i++) {
+ // each value is 10K
+ ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ verify_func(max_key2);
+ // Compaction to non-L0 has happened.
+ ASSERT_GT(NumTableFilesAtLevel(options.num_levels - 1, 1), 0);
+
+ // Stage 3: Revert it back to one level and revert to num_levels=1.
+ options.num_levels = 4;
+ options.target_file_size_base = INT_MAX;
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ // Compact all to level 0
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 0;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(
+ dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ // Need to restart it once to remove higher level records in manifest.
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ // Final reopen
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ // Insert more keys
+ for (int i = max_key2 + 1; i <= max_key3; i++) {
+ // each value is 10K
+ ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ verify_func(max_key3);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024);
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 5;
+ options.write_buffer_size = 111 << 10; // 114KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 1;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+
+ std::vector<std::string> filenames;
+ if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) {
+ // Delete archival files.
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ ASSERT_OK(
+ env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]));
+ }
+ ASSERT_OK(env_->DeleteDir(options.db_paths[1].path));
+ }
+ Reopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are not going to second path.
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ // Another 110KB triggers a compaction to 400K file to second path
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ // (1, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1,1,4) -> (2, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 2, 4) -> (3, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 3, 4) -> (8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 1, 8) -> (2, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 2, 8) -> (3, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 3, 8) -> (4, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // (1, 4, 8) -> (5, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) {
+ if (num_levels_ == 1) {
+ // for single-level universal, everything's bottom level so nothing should
+ // be executed in bottom-pri thread pool.
+ return;
+ }
+ const int kNumFilesTrigger = 3;
+ Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.max_background_compactions = 2;
+ options.num_levels = num_levels_;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ DestroyAndReopen(options);
+
+ // Need to get a token to enable compaction parallelism up to
+ // `max_background_compactions` jobs.
+ auto pressure_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {// wait for the full compaction to be picked before adding files intended
+ // for the second one.
+ {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+ "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
+ // the full (bottom-pri) compaction waits until a partial (low-pri)
+ // compaction has started to verify they can run in parallel.
+ {"DBImpl::BackgroundCompaction:NonTrivial",
+ "DBImpl::BGWorkBottomCompaction"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int i = 0; i < 2; ++i) {
+ for (int num = 0; num < kNumFilesTrigger; num++) {
+ int key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+ // use no_wait above because that one waits for flush and compaction. We
+ // don't want to wait for compaction because the full compaction is
+ // intentionally blocked while more files are flushed.
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ if (i == 0) {
+ TEST_SYNC_POINT(
+ "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0");
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // First compaction should output to bottom level. Second should output to L0
+ // since older L0 files pending compaction prevent it from being placed lower.
+ ASSERT_EQ(NumSortedRuns(), 2);
+ ASSERT_GT(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_P(DBTestUniversalCompaction, RecalculateScoreAfterPicking) {
+ // Regression test for extra compactions scheduled. Once enough compactions
+ // have been scheduled to bring the score below one, we should stop
+ // scheduling more; otherwise, other CFs/DBs may be delayed unnecessarily.
+ const int kNumFilesTrigger = 8;
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_options_universal.max_merge_width = kNumFilesTrigger / 2;
+ options.compaction_options_universal.max_size_amplification_percent =
+ static_cast<unsigned int>(-1);
+ options.compaction_style = kCompactionStyleUniversal;
+ options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+ options.num_levels = num_levels_;
+ Reopen(options);
+
+ std::atomic<int> num_compactions_attempted(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:Start",
+ [&](void* /*arg*/) { ++num_compactions_attempted; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int num = 0; num < kNumFilesTrigger; num++) {
+ ASSERT_EQ(NumSortedRuns(), num);
+ int key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Compacting the first four files was enough to bring the score below one so
+ // there's no need to schedule any more compactions.
+ ASSERT_EQ(1, num_compactions_attempted);
+ ASSERT_EQ(NumSortedRuns(), 5);
+}
+
+TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) {
+ // Regression test for conflict between:
+ // (1) Running CompactFiles including file in the final sorted run; and
+ // (2) Picking universal size-amp-triggered compaction, which always includes
+ // the final sorted run.
+ if (exclusive_manual_compaction_) {
+ return;
+ }
+
+ Options opts = CurrentOptions();
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.compaction_options_universal.max_size_amplification_percent = 50;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compression = kNoCompression;
+ opts.level0_file_num_compaction_trigger = 2;
+ opts.max_background_compactions = 2;
+ opts.num_levels = num_levels_;
+ Reopen(opts);
+
+ // make sure compaction jobs can be parallelized
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1);
+ ColumnFamilyMetaData cf_meta;
+ ColumnFamilyHandle* default_cfh = db_->DefaultColumnFamily();
+ dbfull()->GetColumnFamilyMetaData(default_cfh, &cf_meta);
+ ASSERT_EQ(1, cf_meta.levels[num_levels_ - 1].files.size());
+ std::string first_sst_filename =
+ cf_meta.levels[num_levels_ - 1].files[0].name;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"CompactFilesImpl:0",
+ "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"},
+ {"DBImpl::BackgroundCompaction():AfterPickCompaction",
+ "CompactFilesImpl:1"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread compact_files_thread([&]() {
+ ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), default_cfh,
+ {first_sst_filename}, num_levels_ - 1));
+ });
+
+ TEST_SYNC_POINT(
+ "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0");
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ compact_files_thread.join();
+}
+
+INSTANTIATE_TEST_CASE_P(NumLevels, DBTestUniversalCompaction,
+ ::testing::Combine(::testing::Values(1, 3, 5),
+ ::testing::Bool()));
+
+class DBTestUniversalManualCompactionOutputPathId
+ : public DBTestUniversalCompactionBase {
+ public:
+ DBTestUniversalManualCompactionOutputPathId()
+ : DBTestUniversalCompactionBase(
+ "/db_universal_compaction_manual_pid_test") {}
+};
+
+TEST_P(DBTestUniversalManualCompactionOutputPathId,
+ ManualCompactionOutputPathId) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.db_paths.emplace_back(dbname_, 1000000000);
+ options.db_paths.emplace_back(dbname_ + "_2", 1000000000);
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = num_levels_;
+ options.target_file_size_base = 1 << 30; // Big size
+ options.level0_file_num_compaction_trigger = 10;
+ Destroy(options);
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ MakeTables(3, "p", "q", 1);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(2, TotalLiveFiles(1));
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+ // Full compaction to DB path 0
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = 1;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ ASSERT_EQ(1, TotalLiveFiles(1));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+ ASSERT_EQ(1, TotalLiveFiles(1));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ MakeTables(1, "p", "q", 1);
+ ASSERT_EQ(2, TotalLiveFiles(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+ ASSERT_EQ(2, TotalLiveFiles(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ // Full compaction to DB path 0
+ compact_options.target_path_id = 0;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+ ASSERT_EQ(1, TotalLiveFiles(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+ // Fail when compacting to an invalid path ID
+ compact_options.target_path_id = 2;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)
+ .IsInvalidArgument());
+}
+
+INSTANTIATE_TEST_CASE_P(OutputPathId,
+ DBTestUniversalManualCompactionOutputPathId,
+ ::testing::Combine(::testing::Values(1, 8),
+ ::testing::Bool()));
+
+TEST_F(DBTestUniversalCompaction2, BasicL0toL1) {
+ const int kNumKeys = 3000;
+ const int kWindowSize = 100;
+ const int kNumDelsTrigger = 90;
+
+ Options opts = CurrentOptions();
+ opts.table_properties_collector_factories.emplace_back(
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 2;
+ opts.compression = kNoCompression;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ int i;
+ for (i = 0; i < 2000; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+ // MoveFilesToLevel(6);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ for (i = 1999; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ ASSERT_OK(Delete(Key(i)));
+ } else {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+#if defined(ENABLE_SINGLE_LEVEL_DTC)
+TEST_F(DBTestUniversalCompaction2, SingleLevel) {
+ const int kNumKeys = 3000;
+ const int kWindowSize = 100;
+ const int kNumDelsTrigger = 90;
+
+ Options opts = CurrentOptions();
+ opts.table_properties_collector_factories.emplace_back(
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 2;
+ opts.compression = kNoCompression;
+ opts.num_levels = 1;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ int i;
+ for (i = 0; i < 2000; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+
+ for (i = 1999; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ ASSERT_OK(Delete(Key(i)));
+ } else {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ }
+ ASSERT_OK(Flush()(;
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+#endif // ENABLE_SINGLE_LEVEL_DTC
+
+TEST_F(DBTestUniversalCompaction2, MultipleLevels) {
+ const int kWindowSize = 100;
+ const int kNumDelsTrigger = 90;
+
+ Options opts = CurrentOptions();
+ opts.table_properties_collector_factories.emplace_back(
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 4;
+ opts.compression = kNoCompression;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ int i;
+ for (i = 0; i < 500; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+ for (i = 500; i < 1000; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+ for (i = 1000; i < 1500; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+ for (i = 1500; i < 2000; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+
+ for (i = 1999; i < 2333; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+ for (i = 2333; i < 2666; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+ for (i = 2666; i < 2999; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+ ASSERT_GT(NumTableFilesAtLevel(5), 0);
+
+ for (i = 1900; i < 2100; ++i) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(1));
+ ASSERT_EQ(0, NumTableFilesAtLevel(2));
+ ASSERT_EQ(0, NumTableFilesAtLevel(3));
+ ASSERT_EQ(0, NumTableFilesAtLevel(4));
+ ASSERT_EQ(0, NumTableFilesAtLevel(5));
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, OverlappingL0) {
+ const int kWindowSize = 100;
+ const int kNumDelsTrigger = 90;
+
+ Options opts = CurrentOptions();
+ opts.table_properties_collector_factories.emplace_back(
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 5;
+ opts.compression = kNoCompression;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ int i;
+ for (i = 0; i < 2000; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+ for (i = 2000; i < 3000; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+ for (i = 3500; i < 4000; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+ for (i = 2900; i < 3100; ++i) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, IngestBehind) {
+ const int kNumKeys = 3000;
+ const int kWindowSize = 100;
+ const int kNumDelsTrigger = 90;
+
+ Options opts = CurrentOptions();
+ opts.table_properties_collector_factories.emplace_back(
+ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 2;
+ opts.compression = kNoCompression;
+ opts.allow_ingest_behind = true;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ Reopen(opts);
+
+ // add an L1 file to prevent tombstones from dropping due to obsolescence
+ // during flush
+ int i;
+ for (i = 0; i < 2000; ++i) {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ ASSERT_OK(Flush());
+ // MoveFilesToLevel(6);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ for (i = 1999; i < kNumKeys; ++i) {
+ if (i >= kNumKeys - kWindowSize &&
+ i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+ ASSERT_OK(Delete(Key(i)));
+ } else {
+ ASSERT_OK(Put(Key(i), "val"));
+ }
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(6));
+ ASSERT_GT(NumTableFilesAtLevel(5), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
+ Options options;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.env = env_;
+ KeepFilterFactory* filter = new KeepFilterFactory(true);
+ options.compaction_filter_factory.reset(filter);
+ Reopen(options);
+ ASSERT_EQ(30 * 24 * 60 * 60,
+ dbfull()->GetOptions().periodic_compaction_seconds);
+
+ KeepFilter df;
+ options.compaction_filter_factory.reset();
+ options.compaction_filter = &df;
+ Reopen(options);
+ ASSERT_EQ(30 * 24 * 60 * 60,
+ dbfull()->GetOptions().periodic_compaction_seconds);
+
+ options.ttl = 60 * 24 * 60 * 60;
+ options.compaction_filter = nullptr;
+ Reopen(options);
+ ASSERT_EQ(60 * 24 * 60 * 60,
+ dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
+ Options opts = CurrentOptions();
+ opts.env = env_;
+ opts.compaction_style = kCompactionStyleUniversal;
+ opts.level0_file_num_compaction_trigger = 10;
+ opts.max_open_files = -1;
+ opts.compaction_options_universal.size_ratio = 10;
+ opts.compaction_options_universal.min_merge_width = 2;
+ opts.compaction_options_universal.max_size_amplification_percent = 200;
+ opts.periodic_compaction_seconds = 48 * 60 * 60; // 2 days
+ opts.num_levels = 5;
+ env_->SetMockSleep();
+ Reopen(opts);
+
+ // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+ int periodic_compactions = 0;
+ int start_level = -1;
+ int output_level = -1;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionPicker::PickPeriodicCompaction:Return",
+ [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(arg != nullptr);
+ ASSERT_TRUE(compaction->compaction_reason() ==
+ CompactionReason::kPeriodicCompaction);
+ start_level = compaction->start_level();
+ output_level = compaction->output_level();
+ periodic_compactions++;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Case 1: Oldest flushed file excceeds periodic compaction threshold.
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(0, periodic_compactions);
+ // Move clock forward so that the flushed file would qualify periodic
+ // compaction.
+ env_->MockSleepForSeconds(48 * 60 * 60 + 100);
+
+ // Another flush would trigger compaction the oldest file.
+ ASSERT_OK(Put("foo", "bar2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(1, periodic_compactions);
+ ASSERT_EQ(0, start_level);
+ ASSERT_EQ(4, output_level);
+
+ // Case 2: Oldest compacted file excceeds periodic compaction threshold
+ periodic_compactions = 0;
+ // A flush doesn't trigger a periodic compaction when threshold not hit
+ ASSERT_OK(Put("foo", "bar2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(0, periodic_compactions);
+
+ // After periodic compaction threshold hits, a flush will trigger
+ // a compaction
+ ASSERT_OK(Put("foo", "bar2"));
+ env_->MockSleepForSeconds(48 * 60 * 60 + 100);
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(1, periodic_compactions);
+ ASSERT_EQ(0, start_level);
+ ASSERT_EQ(4, output_level);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void)argc;
+ (void)argv;
+ return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_wal_test.cc b/src/rocksdb/db/db_wal_test.cc
new file mode 100644
index 000000000..5b5ec76af
--- /dev/null
+++ b/src/rocksdb/db/db_wal_test.cc
@@ -0,0 +1,2314 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/file_system.h"
+#include "test_util/sync_point.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBWALTestBase : public DBTestBase {
+ protected:
+ explicit DBWALTestBase(const std::string& dir_name)
+ : DBTestBase(dir_name, /*env_do_fsync=*/true) {}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+ public:
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+ bool IsFallocateSupported() {
+ // Test fallocate support of running file system.
+ // Skip this test if fallocate is not supported.
+ std::string fname_test_fallocate = dbname_ + "/preallocate_testfile";
+ int fd = -1;
+ do {
+ fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+ } while (fd < 0 && errno == EINTR);
+ assert(fd > 0);
+ int alloc_status = fallocate(fd, 0, 0, 1);
+ int err_number = errno;
+ close(fd);
+ assert(env_->DeleteFile(fname_test_fallocate) == Status::OK());
+ if (err_number == ENOSYS || err_number == EOPNOTSUPP) {
+ fprintf(stderr, "Skipped preallocated space check: %s\n",
+ errnoStr(err_number).c_str());
+ return false;
+ }
+ assert(alloc_status == 0);
+ return true;
+ }
+#endif // ROCKSDB_FALLOCATE_PRESENT
+
+ uint64_t GetAllocatedFileSize(std::string file_name) {
+ struct stat sbuf;
+ int err = stat(file_name.c_str(), &sbuf);
+ assert(err == 0);
+ return sbuf.st_blocks * 512;
+ }
+#endif // ROCKSDB_PLATFORM_POSIX
+};
+
+class DBWALTest : public DBWALTestBase {
+ public:
+ DBWALTest() : DBWALTestBase("/db_wal_test") {}
+};
+
+// A SpecialEnv enriched to give more insight about deleted files
+class EnrichedSpecialEnv : public SpecialEnv {
+ public:
+ explicit EnrichedSpecialEnv(Env* base) : SpecialEnv(base) {}
+ Status NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& soptions) override {
+ InstrumentedMutexLock l(&env_mutex_);
+ if (f == skipped_wal) {
+ deleted_wal_reopened = true;
+ if (IsWAL(f) && largest_deleted_wal.size() != 0 &&
+ f.compare(largest_deleted_wal) <= 0) {
+ gap_in_wals = true;
+ }
+ }
+ return SpecialEnv::NewSequentialFile(f, r, soptions);
+ }
+ Status DeleteFile(const std::string& fname) override {
+ if (IsWAL(fname)) {
+ deleted_wal_cnt++;
+ InstrumentedMutexLock l(&env_mutex_);
+ // If this is the first WAL, remember its name and skip deleting it. We
+ // remember its name partly because the application might attempt to
+ // delete the file again.
+ if (skipped_wal.size() != 0 && skipped_wal != fname) {
+ if (largest_deleted_wal.size() == 0 ||
+ largest_deleted_wal.compare(fname) < 0) {
+ largest_deleted_wal = fname;
+ }
+ } else {
+ skipped_wal = fname;
+ return Status::OK();
+ }
+ }
+ return SpecialEnv::DeleteFile(fname);
+ }
+ bool IsWAL(const std::string& fname) {
+ // printf("iswal %s\n", fname.c_str());
+ return fname.compare(fname.size() - 3, 3, "log") == 0;
+ }
+
+ InstrumentedMutex env_mutex_;
+ // the wal whose actual delete was skipped by the env
+ std::string skipped_wal = "";
+ // the largest WAL that was requested to be deleted
+ std::string largest_deleted_wal = "";
+ // number of WALs that were successfully deleted
+ std::atomic<size_t> deleted_wal_cnt = {0};
+ // the WAL whose delete from fs was skipped is reopened during recovery
+ std::atomic<bool> deleted_wal_reopened = {false};
+ // whether a gap in the WALs was detected during recovery
+ std::atomic<bool> gap_in_wals = {false};
+};
+
+class DBWALTestWithEnrichedEnv : public DBTestBase {
+ public:
+ DBWALTestWithEnrichedEnv()
+ : DBTestBase("db_wal_test", /*env_do_fsync=*/true) {
+ enriched_env_ = new EnrichedSpecialEnv(env_->target());
+ auto options = CurrentOptions();
+ options.env = enriched_env_;
+ options.allow_2pc = true;
+ Reopen(options);
+ delete env_;
+ // to be deleted by the parent class
+ env_ = enriched_env_;
+ }
+
+ protected:
+ EnrichedSpecialEnv* enriched_env_;
+};
+
+// Test that the recovery would successfully avoid the gaps between the logs.
+// One known scenario that could cause this is that the application issue the
+// WAL deletion out of order. For the sake of simplicity in the test, here we
+// create the gap by manipulating the env to skip deletion of the first WAL but
+// not the ones after it.
+TEST_F(DBWALTestWithEnrichedEnv, SkipDeletedWALs) {
+ auto options = last_options_;
+ // To cause frequent WAL deletion
+ options.write_buffer_size = 128;
+ Reopen(options);
+
+ WriteOptions writeOpt = WriteOptions();
+ for (int i = 0; i < 128 * 5; i++) {
+ ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
+ }
+ FlushOptions fo;
+ fo.wait = true;
+ ASSERT_OK(db_->Flush(fo));
+
+ // some wals are deleted
+ ASSERT_NE(0, enriched_env_->deleted_wal_cnt);
+ // but not the first one
+ ASSERT_NE(0, enriched_env_->skipped_wal.size());
+
+ // Test that the WAL that was not deleted will be skipped during recovery
+ options = last_options_;
+ Reopen(options);
+ ASSERT_FALSE(enriched_env_->deleted_wal_reopened);
+ ASSERT_FALSE(enriched_env_->gap_in_wals);
+}
+
+TEST_F(DBWALTest, WAL) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v1", Get(1, "bar"));
+
+ writeOpt.disableWAL = false;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ // Both value's should be present.
+ ASSERT_EQ("v2", Get(1, "bar"));
+ ASSERT_EQ("v2", Get(1, "foo"));
+
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+ writeOpt.disableWAL = false;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ // again both values should be present.
+ ASSERT_EQ("v3", Get(1, "foo"));
+ ASSERT_EQ("v3", Get(1, "bar"));
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RollLog) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Put(1, "baz", "v5"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ for (int i = 0; i < 10; i++) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ }
+ ASSERT_OK(Put(1, "foo", "v4"));
+ for (int i = 0; i < 10; i++) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ }
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, SyncWALNotBlockWrite) {
+ Options options = CurrentOptions();
+ options.max_write_buffer_number = 4;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("foo5", "bar5"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"WritableFileWriter::SyncWithoutFlush:1",
+ "DBWALTest::SyncWALNotBlockWrite:1"},
+ {"DBWALTest::SyncWALNotBlockWrite:2",
+ "WritableFileWriter::SyncWithoutFlush:2"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread thread([&]() { ASSERT_OK(db_->SyncWAL()); });
+
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:1");
+ ASSERT_OK(Put("foo2", "bar2"));
+ ASSERT_OK(Put("foo3", "bar3"));
+ FlushOptions fo;
+ fo.wait = false;
+ ASSERT_OK(db_->Flush(fo));
+ ASSERT_OK(Put("foo4", "bar4"));
+
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:2");
+
+ thread.join();
+
+ ASSERT_EQ(Get("foo1"), "bar1");
+ ASSERT_EQ(Get("foo2"), "bar2");
+ ASSERT_EQ(Get("foo3"), "bar3");
+ ASSERT_EQ(Get("foo4"), "bar4");
+ ASSERT_EQ(Get("foo5"), "bar5");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, SyncWALNotWaitWrite) {
+ ASSERT_OK(Put("foo1", "bar1"));
+ ASSERT_OK(Put("foo3", "bar3"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"SpecialEnv::WalFile::Append:1", "DBWALTest::SyncWALNotWaitWrite:1"},
+ {"DBWALTest::SyncWALNotWaitWrite:2", "SpecialEnv::WalFile::Append:2"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread thread(
+ [&]() { ASSERT_OK(Put("foo2", "bar2")); });
+ // Moving this to SyncWAL before the actual fsync
+ // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+ ASSERT_OK(db_->SyncWAL());
+ // Moving this to SyncWAL after actual fsync
+ // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+ thread.join();
+
+ ASSERT_EQ(Get("foo1"), "bar1");
+ ASSERT_EQ(Get("foo2"), "bar2");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, Recover) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Put(1, "baz", "v5"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v5", Get(1, "baz"));
+ ASSERT_OK(Put(1, "bar", "v2"));
+ ASSERT_OK(Put(1, "foo", "v3"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v3", Get(1, "foo"));
+ ASSERT_OK(Put(1, "foo", "v4"));
+ ASSERT_EQ("v4", Get(1, "foo"));
+ ASSERT_EQ("v2", Get(1, "bar"));
+ ASSERT_EQ("v5", Get(1, "baz"));
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithTableHandle) {
+ do {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.avoid_flush_during_recovery = false;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Put(1, "bar", "v2"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "foo", "v3"));
+ ASSERT_OK(Put(1, "bar", "v4"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(1, "big", std::string(100, 'a')));
+
+ options = CurrentOptions();
+ const int kSmallMaxOpenFiles = 13;
+ if (option_config_ == kDBLogDir) {
+ // Use this option to check not preloading files
+ // Set the max open files to be small enough so no preload will
+ // happen.
+ options.max_open_files = kSmallMaxOpenFiles;
+ // RocksDB sanitize max open files to at least 20. Modify it back.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+ int* max_open_files = static_cast<int*>(arg);
+ *max_open_files = kSmallMaxOpenFiles;
+ });
+
+ } else if (option_config_ == kWalDirAndMmapReads) {
+ // Use this option to check always loading all files.
+ options.max_open_files = 100;
+ } else {
+ options.max_open_files = -1;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ std::vector<std::vector<FileMetaData>> files;
+ dbfull()->TEST_GetFilesMetaData(handles_[1], &files);
+ size_t total_files = 0;
+ for (const auto& level : files) {
+ total_files += level.size();
+ }
+ ASSERT_EQ(total_files, 3);
+ for (const auto& level : files) {
+ for (const auto& file : level) {
+ if (options.max_open_files == kSmallMaxOpenFiles) {
+ ASSERT_TRUE(file.table_reader_handle == nullptr);
+ } else {
+ ASSERT_TRUE(file.table_reader_handle != nullptr);
+ }
+ }
+ }
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithBlob) {
+ // Write a value that's below the prospective size limit for blobs and another
+ // one that's above. Note that blob files are not actually enabled at this
+ // point.
+ constexpr uint64_t min_blob_size = 10;
+
+ constexpr char short_value[] = "short";
+ static_assert(sizeof(short_value) - 1 < min_blob_size,
+ "short_value too long");
+
+ constexpr char long_value[] = "long_value";
+ static_assert(sizeof(long_value) - 1 >= min_blob_size,
+ "long_value too short");
+
+ ASSERT_OK(Put("key1", short_value));
+ ASSERT_OK(Put("key2", long_value));
+
+ // There should be no files just yet since we haven't flushed.
+ {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ ASSERT_NE(versions, nullptr);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(cfd, nullptr);
+
+ Version* const current = cfd->current();
+ ASSERT_NE(current, nullptr);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ ASSERT_NE(storage_info, nullptr);
+
+ ASSERT_EQ(storage_info->num_non_empty_levels(), 0);
+ ASSERT_TRUE(storage_info->GetBlobFiles().empty());
+ }
+
+ // Reopen the database with blob files enabled. A new table file/blob file
+ // pair should be written during recovery.
+ Options options;
+ options.enable_blob_files = true;
+ options.min_blob_size = min_blob_size;
+ options.avoid_flush_during_recovery = false;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+
+ Reopen(options);
+
+ ASSERT_EQ(Get("key1"), short_value);
+ ASSERT_EQ(Get("key2"), long_value);
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ ASSERT_NE(versions, nullptr);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(cfd, nullptr);
+
+ Version* const current = cfd->current();
+ ASSERT_NE(current, nullptr);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ ASSERT_NE(storage_info, nullptr);
+
+ const auto& l0_files = storage_info->LevelFiles(0);
+ ASSERT_EQ(l0_files.size(), 1);
+
+ const FileMetaData* const table_file = l0_files[0];
+ ASSERT_NE(table_file, nullptr);
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+ ASSERT_EQ(blob_files.size(), 1);
+
+ const auto& blob_file = blob_files.front();
+ ASSERT_NE(blob_file, nullptr);
+
+ ASSERT_EQ(table_file->smallest.user_key(), "key1");
+ ASSERT_EQ(table_file->largest.user_key(), "key2");
+ ASSERT_EQ(table_file->fd.smallest_seqno, 1);
+ ASSERT_EQ(table_file->fd.largest_seqno, 2);
+ ASSERT_EQ(table_file->oldest_blob_file_number,
+ blob_file->GetBlobFileNumber());
+
+ ASSERT_EQ(blob_file->GetTotalBlobCount(), 1);
+
+#ifndef ROCKSDB_LITE
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ ASSERT_NE(internal_stats, nullptr);
+
+ const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+ ASSERT_FALSE(compaction_stats.empty());
+ ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize());
+ ASSERT_EQ(compaction_stats[0].bytes_written_blob,
+ blob_file->GetTotalBlobBytes());
+ ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+ ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1);
+
+ const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+ ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+ compaction_stats[0].bytes_written +
+ compaction_stats[0].bytes_written_blob);
+#endif // ROCKSDB_LITE
+}
+
+TEST_F(DBWALTest, RecoverWithBlobMultiSST) {
+ // Write several large (4 KB) values without flushing. Note that blob files
+ // are not actually enabled at this point.
+ std::string large_value(1 << 12, 'a');
+
+ constexpr int num_keys = 64;
+
+ for (int i = 0; i < num_keys; ++i) {
+ ASSERT_OK(Put(Key(i), large_value));
+ }
+
+ // There should be no files just yet since we haven't flushed.
+ {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ ASSERT_NE(versions, nullptr);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(cfd, nullptr);
+
+ Version* const current = cfd->current();
+ ASSERT_NE(current, nullptr);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ ASSERT_NE(storage_info, nullptr);
+
+ ASSERT_EQ(storage_info->num_non_empty_levels(), 0);
+ ASSERT_TRUE(storage_info->GetBlobFiles().empty());
+ }
+
+ // Reopen the database with blob files enabled and write buffer size set to a
+ // smaller value. Multiple table files+blob files should be written and added
+ // to the Version during recovery.
+ Options options;
+ options.write_buffer_size = 1 << 16; // 64 KB
+ options.enable_blob_files = true;
+ options.avoid_flush_during_recovery = false;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+
+ Reopen(options);
+
+ for (int i = 0; i < num_keys; ++i) {
+ ASSERT_EQ(Get(Key(i)), large_value);
+ }
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ ASSERT_NE(versions, nullptr);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ ASSERT_NE(cfd, nullptr);
+
+ Version* const current = cfd->current();
+ ASSERT_NE(current, nullptr);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ ASSERT_NE(storage_info, nullptr);
+
+ const auto& l0_files = storage_info->LevelFiles(0);
+ ASSERT_GT(l0_files.size(), 1);
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+ ASSERT_GT(blob_files.size(), 1);
+
+ ASSERT_EQ(l0_files.size(), blob_files.size());
+}
+
+TEST_F(DBWALTest, WALWithChecksumHandoff) {
+#ifndef ROCKSDB_ASSERT_STATUS_CHECKED
+ if (mem_env_ || encrypted_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+ return;
+ }
+ std::shared_ptr<FaultInjectionTestFS> fault_fs(
+ new FaultInjectionTestFS(FileSystem::Default()));
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ do {
+ Options options = CurrentOptions();
+
+ options.checksum_handoff_file_types.Add(FileType::kWalFile);
+ options.env = fault_fs_env.get();
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+ WriteOptions writeOpt = WriteOptions();
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v1", Get(1, "bar"));
+
+ writeOpt.disableWAL = false;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ // Both value's should be present.
+ ASSERT_EQ("v2", Get(1, "bar"));
+ ASSERT_EQ("v2", Get(1, "foo"));
+
+ writeOpt.disableWAL = true;
+ // This put, data is persisted by Flush
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ writeOpt.disableWAL = false;
+ // Data is persisted in the WAL
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "zoo", "v3"));
+ // The hash does not match, write fails
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+ writeOpt.disableWAL = false;
+ ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ // Due to the write failure, Get should not find
+ ASSERT_NE("v3", Get(1, "foo"));
+ ASSERT_EQ("v3", Get(1, "zoo"));
+ ASSERT_EQ("v3", Get(1, "bar"));
+
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ // Each write will be similated as corrupted.
+ fault_fs->IngestDataCorruptionBeforeWrite();
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v4"));
+ writeOpt.disableWAL = false;
+ ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v4"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_NE("v4", Get(1, "foo"));
+ ASSERT_NE("v4", Get(1, "bar"));
+ fault_fs->NoDataCorruptionBeforeWrite();
+
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+ // The file system does not provide checksum method and verification.
+ writeOpt.disableWAL = true;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v5"));
+ writeOpt.disableWAL = false;
+ ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v5"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("v5", Get(1, "foo"));
+ ASSERT_EQ("v5", Get(1, "bar"));
+
+ Destroy(options);
+ } while (ChangeWalOptions());
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+}
+
+class DBRecoveryTestBlobError
+ : public DBWALTest,
+ public testing::WithParamInterface<std::string> {
+ public:
+ DBRecoveryTestBlobError() : sync_point_(GetParam()) {}
+
+ std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBRecoveryTestBlobError, DBRecoveryTestBlobError,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileBuilder::WriteBlobToFile:AddRecord",
+ "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBRecoveryTestBlobError, RecoverWithBlobError) {
+ // Write a value. Note that blob files are not actually enabled at this point.
+ ASSERT_OK(Put("key", "blob"));
+
+ // Reopen with blob files enabled but make blob file writing fail during
+ // recovery.
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+ Status* const s = static_cast<Status*>(arg);
+ assert(s);
+
+ (*s) = Status::IOError(sync_point_);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options;
+ options.enable_blob_files = true;
+ options.avoid_flush_during_recovery = false;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+
+ ASSERT_NOK(TryReopen(options));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Make sure the files generated by the failed recovery have been deleted.
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(dbname_, &files));
+ for (const auto& file : files) {
+ uint64_t number = 0;
+ FileType type = kTableFile;
+
+ if (!ParseFileName(file, &number, &type)) {
+ continue;
+ }
+
+ ASSERT_NE(type, kTableFile);
+ ASSERT_NE(type, kBlobFile);
+ }
+}
+
+TEST_F(DBWALTest, IgnoreRecoveredLog) {
+ std::string backup_logs = dbname_ + "/backup_logs";
+
+ do {
+ // delete old files in backup_logs directory
+ ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+ std::vector<std::string> old_files;
+ ASSERT_OK(env_->GetChildren(backup_logs, &old_files));
+ for (auto& file : old_files) {
+ ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file));
+ }
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ options.wal_dir = dbname_ + "/logs";
+ DestroyAndReopen(options);
+
+ // fill up the DB
+ std::string one, two;
+ PutFixed64(&one, 1);
+ PutFixed64(&two, 2);
+ ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+ ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+ ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one)));
+
+ // copy the logs to backup
+ std::vector<std::string> logs;
+ ASSERT_OK(env_->GetChildren(options.wal_dir, &logs));
+ for (auto& log : logs) {
+ CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
+ }
+
+ // recover the DB
+ Reopen(options);
+ ASSERT_EQ(two, Get("foo"));
+ ASSERT_EQ(one, Get("bar"));
+ Close();
+
+ // copy the logs from backup back to wal dir
+ for (auto& log : logs) {
+ CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+ }
+ // this should ignore the log files, recovery should not happen again
+ // if the recovery happens, the same merge operator would be called twice,
+ // leading to incorrect results
+ Reopen(options);
+ ASSERT_EQ(two, Get("foo"));
+ ASSERT_EQ(one, Get("bar"));
+ Close();
+ Destroy(options);
+ Reopen(options);
+ Close();
+
+ // copy the logs from backup back to wal dir
+ ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir));
+ for (auto& log : logs) {
+ CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+ }
+ // assert that we successfully recovered only from logs, even though we
+ // destroyed the DB
+ Reopen(options);
+ ASSERT_EQ(two, Get("foo"));
+ ASSERT_EQ(one, Get("bar"));
+
+ // Recovery will fail if DB directory doesn't exist.
+ Destroy(options);
+ // copy the logs from backup back to wal dir
+ ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir));
+ for (auto& log : logs) {
+ CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+ // we won't be needing this file no more
+ ASSERT_OK(env_->DeleteFile(backup_logs + "/" + log));
+ }
+ Status s = TryReopen(options);
+ ASSERT_NOK(s);
+ Destroy(options);
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoveryWithEmptyLog) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v3"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("v3", Get(1, "foo"));
+ } while (ChangeWalOptions());
+}
+
+#if !(defined NDEBUG) || !defined(OS_WIN)
+TEST_F(DBWALTest, PreallocateBlock) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1000 * 1000;
+ options.max_total_wal_size = 0;
+
+ size_t expected_preallocation_size = static_cast<size_t>(
+ options.write_buffer_size + options.write_buffer_size / 10);
+
+ DestroyAndReopen(options);
+
+ std::atomic<int> called(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ size_t preallocation_size = *(static_cast<size_t*>(arg));
+ ASSERT_EQ(expected_preallocation_size, preallocation_size);
+ called.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put("", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("", ""));
+ Close();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(2, called.load());
+
+ options.max_total_wal_size = 1000 * 1000;
+ expected_preallocation_size = static_cast<size_t>(options.max_total_wal_size);
+ Reopen(options);
+ called.store(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ size_t preallocation_size = *(static_cast<size_t*>(arg));
+ ASSERT_EQ(expected_preallocation_size, preallocation_size);
+ called.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put("", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("", ""));
+ Close();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(2, called.load());
+
+ options.db_write_buffer_size = 800 * 1000;
+ expected_preallocation_size =
+ static_cast<size_t>(options.db_write_buffer_size);
+ Reopen(options);
+ called.store(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ size_t preallocation_size = *(static_cast<size_t*>(arg));
+ ASSERT_EQ(expected_preallocation_size, preallocation_size);
+ called.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put("", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("", ""));
+ Close();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(2, called.load());
+
+ expected_preallocation_size = 700 * 1000;
+ std::shared_ptr<WriteBufferManager> write_buffer_manager =
+ std::make_shared<WriteBufferManager>(static_cast<uint64_t>(700 * 1000));
+ options.write_buffer_manager = write_buffer_manager;
+ Reopen(options);
+ called.store(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ size_t preallocation_size = *(static_cast<size_t*>(arg));
+ ASSERT_EQ(expected_preallocation_size, preallocation_size);
+ called.fetch_add(1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(Put("", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("", ""));
+ Close();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(2, called.load());
+}
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBWALTest, DISABLED_FullPurgePreservesRecycledLog) {
+ // TODO(ajkr): Disabled until WAL recycling is fixed for
+ // `kPointInTimeRecovery`.
+
+ // For github issue #1303
+ for (int i = 0; i < 2; ++i) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.recycle_log_file_num = 2;
+ if (i != 0) {
+ options.wal_dir = alternative_wal_dir_;
+ }
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "v1"));
+ VectorLogPtr log_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+ ASSERT_GT(log_files.size(), 0);
+ ASSERT_OK(Flush());
+
+ // Now the original WAL is in log_files[0] and should be marked for
+ // recycling.
+ // Verify full purge cannot remove this file.
+ JobContext job_context(0);
+ dbfull()->TEST_LockMutex();
+ dbfull()->FindObsoleteFiles(&job_context, true /* force */);
+ dbfull()->TEST_UnlockMutex();
+ dbfull()->PurgeObsoleteFiles(job_context);
+
+ if (i == 0) {
+ ASSERT_OK(
+ env_->FileExists(LogFileName(dbname_, log_files[0]->LogNumber())));
+ } else {
+ ASSERT_OK(env_->FileExists(
+ LogFileName(alternative_wal_dir_, log_files[0]->LogNumber())));
+ }
+ }
+}
+
+TEST_F(DBWALTest, DISABLED_FullPurgePreservesLogPendingReuse) {
+ // TODO(ajkr): Disabled until WAL recycling is fixed for
+ // `kPointInTimeRecovery`.
+
+ // Ensures full purge cannot delete a WAL while it's in the process of being
+ // recycled. In particular, we force the full purge after a file has been
+ // chosen for reuse, but before it has been renamed.
+ for (int i = 0; i < 2; ++i) {
+ Options options = CurrentOptions();
+ options.recycle_log_file_num = 1;
+ if (i != 0) {
+ options.wal_dir = alternative_wal_dir_;
+ }
+ DestroyAndReopen(options);
+
+ // The first flush creates a second log so writes can continue before the
+ // flush finishes.
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ // The second flush can recycle the first log. Sync points enforce the
+ // full purge happens after choosing the log to recycle and before it is
+ // renamed.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::CreateWAL:BeforeReuseWritableFile1",
+ "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge"},
+ {"DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge",
+ "DBImpl::CreateWAL:BeforeReuseWritableFile2"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread thread([&]() {
+ TEST_SYNC_POINT(
+ "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge");
+ ASSERT_OK(db_->EnableFileDeletions(true));
+ TEST_SYNC_POINT(
+ "DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge");
+ });
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+ thread.join();
+ }
+}
+
+TEST_F(DBWALTest, GetSortedWalFiles) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ VectorLogPtr log_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+ ASSERT_EQ(0, log_files.size());
+
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+ ASSERT_EQ(1, log_files.size());
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, GetCurrentWalFile) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+ std::unique_ptr<LogFile>* bad_log_file = nullptr;
+ ASSERT_NOK(dbfull()->GetCurrentWalFile(bad_log_file));
+
+ std::unique_ptr<LogFile> log_file;
+ ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+ // nothing has been written to the log yet
+ ASSERT_EQ(log_file->StartSequence(), 0);
+ ASSERT_EQ(log_file->SizeFileBytes(), 0);
+ ASSERT_EQ(log_file->Type(), kAliveLogFile);
+ ASSERT_GT(log_file->LogNumber(), 0);
+
+ // add some data and verify that the file size actually moves foward
+ ASSERT_OK(Put(0, "foo", "v1"));
+ ASSERT_OK(Put(0, "foo2", "v2"));
+ ASSERT_OK(Put(0, "foo3", "v3"));
+
+ ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+ ASSERT_EQ(log_file->StartSequence(), 0);
+ ASSERT_GT(log_file->SizeFileBytes(), 0);
+ ASSERT_EQ(log_file->Type(), kAliveLogFile);
+ ASSERT_GT(log_file->LogNumber(), 0);
+
+ // force log files to cycle and add some more data, then check if
+ // log number moves forward
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ for (int i = 0; i < 10; i++) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ }
+
+ ASSERT_OK(Put(0, "foo4", "v4"));
+ ASSERT_OK(Put(0, "foo5", "v5"));
+ ASSERT_OK(Put(0, "foo6", "v6"));
+
+ ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+ ASSERT_EQ(log_file->StartSequence(), 0);
+ ASSERT_GT(log_file->SizeFileBytes(), 0);
+ ASSERT_EQ(log_file->Type(), kAliveLogFile);
+ ASSERT_GT(log_file->LogNumber(), 0);
+
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoveryWithLogDataForSomeCFs) {
+ // Test for regression of WAL cleanup missing files that don't contain data
+ // for every column family.
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v1"));
+ ASSERT_OK(Put(1, "foo", "v2"));
+ uint64_t earliest_log_nums[2];
+ for (int i = 0; i < 2; ++i) {
+ if (i > 0) {
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ }
+ VectorLogPtr log_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+ if (log_files.size() > 0) {
+ earliest_log_nums[i] = log_files[0]->LogNumber();
+ } else {
+ earliest_log_nums[i] = std::numeric_limits<uint64_t>::max();
+ }
+ }
+ // Check at least the first WAL was cleaned up during the recovery.
+ ASSERT_LT(earliest_log_nums[0], earliest_log_nums[1]);
+ } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithLargeLog) {
+ do {
+ {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ASSERT_OK(Put(1, "big1", std::string(200000, '1')));
+ ASSERT_OK(Put(1, "big2", std::string(200000, '2')));
+ ASSERT_OK(Put(1, "small3", std::string(10, '3')));
+ ASSERT_OK(Put(1, "small4", std::string(10, '4')));
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ }
+
+ // Make sure that if we re-open with a small write buffer size that
+ // we flush table files in the middle of a large log file.
+ Options options;
+ options.write_buffer_size = 100000;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
+ ASSERT_EQ(std::string(200000, '1'), Get(1, "big1"));
+ ASSERT_EQ(std::string(200000, '2'), Get(1, "big2"));
+ ASSERT_EQ(std::string(10, '3'), Get(1, "small3"));
+ ASSERT_EQ(std::string(10, '4'), Get(1, "small4"));
+ ASSERT_GT(NumTableFilesAtLevel(0, 1), 1);
+ } while (ChangeWalOptions());
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it was empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 5000000;
+ CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+ // Since we will reopen DB with smaller write_buffer_size,
+ // each key will go to new SST file
+ ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+ ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+ ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+ ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+
+ ASSERT_OK(Put(3, Key(10), DummyString(1)));
+ // Make 'dobrynia' to be flushed and new WAL file to be created
+ ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
+ ASSERT_OK(Put(2, Key(1), DummyString(1)));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+ {
+ auto tables = ListTableFiles(env_, dbname_);
+ ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+ // Make sure 'dobrynia' was flushed: check sst files amount
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(1));
+ }
+ // New WAL file
+ ASSERT_OK(Put(1, Key(1), DummyString(1)));
+ ASSERT_OK(Put(1, Key(1), DummyString(1)));
+ ASSERT_OK(Put(3, Key(10), DummyString(1)));
+ ASSERT_OK(Put(3, Key(10), DummyString(1)));
+ ASSERT_OK(Put(3, Key(10), DummyString(1)));
+
+ options.write_buffer_size = 4096;
+ options.arena_block_size = 4096;
+ ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+ options);
+ {
+ // No inserts => default is empty
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(0));
+ // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(5));
+ // 1 SST for big key + 1 SST for small one
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(2));
+ // 1 SST for all keys
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(1));
+ }
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it wasn't empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBWALTest, RecoverCheckFileAmount) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000;
+ options.arena_block_size = 4 * 1024;
+ options.avoid_flush_during_recovery = false;
+ CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+ ASSERT_OK(Put(0, Key(1), DummyString(1)));
+ ASSERT_OK(Put(1, Key(1), DummyString(1)));
+ ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+ // Make 'nikitich' memtable to be flushed
+ ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+ ASSERT_OK(Put(3, Key(1), DummyString(1)));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
+ // 4 memtable are not flushed, 1 sst file
+ {
+ auto tables = ListTableFiles(env_, dbname_);
+ ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(1));
+ }
+ // Memtable for 'nikitich' has flushed, new WAL file has opened
+ // 4 memtable still not flushed
+
+ // Write to new WAL file
+ ASSERT_OK(Put(0, Key(1), DummyString(1)));
+ ASSERT_OK(Put(1, Key(1), DummyString(1)));
+ ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+ // Fill up 'nikitich' one more time
+ ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+ // make it flush
+ ASSERT_OK(Put(3, Key(1), DummyString(1)));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
+ // There are still 4 memtable not flushed, and 2 sst tables
+ ASSERT_OK(Put(0, Key(1), DummyString(1)));
+ ASSERT_OK(Put(1, Key(1), DummyString(1)));
+ ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+ {
+ auto tables = ListTableFiles(env_, dbname_);
+ ASSERT_EQ(tables.size(), static_cast<size_t>(2));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(2));
+ }
+
+ ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+ options);
+ {
+ std::vector<uint64_t> table_files = ListTableFiles(env_, dbname_);
+ // Check, that records for 'default', 'dobrynia' and 'pikachu' from
+ // first, second and third WALs went to the same SST.
+ // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for
+ // 'dobrynia', one for 'pikachu'
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+ static_cast<uint64_t>(3));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+ static_cast<uint64_t>(1));
+ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+ static_cast<uint64_t>(1));
+ }
+}
+
+TEST_F(DBWALTest, SyncMultipleLogs) {
+ const uint64_t kNumBatches = 2;
+ const int kBatchSize = 1000;
+
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.write_buffer_size = 4096;
+ Reopen(options);
+
+ WriteBatch batch;
+ WriteOptions wo;
+ wo.sync = true;
+
+ for (uint64_t b = 0; b < kNumBatches; b++) {
+ batch.Clear();
+ for (int i = 0; i < kBatchSize; i++) {
+ ASSERT_OK(batch.Put(Key(i), DummyString(128)));
+ }
+
+ ASSERT_OK(dbfull()->Write(wo, &batch));
+ }
+
+ ASSERT_OK(dbfull()->SyncWAL());
+}
+
+// Github issue 1339. Prior the fix we read sequence id from the first log to
+// a local variable, then keep increase the variable as we replay logs,
+// ignoring actual sequence id of the records. This is incorrect if some writes
+// come with WAL disabled.
+TEST_F(DBWALTest, PartOfWritesWithWALDisabled) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.env = fault_env.get();
+ options.disable_auto_compactions = true;
+ WriteOptions wal_on, wal_off;
+ wal_on.sync = true;
+ wal_on.disableWAL = false;
+ wal_off.disableWAL = true;
+ CreateAndReopenWithCF({"dummy"}, options);
+ ASSERT_OK(Put(1, "dummy", "d1", wal_on)); // seq id 1
+ ASSERT_OK(Put(1, "dummy", "d2", wal_off));
+ ASSERT_OK(Put(1, "dummy", "d3", wal_off));
+ ASSERT_OK(Put(0, "key", "v4", wal_on)); // seq id 4
+ ASSERT_OK(Flush(0));
+ ASSERT_OK(Put(0, "key", "v5", wal_on)); // seq id 5
+ ASSERT_EQ("v5", Get(0, "key"));
+ ASSERT_OK(dbfull()->FlushWAL(false));
+ // Simulate a crash.
+ fault_env->SetFilesystemActive(false);
+ Close();
+ fault_env->ResetState();
+ ReopenWithColumnFamilies({"default", "dummy"}, options);
+ // Prior to the fix, we may incorrectly recover "v5" with sequence id = 3.
+ ASSERT_EQ("v5", Get(0, "key"));
+ // Destroy DB before destruct fault_env.
+ Destroy(options);
+}
+
+//
+// Test WAL recovery for the various modes available
+//
+class RecoveryTestHelper {
+ public:
+ // Number of WAL files to generate
+ static constexpr int kWALFilesCount = 10;
+ // Starting number for the WAL file name like 00010.log
+ static constexpr int kWALFileOffset = 10;
+ // Keys to be written per WAL file
+ static constexpr int kKeysPerWALFile = 133;
+ // Size of the value
+ static constexpr int kValueSize = 96;
+
+ // Create WAL files with values filled in
+ static void FillData(DBWALTestBase* test, const Options& options,
+ const size_t wal_count, size_t* count) {
+ // Calling internal functions requires sanitized options.
+ Options sanitized_options = SanitizeOptions(test->dbname_, options);
+ const ImmutableDBOptions db_options(sanitized_options);
+
+ *count = 0;
+
+ std::shared_ptr<Cache> table_cache = NewLRUCache(50, 0);
+ FileOptions file_options;
+ WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+
+ std::unique_ptr<VersionSet> versions;
+ std::unique_ptr<WalManager> wal_manager;
+ WriteController write_controller;
+
+ versions.reset(new VersionSet(
+ test->dbname_, &db_options, file_options, table_cache.get(),
+ &write_buffer_manager, &write_controller,
+ /*block_cache_tracer=*/nullptr,
+ /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ ""));
+
+ wal_manager.reset(
+ new WalManager(db_options, file_options, /*io_tracer=*/nullptr));
+
+ std::unique_ptr<log::Writer> current_log_writer;
+
+ for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) {
+ uint64_t current_log_number = j;
+ std::string fname = LogFileName(test->dbname_, current_log_number);
+ std::unique_ptr<WritableFileWriter> file_writer;
+ ASSERT_OK(WritableFileWriter::Create(db_options.env->GetFileSystem(),
+ fname, file_options, &file_writer,
+ nullptr));
+ log::Writer* log_writer =
+ new log::Writer(std::move(file_writer), current_log_number,
+ db_options.recycle_log_file_num > 0, false,
+ db_options.wal_compression);
+ ASSERT_OK(log_writer->AddCompressionTypeRecord());
+ current_log_writer.reset(log_writer);
+
+ WriteBatch batch;
+ for (int i = 0; i < kKeysPerWALFile; i++) {
+ std::string key = "key" + std::to_string((*count)++);
+ std::string value = test->DummyString(kValueSize);
+ ASSERT_NE(current_log_writer.get(), nullptr);
+ uint64_t seq = versions->LastSequence() + 1;
+ batch.Clear();
+ ASSERT_OK(batch.Put(key, value));
+ WriteBatchInternal::SetSequence(&batch, seq);
+ ASSERT_OK(current_log_writer->AddRecord(
+ WriteBatchInternal::Contents(&batch)));
+ versions->SetLastAllocatedSequence(seq);
+ versions->SetLastPublishedSequence(seq);
+ versions->SetLastSequence(seq);
+ }
+ }
+ }
+
+ // Recreate and fill the store with some data
+ static size_t FillData(DBWALTestBase* test, Options* options) {
+ options->create_if_missing = true;
+ test->DestroyAndReopen(*options);
+ test->Close();
+
+ size_t count = 0;
+ FillData(test, *options, kWALFilesCount, &count);
+ return count;
+ }
+
+ // Read back all the keys we wrote and return the number of keys found
+ static size_t GetData(DBWALTestBase* test) {
+ size_t count = 0;
+ for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) {
+ if (test->Get("key" + std::to_string(i)) != "NOT_FOUND") {
+ ++count;
+ }
+ }
+ return count;
+ }
+
+ // Manuall corrupt the specified WAL
+ static void CorruptWAL(DBWALTestBase* test, const Options& options,
+ const double off, const double len,
+ const int wal_file_id, const bool trunc = false) {
+ Env* env = options.env;
+ std::string fname = LogFileName(test->dbname_, wal_file_id);
+ uint64_t size;
+ ASSERT_OK(env->GetFileSize(fname, &size));
+ ASSERT_GT(size, 0);
+#ifdef OS_WIN
+ // Windows disk cache behaves differently. When we truncate
+ // the original content is still in the cache due to the original
+ // handle is still open. Generally, in Windows, one prohibits
+ // shared access to files and it is not needed for WAL but we allow
+ // it to induce corruption at various tests.
+ test->Close();
+#endif
+ if (trunc) {
+ ASSERT_OK(
+ test::TruncateFile(env, fname, static_cast<uint64_t>(size * off)));
+ } else {
+ ASSERT_OK(test::CorruptFile(env, fname, static_cast<int>(size * off + 8),
+ static_cast<int>(size * len), false));
+ }
+ }
+};
+
+class DBWALTestWithParams : public DBWALTestBase,
+ public ::testing::WithParamInterface<
+ std::tuple<bool, int, int, CompressionType>> {
+ public:
+ DBWALTestWithParams() : DBWALTestBase("/db_wal_test_with_params") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+ Wal, DBWALTestWithParams,
+ ::testing::Combine(::testing::Bool(), ::testing::Range(0, 4, 1),
+ ::testing::Range(RecoveryTestHelper::kWALFileOffset,
+ RecoveryTestHelper::kWALFileOffset +
+ RecoveryTestHelper::kWALFilesCount,
+ 1),
+ ::testing::Values(CompressionType::kNoCompression,
+ CompressionType::kZSTD)));
+
+class DBWALTestWithParamsVaryingRecoveryMode
+ : public DBWALTestBase,
+ public ::testing::WithParamInterface<
+ std::tuple<bool, int, int, WALRecoveryMode, CompressionType>> {
+ public:
+ DBWALTestWithParamsVaryingRecoveryMode()
+ : DBWALTestBase("/db_wal_test_with_params_mode") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+ Wal, DBWALTestWithParamsVaryingRecoveryMode,
+ ::testing::Combine(
+ ::testing::Bool(), ::testing::Range(0, 4, 1),
+ ::testing::Range(RecoveryTestHelper::kWALFileOffset,
+ RecoveryTestHelper::kWALFileOffset +
+ RecoveryTestHelper::kWALFilesCount,
+ 1),
+ ::testing::Values(WALRecoveryMode::kTolerateCorruptedTailRecords,
+ WALRecoveryMode::kAbsoluteConsistency,
+ WALRecoveryMode::kPointInTimeRecovery,
+ WALRecoveryMode::kSkipAnyCorruptedRecords),
+ ::testing::Values(CompressionType::kNoCompression,
+ CompressionType::kZSTD)));
+
+// Test scope:
+// - We expect to open the data store when there is incomplete trailing writes
+// at the end of any of the logs
+// - We do not expect to open the data store for corruption
+TEST_P(DBWALTestWithParams, kTolerateCorruptedTailRecords) {
+ bool trunc = std::get<0>(GetParam()); // Corruption style
+ // Corruption offset position
+ int corrupt_offset = std::get<1>(GetParam());
+ int wal_file_id = std::get<2>(GetParam()); // WAL file
+
+ // Fill data for testing
+ Options options = CurrentOptions();
+ const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+ // test checksum failure or parsing
+ RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+ /*len%=*/.1, wal_file_id, trunc);
+
+ options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+ if (trunc) {
+ options.create_if_missing = false;
+ ASSERT_OK(TryReopen(options));
+ const size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+ ASSERT_TRUE(corrupt_offset == 0 || recovered_row_count > 0);
+ ASSERT_LT(recovered_row_count, row_count);
+ } else {
+ ASSERT_NOK(TryReopen(options));
+ }
+}
+
+// Test scope:
+// We don't expect the data store to be opened if there is any corruption
+// (leading, middle or trailing -- incomplete writes or corruption)
+TEST_P(DBWALTestWithParams, kAbsoluteConsistency) {
+ // Verify clean slate behavior
+ Options options = CurrentOptions();
+ const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+ options.create_if_missing = false;
+ ASSERT_OK(TryReopen(options));
+ ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count);
+
+ bool trunc = std::get<0>(GetParam()); // Corruption style
+ // Corruption offset position
+ int corrupt_offset = std::get<1>(GetParam());
+ int wal_file_id = std::get<2>(GetParam()); // WAL file
+ // WAL compression type
+ CompressionType compression_type = std::get<3>(GetParam());
+ options.wal_compression = compression_type;
+
+ if (trunc && corrupt_offset == 0) {
+ return;
+ }
+
+ // fill with new date
+ RecoveryTestHelper::FillData(this, &options);
+ // corrupt the wal
+ RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .33,
+ /*len%=*/.1, wal_file_id, trunc);
+ // verify
+ options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+ options.create_if_missing = false;
+ ASSERT_NOK(TryReopen(options));
+}
+
+// Test scope:
+// We don't expect the data store to be opened if there is any inconsistency
+// between WAL and SST files
+TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
+ Options options = CurrentOptions();
+ options.avoid_flush_during_recovery = true;
+
+ // Create DB with multiple column families.
+ CreateAndReopenWithCF({"one", "two"}, options);
+ ASSERT_OK(Put(1, "key1", "val1"));
+ ASSERT_OK(Put(2, "key2", "val2"));
+
+ // Record the offset at this point
+ Env* env = options.env;
+ uint64_t wal_file_id = dbfull()->TEST_LogfileNumber();
+ std::string fname = LogFileName(dbname_, wal_file_id);
+ uint64_t offset_to_corrupt;
+ ASSERT_OK(env->GetFileSize(fname, &offset_to_corrupt));
+ ASSERT_GT(offset_to_corrupt, 0);
+
+ ASSERT_OK(Put(1, "key3", "val3"));
+ // Corrupt WAL at location of key3
+ ASSERT_OK(test::CorruptFile(env, fname, static_cast<int>(offset_to_corrupt),
+ 4, false));
+ ASSERT_OK(Put(2, "key4", "val4"));
+ ASSERT_OK(Put(1, "key5", "val5"));
+ ASSERT_OK(Flush(2));
+
+ // PIT recovery & verify
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
+}
+
+TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.track_and_verify_wals_in_manifest = true;
+ // The following make sure there are two bg flush threads.
+ options.max_background_jobs = 8;
+
+ DestroyAndReopen(options);
+
+ const std::string cf1_name("cf1");
+ CreateAndReopenWithCF({cf1_name}, options);
+ assert(handles_.size() == 2);
+
+ {
+ dbfull()->TEST_LockMutex();
+ ASSERT_LE(2, dbfull()->GetBGJobLimits().max_flushes);
+ dbfull()->TEST_UnlockMutex();
+ }
+
+ ASSERT_OK(dbfull()->PauseBackgroundWork());
+
+ ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", "value"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(
+ /*wait=*/false, /*allow_write_stall=*/true, handles_[1]));
+
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(
+ /*wait=*/false, /*allow_write_stall=*/true, handles_[0]));
+
+ bool called = false;
+ std::atomic<int> bg_flush_threads{0};
+ std::atomic<bool> wal_synced{false};
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) {
+ int cur = bg_flush_threads.load();
+ int desired = cur + 1;
+ if (cur > 0 ||
+ !bg_flush_threads.compare_exchange_strong(cur, desired)) {
+ while (!wal_synced.load()) {
+ // Wait until the other bg flush thread finishes committing WAL sync
+ // operation to the MANIFEST.
+ }
+ }
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushMemTableToOutputFile:CommitWal:1",
+ [&](void* /*arg*/) { wal_synced.store(true); });
+ // This callback will be called when the first bg flush thread reaches the
+ // point before entering the MANIFEST write queue after flushing the SST
+ // file.
+ // The purpose of the sync points here is to ensure both bg flush threads
+ // finish computing `min_wal_number_to_keep` before any of them updates the
+ // `log_number` for the column family that's being flushed.
+ SyncPoint::GetInstance()->SetCallBack(
+ "MemTableList::TryInstallMemtableFlushResults:AfterComputeMinWalToKeep",
+ [&](void* /*arg*/) {
+ dbfull()->mutex()->AssertHeld();
+ if (!called) {
+ // We are the first bg flush thread in the MANIFEST write queue.
+ // We set up the dependency between sync points for two threads that
+ // will be executing the same code.
+ // For the interleaving of events, see
+ // https://github.com/facebook/rocksdb/pull/9715.
+ // bg flush thread1 will release the db mutex while in the MANIFEST
+ // write queue. In the meantime, bg flush thread2 locks db mutex and
+ // computes the min_wal_number_to_keep (before thread1 writes to
+ // MANIFEST thus before cf1->log_number is updated). Bg thread2 joins
+ // the MANIFEST write queue afterwards and bg flush thread1 proceeds
+ // with writing to MANIFEST.
+ called = true;
+ SyncPoint::GetInstance()->LoadDependency({
+ {"VersionSet::LogAndApply:WriteManifestStart",
+ "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2"},
+ {"DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2",
+ "VersionSet::LogAndApply:WriteManifest"},
+ });
+ } else {
+ // The other bg flush thread has already been in the MANIFEST write
+ // queue, and we are after.
+ TEST_SYNC_POINT(
+ "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2");
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(dbfull()->ContinueBackgroundWork());
+
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+ ASSERT_TRUE(called);
+
+ Close();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ DB* db1 = nullptr;
+ Status s = DB::OpenForReadOnly(options, dbname_, &db1);
+ ASSERT_OK(s);
+ assert(db1);
+ delete db1;
+}
+
+// Test scope:
+// - We expect to open data store under all circumstances
+// - We expect only data upto the point where the first error was encountered
+TEST_P(DBWALTestWithParams, kPointInTimeRecovery) {
+ const int maxkeys =
+ RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile;
+
+ bool trunc = std::get<0>(GetParam()); // Corruption style
+ // Corruption offset position
+ int corrupt_offset = std::get<1>(GetParam());
+ int wal_file_id = std::get<2>(GetParam()); // WAL file
+ // WAL compression type
+ CompressionType compression_type = std::get<3>(GetParam());
+
+ // Fill data for testing
+ Options options = CurrentOptions();
+ options.wal_compression = compression_type;
+ const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+ // Corrupt the wal
+ // The offset here was 0.3 which cuts off right at the end of a
+ // valid fragment after wal zstd compression checksum is enabled,
+ // so changed the value to 0.33.
+ RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .33,
+ /*len%=*/.1, wal_file_id, trunc);
+
+ // Verify
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ options.create_if_missing = false;
+ ASSERT_OK(TryReopen(options));
+
+ // Probe data for invariants
+ size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+ ASSERT_LT(recovered_row_count, row_count);
+
+ // Verify a prefix of keys were recovered. But not in the case of full WAL
+ // truncation, because we have no way to know there was a corruption when
+ // truncation happened on record boundaries (preventing recovery holes in
+ // that case requires using `track_and_verify_wals_in_manifest`).
+ if (!trunc || corrupt_offset != 0) {
+ bool expect_data = true;
+ for (size_t k = 0; k < maxkeys; ++k) {
+ bool found = Get("key" + std::to_string(k)) != "NOT_FOUND";
+ if (expect_data && !found) {
+ expect_data = false;
+ }
+ ASSERT_EQ(found, expect_data);
+ }
+ }
+
+ const size_t min = RecoveryTestHelper::kKeysPerWALFile *
+ (wal_file_id - RecoveryTestHelper::kWALFileOffset);
+ ASSERT_GE(recovered_row_count, min);
+ if (!trunc && corrupt_offset != 0) {
+ const size_t max = RecoveryTestHelper::kKeysPerWALFile *
+ (wal_file_id - RecoveryTestHelper::kWALFileOffset + 1);
+ ASSERT_LE(recovered_row_count, max);
+ }
+}
+
+// Test scope:
+// - We expect to open the data store under all scenarios
+// - We expect to have recovered records past the corruption zone
+TEST_P(DBWALTestWithParams, kSkipAnyCorruptedRecords) {
+ bool trunc = std::get<0>(GetParam()); // Corruption style
+ // Corruption offset position
+ int corrupt_offset = std::get<1>(GetParam());
+ int wal_file_id = std::get<2>(GetParam()); // WAL file
+ // WAL compression type
+ CompressionType compression_type = std::get<3>(GetParam());
+
+ // Fill data for testing
+ Options options = CurrentOptions();
+ options.wal_compression = compression_type;
+ const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+ // Corrupt the WAL
+ RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+ /*len%=*/.1, wal_file_id, trunc);
+
+ // Verify behavior
+ options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords;
+ options.create_if_missing = false;
+ ASSERT_OK(TryReopen(options));
+
+ // Probe data for invariants
+ size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+ ASSERT_LT(recovered_row_count, row_count);
+
+ if (!trunc) {
+ ASSERT_TRUE(corrupt_offset != 0 || recovered_row_count > 0);
+ }
+}
+
+TEST_F(DBWALTest, AvoidFlushDuringRecovery) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.avoid_flush_during_recovery = false;
+
+ // Test with flush after recovery.
+ Reopen(options);
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "v3"));
+ ASSERT_OK(Put("bar", "v4"));
+ ASSERT_EQ(1, TotalTableFiles());
+ // Reopen DB. Check if WAL logs flushed.
+ Reopen(options);
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v4", Get("bar"));
+ ASSERT_EQ(2, TotalTableFiles());
+
+ // Test without flush after recovery.
+ options.avoid_flush_during_recovery = true;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "v5"));
+ ASSERT_OK(Put("bar", "v6"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "v7"));
+ ASSERT_OK(Put("bar", "v8"));
+ ASSERT_EQ(1, TotalTableFiles());
+ // Reopen DB. WAL logs should not be flushed this time.
+ Reopen(options);
+ ASSERT_EQ("v7", Get("foo"));
+ ASSERT_EQ("v8", Get("bar"));
+ ASSERT_EQ(1, TotalTableFiles());
+
+ // Force flush with allow_2pc.
+ options.avoid_flush_during_recovery = true;
+ options.allow_2pc = true;
+ ASSERT_OK(Put("foo", "v9"));
+ ASSERT_OK(Put("bar", "v10"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "v11"));
+ ASSERT_OK(Put("bar", "v12"));
+ Reopen(options);
+ ASSERT_EQ("v11", Get("foo"));
+ ASSERT_EQ("v12", Get("bar"));
+ ASSERT_EQ(3, TotalTableFiles());
+}
+
+TEST_F(DBWALTest, WalCleanupAfterAvoidFlushDuringRecovery) {
+ // Verifies WAL files that were present during recovery, but not flushed due
+ // to avoid_flush_during_recovery, will be considered for deletion at a later
+ // stage. We check at least one such file is deleted during Flush().
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.avoid_flush_during_recovery = true;
+ Reopen(options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ Reopen(options);
+ for (int i = 0; i < 2; ++i) {
+ if (i > 0) {
+ // Flush() triggers deletion of obsolete tracked files
+ ASSERT_OK(Flush());
+ }
+ VectorLogPtr log_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+ if (i == 0) {
+ ASSERT_GT(log_files.size(), 0);
+ } else {
+ ASSERT_EQ(0, log_files.size());
+ }
+ }
+}
+
+TEST_F(DBWALTest, RecoverWithoutFlush) {
+ Options options = CurrentOptions();
+ options.avoid_flush_during_recovery = true;
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 64 * 1024 * 1024;
+
+ size_t count = RecoveryTestHelper::FillData(this, &options);
+ auto validateData = [this, count]() {
+ for (size_t i = 0; i < count; i++) {
+ ASSERT_NE(Get("key" + std::to_string(i)), "NOT_FOUND");
+ }
+ };
+ Reopen(options);
+ validateData();
+ // Insert some data without flush
+ ASSERT_OK(Put("foo", "foo_v1"));
+ ASSERT_OK(Put("bar", "bar_v1"));
+ Reopen(options);
+ validateData();
+ ASSERT_EQ(Get("foo"), "foo_v1");
+ ASSERT_EQ(Get("bar"), "bar_v1");
+ // Insert again and reopen
+ ASSERT_OK(Put("foo", "foo_v2"));
+ ASSERT_OK(Put("bar", "bar_v2"));
+ Reopen(options);
+ validateData();
+ ASSERT_EQ(Get("foo"), "foo_v2");
+ ASSERT_EQ(Get("bar"), "bar_v2");
+ // manual flush and insert again
+ ASSERT_OK(Flush());
+ ASSERT_EQ(Get("foo"), "foo_v2");
+ ASSERT_EQ(Get("bar"), "bar_v2");
+ ASSERT_OK(Put("foo", "foo_v3"));
+ ASSERT_OK(Put("bar", "bar_v3"));
+ Reopen(options);
+ validateData();
+ ASSERT_EQ(Get("foo"), "foo_v3");
+ ASSERT_EQ(Get("bar"), "bar_v3");
+}
+
+TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) {
+ const std::string kSmallValue = "v";
+ const std::string kLargeValue = DummyString(1024);
+ Options options = CurrentOptions();
+ options.avoid_flush_during_recovery = true;
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+
+ auto countWalFiles = [this]() {
+ VectorLogPtr log_files;
+ if (!dbfull()->GetSortedWalFiles(log_files).ok()) {
+ return size_t{0};
+ }
+ return log_files.size();
+ };
+
+ // Create DB with multiple column families and multiple log files.
+ CreateAndReopenWithCF({"one", "two"}, options);
+ ASSERT_OK(Put(0, "key1", kSmallValue));
+ ASSERT_OK(Put(1, "key2", kLargeValue));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ(1, countWalFiles());
+ ASSERT_OK(Put(0, "key3", kSmallValue));
+ ASSERT_OK(Put(2, "key4", kLargeValue));
+ ASSERT_OK(Flush(2));
+ ASSERT_EQ(2, countWalFiles());
+
+ // Reopen, insert and flush.
+ options.db_write_buffer_size = 64 * 1024 * 1024;
+ ReopenWithColumnFamilies({"default", "one", "two"}, options);
+ ASSERT_EQ(Get(0, "key1"), kSmallValue);
+ ASSERT_EQ(Get(1, "key2"), kLargeValue);
+ ASSERT_EQ(Get(0, "key3"), kSmallValue);
+ ASSERT_EQ(Get(2, "key4"), kLargeValue);
+ // Insert more data.
+ ASSERT_OK(Put(0, "key5", kLargeValue));
+ ASSERT_OK(Put(1, "key6", kLargeValue));
+ ASSERT_EQ(3, countWalFiles());
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(2, "key7", kLargeValue));
+ ASSERT_OK(dbfull()->FlushWAL(false));
+ ASSERT_EQ(4, countWalFiles());
+
+ // Reopen twice and validate.
+ for (int i = 0; i < 2; i++) {
+ ReopenWithColumnFamilies({"default", "one", "two"}, options);
+ ASSERT_EQ(Get(0, "key1"), kSmallValue);
+ ASSERT_EQ(Get(1, "key2"), kLargeValue);
+ ASSERT_EQ(Get(0, "key3"), kSmallValue);
+ ASSERT_EQ(Get(2, "key4"), kLargeValue);
+ ASSERT_EQ(Get(0, "key5"), kLargeValue);
+ ASSERT_EQ(Get(1, "key6"), kLargeValue);
+ ASSERT_EQ(Get(2, "key7"), kLargeValue);
+ ASSERT_EQ(4, countWalFiles());
+ }
+}
+
+// In this test we are trying to do the following:
+// 1. Create a DB with corrupted WAL log;
+// 2. Open with avoid_flush_during_recovery = true;
+// 3. Append more data without flushing, which creates new WAL log.
+// 4. Open again. See if it can correctly handle previous corruption.
+TEST_P(DBWALTestWithParamsVaryingRecoveryMode,
+ RecoverFromCorruptedWALWithoutFlush) {
+ const int kAppendKeys = 100;
+ Options options = CurrentOptions();
+ options.avoid_flush_during_recovery = true;
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 64 * 1024 * 1024;
+
+ auto getAll = [this]() {
+ std::vector<std::pair<std::string, std::string>> data;
+ ReadOptions ropt;
+ Iterator* iter = dbfull()->NewIterator(ropt);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ data.push_back(
+ std::make_pair(iter->key().ToString(), iter->value().ToString()));
+ }
+ delete iter;
+ return data;
+ };
+
+ bool trunc = std::get<0>(GetParam()); // Corruption style
+ // Corruption offset position
+ int corrupt_offset = std::get<1>(GetParam());
+ int wal_file_id = std::get<2>(GetParam()); // WAL file
+ WALRecoveryMode recovery_mode = std::get<3>(GetParam());
+ // WAL compression type
+ CompressionType compression_type = std::get<4>(GetParam());
+
+ options.wal_recovery_mode = recovery_mode;
+ options.wal_compression = compression_type;
+ // Create corrupted WAL
+ RecoveryTestHelper::FillData(this, &options);
+ RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+ /*len%=*/.1, wal_file_id, trunc);
+ // Skip the test if DB won't open.
+ if (!TryReopen(options).ok()) {
+ ASSERT_TRUE(options.wal_recovery_mode ==
+ WALRecoveryMode::kAbsoluteConsistency ||
+ (!trunc && options.wal_recovery_mode ==
+ WALRecoveryMode::kTolerateCorruptedTailRecords));
+ return;
+ }
+ ASSERT_OK(TryReopen(options));
+ // Append some more data.
+ for (int k = 0; k < kAppendKeys; k++) {
+ std::string key = "extra_key" + std::to_string(k);
+ std::string value = DummyString(RecoveryTestHelper::kValueSize);
+ ASSERT_OK(Put(key, value));
+ }
+ // Save data for comparison.
+ auto data = getAll();
+ // Reopen. Verify data.
+ ASSERT_OK(TryReopen(options));
+ auto actual_data = getAll();
+ ASSERT_EQ(data, actual_data);
+}
+
+// Tests that total log size is recovered if we set
+// avoid_flush_during_recovery=true.
+// Flush should trigger if max_total_wal_size is reached.
+TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
+ auto test_listener = std::make_shared<FlushCounterListener>();
+ test_listener->expected_flush_reason = FlushReason::kWalFull;
+
+ constexpr size_t kKB = 1024;
+ constexpr size_t kMB = 1024 * 1024;
+ Options options = CurrentOptions();
+ options.avoid_flush_during_recovery = true;
+ options.max_total_wal_size = 1 * kMB;
+ options.listeners.push_back(test_listener);
+ // Have to open DB in multi-CF mode to trigger flush when
+ // max_total_wal_size is reached.
+ CreateAndReopenWithCF({"one"}, options);
+ // Write some keys and we will end up with one log file which is slightly
+ // smaller than 1MB.
+ std::string value_100k(100 * kKB, 'v');
+ std::string value_300k(300 * kKB, 'v');
+ ASSERT_OK(Put(0, "foo", "v1"));
+ for (int i = 0; i < 9; i++) {
+ ASSERT_OK(Put(1, "key" + std::to_string(i), value_100k));
+ }
+ // Get log files before reopen.
+ VectorLogPtr log_files_before;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+ ASSERT_EQ(1, log_files_before.size());
+ uint64_t log_size_before = log_files_before[0]->SizeFileBytes();
+ ASSERT_GT(log_size_before, 900 * kKB);
+ ASSERT_LT(log_size_before, 1 * kMB);
+ ReopenWithColumnFamilies({"default", "one"}, options);
+ // Write one more value to make log larger than 1MB.
+ ASSERT_OK(Put(1, "bar", value_300k));
+ // Get log files again. A new log file will be opened.
+ VectorLogPtr log_files_after_reopen;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after_reopen));
+ ASSERT_EQ(2, log_files_after_reopen.size());
+ ASSERT_EQ(log_files_before[0]->LogNumber(),
+ log_files_after_reopen[0]->LogNumber());
+ ASSERT_GT(log_files_after_reopen[0]->SizeFileBytes() +
+ log_files_after_reopen[1]->SizeFileBytes(),
+ 1 * kMB);
+ // Write one more key to trigger flush.
+ ASSERT_OK(Put(0, "foo", "v2"));
+ for (auto* h : handles_) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(h));
+ }
+ // Flushed two column families.
+ ASSERT_EQ(2, test_listener->count.load());
+}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
+ constexpr size_t kKB = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.avoid_flush_during_recovery = true;
+ if (mem_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+ return;
+ }
+ if (!IsFallocateSupported()) {
+ return;
+ }
+
+ DestroyAndReopen(options);
+ size_t preallocated_size =
+ dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+ ASSERT_OK(Put("foo", "v1"));
+ VectorLogPtr log_files_before;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+ ASSERT_EQ(1, log_files_before.size());
+ auto& file_before = log_files_before[0];
+ ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+ // The log file has preallocated space.
+ ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+ preallocated_size);
+ Reopen(options);
+ VectorLogPtr log_files_after;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+ ASSERT_EQ(1, log_files_after.size());
+ ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+ // The preallocated space should be truncated.
+ ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+ preallocated_size);
+}
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithFlush) {
+ constexpr size_t kKB = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.avoid_flush_during_recovery = false;
+ options.avoid_flush_during_shutdown = true;
+ if (mem_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+ return;
+ }
+ if (!IsFallocateSupported()) {
+ return;
+ }
+
+ DestroyAndReopen(options);
+ size_t preallocated_size =
+ dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+ ASSERT_OK(Put("foo", "v1"));
+ VectorLogPtr log_files_before;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+ ASSERT_EQ(1, log_files_before.size());
+ auto& file_before = log_files_before[0];
+ ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+ ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+ preallocated_size);
+ // The log file has preallocated space.
+ Close();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::PurgeObsoleteFiles:Begin",
+ "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
+ {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
+ "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread reopen_thread([&]() { Reopen(options); });
+
+ TEST_SYNC_POINT(
+ "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
+ // After the flush during Open, the log file should get deleted. However,
+ // if the process is in a crash loop, the log file may not get
+ // deleted and thte preallocated space will keep accumulating. So we need
+ // to ensure it gets trtuncated.
+ EXPECT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+ preallocated_size);
+ TEST_SYNC_POINT(
+ "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
+ reopen_thread.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.avoid_flush_during_recovery = false;
+ if (mem_env_ || encrypted_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem/non-encrypted environment");
+ return;
+ }
+ if (!IsFallocateSupported()) {
+ return;
+ }
+
+ DestroyAndReopen(options);
+ size_t preallocated_size =
+ dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+ Close();
+ std::vector<std::string> filenames;
+ std::string last_log;
+ uint64_t last_log_num = 0;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ for (auto fname : filenames) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(fname, &number, &type, nullptr)) {
+ if (type == kWalFile && number > last_log_num) {
+ last_log = fname;
+ }
+ }
+ }
+ ASSERT_NE(last_log, "");
+ last_log = dbname_ + '/' + last_log;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::PurgeObsoleteFiles:Begin",
+ "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
+ {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
+ "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PosixWritableFile::Close",
+ [](void* arg) { *(reinterpret_cast<size_t*>(arg)) = 0; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ // Preallocate space for the empty log file. This could happen if WAL data
+ // was buffered in memory and the process crashed.
+ std::unique_ptr<WritableFile> log_file;
+ ASSERT_OK(env_->ReopenWritableFile(last_log, &log_file, EnvOptions()));
+ log_file->SetPreallocationBlockSize(preallocated_size);
+ log_file->PrepareWrite(0, 4096);
+ log_file.reset();
+
+ ASSERT_GE(GetAllocatedFileSize(last_log), preallocated_size);
+
+ port::Thread reopen_thread([&]() { Reopen(options); });
+
+ TEST_SYNC_POINT(
+ "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
+ // The preallocated space should be truncated.
+ EXPECT_LT(GetAllocatedFileSize(last_log), preallocated_size);
+ TEST_SYNC_POINT(
+ "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
+ reopen_thread.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBWALTest, ReadOnlyRecoveryNoTruncate) {
+ constexpr size_t kKB = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.avoid_flush_during_recovery = true;
+ if (mem_env_) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+ return;
+ }
+ if (!IsFallocateSupported()) {
+ return;
+ }
+
+ // create DB and close with file truncate disabled
+ std::atomic_bool enable_truncate{false};
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "PosixWritableFile::Close", [&](void* arg) {
+ if (!enable_truncate) {
+ *(reinterpret_cast<size_t*>(arg)) = 0;
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyAndReopen(options);
+ size_t preallocated_size =
+ dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+ ASSERT_OK(Put("foo", "v1"));
+ VectorLogPtr log_files_before;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+ ASSERT_EQ(1, log_files_before.size());
+ auto& file_before = log_files_before[0];
+ ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+ // The log file has preallocated space.
+ auto db_size = GetAllocatedFileSize(dbname_ + file_before->PathName());
+ ASSERT_GE(db_size, preallocated_size);
+ Close();
+
+ // enable truncate and open DB as readonly, the file should not be truncated
+ // and DB size is not changed.
+ enable_truncate = true;
+ ASSERT_OK(ReadOnlyReopen(options));
+ VectorLogPtr log_files_after;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+ ASSERT_EQ(1, log_files_after.size());
+ ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+ ASSERT_EQ(log_files_after[0]->PathName(), file_before->PathName());
+ // The preallocated space should NOT be truncated.
+ // the DB size is almost the same.
+ ASSERT_NEAR(GetAllocatedFileSize(dbname_ + file_before->PathName()), db_size,
+ db_size / 100);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // ROCKSDB_FALLOCATE_PRESENT
+#endif // ROCKSDB_PLATFORM_POSIX
+
+TEST_F(DBWALTest, WalInManifestButNotInSortedWals) {
+ Options options = CurrentOptions();
+ options.track_and_verify_wals_in_manifest = true;
+ options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+
+ // Build a way to make wal files selectively go missing
+ bool wals_go_missing = false;
+ struct MissingWalFs : public FileSystemWrapper {
+ MissingWalFs(const std::shared_ptr<FileSystem>& t,
+ bool* _wals_go_missing_flag)
+ : FileSystemWrapper(t), wals_go_missing_flag(_wals_go_missing_flag) {}
+ bool* wals_go_missing_flag;
+ IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+ std::vector<std::string>* r,
+ IODebugContext* dbg) override {
+ IOStatus s = target_->GetChildren(dir, io_opts, r, dbg);
+ if (s.ok() && *wals_go_missing_flag) {
+ for (size_t i = 0; i < r->size();) {
+ if (EndsWith(r->at(i), ".log")) {
+ r->erase(r->begin() + i);
+ } else {
+ ++i;
+ }
+ }
+ }
+ return s;
+ }
+ const char* Name() const override { return "MissingWalFs"; }
+ };
+ auto my_fs =
+ std::make_shared<MissingWalFs>(env_->GetFileSystem(), &wals_go_missing);
+ std::unique_ptr<Env> my_env(NewCompositeEnv(my_fs));
+ options.env = my_env.get();
+
+ CreateAndReopenWithCF({"blah"}, options);
+
+ // Currently necessary to get a WAL tracked in manifest; see
+ // https://github.com/facebook/rocksdb/issues/10080
+ ASSERT_OK(Put(0, "x", "y"));
+ ASSERT_OK(db_->SyncWAL());
+ ASSERT_OK(Put(1, "x", "y"));
+ ASSERT_OK(db_->SyncWAL());
+ ASSERT_OK(Flush(1));
+
+ ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+ std::vector<std::unique_ptr<LogFile>> wals;
+ ASSERT_OK(db_->GetSortedWalFiles(wals));
+ wals_go_missing = true;
+ ASSERT_NOK(db_->GetSortedWalFiles(wals));
+ wals_go_missing = false;
+ Close();
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(DBWALTest, WalTermTest) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(1, "foo", "bar"));
+
+ WriteOptions wo;
+ wo.sync = true;
+ wo.disableWAL = false;
+
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("foo", "bar"));
+ batch.MarkWalTerminationPoint();
+ ASSERT_OK(batch.Put("foo2", "bar2"));
+
+ ASSERT_OK(dbfull()->Write(wo, &batch));
+
+ // make sure we can re-open it.
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+ ASSERT_EQ("bar", Get(1, "foo"));
+ ASSERT_EQ("NOT_FOUND", Get(1, "foo2"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBWALTest, GetCompressedWalsAfterSync) {
+ if (db_->GetOptions().wal_compression == kNoCompression) {
+ ROCKSDB_GTEST_BYPASS("stream compression not present");
+ return;
+ }
+ Options options = GetDefaultOptions();
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ options.create_if_missing = true;
+ options.env = env_;
+ options.avoid_flush_during_recovery = true;
+ options.track_and_verify_wals_in_manifest = true;
+ // Enable WAL compression so that the newly-created WAL will be non-empty
+ // after DB open, even if point-in-time WAL recovery encounters no
+ // corruption.
+ options.wal_compression = kZSTD;
+ DestroyAndReopen(options);
+
+ // Write something to memtable and WAL so that log_empty_ will be false after
+ // next DB::Open().
+ ASSERT_OK(Put("a", "v"));
+
+ Reopen(options);
+
+ // New WAL is created, thanks to !log_empty_.
+ ASSERT_OK(dbfull()->TEST_SwitchWAL());
+
+ ASSERT_OK(Put("b", "v"));
+
+ ASSERT_OK(db_->SyncWAL());
+
+ VectorLogPtr wals;
+ Status s = dbfull()->GetSortedWalFiles(wals);
+ ASSERT_OK(s);
+}
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_with_timestamp_basic_test.cc b/src/rocksdb/db/db_with_timestamp_basic_test.cc
new file mode 100644
index 000000000..6ea1aaf46
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_basic_test.cc
@@ -0,0 +1,3880 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_with_timestamp_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
+ public:
+ DBBasicTestWithTimestamp()
+ : DBBasicTestWithTimestampBase("db_basic_test_with_timestamp") {}
+};
+
+TEST_F(DBBasicTestWithTimestamp, SanityChecks) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.avoid_flush_during_shutdown = true;
+ options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+ DestroyAndReopen(options);
+
+ Options options1 = CurrentOptions();
+ options1.env = env_;
+ options1.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ options1.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+ assert(options1.comparator &&
+ options1.comparator->timestamp_size() == sizeof(uint64_t));
+ ColumnFamilyHandle* handle = nullptr;
+ Status s = db_->CreateColumnFamily(options1, "data", &handle);
+ ASSERT_OK(s);
+
+ std::string dummy_ts(sizeof(uint64_t), '\0');
+ // Perform timestamp operations on default cf.
+ ASSERT_TRUE(
+ db_->Put(WriteOptions(), "key", dummy_ts, "value").IsInvalidArgument());
+ ASSERT_TRUE(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), "key",
+ dummy_ts, "value")
+ .IsInvalidArgument());
+ ASSERT_TRUE(db_->Delete(WriteOptions(), "key", dummy_ts).IsInvalidArgument());
+ ASSERT_TRUE(
+ db_->SingleDelete(WriteOptions(), "key", dummy_ts).IsInvalidArgument());
+ ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ "begin_key", "end_key", dummy_ts)
+ .IsInvalidArgument());
+
+ // Perform non-timestamp operations on "data" cf.
+ ASSERT_TRUE(
+ db_->Put(WriteOptions(), handle, "key", "value").IsInvalidArgument());
+ ASSERT_TRUE(db_->Delete(WriteOptions(), handle, "key").IsInvalidArgument());
+ ASSERT_TRUE(
+ db_->SingleDelete(WriteOptions(), handle, "key").IsInvalidArgument());
+
+ ASSERT_TRUE(
+ db_->Merge(WriteOptions(), handle, "key", "value").IsInvalidArgument());
+ ASSERT_TRUE(db_->DeleteRange(WriteOptions(), handle, "begin_key", "end_key")
+ .IsInvalidArgument());
+
+ {
+ WriteBatch wb;
+ ASSERT_OK(wb.Put(handle, "key", "value"));
+ ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+ }
+ {
+ WriteBatch wb;
+ ASSERT_OK(wb.Delete(handle, "key"));
+ ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+ }
+ {
+ WriteBatch wb;
+ ASSERT_OK(wb.SingleDelete(handle, "key"));
+ ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+ }
+ {
+ WriteBatch wb;
+ ASSERT_OK(wb.DeleteRange(handle, "begin_key", "end_key"));
+ ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+ }
+
+ // Perform timestamp operations with timestamps of incorrect size.
+ const std::string wrong_ts(sizeof(uint32_t), '\0');
+ ASSERT_TRUE(db_->Put(WriteOptions(), handle, "key", wrong_ts, "value")
+ .IsInvalidArgument());
+ ASSERT_TRUE(db_->Merge(WriteOptions(), handle, "key", wrong_ts, "value")
+ .IsInvalidArgument());
+ ASSERT_TRUE(
+ db_->Delete(WriteOptions(), handle, "key", wrong_ts).IsInvalidArgument());
+ ASSERT_TRUE(db_->SingleDelete(WriteOptions(), handle, "key", wrong_ts)
+ .IsInvalidArgument());
+ ASSERT_TRUE(
+ db_->DeleteRange(WriteOptions(), handle, "begin_key", "end_key", wrong_ts)
+ .IsInvalidArgument());
+
+ delete handle;
+}
+
+TEST_F(DBBasicTestWithTimestamp, MixedCfs) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.avoid_flush_during_shutdown = true;
+ DestroyAndReopen(options);
+
+ Options options1 = CurrentOptions();
+ options1.env = env_;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options1.comparator = &test_cmp;
+ ColumnFamilyHandle* handle = nullptr;
+ Status s = db_->CreateColumnFamily(options1, "data", &handle);
+ ASSERT_OK(s);
+
+ WriteBatch wb;
+ ASSERT_OK(wb.Put("a", "value"));
+ ASSERT_OK(wb.Put(handle, "a", "value"));
+ {
+ std::string ts = Timestamp(1, 0);
+ const auto ts_sz_func = [kTimestampSize, handle](uint32_t cf_id) {
+ assert(handle);
+ if (cf_id == 0) {
+ return static_cast<size_t>(0);
+ } else if (cf_id == handle->GetID()) {
+ return kTimestampSize;
+ } else {
+ assert(false);
+ return std::numeric_limits<size_t>::max();
+ }
+ };
+ ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func));
+ ASSERT_OK(db_->Write(WriteOptions(), &wb));
+ }
+
+ const auto verify_db = [this](ColumnFamilyHandle* h, const std::string& key,
+ const std::string& ts,
+ const std::string& expected_value) {
+ ASSERT_EQ(expected_value, Get(key));
+ Slice read_ts_slice(ts);
+ ReadOptions read_opts;
+ read_opts.timestamp = &read_ts_slice;
+ std::string value;
+ ASSERT_OK(db_->Get(read_opts, h, key, &value));
+ ASSERT_EQ(expected_value, value);
+ };
+
+ verify_db(handle, "a", Timestamp(1, 0), "value");
+
+ delete handle;
+ Close();
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back("data", options1);
+ options.create_if_missing = false;
+ s = DB::Open(options, dbname_, cf_descs, &handles_, &db_);
+ ASSERT_OK(s);
+
+ verify_db(handles_[1], "a", Timestamp(1, 0), "value");
+
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+
+ ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
+ ASSERT_OK(Flush());
+
+ std::string start_str = "foo";
+ std::string end_str = "foo2";
+ Slice start(start_str), end(end_str);
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, GcPreserveLatestVersionBelowFullHistoryLow) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ std::string ts_str = Timestamp(1, 0);
+ WriteOptions wopts;
+ ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v1"));
+ ASSERT_OK(db_->Put(wopts, "k2", ts_str, "v2"));
+ ASSERT_OK(db_->Put(wopts, "k3", ts_str, "v3"));
+
+ ts_str = Timestamp(2, 0);
+ ASSERT_OK(db_->Delete(wopts, "k3", ts_str));
+
+ ts_str = Timestamp(4, 0);
+ ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v5"));
+
+ ts_str = Timestamp(5, 0);
+ ASSERT_OK(
+ db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k0", "k9", ts_str));
+
+ ts_str = Timestamp(3, 0);
+ Slice ts = ts_str;
+ CompactRangeOptions cro;
+ cro.full_history_ts_low = &ts;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_OK(Flush());
+
+ ReadOptions ropts;
+ ropts.timestamp = &ts;
+ std::string value;
+ Status s = db_->Get(ropts, "k1", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("v1", value);
+
+ std::string key_ts;
+ ASSERT_TRUE(db_->Get(ropts, "k3", &value, &key_ts).IsNotFound());
+ ASSERT_EQ(Timestamp(2, 0), key_ts);
+
+ ts_str = Timestamp(5, 0);
+ ts = ts_str;
+ ropts.timestamp = &ts;
+ ASSERT_TRUE(db_->Get(ropts, "k2", &value, &key_ts).IsNotFound());
+ ASSERT_EQ(Timestamp(5, 0), key_ts);
+ ASSERT_TRUE(db_->Get(ropts, "k2", &value).IsNotFound());
+
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLow) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ const std::string kKey = "test kKey";
+
+ // Test set ts_low first and flush()
+ int current_ts_low = 5;
+ std::string ts_low_str = Timestamp(current_ts_low, 0);
+ Slice ts_low = ts_low_str;
+ CompactRangeOptions comp_opts;
+ comp_opts.full_history_ts_low = &ts_low;
+ comp_opts.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+ ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
+
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+ ->cfd();
+ auto result_ts_low = cfd->GetFullHistoryTsLow();
+
+ ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
+
+ for (int i = 0; i < 10; i++) {
+ WriteOptions write_opts;
+ std::string ts = Timestamp(i, 0);
+ ASSERT_OK(db_->Put(write_opts, kKey, ts, Key(i)));
+ }
+ ASSERT_OK(Flush());
+
+ for (int i = 0; i < 10; i++) {
+ ReadOptions read_opts;
+ std::string ts_str = Timestamp(i, 0);
+ Slice ts = ts_str;
+ read_opts.timestamp = &ts;
+ std::string value;
+ Status status = db_->Get(read_opts, kKey, &value);
+ if (i < current_ts_low) {
+ ASSERT_TRUE(status.IsInvalidArgument());
+ } else {
+ ASSERT_OK(status);
+ ASSERT_TRUE(value.compare(Key(i)) == 0);
+ }
+ }
+
+ // Test set ts_low and then trigger compaction
+ for (int i = 10; i < 20; i++) {
+ WriteOptions write_opts;
+ std::string ts = Timestamp(i, 0);
+ ASSERT_OK(db_->Put(write_opts, kKey, ts, Key(i)));
+ }
+
+ ASSERT_OK(Flush());
+
+ current_ts_low = 15;
+ ts_low_str = Timestamp(current_ts_low, 0);
+ ts_low = ts_low_str;
+ comp_opts.full_history_ts_low = &ts_low;
+ ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
+ result_ts_low = cfd->GetFullHistoryTsLow();
+ ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
+
+ for (int i = current_ts_low; i < 20; i++) {
+ ReadOptions read_opts;
+ std::string ts_str = Timestamp(i, 0);
+ Slice ts = ts_str;
+ read_opts.timestamp = &ts;
+ std::string value;
+ Status status = db_->Get(read_opts, kKey, &value);
+ ASSERT_OK(status);
+ ASSERT_TRUE(value.compare(Key(i)) == 0);
+ }
+
+ // Test invalid compaction with range
+ Slice start(kKey), end(kKey);
+ Status s = db_->CompactRange(comp_opts, &start, &end);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ s = db_->CompactRange(comp_opts, &start, nullptr);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ s = db_->CompactRange(comp_opts, nullptr, &end);
+ ASSERT_TRUE(s.IsInvalidArgument());
+
+ // Test invalid compaction with the decreasing ts_low
+ ts_low_str = Timestamp(current_ts_low - 1, 0);
+ ts_low = ts_low_str;
+ comp_opts.full_history_ts_low = &ts_low;
+ s = db_->CompactRange(comp_opts, nullptr, nullptr);
+ ASSERT_TRUE(s.IsInvalidArgument());
+
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLowWithPublicAPI) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ std::string ts_low_str = Timestamp(9, 0);
+ ASSERT_OK(
+ db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str));
+ std::string result_ts_low;
+ ASSERT_OK(db_->GetFullHistoryTsLow(nullptr, &result_ts_low));
+ ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low_str, result_ts_low) == 0);
+ // test increase full_history_low backward
+ std::string ts_low_str_back = Timestamp(8, 0);
+ auto s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+ ts_low_str_back);
+ ASSERT_EQ(s, Status::InvalidArgument());
+ // test IncreaseFullHistoryTsLow with a timestamp whose length is longger
+ // than the cf's timestamp size
+ std::string ts_low_str_long(Timestamp(0, 0).size() + 1, 'a');
+ s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+ ts_low_str_long);
+ ASSERT_EQ(s, Status::InvalidArgument());
+ // test IncreaseFullHistoryTsLow with a timestamp which is null
+ std::string ts_low_str_null = "";
+ s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+ ts_low_str_null);
+ ASSERT_EQ(s, Status::InvalidArgument());
+ // test IncreaseFullHistoryTsLow for a column family that does not enable
+ // timestamp
+ options.comparator = BytewiseComparator();
+ DestroyAndReopen(options);
+ ts_low_str = Timestamp(10, 0);
+ s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str);
+ ASSERT_EQ(s, Status::InvalidArgument());
+ // test GetFullHistoryTsLow for a column family that does not enable
+ // timestamp
+ std::string current_ts_low;
+ s = db_->GetFullHistoryTsLow(db_->DefaultColumnFamily(), &current_ts_low);
+ ASSERT_EQ(s, Status::InvalidArgument());
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, GetApproximateSizes) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.compression = kNoCompression;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ auto default_cf = db_->DefaultColumnFamily();
+
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+
+ const int N = 128;
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(db_->Put(write_opts, Key(i), ts, rnd.RandomString(1024)));
+ }
+
+ uint64_t size;
+ std::string start = Key(50);
+ std::string end = Key(60);
+ Range r(start, end);
+ SizeApproximationOptions size_approx_options;
+ size_approx_options.include_memtables = true;
+ size_approx_options.include_files = true;
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_GT(size, 6000);
+ ASSERT_LT(size, 204800);
+
+ // test multiple ranges
+ std::vector<Range> ranges;
+ std::string start_tmp = Key(10);
+ std::string end_tmp = Key(20);
+ ranges.emplace_back(Range(start_tmp, end_tmp));
+ ranges.emplace_back(Range(start, end));
+ uint64_t range_sizes[2];
+ ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf,
+ ranges.data(), 2, range_sizes));
+
+ ASSERT_EQ(range_sizes[1], size);
+
+ // Zero if not including mem table
+ ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+ ASSERT_EQ(size, 0);
+
+ start = Key(500);
+ end = Key(600);
+ r = Range(start, end);
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_EQ(size, 0);
+
+ // Test range boundaries
+ ASSERT_OK(db_->Put(write_opts, Key(1000), ts, rnd.RandomString(1024)));
+ // Should include start key
+ start = Key(1000);
+ end = Key(1100);
+ r = Range(start, end);
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_GT(size, 0);
+
+ // Should exclude end key
+ start = Key(900);
+ end = Key(1000);
+ r = Range(start, end);
+ ASSERT_OK(
+ db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+ ASSERT_EQ(size, 0);
+
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleIterate) {
+ const int kNumKeysPerFile = 128;
+ const uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::vector<uint64_t> start_keys = {1, 0};
+ const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+ Timestamp(3, 0)};
+ const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+ Timestamp(4, 0)};
+ for (size_t i = 0; i < write_timestamps.size(); ++i) {
+ WriteOptions write_opts;
+ for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ }
+ for (size_t i = 0; i < read_timestamps.size(); ++i) {
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamps[i];
+ read_opts.timestamp = &read_ts;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ int count = 0;
+ uint64_t key = 0;
+ // Forward iterate.
+ for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
+ it->Next(), ++count, ++key) {
+ CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ }
+ size_t expected_count = kMaxKey - start_keys[i] + 1;
+ ASSERT_EQ(expected_count, count);
+
+ // Backward iterate.
+ count = 0;
+ for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+ it->Prev(), ++count, --key) {
+ CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ }
+ ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+ // SeekToFirst()/SeekToLast() with lower/upper bounds.
+ // Then iter with lower and upper bounds.
+ uint64_t l = 0;
+ uint64_t r = kMaxKey + 1;
+ while (l < r) {
+ std::string lb_str = Key1(l);
+ Slice lb = lb_str;
+ std::string ub_str = Key1(r);
+ Slice ub = ub_str;
+ read_opts.iterate_lower_bound = &lb;
+ read_opts.iterate_upper_bound = &ub;
+ it.reset(db_->NewIterator(read_opts));
+ for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
+ it->Valid(); it->Next(), ++key, ++count) {
+ CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ }
+ ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+ for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+ it->Valid(); it->Prev(), --key, ++count) {
+ CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ }
+ l += (kMaxKey / 100);
+ r -= (kMaxKey / 100);
+ }
+ }
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ auto check_value_by_ts = [](DB* db, Slice key, std::string readTs,
+ Status status, std::string checkValue,
+ std::string expected_ts) {
+ ReadOptions ropts;
+ Slice ts = readTs;
+ ropts.timestamp = &ts;
+ std::string value;
+ std::string key_ts;
+ Status s = db->Get(ropts, key, &value, &key_ts);
+ ASSERT_TRUE(s == status);
+ if (s.ok()) {
+ ASSERT_EQ(checkValue, value);
+ }
+ if (s.ok() || s.IsNotFound()) {
+ ASSERT_EQ(expected_ts, key_ts);
+ }
+ };
+ // Construct data of different versions with different ts
+ ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(2, 0), "v1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(4, 0), "v2"));
+ ASSERT_OK(db_->Delete(WriteOptions(), "k1", Timestamp(5, 0)));
+ ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(6, 0), "v3"));
+ check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v3",
+ Timestamp(6, 0));
+ ASSERT_OK(Flush());
+ Close();
+
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ DBOptions db_options(options);
+
+ // Trim data whose version > Timestamp(5, 0), read(k1, ts(7)) <- NOT_FOUND.
+ ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+ &handles_, &db_, Timestamp(5, 0)));
+ check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::NotFound(), "",
+ Timestamp(5, 0));
+ Close();
+
+ // Trim data whose timestamp > Timestamp(4, 0), read(k1, ts(7)) <- v2
+ ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+ &handles_, &db_, Timestamp(4, 0)));
+ check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v2",
+ Timestamp(4, 0));
+ Close();
+
+ Reopen(options);
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "k1",
+ "k3", Timestamp(7, 0)));
+ check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::NotFound(), "",
+ Timestamp(7, 0));
+ Close();
+ // Trim data whose timestamp > Timestamp(6, 0), read(k1, ts(8)) <- v2
+ ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+ &handles_, &db_, Timestamp(6, 0)));
+ check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::OK(), "v2",
+ Timestamp(4, 0));
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, OpenAndTrimHistoryInvalidOptionTest) {
+ Destroy(last_options_);
+
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ DBOptions db_options(options);
+
+ // OpenAndTrimHistory should not work with avoid_flush_during_recovery
+ db_options.avoid_flush_during_recovery = true;
+ ASSERT_TRUE(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+ &handles_, &db_, Timestamp(0, 0))
+ .IsInvalidArgument());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTestWithTimestamp, GetTimestampTableProperties) {
+ Options options = CurrentOptions();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ // Create 2 tables
+ for (int table = 0; table < 2; ++table) {
+ for (int i = 0; i < 10; i++) {
+ std::string ts = Timestamp(i, 0);
+ ASSERT_OK(db_->Put(WriteOptions(), "key", ts, Key(i)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ TablePropertiesCollection props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+ ASSERT_EQ(2U, props.size());
+ for (const auto& item : props) {
+ auto& user_collected = item.second->user_collected_properties;
+ ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") !=
+ user_collected.end());
+ ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") !=
+ user_collected.end());
+ ASSERT_EQ(user_collected.at("rocksdb.timestamp_min"), Timestamp(0, 0));
+ ASSERT_EQ(user_collected.at("rocksdb.timestamp_max"), Timestamp(9, 0));
+ }
+ Close();
+}
+#endif // !ROCKSDB_LITE
+
+class DBBasicTestWithTimestampTableOptions
+ : public DBBasicTestWithTimestampBase,
+ public testing::WithParamInterface<BlockBasedTableOptions::IndexType> {
+ public:
+ explicit DBBasicTestWithTimestampTableOptions()
+ : DBBasicTestWithTimestampBase(
+ "db_basic_test_with_timestamp_table_options") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+ Timestamp, DBBasicTestWithTimestampTableOptions,
+ testing::Values(
+ BlockBasedTableOptions::IndexType::kBinarySearch,
+ BlockBasedTableOptions::IndexType::kHashSearch,
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+ BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey));
+
+TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ options.compression = kNoCompression;
+ BlockBasedTableOptions bbto;
+ bbto.index_type = GetParam();
+ bbto.block_size = 100;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator cmp(kTimestampSize);
+ options.comparator = &cmp;
+ DestroyAndReopen(options);
+ constexpr uint64_t kNumKeys = 1024;
+ for (uint64_t k = 0; k < kNumKeys; ++k) {
+ WriteOptions write_opts;
+ ASSERT_OK(db_->Put(write_opts, Key1(k), Timestamp(1, 0),
+ "value" + std::to_string(k)));
+ }
+ ASSERT_OK(Flush());
+ {
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ std::string ts_str = Timestamp(2, 0);
+ Slice ts = ts_str;
+ read_opts.timestamp = &ts;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ // verify Get()
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ std::string value_from_get;
+ std::string key_str(it->key().data(), it->key().size());
+ std::string timestamp;
+ ASSERT_OK(db_->Get(read_opts, key_str, &value_from_get, &timestamp));
+ ASSERT_EQ(it->value(), value_from_get);
+ ASSERT_EQ(Timestamp(1, 0), timestamp);
+ }
+
+ // verify MultiGet()
+ constexpr uint64_t step = 2;
+ static_assert(0 == (kNumKeys % step),
+ "kNumKeys must be a multiple of step");
+ for (uint64_t k = 0; k < kNumKeys; k += 2) {
+ std::vector<std::string> key_strs;
+ std::vector<Slice> keys;
+ for (size_t i = 0; i < step; ++i) {
+ key_strs.push_back(Key1(k + i));
+ }
+ for (size_t i = 0; i < step; ++i) {
+ keys.emplace_back(key_strs[i]);
+ }
+ std::vector<std::string> values;
+ std::vector<std::string> timestamps;
+ std::vector<Status> statuses =
+ db_->MultiGet(read_opts, keys, &values, &timestamps);
+ ASSERT_EQ(step, statuses.size());
+ ASSERT_EQ(step, values.size());
+ ASSERT_EQ(step, timestamps.size());
+ for (uint64_t i = 0; i < step; ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ("value" + std::to_string(k + i), values[i]);
+ ASSERT_EQ(Timestamp(1, 0), timestamps[i]);
+ }
+ }
+ }
+ Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ options.memtable_whole_key_filtering = true;
+ options.memtable_prefix_bloom_size_ratio = 0.1;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.whole_key_filtering = true;
+ bbto.index_type = GetParam();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+
+ ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
+ ASSERT_OK(Flush());
+
+ // Move sst file to next level
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ ASSERT_OK(db_->Put(write_opts, "foo3", ts, "bar"));
+ ASSERT_OK(Flush());
+
+ ReadOptions read_opts;
+ Slice read_ts = ts;
+ read_opts.timestamp = &read_ts;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+
+ iter->Seek("bbb");
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ }
+
+ Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithCappedPrefix) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ // All of the keys or this test must be longer than 3 characters
+ constexpr int kMinKeyLen = 3;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(kMinKeyLen));
+ options.memtable_whole_key_filtering = true;
+ options.memtable_prefix_bloom_size_ratio = 0.1;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.whole_key_filtering = true;
+ bbto.index_type = GetParam();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+
+ ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
+ ASSERT_OK(Flush());
+
+ // Move sst file to next level
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ ASSERT_OK(db_->Put(write_opts, "foo3", ts, "bar"));
+ ASSERT_OK(Flush());
+
+ ReadOptions read_opts;
+ ts = Timestamp(2, 0);
+ Slice read_ts = ts;
+ read_opts.timestamp = &read_ts;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ // Make sure the prefix extractor doesn't include timestamp, otherwise it
+ // may return invalid result.
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ }
+
+ Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithBound) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.whole_key_filtering = true;
+ bbto.index_type = GetParam();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+
+ ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar1"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar2"));
+ ASSERT_OK(Flush());
+
+ // Move sst file to next level
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ for (int i = 3; i < 9; ++i) {
+ ASSERT_OK(db_->Put(write_opts, "foo" + std::to_string(i), ts,
+ "bar" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+
+ ReadOptions read_opts;
+ ts = Timestamp(2, 0);
+ Slice read_ts = ts;
+ read_opts.timestamp = &read_ts;
+ std::string up_bound = "foo5"; // exclusive
+ Slice up_bound_slice = up_bound;
+ std::string lo_bound = "foo2"; // inclusive
+ Slice lo_bound_slice = lo_bound;
+ read_opts.iterate_upper_bound = &up_bound_slice;
+ read_opts.iterate_lower_bound = &lo_bound_slice;
+ read_opts.auto_prefix_mode = true;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ // Make sure the prefix extractor doesn't include timestamp, otherwise it
+ // may return invalid result.
+ iter->Seek("foo");
+ CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
+ Timestamp(1, 0));
+ iter->SeekToFirst();
+ CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
+ Timestamp(1, 0));
+ iter->SeekForPrev("g");
+ CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
+ iter->SeekToLast();
+ CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
+ }
+
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.env = env_;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
+ Timestamp(4, 3)};
+ const std::vector<std::tuple<std::string, std::string>> kvs = {
+ std::make_tuple("aa", "value1"), std::make_tuple("ab", "value2")};
+ for (const auto& ts : timestamps) {
+ WriteBatch wb(0, 0, 0, kTimestampSize);
+ for (const auto& kv : kvs) {
+ const std::string& key = std::get<0>(kv);
+ const std::string& value = std::get<1>(kv);
+ ASSERT_OK(wb.Put(key, value));
+ }
+
+ ASSERT_OK(wb.UpdateTimestamps(
+ ts, [kTimestampSize](uint32_t) { return kTimestampSize; }));
+ ASSERT_OK(db_->Write(WriteOptions(), &wb));
+ }
+ std::string read_ts_str = Timestamp(5, 3);
+ Slice read_ts = read_ts_str;
+ ReadOptions read_opts;
+ read_opts.timestamp = &read_ts;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+
+ it->SeekToFirst();
+ ASSERT_TRUE(it->Valid());
+ it->Prev();
+ ASSERT_FALSE(it->Valid());
+
+ it->SeekToLast();
+ ASSERT_TRUE(it->Valid());
+ uint64_t prev_reseek_count =
+ options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+ ASSERT_EQ(0, prev_reseek_count);
+ it->Next();
+ ASSERT_FALSE(it->Valid());
+ ASSERT_EQ(1 + prev_reseek_count,
+ options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+ it->Seek(std::get<0>(kvs[0]));
+ CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+ std::get<1>(kvs[0]), Timestamp(4, 3));
+ it->Next();
+ CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+ std::get<1>(kvs[1]), Timestamp(4, 3));
+ it->Prev();
+ CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+ std::get<1>(kvs[0]), Timestamp(4, 3));
+
+ prev_reseek_count =
+ options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+ ASSERT_EQ(1, prev_reseek_count);
+ it->Next();
+ CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+ std::get<1>(kvs[1]), Timestamp(4, 3));
+ ASSERT_EQ(1 + prev_reseek_count,
+ options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+ it->SeekForPrev(std::get<0>(kvs[1]));
+ CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+ std::get<1>(kvs[1]), Timestamp(4, 3));
+ it->Prev();
+ CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+ std::get<1>(kvs[0]), Timestamp(4, 3));
+
+ prev_reseek_count =
+ options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+ it->Next();
+ CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+ std::get<1>(kvs[1]), Timestamp(4, 3));
+ ASSERT_EQ(1 + prev_reseek_count,
+ options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+ it.reset();
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) {
+ constexpr int kNumKeysPerFile = 128;
+ constexpr uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+ Timestamp(3, 0)};
+ const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+ Timestamp(4, 0)};
+ const std::vector<std::string> read_timestamps_lb = {Timestamp(1, 0),
+ Timestamp(1, 0)};
+ for (size_t i = 0; i < write_timestamps.size(); ++i) {
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ }
+ for (size_t i = 0; i < read_timestamps.size(); ++i) {
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamps[i];
+ Slice read_ts_lb = read_timestamps_lb[i];
+ read_opts.timestamp = &read_ts;
+ read_opts.iter_start_ts = &read_ts_lb;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ int count = 0;
+ uint64_t key = 0;
+ for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
+ CheckIterEntry(it.get(), Key1(key), kTypeValue,
+ "value" + std::to_string(i), write_timestamps[i]);
+ if (i > 0) {
+ it->Next();
+ CheckIterEntry(it.get(), Key1(key), kTypeValue,
+ "value" + std::to_string(i - 1),
+ write_timestamps[i - 1]);
+ }
+ }
+ size_t expected_count = kMaxKey + 1;
+ ASSERT_EQ(expected_count, count);
+ }
+ // Delete all keys@ts=5 and check iteration result with start ts set
+ {
+ std::string write_timestamp = Timestamp(5, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
+ Status s = db_->Delete(write_opts, Key1(key), write_timestamp);
+ ASSERT_OK(s);
+ }
+
+ std::string read_timestamp = Timestamp(6, 0);
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamp;
+ read_opts.timestamp = &read_ts;
+ std::string read_timestamp_lb = Timestamp(2, 0);
+ Slice read_ts_lb = read_timestamp_lb;
+ read_opts.iter_start_ts = &read_ts_lb;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ int count = 0;
+ uint64_t key = 0;
+ for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
+ CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
+ write_timestamp);
+ // Skip key@ts=3 and land on tombstone key@ts=5
+ it->Next();
+ }
+ ASSERT_EQ(kMaxKey + 1, count);
+ }
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound) {
+ constexpr int kNumKeysPerFile = 128;
+ constexpr uint64_t kMaxKey = 1024;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+ Timestamp(3, 0)};
+ const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+ Timestamp(4, 0)};
+ const std::vector<std::string> read_timestamps_lb = {Timestamp(1, 0),
+ Timestamp(1, 0)};
+ for (size_t i = 0; i < write_timestamps.size(); ++i) {
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key <= kMaxKey; ++key) {
+ Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ }
+ for (size_t i = 0; i < read_timestamps.size(); ++i) {
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamps[i];
+ Slice read_ts_lb = read_timestamps_lb[i];
+ read_opts.timestamp = &read_ts;
+ read_opts.iter_start_ts = &read_ts_lb;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ int count = 0;
+ uint64_t key = 0;
+ for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+ it->Prev(), ++count, --key) {
+ CheckIterEntry(it.get(), Key1(key), kTypeValue, "value0",
+ write_timestamps[0]);
+ if (i > 0) {
+ it->Prev();
+ CheckIterEntry(it.get(), Key1(key), kTypeValue, "value1",
+ write_timestamps[1]);
+ }
+ }
+ size_t expected_count = kMaxKey + 1;
+ ASSERT_EQ(expected_count, count);
+ }
+ // Delete all keys@ts=5 and check iteration result with start ts set
+ {
+ std::string write_timestamp = Timestamp(5, 0);
+ WriteOptions write_opts;
+ for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
+ Status s = db_->Delete(write_opts, Key1(key), write_timestamp);
+ ASSERT_OK(s);
+ }
+
+ std::string read_timestamp = Timestamp(6, 0);
+ ReadOptions read_opts;
+ Slice read_ts = read_timestamp;
+ read_opts.timestamp = &read_ts;
+ std::string read_timestamp_lb = Timestamp(2, 0);
+ Slice read_ts_lb = read_timestamp_lb;
+ read_opts.iter_start_ts = &read_ts_lb;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ int count = 0;
+ uint64_t key = kMaxKey;
+ for (it->SeekForPrev(Key1(key)), key = kMaxKey; it->Valid();
+ it->Prev(), ++count, --key) {
+ CheckIterEntry(it.get(), Key1(key), kTypeValue, "value1",
+ Timestamp(3, 0));
+ it->Prev();
+ CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
+ write_timestamp);
+ }
+ ASSERT_EQ(kMaxKey + 1, count);
+ }
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleBackwardIterateLowerTsBound) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ std::string ts_ub_buf = Timestamp(5, 0);
+ Slice ts_ub = ts_ub_buf;
+ std::string ts_lb_buf = Timestamp(1, 0);
+ Slice ts_lb = ts_lb_buf;
+
+ {
+ ReadOptions read_opts;
+ read_opts.timestamp = &ts_ub;
+ read_opts.iter_start_ts = &ts_lb;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ it->SeekToLast();
+ ASSERT_FALSE(it->Valid());
+ ASSERT_OK(it->status());
+
+ it->SeekForPrev("foo");
+ ASSERT_FALSE(it->Valid());
+ ASSERT_OK(it->status());
+ }
+
+ // Test iterate_upper_bound
+ ASSERT_OK(db_->Put(WriteOptions(), "a", Timestamp(0, 0), "v0"));
+ ASSERT_OK(db_->SingleDelete(WriteOptions(), "a", Timestamp(1, 0)));
+
+ for (int i = 0; i < 5; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), "b", Timestamp(i, 0),
+ "v" + std::to_string(i)));
+ }
+
+ {
+ ReadOptions read_opts;
+ read_opts.timestamp = &ts_ub;
+ read_opts.iter_start_ts = &ts_lb;
+ std::string key_ub_str = "b"; // exclusive
+ Slice key_ub = key_ub_str;
+ read_opts.iterate_upper_bound = &key_ub;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ it->SeekToLast();
+ CheckIterEntry(it.get(), "a", kTypeSingleDeletion, Slice(),
+ Timestamp(1, 0));
+
+ key_ub_str = "a"; // exclusive
+ key_ub = key_ub_str;
+ read_opts.iterate_upper_bound = &key_ub;
+ it.reset(db_->NewIterator(read_opts));
+ it->SeekToLast();
+ ASSERT_FALSE(it->Valid());
+ ASSERT_OK(it->status());
+ }
+
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound_Reseek) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.max_sequential_skip_in_iterations = 2;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), "a", Timestamp(i, 0),
+ "v" + std::to_string(i)));
+ }
+
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), "b", Timestamp(i, 0),
+ "v" + std::to_string(i)));
+ }
+
+ {
+ std::string ts_ub_buf = Timestamp(6, 0);
+ Slice ts_ub = ts_ub_buf;
+ std::string ts_lb_buf = Timestamp(4, 0);
+ Slice ts_lb = ts_lb_buf;
+
+ ReadOptions read_opts;
+ read_opts.timestamp = &ts_ub;
+ read_opts.iter_start_ts = &ts_lb;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ it->SeekToLast();
+ for (int i = 0; i < 3 && it->Valid(); it->Prev(), ++i) {
+ CheckIterEntry(it.get(), "b", kTypeValue, "v" + std::to_string(4 + i),
+ Timestamp(4 + i, 0));
+ }
+ for (int i = 0; i < 3 && it->Valid(); it->Prev(), ++i) {
+ CheckIterEntry(it.get(), "a", kTypeValue, "v" + std::to_string(4 + i),
+ Timestamp(4 + i, 0));
+ }
+ }
+
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ constexpr size_t kNumKeys = 16;
+ options.max_sequential_skip_in_iterations = kNumKeys / 2;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ // Insert kNumKeys
+ WriteOptions write_opts;
+ Status s;
+ for (size_t i = 0; i != kNumKeys; ++i) {
+ std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+ s = db_->Put(write_opts, "foo", ts, "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ {
+ ReadOptions read_opts;
+ std::string ts_str = Timestamp(1, 0);
+ Slice ts = ts_str;
+ read_opts.timestamp = &ts;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ iter->SeekToFirst();
+ CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str);
+ ASSERT_EQ(
+ 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+ ts_str = Timestamp(kNumKeys, 0);
+ ts = ts_str;
+ read_opts.timestamp = &ts;
+ iter.reset(db_->NewIterator(read_opts));
+ iter->SeekToLast();
+ CheckIterUserEntry(iter.get(), "foo", kTypeValue,
+ "value" + std::to_string(kNumKeys - 1), ts_str);
+ ASSERT_EQ(
+ 2, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+ }
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ constexpr size_t kNumKeys = 16;
+ options.max_sequential_skip_in_iterations = kNumKeys / 2;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ // Write kNumKeys + 1 keys
+ WriteOptions write_opts;
+ Status s;
+ for (size_t i = 0; i != kNumKeys; ++i) {
+ std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+ s = db_->Put(write_opts, "a", ts, "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ {
+ std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
+ WriteBatch batch(0, 0, 0, kTimestampSize);
+ { ASSERT_OK(batch.Put("a", "new_value")); }
+ { ASSERT_OK(batch.Put("b", "new_value")); }
+ s = batch.UpdateTimestamps(
+ ts_str, [kTimestampSize](uint32_t) { return kTimestampSize; });
+ ASSERT_OK(s);
+ s = db_->Write(write_opts, &batch);
+ ASSERT_OK(s);
+ }
+ {
+ ReadOptions read_opts;
+ std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
+ Slice ts = ts_str;
+ read_opts.timestamp = &ts;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ iter->Seek("a");
+ iter->Next();
+ CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str);
+ ASSERT_EQ(
+ 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+ }
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) {
+ Options options = GetDefaultOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ constexpr size_t kNumKeys = 16;
+ options.max_sequential_skip_in_iterations = kNumKeys / 2;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+ WriteOptions write_opts;
+ Status s = db_->Put(write_opts, "b", ts, "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ {
+ std::string ts = Timestamp(1, 0);
+ WriteOptions write_opts;
+ ASSERT_OK(db_->Put(write_opts, "a", ts, "value"));
+ }
+ {
+ ReadOptions read_opts;
+ std::string ts_str = Timestamp(1, 0);
+ Slice ts = ts_str;
+ read_opts.timestamp = &ts;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ iter->SeekToLast();
+ iter->Prev();
+ CheckIterUserEntry(iter.get(), "a", kTypeValue, "value", ts_str);
+ ASSERT_EQ(
+ 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+ }
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetWithFastLocalBloom) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ // Write any value
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+
+ ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+ ASSERT_OK(Flush());
+
+ // Read with MultiGet
+ ReadOptions read_opts;
+ Slice read_ts = ts;
+ read_opts.timestamp = &read_ts;
+ size_t batch_size = 1;
+ std::vector<Slice> keys(batch_size);
+ std::vector<PinnableSlice> values(batch_size);
+ std::vector<Status> statuses(batch_size);
+ std::vector<std::string> timestamps(batch_size);
+ keys[0] = "foo";
+ ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+ db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+ timestamps.data(), statuses.data(), true);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(Timestamp(1, 0), timestamps[0]);
+ for (auto& elem : values) {
+ elem.Reset();
+ }
+
+ ASSERT_OK(db_->SingleDelete(WriteOptions(), "foo", Timestamp(2, 0)));
+ ts = Timestamp(3, 0);
+ read_ts = ts;
+ read_opts.timestamp = &read_ts;
+ db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+ timestamps.data(), statuses.data(), true);
+ ASSERT_TRUE(statuses[0].IsNotFound());
+ ASSERT_EQ(Timestamp(2, 0), timestamps[0]);
+
+ Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithPrefix) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(5));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.whole_key_filtering = false;
+ bbto.index_type = GetParam();
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ // Write any value
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+
+ ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+ ASSERT_OK(Flush());
+
+ // Read with MultiGet
+ ReadOptions read_opts;
+ Slice read_ts = ts;
+ read_opts.timestamp = &read_ts;
+ size_t batch_size = 1;
+ std::vector<Slice> keys(batch_size);
+ std::vector<PinnableSlice> values(batch_size);
+ std::vector<Status> statuses(batch_size);
+ std::vector<std::string> timestamps(batch_size);
+ keys[0] = "foo";
+ ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+ db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+ timestamps.data(), statuses.data(), true);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(Timestamp(1, 0), timestamps[0]);
+ for (auto& elem : values) {
+ elem.Reset();
+ }
+
+ ASSERT_OK(db_->SingleDelete(WriteOptions(), "foo", Timestamp(2, 0)));
+ // TODO re-enable after fixing a bug of kHashSearch
+ if (GetParam() != BlockBasedTableOptions::IndexType::kHashSearch) {
+ ASSERT_OK(Flush());
+ }
+
+ ts = Timestamp(3, 0);
+ read_ts = ts;
+ db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+ timestamps.data(), statuses.data(), true);
+ ASSERT_TRUE(statuses[0].IsNotFound());
+ ASSERT_EQ(Timestamp(2, 0), timestamps[0]);
+
+ Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithMemBloomFilter) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(5));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.whole_key_filtering = false;
+ bbto.index_type = GetParam();
+ options.memtable_prefix_bloom_size_ratio = 0.1;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ // Write any value
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+
+ ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+ // Read with MultiGet
+ ts = Timestamp(2, 0);
+ Slice read_ts = ts;
+ ReadOptions read_opts;
+ read_opts.timestamp = &read_ts;
+ size_t batch_size = 1;
+ std::vector<Slice> keys(batch_size);
+ std::vector<PinnableSlice> values(batch_size);
+ std::vector<Status> statuses(batch_size);
+ keys[0] = "foo";
+ ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+ db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+ statuses.data());
+
+ ASSERT_OK(statuses[0]);
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetRangeFiltering) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.whole_key_filtering = false;
+ options.memtable_prefix_bloom_size_ratio = 0.1;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ // Write any value
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+
+ // random data
+ for (int i = 0; i < 3; i++) {
+ auto key = std::to_string(i * 10);
+ auto value = std::to_string(i * 10);
+ Slice key_slice = key;
+ Slice value_slice = value;
+ ASSERT_OK(db_->Put(write_opts, key_slice, ts, value_slice));
+ ASSERT_OK(Flush());
+ }
+
+ // Make num_levels to 2 to do key range filtering of sst files
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+ ASSERT_OK(Flush());
+
+ // Read with MultiGet
+ ts = Timestamp(2, 0);
+ Slice read_ts = ts;
+ ReadOptions read_opts;
+ read_opts.timestamp = &read_ts;
+ size_t batch_size = 1;
+ std::vector<Slice> keys(batch_size);
+ std::vector<PinnableSlice> values(batch_size);
+ std::vector<Status> statuses(batch_size);
+ keys[0] = "foo";
+ ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+ db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+ statuses.data());
+
+ ASSERT_OK(statuses[0]);
+ Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.whole_key_filtering = false;
+ bbto.index_type = GetParam();
+ options.memtable_prefix_bloom_size_ratio = 0.1;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+
+ ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+ ASSERT_OK(Flush());
+ // Read with MultiGet
+ ts = Timestamp(2, 0);
+ Slice read_ts = ts;
+ ReadOptions read_opts;
+ read_opts.timestamp = &read_ts;
+ size_t batch_size = 1;
+ std::vector<Slice> keys(batch_size);
+ std::vector<std::string> values(batch_size);
+ std::vector<std::string> timestamps(batch_size);
+ keys[0] = "foo";
+ ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+ std::vector<ColumnFamilyHandle*> cfhs(keys.size(), cfh);
+ std::vector<Status> statuses =
+ db_->MultiGet(read_opts, cfhs, keys, &values, &timestamps);
+
+ ASSERT_OK(statuses[0]);
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringNext) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ constexpr size_t max_skippable_internal_keys = 2;
+ const size_t kNumKeys = max_skippable_internal_keys + 2;
+ WriteOptions write_opts;
+ Status s;
+ {
+ std::string ts = Timestamp(1, 0);
+ ASSERT_OK(db_->Put(write_opts, "a", ts, "value"));
+ }
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+ s = db_->Put(write_opts, "b", ts, "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ {
+ ReadOptions read_opts;
+ read_opts.max_skippable_internal_keys = max_skippable_internal_keys;
+ std::string ts_str = Timestamp(1, 0);
+ Slice ts = ts_str;
+ read_opts.timestamp = &ts;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ iter->SeekToFirst();
+ iter->Next();
+ ASSERT_TRUE(iter->status().IsIncomplete());
+ }
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringPrev) {
+ Options options = GetDefaultOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ constexpr size_t max_skippable_internal_keys = 2;
+ const size_t kNumKeys = max_skippable_internal_keys + 2;
+ WriteOptions write_opts;
+ Status s;
+ {
+ std::string ts = Timestamp(1, 0);
+ ASSERT_OK(db_->Put(write_opts, "b", ts, "value"));
+ }
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+ s = db_->Put(write_opts, "a", ts, "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ {
+ ReadOptions read_opts;
+ read_opts.max_skippable_internal_keys = max_skippable_internal_keys;
+ std::string ts_str = Timestamp(1, 0);
+ Slice ts = ts_str;
+ read_opts.timestamp = &ts;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ iter->SeekToLast();
+ iter->Prev();
+ ASSERT_TRUE(iter->status().IsIncomplete());
+ }
+ Close();
+}
+
+// Create two L0, and compact them to a new L1. In this test, L1 is L_bottom.
+// Two L0s:
+// f1 f2
+// <a, 1, kTypeValue> <a, 3, kTypeDeletionWithTimestamp>...<b, 2, kTypeValue>
+// Since f2.smallest < f1.largest < f2.largest
+// f1 and f2 will be the inputs of a real compaction instead of trivial move.
+TEST_F(DBBasicTestWithTimestamp, CompactDeletionWithTimestampMarkerToBottom) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.num_levels = 2;
+ options.level0_file_num_compaction_trigger = 2;
+ DestroyAndReopen(options);
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+ ASSERT_OK(db_->Put(write_opts, "a", ts, "value0"));
+ ASSERT_OK(Flush());
+
+ ts = Timestamp(2, 0);
+ ASSERT_OK(db_->Put(write_opts, "b", ts, "value0"));
+ ts = Timestamp(3, 0);
+ ASSERT_OK(db_->Delete(write_opts, "a", ts));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ReadOptions read_opts;
+ ts = Timestamp(1, 0);
+ Slice read_ts = ts;
+ read_opts.timestamp = &read_ts;
+ std::string value;
+ Status s = db_->Get(read_opts, "a", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value0", value);
+
+ ts = Timestamp(3, 0);
+ read_ts = ts;
+ read_opts.timestamp = &read_ts;
+ std::string key_ts;
+ s = db_->Get(read_opts, "a", &value, &key_ts);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ(Timestamp(3, 0), key_ts);
+
+ // Time-travel to the past before deletion
+ ts = Timestamp(2, 0);
+ read_ts = ts;
+ read_opts.timestamp = &read_ts;
+ s = db_->Get(read_opts, "a", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value0", value);
+ Close();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class DBBasicTestWithTimestampFilterPrefixSettings
+ : public DBBasicTestWithTimestampBase,
+ public testing::WithParamInterface<
+ std::tuple<std::shared_ptr<const FilterPolicy>, bool, bool,
+ std::shared_ptr<const SliceTransform>, bool, double,
+ BlockBasedTableOptions::IndexType>> {
+ public:
+ DBBasicTestWithTimestampFilterPrefixSettings()
+ : DBBasicTestWithTimestampBase(
+ "db_basic_test_with_timestamp_filter_prefix") {}
+};
+
+TEST_P(DBBasicTestWithTimestampFilterPrefixSettings, GetAndMultiGet) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy = std::get<0>(GetParam());
+ bbto.whole_key_filtering = std::get<1>(GetParam());
+ bbto.cache_index_and_filter_blocks = std::get<2>(GetParam());
+ bbto.index_type = std::get<6>(GetParam());
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.prefix_extractor = std::get<3>(GetParam());
+ options.memtable_whole_key_filtering = std::get<4>(GetParam());
+ options.memtable_prefix_bloom_size_ratio = std::get<5>(GetParam());
+
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ const int kMaxKey = 1000;
+
+ // Write any value
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+
+ int idx = 0;
+ for (; idx < kMaxKey / 4; idx++) {
+ ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
+ ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
+ }
+
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ for (; idx < kMaxKey / 2; idx++) {
+ ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
+ ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
+ }
+
+ ASSERT_OK(Flush());
+
+ for (; idx < kMaxKey; idx++) {
+ ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
+ ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
+ }
+
+ // Read with MultiGet
+ ReadOptions read_opts;
+ Slice read_ts = ts;
+ read_opts.timestamp = &read_ts;
+
+ for (idx = 0; idx < kMaxKey; idx++) {
+ size_t batch_size = 4;
+ std::vector<std::string> keys_str(batch_size);
+ std::vector<PinnableSlice> values(batch_size);
+ std::vector<Status> statuses(batch_size);
+ ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+
+ keys_str[0] = Key1(idx);
+ keys_str[1] = KeyWithPrefix("foo", idx);
+ keys_str[2] = Key1(kMaxKey + idx);
+ keys_str[3] = KeyWithPrefix("foo", kMaxKey + idx);
+
+ auto keys = ConvertStrToSlice(keys_str);
+
+ db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+ statuses.data());
+
+ for (int i = 0; i < 2; i++) {
+ ASSERT_OK(statuses[i]);
+ }
+ for (int i = 2; i < 4; i++) {
+ ASSERT_TRUE(statuses[i].IsNotFound());
+ }
+
+ for (int i = 0; i < 2; i++) {
+ std::string value;
+ ASSERT_OK(db_->Get(read_opts, keys[i], &value));
+ std::unique_ptr<Iterator> it1(db_->NewIterator(read_opts));
+ ASSERT_NE(nullptr, it1);
+ ASSERT_OK(it1->status());
+ it1->Seek(keys[i]);
+ ASSERT_TRUE(it1->Valid());
+ }
+
+ for (int i = 2; i < 4; i++) {
+ std::string value;
+ Status s = db_->Get(read_opts, keys[i], &value);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ }
+ Close();
+}
+
+INSTANTIATE_TEST_CASE_P(
+ Timestamp, DBBasicTestWithTimestampFilterPrefixSettings,
+ ::testing::Combine(
+ ::testing::Values(
+ std::shared_ptr<const FilterPolicy>(nullptr),
+ std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10, true)),
+ std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10,
+ false))),
+ ::testing::Bool(), ::testing::Bool(),
+ ::testing::Values(
+ std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
+ std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
+ std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+ std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+ ::testing::Bool(), ::testing::Values(0, 0.1),
+ ::testing::Values(
+ BlockBasedTableOptions::IndexType::kBinarySearch,
+ BlockBasedTableOptions::IndexType::kHashSearch,
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+ BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class DataVisibilityTest : public DBBasicTestWithTimestampBase {
+ public:
+ DataVisibilityTest() : DBBasicTestWithTimestampBase("data_visibility_test") {
+ // Initialize test data
+ for (int i = 0; i < kTestDataSize; i++) {
+ test_data_[i].key = "key" + std::to_string(i);
+ test_data_[i].value = "value" + std::to_string(i);
+ test_data_[i].timestamp = Timestamp(i, 0);
+ test_data_[i].ts = i;
+ test_data_[i].seq_num = kMaxSequenceNumber;
+ }
+ }
+
+ protected:
+ struct TestData {
+ std::string key;
+ std::string value;
+ int ts;
+ std::string timestamp;
+ SequenceNumber seq_num;
+ };
+
+ constexpr static int kTestDataSize = 3;
+ TestData test_data_[kTestDataSize];
+
+ void PutTestData(int index, ColumnFamilyHandle* cfh = nullptr) {
+ ASSERT_LE(index, kTestDataSize);
+ WriteOptions write_opts;
+
+ if (cfh == nullptr) {
+ ASSERT_OK(db_->Put(write_opts, test_data_[index].key,
+ test_data_[index].timestamp, test_data_[index].value));
+ const Snapshot* snap = db_->GetSnapshot();
+ test_data_[index].seq_num = snap->GetSequenceNumber();
+ if (index > 0) {
+ ASSERT_GT(test_data_[index].seq_num, test_data_[index - 1].seq_num);
+ }
+ db_->ReleaseSnapshot(snap);
+ } else {
+ ASSERT_OK(db_->Put(write_opts, cfh, test_data_[index].key,
+ test_data_[index].timestamp, test_data_[index].value));
+ }
+ }
+
+ void AssertVisibility(int ts, SequenceNumber seq,
+ std::vector<Status> statuses) {
+ ASSERT_EQ(kTestDataSize, statuses.size());
+ for (int i = 0; i < kTestDataSize; i++) {
+ if (test_data_[i].seq_num <= seq && test_data_[i].ts <= ts) {
+ ASSERT_OK(statuses[i]);
+ } else {
+ ASSERT_TRUE(statuses[i].IsNotFound());
+ }
+ }
+ }
+
+ std::vector<Slice> GetKeys() {
+ std::vector<Slice> ret(kTestDataSize);
+ for (int i = 0; i < kTestDataSize; i++) {
+ ret[i] = test_data_[i].key;
+ }
+ return ret;
+ }
+
+ void VerifyDefaultCF(int ts, const Snapshot* snap = nullptr) {
+ ReadOptions read_opts;
+ std::string read_ts = Timestamp(ts, 0);
+ Slice read_ts_slice = read_ts;
+ read_opts.timestamp = &read_ts_slice;
+ read_opts.snapshot = snap;
+
+ ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+ std::vector<ColumnFamilyHandle*> cfs(kTestDataSize, cfh);
+ SequenceNumber seq =
+ snap ? snap->GetSequenceNumber() : kMaxSequenceNumber - 1;
+
+ // There're several MultiGet interfaces with not exactly the same
+ // implementations, query data with all of them.
+ auto keys = GetKeys();
+ std::vector<std::string> values;
+ auto s1 = db_->MultiGet(read_opts, cfs, keys, &values);
+ AssertVisibility(ts, seq, s1);
+
+ auto s2 = db_->MultiGet(read_opts, keys, &values);
+ AssertVisibility(ts, seq, s2);
+
+ std::vector<std::string> timestamps;
+ auto s3 = db_->MultiGet(read_opts, cfs, keys, &values, &timestamps);
+ AssertVisibility(ts, seq, s3);
+
+ auto s4 = db_->MultiGet(read_opts, keys, &values, &timestamps);
+ AssertVisibility(ts, seq, s4);
+
+ std::vector<PinnableSlice> values_ps5(kTestDataSize);
+ std::vector<Status> s5(kTestDataSize);
+ db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps5.data(),
+ s5.data());
+ AssertVisibility(ts, seq, s5);
+
+ std::vector<PinnableSlice> values_ps6(kTestDataSize);
+ std::vector<Status> s6(kTestDataSize);
+ std::vector<std::string> timestamps_array(kTestDataSize);
+ db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps6.data(),
+ timestamps_array.data(), s6.data());
+ AssertVisibility(ts, seq, s6);
+
+ std::vector<PinnableSlice> values_ps7(kTestDataSize);
+ std::vector<Status> s7(kTestDataSize);
+ db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(),
+ values_ps7.data(), s7.data());
+ AssertVisibility(ts, seq, s7);
+
+ std::vector<PinnableSlice> values_ps8(kTestDataSize);
+ std::vector<Status> s8(kTestDataSize);
+ db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(),
+ values_ps8.data(), timestamps_array.data(), s8.data());
+ AssertVisibility(ts, seq, s8);
+ }
+
+ void VerifyDefaultCF(const Snapshot* snap = nullptr) {
+ for (int i = 0; i <= kTestDataSize; i++) {
+ VerifyDefaultCF(i, snap);
+ }
+ }
+};
+constexpr int DataVisibilityTest::kTestDataSize;
+
+// Application specifies timestamp but not snapshot.
+// reader writer
+// ts'=90
+// ts=100
+// seq=10
+// seq'=11
+// write finishes
+// GetImpl(ts,seq)
+// It is OK to return <k, t1, s1> if ts>=t1 AND seq>=s1. If ts>=t1 but seq<s1,
+// the key should not be returned.
+TEST_F(DataVisibilityTest, PointLookupWithoutSnapshot1) {
+ Options options = CurrentOptions();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::GetImpl:3",
+ "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut"},
+ {"DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut",
+ "DBImpl::GetImpl:4"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread writer_thread([this]() {
+ std::string write_ts = Timestamp(1, 0);
+ WriteOptions write_opts;
+ TEST_SYNC_POINT(
+ "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut");
+ Status s = db_->Put(write_opts, "foo", write_ts, "value");
+ ASSERT_OK(s);
+ TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut");
+ });
+ ReadOptions read_opts;
+ std::string read_ts_str = Timestamp(3, 0);
+ Slice read_ts = read_ts_str;
+ read_opts.timestamp = &read_ts;
+ std::string value;
+ Status s = db_->Get(read_opts, "foo", &value);
+
+ writer_thread.join();
+ ASSERT_TRUE(s.IsNotFound());
+ Close();
+}
+
+// Application specifies timestamp but not snapshot.
+// reader writer
+// ts'=90
+// ts=100
+// seq=10
+// seq'=11
+// write finishes
+// Flush
+// GetImpl(ts,seq)
+// It is OK to return <k, t1, s1> if ts>=t1 AND seq>=s1. If ts>=t1 but seq<s1,
+// the key should not be returned.
+TEST_F(DataVisibilityTest, PointLookupWithoutSnapshot2) {
+ Options options = CurrentOptions();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::GetImpl:3",
+ "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut"},
+ {"DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut",
+ "DBImpl::GetImpl:4"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread writer_thread([this]() {
+ std::string write_ts = Timestamp(1, 0);
+ WriteOptions write_opts;
+ TEST_SYNC_POINT(
+ "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut");
+ Status s = db_->Put(write_opts, "foo", write_ts, "value");
+ ASSERT_OK(s);
+ ASSERT_OK(Flush());
+
+ write_ts = Timestamp(2, 0);
+ s = db_->Put(write_opts, "bar", write_ts, "value");
+ ASSERT_OK(s);
+ TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut");
+ });
+ ReadOptions read_opts;
+ std::string read_ts_str = Timestamp(3, 0);
+ Slice read_ts = read_ts_str;
+ read_opts.timestamp = &read_ts;
+ std::string value;
+ Status s = db_->Get(read_opts, "foo", &value);
+ writer_thread.join();
+ ASSERT_TRUE(s.IsNotFound());
+ Close();
+}
+
+// Application specifies both timestamp and snapshot.
+// reader writer
+// seq=10
+// ts'=90
+// ts=100
+// seq'=11
+// write finishes
+// GetImpl(ts,seq)
+// Since application specifies both timestamp and snapshot, application expects
+// to see data that visible in BOTH timestamp and sequence number. Therefore,
+// <k, t1, s1> can be returned only if t1<=ts AND s1<=seq.
+TEST_F(DataVisibilityTest, PointLookupWithSnapshot1) {
+ Options options = CurrentOptions();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap",
+ "DataVisibilityTest::PointLookupWithSnapshot1:BeforePut"},
+ {"DataVisibilityTest::PointLookupWithSnapshot1:AfterPut",
+ "DBImpl::GetImpl:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread writer_thread([this]() {
+ std::string write_ts = Timestamp(1, 0);
+ WriteOptions write_opts;
+ TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:BeforePut");
+ Status s = db_->Put(write_opts, "foo", write_ts, "value");
+ TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:AfterPut");
+ ASSERT_OK(s);
+ });
+ ReadOptions read_opts;
+ const Snapshot* snap = db_->GetSnapshot();
+ TEST_SYNC_POINT(
+ "DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap");
+ read_opts.snapshot = snap;
+ std::string read_ts_str = Timestamp(3, 0);
+ Slice read_ts = read_ts_str;
+ read_opts.timestamp = &read_ts;
+ std::string value;
+ Status s = db_->Get(read_opts, "foo", &value);
+ writer_thread.join();
+
+ ASSERT_TRUE(s.IsNotFound());
+
+ db_->ReleaseSnapshot(snap);
+ Close();
+}
+
+// Application specifies both timestamp and snapshot.
+// reader writer
+// seq=10
+// ts'=90
+// ts=100
+// seq'=11
+// write finishes
+// Flush
+// GetImpl(ts,seq)
+// Since application specifies both timestamp and snapshot, application expects
+// to see data that visible in BOTH timestamp and sequence number. Therefore,
+// <k, t1, s1> can be returned only if t1<=ts AND s1<=seq.
+TEST_F(DataVisibilityTest, PointLookupWithSnapshot2) {
+ Options options = CurrentOptions();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap",
+ "DataVisibilityTest::PointLookupWithSnapshot2:BeforePut"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread writer_thread([this]() {
+ std::string write_ts = Timestamp(1, 0);
+ WriteOptions write_opts;
+ TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot2:BeforePut");
+ Status s = db_->Put(write_opts, "foo", write_ts, "value1");
+ ASSERT_OK(s);
+ ASSERT_OK(Flush());
+
+ write_ts = Timestamp(2, 0);
+ s = db_->Put(write_opts, "bar", write_ts, "value2");
+ ASSERT_OK(s);
+ });
+ const Snapshot* snap = db_->GetSnapshot();
+ TEST_SYNC_POINT(
+ "DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap");
+ writer_thread.join();
+ std::string read_ts_str = Timestamp(3, 0);
+ Slice read_ts = read_ts_str;
+ ReadOptions read_opts;
+ read_opts.snapshot = snap;
+ read_opts.timestamp = &read_ts;
+ std::string value;
+ Status s = db_->Get(read_opts, "foo", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ db_->ReleaseSnapshot(snap);
+ Close();
+}
+
+// Application specifies timestamp but not snapshot.
+// reader writer
+// ts'=90
+// ts=100
+// seq=10
+// seq'=11
+// write finishes
+// scan(ts,seq)
+// <k, t1, s1> can be seen in scan as long as ts>=t1 AND seq>=s1. If ts>=t1 but
+// seq<s1, then the key should not be returned.
+TEST_F(DataVisibilityTest, RangeScanWithoutSnapshot) {
+ Options options = CurrentOptions();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::NewIterator:3",
+ "DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread writer_thread([this]() {
+ WriteOptions write_opts;
+ TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut");
+ for (int i = 0; i < 3; ++i) {
+ std::string write_ts = Timestamp(i + 1, 0);
+ Status s = db_->Put(write_opts, "key" + std::to_string(i), write_ts,
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ });
+ std::string read_ts_str = Timestamp(10, 0);
+ Slice read_ts = read_ts_str;
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ read_opts.timestamp = &read_ts;
+ Iterator* it = db_->NewIterator(read_opts);
+ ASSERT_NE(nullptr, it);
+ writer_thread.join();
+ it->SeekToFirst();
+ ASSERT_FALSE(it->Valid());
+ delete it;
+ Close();
+}
+
+// Application specifies both timestamp and snapshot.
+// reader writer
+// seq=10
+// ts'=90
+// ts=100 seq'=11
+// write finishes
+// scan(ts,seq)
+// <k, t1, s1> can be seen by the scan only if t1<=ts AND s1<=seq. If t1<=ts
+// but s1>seq, then the key should not be returned.
+TEST_F(DataVisibilityTest, RangeScanWithSnapshot) {
+ Options options = CurrentOptions();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot",
+ "DataVisibilityTest::RangeScanWithSnapshot:BeforePut"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread writer_thread([this]() {
+ WriteOptions write_opts;
+ TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithSnapshot:BeforePut");
+ for (int i = 0; i < 3; ++i) {
+ std::string write_ts = Timestamp(i + 1, 0);
+ Status s = db_->Put(write_opts, "key" + std::to_string(i), write_ts,
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ });
+ const Snapshot* snap = db_->GetSnapshot();
+ TEST_SYNC_POINT(
+ "DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot");
+
+ writer_thread.join();
+
+ std::string read_ts_str = Timestamp(10, 0);
+ Slice read_ts = read_ts_str;
+ ReadOptions read_opts;
+ read_opts.snapshot = snap;
+ read_opts.total_order_seek = true;
+ read_opts.timestamp = &read_ts;
+ Iterator* it = db_->NewIterator(read_opts);
+ ASSERT_NE(nullptr, it);
+ it->Seek("key0");
+ ASSERT_FALSE(it->Valid());
+
+ delete it;
+ db_->ReleaseSnapshot(snap);
+ Close();
+}
+
+// Application specifies both timestamp and snapshot.
+// Query each combination and make sure for MultiGet key <k, t1, s1>, only
+// return keys that ts>=t1 AND seq>=s1.
+TEST_F(DataVisibilityTest, MultiGetWithTimestamp) {
+ Options options = CurrentOptions();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ const Snapshot* snap0 = db_->GetSnapshot();
+ PutTestData(0);
+ VerifyDefaultCF();
+ VerifyDefaultCF(snap0);
+
+ const Snapshot* snap1 = db_->GetSnapshot();
+ PutTestData(1);
+ VerifyDefaultCF();
+ VerifyDefaultCF(snap0);
+ VerifyDefaultCF(snap1);
+
+ ASSERT_OK(Flush());
+
+ const Snapshot* snap2 = db_->GetSnapshot();
+ PutTestData(2);
+ VerifyDefaultCF();
+ VerifyDefaultCF(snap0);
+ VerifyDefaultCF(snap1);
+ VerifyDefaultCF(snap2);
+
+ db_->ReleaseSnapshot(snap0);
+ db_->ReleaseSnapshot(snap1);
+ db_->ReleaseSnapshot(snap2);
+
+ Close();
+}
+
+// Application specifies timestamp but not snapshot.
+// reader writer
+// ts'=0, 1
+// ts=3
+// seq=10
+// seq'=11, 12
+// write finishes
+// MultiGet(ts,seq)
+// For MultiGet <k, t1, s1>, only return keys that ts>=t1 AND seq>=s1.
+TEST_F(DataVisibilityTest, MultiGetWithoutSnapshot) {
+ Options options = CurrentOptions();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::MultiGet:AfterGetSeqNum1",
+ "DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut"},
+ {"DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut",
+ "DBImpl::MultiGet:AfterGetSeqNum2"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread writer_thread([this]() {
+ TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut");
+ PutTestData(0);
+ PutTestData(1);
+ TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut");
+ });
+
+ ReadOptions read_opts;
+ std::string read_ts = Timestamp(kTestDataSize, 0);
+ Slice read_ts_slice = read_ts;
+ read_opts.timestamp = &read_ts_slice;
+ auto keys = GetKeys();
+ std::vector<std::string> values;
+ auto ss = db_->MultiGet(read_opts, keys, &values);
+
+ writer_thread.join();
+ for (auto s : ss) {
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ VerifyDefaultCF();
+ Close();
+}
+
+TEST_F(DataVisibilityTest, MultiGetCrossCF) {
+ Options options = CurrentOptions();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ CreateAndReopenWithCF({"second"}, options);
+ ColumnFamilyHandle* second_cf = handles_[1];
+
+ const Snapshot* snap0 = db_->GetSnapshot();
+ PutTestData(0);
+ PutTestData(0, second_cf);
+ VerifyDefaultCF();
+ VerifyDefaultCF(snap0);
+
+ const Snapshot* snap1 = db_->GetSnapshot();
+ PutTestData(1);
+ PutTestData(1, second_cf);
+ VerifyDefaultCF();
+ VerifyDefaultCF(snap0);
+ VerifyDefaultCF(snap1);
+
+ ASSERT_OK(Flush());
+
+ const Snapshot* snap2 = db_->GetSnapshot();
+ PutTestData(2);
+ PutTestData(2, second_cf);
+ VerifyDefaultCF();
+ VerifyDefaultCF(snap0);
+ VerifyDefaultCF(snap1);
+ VerifyDefaultCF(snap2);
+
+ ReadOptions read_opts;
+ std::string read_ts = Timestamp(kTestDataSize, 0);
+ Slice read_ts_slice = read_ts;
+ read_opts.timestamp = &read_ts_slice;
+ read_opts.snapshot = snap1;
+ auto keys = GetKeys();
+ auto keys2 = GetKeys();
+ keys.insert(keys.end(), keys2.begin(), keys2.end());
+ std::vector<ColumnFamilyHandle*> cfs(kTestDataSize,
+ db_->DefaultColumnFamily());
+ std::vector<ColumnFamilyHandle*> cfs2(kTestDataSize, second_cf);
+ cfs.insert(cfs.end(), cfs2.begin(), cfs2.end());
+
+ std::vector<std::string> values;
+ auto ss = db_->MultiGet(read_opts, cfs, keys, &values);
+ for (int i = 0; i < 2 * kTestDataSize; i++) {
+ if (i % 3 == 0) {
+ // only the first key for each column family should be returned
+ ASSERT_OK(ss[i]);
+ } else {
+ ASSERT_TRUE(ss[i].IsNotFound());
+ }
+ }
+
+ db_->ReleaseSnapshot(snap0);
+ db_->ReleaseSnapshot(snap1);
+ db_->ReleaseSnapshot(snap2);
+ Close();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class DBBasicTestWithTimestampCompressionSettings
+ : public DBBasicTestWithTimestampBase,
+ public testing::WithParamInterface<
+ std::tuple<std::shared_ptr<const FilterPolicy>, CompressionType,
+ uint32_t, uint32_t>> {
+ public:
+ DBBasicTestWithTimestampCompressionSettings()
+ : DBBasicTestWithTimestampBase(
+ "db_basic_test_with_timestamp_compression") {}
+};
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGet) {
+ const int kNumKeysPerFile = 1024;
+ const size_t kNumTimestamps = 4;
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.env = env_;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ size_t ts_sz = Timestamp(0, 0).size();
+ TestComparator test_cmp(ts_sz);
+ options.comparator = &test_cmp;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy = std::get<0>(GetParam());
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400 // r124+
+ if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+ return;
+ }
+#endif // LZ4_VERSION_NUMBER >= 10400
+ if (!ZSTD_Supported() && comp_type == kZSTD) {
+ return;
+ }
+ if (!Zlib_Supported() && comp_type == kZlibCompression) {
+ return;
+ }
+
+ options.compression = comp_type;
+ options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+ if (comp_type == kZSTD) {
+ options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+ }
+ options.compression_opts.parallel_threads = std::get<3>(GetParam());
+ options.target_file_size_base = 1 << 26; // 64MB
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(2, num_cfs);
+ std::vector<std::string> write_ts_list;
+ std::vector<std::string> read_ts_list;
+
+ for (size_t i = 0; i != kNumTimestamps; ++i) {
+ write_ts_list.push_back(Timestamp(i * 2, 0));
+ read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+ const Slice write_ts = write_ts_list.back();
+ WriteOptions wopts;
+ for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+ for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+ ASSERT_OK(
+ db_->Put(wopts, handles_[cf], Key1(j), write_ts,
+ "value_" + std::to_string(j) + "_" + std::to_string(i)));
+ }
+ }
+ }
+ const auto& verify_db_func = [&]() {
+ for (size_t i = 0; i != kNumTimestamps; ++i) {
+ ReadOptions ropts;
+ const Slice read_ts = read_ts_list[i];
+ ropts.timestamp = &read_ts;
+ for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+ ColumnFamilyHandle* cfh = handles_[cf];
+ for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+ std::string value;
+ ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value));
+ ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+ value);
+ }
+ }
+ }
+ };
+ verify_db_func();
+ Close();
+}
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutDeleteGet) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ const int kNumKeysPerFile = 1024;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy = std::get<0>(GetParam());
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400 // r124+
+ if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+ return;
+ }
+#endif // LZ4_VERSION_NUMBER >= 10400
+ if (!ZSTD_Supported() && comp_type == kZSTD) {
+ return;
+ }
+ if (!Zlib_Supported() && comp_type == kZlibCompression) {
+ return;
+ }
+
+ options.compression = comp_type;
+ options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+ if (comp_type == kZSTD) {
+ options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+ }
+ options.compression_opts.parallel_threads = std::get<3>(GetParam());
+ options.target_file_size_base = 1 << 26; // 64MB
+
+ DestroyAndReopen(options);
+
+ const size_t kNumL0Files =
+ static_cast<size_t>(Options().level0_file_num_compaction_trigger);
+ {
+ // Half of the keys will go through Deletion and remaining half with
+ // SingleDeletion. Generate enough L0 files with ts=1 to trigger compaction
+ // to L1
+ std::string ts = Timestamp(1, 0);
+ WriteOptions wopts;
+ for (size_t i = 0; i < kNumL0Files; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(db_->Put(wopts, Key1(j), ts, "value" + std::to_string(i)));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ // Generate another L0 at ts=3
+ ts = Timestamp(3, 0);
+ for (int i = 0; i < kNumKeysPerFile; ++i) {
+ std::string key_str = Key1(i);
+ Slice key(key_str);
+ if ((i % 3) == 0) {
+ if (i < kNumKeysPerFile / 2) {
+ ASSERT_OK(db_->Delete(wopts, key, ts));
+ } else {
+ ASSERT_OK(db_->SingleDelete(wopts, key, ts));
+ }
+ } else {
+ ASSERT_OK(db_->Put(wopts, key, ts, "new_value"));
+ }
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ // Populate memtable at ts=5
+ ts = Timestamp(5, 0);
+ for (int i = 0; i != kNumKeysPerFile; ++i) {
+ std::string key_str = Key1(i);
+ Slice key(key_str);
+ if ((i % 3) == 1) {
+ if (i < kNumKeysPerFile / 2) {
+ ASSERT_OK(db_->Delete(wopts, key, ts));
+ } else {
+ ASSERT_OK(db_->SingleDelete(wopts, key, ts));
+ }
+ } else if ((i % 3) == 2) {
+ ASSERT_OK(db_->Put(wopts, key, ts, "new_value_2"));
+ }
+ }
+ }
+ {
+ std::string ts_str = Timestamp(6, 0);
+ Slice ts = ts_str;
+ ReadOptions ropts;
+ ropts.timestamp = &ts;
+ for (uint64_t i = 0; i != static_cast<uint64_t>(kNumKeysPerFile); ++i) {
+ std::string value;
+ std::string key_ts;
+ Status s = db_->Get(ropts, Key1(i), &value, &key_ts);
+ if ((i % 3) == 2) {
+ ASSERT_OK(s);
+ ASSERT_EQ("new_value_2", value);
+ ASSERT_EQ(Timestamp(5, 0), key_ts);
+ } else if ((i % 3) == 1) {
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ(Timestamp(5, 0), key_ts);
+ } else {
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ(Timestamp(3, 0), key_ts);
+ }
+ }
+ }
+}
+
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+ FlushedFileCollector() {}
+ ~FlushedFileCollector() override {}
+
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+ InstrumentedMutexLock lock(&mutex_);
+ flushed_files_.push_back(info.file_path);
+ }
+
+ std::vector<std::string> GetFlushedFiles() {
+ std::vector<std::string> result;
+ {
+ InstrumentedMutexLock lock(&mutex_);
+ result = flushed_files_;
+ }
+ return result;
+ }
+
+ void ClearFlushedFiles() {
+ InstrumentedMutexLock lock(&mutex_);
+ flushed_files_.clear();
+ }
+
+ private:
+ std::vector<std::string> flushed_files_;
+ InstrumentedMutex mutex_;
+};
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGetWithCompaction) {
+ const int kNumKeysPerFile = 1024;
+ const size_t kNumTimestamps = 2;
+ const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
+ const size_t kSplitPosBase = kNumKeysPerTimestamp / 2;
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.env = env_;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ size_t ts_sz = Timestamp(0, 0).size();
+ TestComparator test_cmp(ts_sz);
+ options.comparator = &test_cmp;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy = std::get<0>(GetParam());
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400 // r124+
+ if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+ return;
+ }
+#endif // LZ4_VERSION_NUMBER >= 10400
+ if (!ZSTD_Supported() && comp_type == kZSTD) {
+ return;
+ }
+ if (!Zlib_Supported() && comp_type == kZlibCompression) {
+ return;
+ }
+
+ options.compression = comp_type;
+ options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+ if (comp_type == kZSTD) {
+ options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+ }
+ options.compression_opts.parallel_threads = std::get<3>(GetParam());
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(2, num_cfs);
+ std::vector<std::string> write_ts_list;
+ std::vector<std::string> read_ts_list;
+
+ const auto& verify_records_func = [&](size_t i, size_t begin, size_t end,
+ ColumnFamilyHandle* cfh) {
+ std::string value;
+ std::string timestamp;
+
+ ReadOptions ropts;
+ const Slice read_ts = read_ts_list[i];
+ ropts.timestamp = &read_ts;
+ std::string expected_timestamp =
+ std::string(write_ts_list[i].data(), write_ts_list[i].size());
+
+ for (size_t j = begin; j <= end; ++j) {
+ ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value, &timestamp));
+ ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), value);
+ ASSERT_EQ(expected_timestamp, timestamp);
+ }
+ };
+
+ for (size_t i = 0; i != kNumTimestamps; ++i) {
+ write_ts_list.push_back(Timestamp(i * 2, 0));
+ read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+ const Slice write_ts = write_ts_list.back();
+ WriteOptions wopts;
+ for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+ size_t memtable_get_start = 0;
+ for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+ ASSERT_OK(
+ db_->Put(wopts, handles_[cf], Key1(j), write_ts,
+ "value_" + std::to_string(j) + "_" + std::to_string(i)));
+ if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) {
+ verify_records_func(i, memtable_get_start, j, handles_[cf]);
+ memtable_get_start = j + 1;
+
+ // flush all keys with the same timestamp to two sst files, split at
+ // incremental positions such that lowerlevel[1].smallest.userkey ==
+ // higherlevel[0].largest.userkey
+ ASSERT_OK(Flush(cf));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact()); // wait for flush (which
+ // is also a compaction)
+
+ // compact files (2 at each level) to a lower level such that all
+ // keys with the same timestamp is at one level, with newer versions
+ // at higher levels.
+ CompactionOptions compact_opt;
+ compact_opt.compression = kNoCompression;
+ ASSERT_OK(db_->CompactFiles(compact_opt, handles_[cf],
+ collector->GetFlushedFiles(),
+ static_cast<int>(kNumTimestamps - i)));
+ collector->ClearFlushedFiles();
+ }
+ }
+ }
+ }
+ const auto& verify_db_func = [&]() {
+ for (size_t i = 0; i != kNumTimestamps; ++i) {
+ ReadOptions ropts;
+ const Slice read_ts = read_ts_list[i];
+ ropts.timestamp = &read_ts;
+ std::string expected_timestamp(write_ts_list[i].data(),
+ write_ts_list[i].size());
+ for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+ ColumnFamilyHandle* cfh = handles_[cf];
+ verify_records_func(i, 0, kNumKeysPerTimestamp - 1, cfh);
+ }
+ }
+ };
+ verify_db_func();
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, BatchWriteAndMultiGet) {
+ const int kNumKeysPerFile = 8192;
+ const size_t kNumTimestamps = 2;
+ const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.env = env_;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ options.memtable_prefix_bloom_size_ratio = 0.1;
+ options.memtable_whole_key_filtering = true;
+
+ size_t ts_sz = Timestamp(0, 0).size();
+ TestComparator test_cmp(ts_sz);
+ options.comparator = &test_cmp;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(
+ 10 /*bits_per_key*/, false /*use_block_based_builder*/));
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ size_t num_cfs = handles_.size();
+ ASSERT_EQ(2, num_cfs);
+ std::vector<std::string> write_ts_list;
+ std::vector<std::string> read_ts_list;
+
+ const auto& verify_records_func = [&](size_t i, ColumnFamilyHandle* cfh) {
+ std::vector<Slice> keys;
+ std::vector<std::string> key_vals;
+ std::vector<std::string> values;
+ std::vector<std::string> timestamps;
+
+ for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+ key_vals.push_back(Key1(j));
+ }
+ for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+ keys.push_back(key_vals[j]);
+ }
+
+ ReadOptions ropts;
+ const Slice read_ts = read_ts_list[i];
+ ropts.timestamp = &read_ts;
+ std::string expected_timestamp(write_ts_list[i].data(),
+ write_ts_list[i].size());
+
+ std::vector<ColumnFamilyHandle*> cfhs(keys.size(), cfh);
+ std::vector<Status> statuses =
+ db_->MultiGet(ropts, cfhs, keys, &values, &timestamps);
+ for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+ ASSERT_OK(statuses[j]);
+ ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+ values[j]);
+ ASSERT_EQ(expected_timestamp, timestamps[j]);
+ }
+ };
+
+ const std::string dummy_ts(ts_sz, '\0');
+ for (size_t i = 0; i != kNumTimestamps; ++i) {
+ write_ts_list.push_back(Timestamp(i * 2, 0));
+ read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+ const Slice& write_ts = write_ts_list.back();
+ for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+ WriteOptions wopts;
+ WriteBatch batch(0, 0, 0, ts_sz);
+ for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+ const std::string key = Key1(j);
+ const std::string value =
+ "value_" + std::to_string(j) + "_" + std::to_string(i);
+ ASSERT_OK(batch.Put(handles_[cf], key, value));
+ }
+ ASSERT_OK(batch.UpdateTimestamps(write_ts,
+ [ts_sz](uint32_t) { return ts_sz; }));
+ ASSERT_OK(db_->Write(wopts, &batch));
+
+ verify_records_func(i, handles_[cf]);
+
+ ASSERT_OK(Flush(cf));
+ }
+ }
+
+ const auto& verify_db_func = [&]() {
+ for (size_t i = 0; i != kNumTimestamps; ++i) {
+ ReadOptions ropts;
+ const Slice read_ts = read_ts_list[i];
+ ropts.timestamp = &read_ts;
+ for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+ ColumnFamilyHandle* cfh = handles_[cf];
+ verify_records_func(i, cfh);
+ }
+ }
+ };
+ verify_db_func();
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetNoReturnTs) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ WriteOptions write_opts;
+ std::string ts = Timestamp(1, 0);
+ ASSERT_OK(db_->Put(write_opts, "foo", ts, "value"));
+ ASSERT_OK(db_->Put(write_opts, "bar", ts, "value"));
+ ASSERT_OK(db_->Put(write_opts, "fooxxxxxxxxxxxxxxxx", ts, "value"));
+ ASSERT_OK(db_->Put(write_opts, "barxxxxxxxxxxxxxxxx", ts, "value"));
+ ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+ ts = Timestamp(2, 0);
+ Slice read_ts = ts;
+ ReadOptions read_opts;
+ read_opts.timestamp = &read_ts;
+ {
+ ColumnFamilyHandle* column_families[] = {cfh, cfh};
+ Slice keys[] = {"foo", "bar"};
+ PinnableSlice values[] = {PinnableSlice(), PinnableSlice()};
+ Status statuses[] = {Status::OK(), Status::OK()};
+ dbfull()->MultiGet(read_opts, /*num_keys=*/2, &column_families[0], &keys[0],
+ &values[0], &statuses[0], /*sorted_input=*/false);
+ for (const auto& s : statuses) {
+ ASSERT_OK(s);
+ }
+ }
+ {
+ ColumnFamilyHandle* column_families[] = {cfh, cfh, cfh, cfh};
+ // Make user keys longer than configured timestamp size (16 bytes) to
+ // verify RocksDB does not use the trailing bytes 'x' as timestamp.
+ Slice keys[] = {"fooxxxxxxxxxxxxxxxx", "barxxxxxxxxxxxxxxxx", "foo", "bar"};
+ PinnableSlice values[] = {PinnableSlice(), PinnableSlice(), PinnableSlice(),
+ PinnableSlice()};
+ Status statuses[] = {Status::OK(), Status::OK(), Status::OK(),
+ Status::OK()};
+ dbfull()->MultiGet(read_opts, /*num_keys=*/4, &column_families[0], &keys[0],
+ &values[0], &statuses[0], /*sorted_input=*/false);
+ for (const auto& s : statuses) {
+ ASSERT_OK(s);
+ }
+ }
+ Close();
+}
+
+#endif // !ROCKSDB_LITE
+
+INSTANTIATE_TEST_CASE_P(
+ Timestamp, DBBasicTestWithTimestampCompressionSettings,
+ ::testing::Combine(
+ ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+ std::shared_ptr<const FilterPolicy>(
+ NewBloomFilterPolicy(10, false))),
+ ::testing::Values(kNoCompression, kZlibCompression, kLZ4Compression,
+ kLZ4HCCompression, kZSTD),
+ ::testing::Values(0, 1 << 14), ::testing::Values(1, 4)));
+
+class DBBasicTestWithTimestampPrefixSeek
+ : public DBBasicTestWithTimestampBase,
+ public testing::WithParamInterface<
+ std::tuple<std::shared_ptr<const SliceTransform>,
+ std::shared_ptr<const FilterPolicy>, bool,
+ BlockBasedTableOptions::IndexType>> {
+ public:
+ DBBasicTestWithTimestampPrefixSeek()
+ : DBBasicTestWithTimestampBase(
+ "/db_basic_test_with_timestamp_prefix_seek") {}
+};
+
+TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) {
+ const size_t kNumKeysPerFile = 128;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.prefix_extractor = std::get<0>(GetParam());
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy = std::get<1>(GetParam());
+ bbto.index_type = std::get<3>(GetParam());
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ const uint64_t kMaxKey = 0xffffffffffffffff;
+ const uint64_t kMinKey = 0xfffffffffffff000;
+ const std::vector<std::string> write_ts_list = {Timestamp(3, 0xffffffff),
+ Timestamp(6, 0xffffffff)};
+ WriteOptions write_opts;
+ {
+ for (size_t i = 0; i != write_ts_list.size(); ++i) {
+ for (uint64_t key = kMaxKey; key >= kMinKey; --key) {
+ Status s = db_->Put(write_opts, Key1(key), write_ts_list[i],
+ "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+ }
+ }
+ const std::vector<std::string> read_ts_list = {Timestamp(5, 0xffffffff),
+ Timestamp(9, 0xffffffff)};
+ {
+ ReadOptions read_opts;
+ read_opts.total_order_seek = false;
+ read_opts.prefix_same_as_start = std::get<2>(GetParam());
+ fprintf(stdout, "%s %s %d\n", options.prefix_extractor->Name(),
+ bbto.filter_policy ? bbto.filter_policy->Name() : "null",
+ static_cast<int>(read_opts.prefix_same_as_start));
+ for (size_t i = 0; i != read_ts_list.size(); ++i) {
+ Slice read_ts = read_ts_list[i];
+ read_opts.timestamp = &read_ts;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+
+ // Seek to kMaxKey
+ iter->Seek(Key1(kMaxKey));
+ CheckIterUserEntry(iter.get(), Key1(kMaxKey), kTypeValue,
+ "value" + std::to_string(i), write_ts_list[i]);
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+
+ // Seek to kMinKey
+ iter->Seek(Key1(kMinKey));
+ CheckIterUserEntry(iter.get(), Key1(kMinKey), kTypeValue,
+ "value" + std::to_string(i), write_ts_list[i]);
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ }
+ const std::vector<uint64_t> targets = {kMinKey, kMinKey + 0x10,
+ kMinKey + 0x100, kMaxKey};
+ const SliceTransform* const pe = options.prefix_extractor.get();
+ ASSERT_NE(nullptr, pe);
+ const size_t kPrefixShift =
+ 8 * (Key1(0).size() - pe->Transform(Key1(0)).size());
+ const uint64_t kPrefixMask =
+ ~((static_cast<uint64_t>(1) << kPrefixShift) - 1);
+ const uint64_t kNumKeysWithinPrefix =
+ (static_cast<uint64_t>(1) << kPrefixShift);
+ for (size_t i = 0; i != read_ts_list.size(); ++i) {
+ Slice read_ts = read_ts_list[i];
+ read_opts.timestamp = &read_ts;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ // Forward and backward iterate.
+ for (size_t j = 0; j != targets.size(); ++j) {
+ std::string start_key = Key1(targets[j]);
+ uint64_t expected_ub =
+ (targets[j] & kPrefixMask) - 1 + kNumKeysWithinPrefix;
+ uint64_t expected_key = targets[j];
+ size_t count = 0;
+ it->Seek(Key1(targets[j]));
+ while (it->Valid()) {
+ std::string saved_prev_key;
+ saved_prev_key.assign(it->key().data(), it->key().size());
+
+ // Out of prefix
+ if (!read_opts.prefix_same_as_start &&
+ pe->Transform(saved_prev_key) != pe->Transform(start_key)) {
+ break;
+ }
+ CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
+ "value" + std::to_string(i), write_ts_list[i]);
+ ++count;
+ ++expected_key;
+ it->Next();
+ }
+ ASSERT_EQ(expected_ub - targets[j] + 1, count);
+
+ count = 0;
+ expected_key = targets[j];
+ it->SeekForPrev(start_key);
+ uint64_t expected_lb = (targets[j] & kPrefixMask);
+ while (it->Valid()) {
+ // Out of prefix
+ if (!read_opts.prefix_same_as_start &&
+ pe->Transform(it->key()) != pe->Transform(start_key)) {
+ break;
+ }
+ CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
+ "value" + std::to_string(i), write_ts_list[i]);
+ ++count;
+ --expected_key;
+ it->Prev();
+ }
+ ASSERT_EQ(targets[j] - std::max(expected_lb, kMinKey) + 1, count);
+ }
+ }
+ }
+ Close();
+}
+
+// TODO(yanqin): consider handling non-fixed-length prefix extractors, e.g.
+// NoopTransform.
+INSTANTIATE_TEST_CASE_P(
+ Timestamp, DBBasicTestWithTimestampPrefixSeek,
+ ::testing::Combine(
+ ::testing::Values(
+ std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
+ std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
+ std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+ std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+ ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+ std::shared_ptr<const FilterPolicy>(
+ NewBloomFilterPolicy(10 /*bits_per_key*/, false)),
+ std::shared_ptr<const FilterPolicy>(
+ NewBloomFilterPolicy(20 /*bits_per_key*/,
+ false))),
+ ::testing::Bool(),
+ ::testing::Values(
+ BlockBasedTableOptions::IndexType::kBinarySearch,
+ BlockBasedTableOptions::IndexType::kHashSearch,
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+ BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+
+class DBBasicTestWithTsIterTombstones
+ : public DBBasicTestWithTimestampBase,
+ public testing::WithParamInterface<
+ std::tuple<std::shared_ptr<const SliceTransform>,
+ std::shared_ptr<const FilterPolicy>, int,
+ BlockBasedTableOptions::IndexType>> {
+ public:
+ DBBasicTestWithTsIterTombstones()
+ : DBBasicTestWithTimestampBase("/db_basic_ts_iter_tombstones") {}
+};
+
+TEST_P(DBBasicTestWithTsIterTombstones, IterWithDelete) {
+ constexpr size_t kNumKeysPerFile = 128;
+ Options options = CurrentOptions();
+ options.env = env_;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.prefix_extractor = std::get<0>(GetParam());
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy = std::get<1>(GetParam());
+ bbto.index_type = std::get<3>(GetParam());
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.num_levels = std::get<2>(GetParam());
+ DestroyAndReopen(options);
+ std::vector<std::string> write_ts_strs = {Timestamp(2, 0), Timestamp(4, 0)};
+ constexpr uint64_t kMaxKey = 0xffffffffffffffff;
+ constexpr uint64_t kMinKey = 0xfffffffffffff000;
+ // Insert kMinKey...kMaxKey
+ uint64_t key = kMinKey;
+ WriteOptions write_opts;
+ Slice ts = write_ts_strs[0];
+ do {
+ Status s = db_->Put(write_opts, Key1(key), write_ts_strs[0],
+ "value" + std::to_string(key));
+ ASSERT_OK(s);
+ if (kMaxKey == key) {
+ break;
+ }
+ ++key;
+ } while (true);
+
+ for (key = kMaxKey; key >= kMinKey; --key) {
+ Status s;
+ if (0 != (key % 2)) {
+ s = db_->Put(write_opts, Key1(key), write_ts_strs[1],
+ "value1" + std::to_string(key));
+ } else {
+ s = db_->Delete(write_opts, Key1(key), write_ts_strs[1]);
+ }
+ ASSERT_OK(s);
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ {
+ std::string read_ts = Timestamp(4, 0);
+ ts = read_ts;
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ read_opts.timestamp = &ts;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ size_t count = 0;
+ key = kMinKey + 1;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++count, key += 2) {
+ ASSERT_EQ(Key1(key), iter->key());
+ ASSERT_EQ("value1" + std::to_string(key), iter->value());
+ }
+ ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
+
+ for (iter->SeekToLast(), count = 0, key = kMaxKey; iter->Valid();
+ key -= 2, ++count, iter->Prev()) {
+ ASSERT_EQ(Key1(key), iter->key());
+ ASSERT_EQ("value1" + std::to_string(key), iter->value());
+ }
+ ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
+ }
+ Close();
+}
+
+INSTANTIATE_TEST_CASE_P(
+ Timestamp, DBBasicTestWithTsIterTombstones,
+ ::testing::Combine(
+ ::testing::Values(
+ std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+ std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+ ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+ std::shared_ptr<const FilterPolicy>(
+ NewBloomFilterPolicy(10, false)),
+ std::shared_ptr<const FilterPolicy>(
+ NewBloomFilterPolicy(20, false))),
+ ::testing::Values(2, 6),
+ ::testing::Values(
+ BlockBasedTableOptions::IndexType::kBinarySearch,
+ BlockBasedTableOptions::IndexType::kHashSearch,
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+ BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class UpdateFullHistoryTsLowTest : public DBBasicTestWithTimestampBase {
+ public:
+ UpdateFullHistoryTsLowTest()
+ : DBBasicTestWithTimestampBase("/update_full_history_ts_low_test") {}
+};
+
+TEST_F(UpdateFullHistoryTsLowTest, ConcurrentUpdate) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ std::string lower_ts_low = Timestamp(10, 0);
+ std::string higher_ts_low = Timestamp(25, 0);
+ const size_t kTimestampSize = lower_ts_low.size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+
+ DestroyAndReopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ // This workaround swaps `lower_ts_low` originally used for update by the
+ // caller to `higher_ts_low` after its writer is queued to make sure
+ // the caller will always get a TryAgain error.
+ // It mimics cases where two threads update full_history_ts_low concurrently
+ // with one thread writing a higher ts_low and one thread writing a lower
+ // ts_low.
+ VersionEdit* version_edit;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit",
+ [&](void* arg) { version_edit = reinterpret_cast<VersionEdit*>(arg); });
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:BeforeWriterWaiting",
+ [&](void* /*arg*/) { version_edit->SetFullHistoryTsLow(higher_ts_low); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_TRUE(
+ db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), lower_ts_low)
+ .IsTryAgain());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp,
+ GCPreserveRangeTombstoneWhenNoOrSmallFullHistoryLow) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+
+ std::string ts_str = Timestamp(1, 0);
+ WriteOptions wopts;
+ ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v1"));
+ ASSERT_OK(db_->Put(wopts, "k2", ts_str, "v2"));
+ ASSERT_OK(db_->Put(wopts, "k3", ts_str, "v3"));
+ ts_str = Timestamp(2, 0);
+ ASSERT_OK(
+ db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k1", "k3", ts_str));
+
+ ts_str = Timestamp(3, 0);
+ Slice ts = ts_str;
+ ReadOptions ropts;
+ ropts.timestamp = &ts;
+ CompactRangeOptions cro;
+ cro.full_history_ts_low = nullptr;
+ std::string value, key_ts;
+ Status s;
+ auto verify = [&] {
+ s = db_->Get(ropts, "k1", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = db_->Get(ropts, "k2", &value, &key_ts);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ(key_ts, Timestamp(2, 0));
+
+ ASSERT_OK(db_->Get(ropts, "k3", &value, &key_ts));
+ ASSERT_EQ(value, "v3");
+ ASSERT_EQ(Timestamp(1, 0), key_ts);
+
+ size_t batch_size = 3;
+ std::vector<std::string> key_strs = {"k1", "k2", "k3"};
+ std::vector<Slice> keys{key_strs.begin(), key_strs.end()};
+ std::vector<PinnableSlice> values(batch_size);
+ std::vector<Status> statuses(batch_size);
+ db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+ values.data(), statuses.data(), true /* sorted_input */);
+ ASSERT_TRUE(statuses[0].IsNotFound());
+ ASSERT_TRUE(statuses[1].IsNotFound());
+ ASSERT_OK(statuses[2]);
+ ;
+ ASSERT_EQ(values[2], "v3");
+ };
+ verify();
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ verify();
+ std::string lb = Timestamp(0, 0);
+ Slice lb_slice = lb;
+ cro.full_history_ts_low = &lb_slice;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ verify();
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp,
+ GCRangeTombstonesAndCoveredKeysRespectingTslow) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.cache_index_and_filter_blocks = true;
+ bbto.whole_key_filtering = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.num_levels = 2;
+ DestroyAndReopen(options);
+
+ WriteOptions wopts;
+ ASSERT_OK(db_->Put(wopts, "k1", Timestamp(1, 0), "v1"));
+ ASSERT_OK(db_->Delete(wopts, "k2", Timestamp(2, 0)));
+ ASSERT_OK(db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k1", "k3",
+ Timestamp(3, 0)));
+ ASSERT_OK(db_->Put(wopts, "k3", Timestamp(4, 0), "v3"));
+
+ ReadOptions ropts;
+ std::string read_ts = Timestamp(5, 0);
+ Slice read_ts_slice = read_ts;
+ ropts.timestamp = &read_ts_slice;
+ size_t batch_size = 3;
+ std::vector<std::string> key_strs = {"k1", "k2", "k3"};
+ std::vector<Slice> keys = {key_strs.begin(), key_strs.end()};
+ std::vector<PinnableSlice> values(batch_size);
+ std::vector<Status> statuses(batch_size);
+ std::vector<std::string> timestamps(batch_size);
+ db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+ values.data(), timestamps.data(), statuses.data(),
+ true /* sorted_input */);
+ ASSERT_TRUE(statuses[0].IsNotFound());
+ ASSERT_EQ(timestamps[0], Timestamp(3, 0));
+ ASSERT_TRUE(statuses[1].IsNotFound());
+ // DeleteRange has a higher timestamp than Delete for "k2"
+ ASSERT_EQ(timestamps[1], Timestamp(3, 0));
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], "v3");
+ ASSERT_EQ(timestamps[2], Timestamp(4, 0));
+
+ CompactRangeOptions cro;
+ // Range tombstone has timestamp >= full_history_ts_low, covered keys
+ // are not dropped.
+ std::string compaction_ts_str = Timestamp(2, 0);
+ Slice compaction_ts = compaction_ts_str;
+ cro.full_history_ts_low = &compaction_ts;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ropts.timestamp = &compaction_ts;
+ std::string value, ts;
+ ASSERT_OK(db_->Get(ropts, "k1", &value, &ts));
+ ASSERT_EQ(value, "v1");
+ // timestamp is below full_history_ts_low, zeroed out as the key goes into
+ // bottommost level
+ ASSERT_EQ(ts, Timestamp(0, 0));
+ ASSERT_TRUE(db_->Get(ropts, "k2", &value, &ts).IsNotFound());
+ ASSERT_EQ(ts, Timestamp(2, 0));
+
+ compaction_ts_str = Timestamp(4, 0);
+ compaction_ts = compaction_ts_str;
+ cro.full_history_ts_low = &compaction_ts;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ropts.timestamp = &read_ts_slice;
+ // k1, k2 and the range tombstone should be dropped
+ // k3 should still exist
+ db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+ values.data(), timestamps.data(), statuses.data(),
+ true /* sorted_input */);
+ ASSERT_TRUE(statuses[0].IsNotFound());
+ ASSERT_TRUE(timestamps[0].empty());
+ ASSERT_TRUE(statuses[1].IsNotFound());
+ ASSERT_TRUE(timestamps[1].empty());
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], "v3");
+ ASSERT_EQ(timestamps[2], Timestamp(4, 0));
+
+ Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, DeleteRangeBaiscReadAndIterate) {
+ const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ options.compression = kNoCompression;
+ BlockBasedTableOptions bbto;
+ bbto.index_type = GetParam();
+ bbto.block_size = 100;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.env = env_;
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+ DestroyAndReopen(options);
+
+ // Write half of the keys before the tombstone and half after the tombstone.
+ // Only covered keys (i.e., within the range and older than the tombstone)
+ // should be deleted.
+ for (int i = 0; i < kNum; ++i) {
+ if (i == kNum / 2) {
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key1(kRangeBegin), Key1(kRangeEnd),
+ Timestamp(i, 0)));
+ }
+ ASSERT_OK(db_->Put(WriteOptions(), Key1(i), Timestamp(i, 0),
+ "val" + std::to_string(i)));
+ if (i == kNum - kNumPerFile) {
+ ASSERT_OK(Flush());
+ }
+ }
+
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ std::string read_ts = Timestamp(kNum, 0);
+ Slice read_ts_slice = read_ts;
+ read_opts.timestamp = &read_ts_slice;
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ ASSERT_OK(iter->status());
+
+ int expected = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(Key1(expected), iter->key());
+ if (expected == kRangeBegin - 1) {
+ expected = kNum / 2;
+ } else {
+ ++expected;
+ }
+ }
+ ASSERT_EQ(kNum, expected);
+
+ expected = kNum / 2;
+ for (iter->Seek(Key1(kNum / 2)); iter->Valid(); iter->Next()) {
+ ASSERT_EQ(Key1(expected), iter->key());
+ ++expected;
+ }
+ ASSERT_EQ(kNum, expected);
+
+ expected = kRangeBegin - 1;
+ for (iter->SeekForPrev(Key1(kNum / 2 - 1)); iter->Valid(); iter->Prev()) {
+ ASSERT_EQ(Key1(expected), iter->key());
+ --expected;
+ }
+ ASSERT_EQ(-1, expected);
+
+ read_ts = Timestamp(0, 0);
+ read_ts_slice = read_ts;
+ read_opts.timestamp = &read_ts_slice;
+ iter.reset(db_->NewIterator(read_opts));
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), Key1(0));
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ }
+
+ read_ts = Timestamp(kNum, 0);
+ read_ts_slice = read_ts;
+ read_opts.timestamp = &read_ts_slice;
+ std::string value, timestamp;
+ Status s;
+ for (int i = 0; i < kNum; ++i) {
+ s = db_->Get(read_opts, Key1(i), &value, &timestamp);
+ if (i >= kRangeBegin && i < kNum / 2) {
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ(timestamp, Timestamp(kNum / 2, 0));
+ } else {
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "val" + std::to_string(i));
+ ASSERT_EQ(timestamp, Timestamp(i, 0));
+ }
+ }
+
+ size_t batch_size = kNum;
+ std::vector<std::string> key_strs(batch_size);
+ std::vector<Slice> keys(batch_size);
+ std::vector<PinnableSlice> values(batch_size);
+ std::vector<Status> statuses(batch_size);
+ std::vector<std::string> timestamps(batch_size);
+ for (int i = 0; i < kNum; ++i) {
+ key_strs[i] = Key1(i);
+ keys[i] = key_strs[i];
+ }
+ db_->MultiGet(read_opts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+ values.data(), timestamps.data(), statuses.data(),
+ true /* sorted_input */);
+ for (int i = 0; i < kNum; ++i) {
+ if (i >= kRangeBegin && i < kNum / 2) {
+ ASSERT_TRUE(statuses[i].IsNotFound());
+ ASSERT_EQ(timestamps[i], Timestamp(kNum / 2, 0));
+ } else {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(values[i], "val" + std::to_string(i));
+ ASSERT_EQ(timestamps[i], Timestamp(i, 0));
+ }
+ }
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, DeleteRangeGetIteratorWithSnapshot) {
+ // 4 keys 0, 1, 2, 3 at timestamps 0, 1, 2, 3 respectively.
+ // A range tombstone [1, 3) at timestamp 1 and has a sequence number between
+ // key 1 and 2.
+ Options options = CurrentOptions();
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ DestroyAndReopen(options);
+ WriteOptions write_opts;
+ std::string put_ts = Timestamp(0, 0);
+ const int kNum = 4, kNumPerFile = 1, kRangeBegin = 1, kRangeEnd = 3;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+ const Snapshot* before_tombstone = nullptr;
+ const Snapshot* after_tombstone = nullptr;
+ for (int i = 0; i < kNum; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), Key1(i), Timestamp(i, 0),
+ "val" + std::to_string(i)));
+ if (i == kRangeBegin) {
+ before_tombstone = db_->GetSnapshot();
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key1(kRangeBegin), Key1(kRangeEnd),
+ Timestamp(kRangeBegin, 0)));
+ }
+ if (i == kNum / 2) {
+ ASSERT_OK(Flush());
+ }
+ }
+ assert(before_tombstone);
+ after_tombstone = db_->GetSnapshot();
+ // snapshot and ts before tombstone
+ std::string read_ts_str = Timestamp(kRangeBegin - 1, 0); // (0, 0)
+ Slice read_ts = read_ts_str;
+ ReadOptions read_opts;
+ read_opts.timestamp = &read_ts;
+ read_opts.snapshot = before_tombstone;
+ std::vector<Status> expected_status = {
+ Status::OK(), Status::NotFound(), Status::NotFound(), Status::NotFound()};
+ std::vector<std::string> expected_values(kNum);
+ expected_values[0] = "val" + std::to_string(0);
+ std::vector<std::string> expected_timestamps(kNum);
+ expected_timestamps[0] = Timestamp(0, 0);
+
+ size_t batch_size = kNum;
+ std::vector<std::string> key_strs(batch_size);
+ std::vector<Slice> keys(batch_size);
+ std::vector<PinnableSlice> values(batch_size);
+ std::vector<Status> statuses(batch_size);
+ std::vector<std::string> timestamps(batch_size);
+ for (int i = 0; i < kNum; ++i) {
+ key_strs[i] = Key1(i);
+ keys[i] = key_strs[i];
+ }
+
+ auto verify = [&] {
+ db_->MultiGet(read_opts, db_->DefaultColumnFamily(), batch_size,
+ keys.data(), values.data(), timestamps.data(),
+ statuses.data(), true /* sorted_input */);
+ std::string value, timestamp;
+ Status s;
+ for (int i = 0; i < kNum; ++i) {
+ s = db_->Get(read_opts, Key1(i), &value, &timestamp);
+ ASSERT_EQ(s, expected_status[i]);
+ ASSERT_EQ(statuses[i], expected_status[i]);
+ if (s.ok()) {
+ ASSERT_EQ(value, expected_values[i]);
+ ASSERT_EQ(values[i], expected_values[i]);
+ }
+ if (!timestamp.empty()) {
+ ASSERT_EQ(timestamp, expected_timestamps[i]);
+ ASSERT_EQ(timestamps[i], expected_timestamps[i]);
+ } else {
+ ASSERT_TRUE(timestamps[i].empty());
+ }
+ }
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+ std::unique_ptr<Iterator> iter_for_seek(db_->NewIterator(read_opts));
+ iter->SeekToFirst();
+ for (int i = 0; i < kNum; ++i) {
+ if (expected_status[i].ok()) {
+ auto verify_iter = [&](Iterator* iter_ptr) {
+ ASSERT_TRUE(iter_ptr->Valid());
+ ASSERT_EQ(iter_ptr->key(), keys[i]);
+ ASSERT_EQ(iter_ptr->value(), expected_values[i]);
+ ASSERT_EQ(iter_ptr->timestamp(), expected_timestamps[i]);
+ };
+ verify_iter(iter.get());
+ iter->Next();
+
+ iter_for_seek->Seek(keys[i]);
+ verify_iter(iter_for_seek.get());
+
+ iter_for_seek->SeekForPrev(keys[i]);
+ verify_iter(iter_for_seek.get());
+ }
+ }
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ };
+
+ verify();
+
+ // snapshot before tombstone and ts after tombstone
+ read_ts_str = Timestamp(kNum, 0); // (4, 0)
+ read_ts = read_ts_str;
+ read_opts.timestamp = &read_ts;
+ read_opts.snapshot = before_tombstone;
+ expected_status[1] = Status::OK();
+ expected_timestamps[1] = Timestamp(1, 0);
+ expected_values[1] = "val" + std::to_string(1);
+ verify();
+
+ // snapshot after tombstone and ts before tombstone
+ read_ts_str = Timestamp(kRangeBegin - 1, 0); // (0, 0)
+ read_ts = read_ts_str;
+ read_opts.timestamp = &read_ts;
+ read_opts.snapshot = after_tombstone;
+ expected_status[1] = Status::NotFound();
+ expected_timestamps[1].clear();
+ expected_values[1].clear();
+ verify();
+
+ // snapshot and ts after tombstone
+ read_ts_str = Timestamp(kNum, 0); // (4, 0)
+ read_ts = read_ts_str;
+ read_opts.timestamp = &read_ts;
+ read_opts.snapshot = after_tombstone;
+ for (int i = 0; i < kNum; ++i) {
+ if (i == kRangeBegin) {
+ expected_status[i] = Status::NotFound();
+ expected_values[i].clear();
+ } else {
+ expected_status[i] = Status::OK();
+ expected_values[i] = "val" + std::to_string(i);
+ }
+ expected_timestamps[i] = Timestamp(i, 0);
+ }
+ verify();
+
+ db_->ReleaseSnapshot(before_tombstone);
+ db_->ReleaseSnapshot(after_tombstone);
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MergeBasic) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.merge_operator = std::make_shared<StringAppendTESTOperator>('.');
+ DestroyAndReopen(options);
+
+ const std::array<std::string, 3> write_ts_strs = {
+ Timestamp(100, 0), Timestamp(200, 0), Timestamp(300, 0)};
+ constexpr size_t kNumOfUniqKeys = 100;
+ ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+
+ for (size_t i = 0; i < write_ts_strs.size(); ++i) {
+ for (size_t j = 0; j < kNumOfUniqKeys; ++j) {
+ Status s;
+ if (i == 0) {
+ const std::string val = "v" + std::to_string(j) + "_0";
+ s = db_->Put(WriteOptions(), Key1(j), write_ts_strs[i], val);
+ } else {
+ const std::string merge_op = std::to_string(i);
+ s = db_->Merge(WriteOptions(), default_cf, Key1(j), write_ts_strs[i],
+ merge_op);
+ }
+ ASSERT_OK(s);
+ }
+ }
+
+ std::array<std::string, 3> read_ts_strs = {
+ Timestamp(150, 0), Timestamp(250, 0), Timestamp(350, 0)};
+
+ const auto verify_db_with_get = [&]() {
+ for (size_t i = 0; i < kNumOfUniqKeys; ++i) {
+ const std::string base_val = "v" + std::to_string(i) + "_0";
+ const std::array<std::string, 3> expected_values = {
+ base_val, base_val + ".1", base_val + ".1.2"};
+ const std::array<std::string, 3>& expected_ts = write_ts_strs;
+ ReadOptions read_opts;
+ for (size_t j = 0; j < read_ts_strs.size(); ++j) {
+ Slice read_ts = read_ts_strs[j];
+ read_opts.timestamp = &read_ts;
+ std::string value;
+ std::string ts;
+ const Status s = db_->Get(read_opts, Key1(i), &value, &ts);
+ ASSERT_OK(s);
+ ASSERT_EQ(expected_values[j], value);
+ ASSERT_EQ(expected_ts[j], ts);
+
+ // Do Seek/SeekForPrev
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ it->Seek(Key1(i));
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ(expected_values[j], it->value());
+ ASSERT_EQ(expected_ts[j], it->timestamp());
+
+ it->SeekForPrev(Key1(i));
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ(expected_values[j], it->value());
+ ASSERT_EQ(expected_ts[j], it->timestamp());
+ }
+ }
+ };
+
+ const auto verify_db_with_iterator = [&]() {
+ std::string value_suffix;
+ for (size_t i = 0; i < read_ts_strs.size(); ++i) {
+ ReadOptions read_opts;
+ Slice read_ts = read_ts_strs[i];
+ read_opts.timestamp = &read_ts;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ size_t key_int_val = 0;
+ for (it->SeekToFirst(); it->Valid(); it->Next(), ++key_int_val) {
+ const std::string key = Key1(key_int_val);
+ const std::string value =
+ "v" + std::to_string(key_int_val) + "_0" + value_suffix;
+ ASSERT_EQ(key, it->key());
+ ASSERT_EQ(value, it->value());
+ ASSERT_EQ(write_ts_strs[i], it->timestamp());
+ }
+ ASSERT_EQ(kNumOfUniqKeys, key_int_val);
+
+ key_int_val = kNumOfUniqKeys - 1;
+ for (it->SeekToLast(); it->Valid(); it->Prev(), --key_int_val) {
+ const std::string key = Key1(key_int_val);
+ const std::string value =
+ "v" + std::to_string(key_int_val) + "_0" + value_suffix;
+ ASSERT_EQ(key, it->key());
+ ASSERT_EQ(value, it->value());
+ ASSERT_EQ(write_ts_strs[i], it->timestamp());
+ }
+ ASSERT_EQ(std::numeric_limits<size_t>::max(), key_int_val);
+
+ value_suffix = value_suffix + "." + std::to_string(i + 1);
+ }
+ };
+
+ verify_db_with_get();
+ verify_db_with_iterator();
+
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ verify_db_with_get();
+ verify_db_with_iterator();
+
+ Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MergeAfterDeletion) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ const size_t kTimestampSize = Timestamp(0, 0).size();
+ TestComparator test_cmp(kTimestampSize);
+ options.comparator = &test_cmp;
+ options.merge_operator = std::make_shared<StringAppendTESTOperator>('.');
+ DestroyAndReopen(options);
+
+ ColumnFamilyHandle* const column_family = db_->DefaultColumnFamily();
+
+ const size_t num_keys_per_file = 10;
+ const size_t num_merges_per_key = 2;
+ for (size_t i = 0; i < num_keys_per_file; ++i) {
+ std::string ts = Timestamp(i + 10000, 0);
+ Status s = db_->Delete(WriteOptions(), Key1(i), ts);
+ ASSERT_OK(s);
+ for (size_t j = 1; j <= num_merges_per_key; ++j) {
+ ts = Timestamp(i + 10000 + j, 0);
+ s = db_->Merge(WriteOptions(), column_family, Key1(i), ts,
+ std::to_string(j));
+ ASSERT_OK(s);
+ }
+ }
+
+ const auto verify_db = [&]() {
+ ReadOptions read_opts;
+ std::string read_ts_str = Timestamp(20000, 0);
+ Slice ts = read_ts_str;
+ read_opts.timestamp = &ts;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ size_t count = 0;
+ for (it->SeekToFirst(); it->Valid(); it->Next(), ++count) {
+ std::string key = Key1(count);
+ ASSERT_EQ(key, it->key());
+ std::string value;
+ for (size_t j = 1; j <= num_merges_per_key; ++j) {
+ value.append(std::to_string(j));
+ if (j < num_merges_per_key) {
+ value.push_back('.');
+ }
+ }
+ ASSERT_EQ(value, it->value());
+ std::string ts1 = Timestamp(count + 10000 + num_merges_per_key, 0);
+ ASSERT_EQ(ts1, it->timestamp());
+ }
+ ASSERT_OK(it->status());
+ ASSERT_EQ(num_keys_per_file, count);
+ for (it->SeekToLast(); it->Valid(); it->Prev(), --count) {
+ std::string key = Key1(count - 1);
+ ASSERT_EQ(key, it->key());
+ std::string value;
+ for (size_t j = 1; j <= num_merges_per_key; ++j) {
+ value.append(std::to_string(j));
+ if (j < num_merges_per_key) {
+ value.push_back('.');
+ }
+ }
+ ASSERT_EQ(value, it->value());
+ std::string ts1 = Timestamp(count - 1 + 10000 + num_merges_per_key, 0);
+ ASSERT_EQ(ts1, it->timestamp());
+ }
+ ASSERT_OK(it->status());
+ ASSERT_EQ(0, count);
+ };
+
+ verify_db();
+
+ Close();
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_with_timestamp_compaction_test.cc b/src/rocksdb/db/db_with_timestamp_compaction_test.cc
new file mode 100644
index 000000000..d28f67e05
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_compaction_test.cc
@@ -0,0 +1,334 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+std::string Key1(uint64_t key) {
+ std::string ret;
+ PutFixed64(&ret, key);
+ std::reverse(ret.begin(), ret.end());
+ return ret;
+}
+
+std::string Timestamp(uint64_t ts) {
+ std::string ret;
+ PutFixed64(&ret, ts);
+ return ret;
+}
+} // anonymous namespace
+
+class TimestampCompatibleCompactionTest : public DBTestBase {
+ public:
+ TimestampCompatibleCompactionTest()
+ : DBTestBase("ts_compatible_compaction_test", /*env_do_fsync=*/true) {}
+
+ std::string Get(const std::string& key, uint64_t ts) {
+ ReadOptions read_opts;
+ std::string ts_str = Timestamp(ts);
+ Slice ts_slice = ts_str;
+ read_opts.timestamp = &ts_slice;
+ std::string value;
+ Status s = db_->Get(read_opts, key, &value);
+ if (s.IsNotFound()) {
+ value.assign("NOT_FOUND");
+ } else if (!s.ok()) {
+ value.assign(s.ToString());
+ }
+ return value;
+ }
+};
+
+TEST_F(TimestampCompatibleCompactionTest, UserKeyCrossFileBoundary) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.compaction_style = kCompactionStyleLevel;
+ options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ options.level0_file_num_compaction_trigger = 3;
+ constexpr size_t kNumKeysPerFile = 101;
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ const auto* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_NE(nullptr, compaction);
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ // Check that all 3 L0 ssts are picked for level compaction.
+ ASSERT_EQ(3, compaction->num_input_files(0));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ // Write a L0 with keys 0, 1, ..., 99 with ts from 100 to 199.
+ uint64_t ts = 100;
+ uint64_t key = 0;
+ WriteOptions write_opts;
+ for (; key < kNumKeysPerFile - 1; ++key, ++ts) {
+ std::string ts_str = Timestamp(ts);
+ ASSERT_OK(
+ db_->Put(write_opts, Key1(key), ts_str, "foo_" + std::to_string(key)));
+ }
+ // Write another L0 with keys 99 with newer ts.
+ ASSERT_OK(Flush());
+ uint64_t saved_read_ts1 = ts++;
+ key = 99;
+ for (int i = 0; i < 4; ++i, ++ts) {
+ std::string ts_str = Timestamp(ts);
+ ASSERT_OK(
+ db_->Put(write_opts, Key1(key), ts_str, "bar_" + std::to_string(key)));
+ }
+ ASSERT_OK(Flush());
+ uint64_t saved_read_ts2 = ts++;
+ // Write another L0 with keys 99, 100, 101, ..., 150
+ for (; key <= 150; ++key, ++ts) {
+ std::string ts_str = Timestamp(ts);
+ ASSERT_OK(
+ db_->Put(write_opts, Key1(key), ts_str, "foo1_" + std::to_string(key)));
+ }
+ ASSERT_OK(Flush());
+ // Wait for compaction to finish
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ uint64_t read_ts = ts;
+ ASSERT_EQ("foo_99", Get(Key1(99), saved_read_ts1));
+ ASSERT_EQ("bar_99", Get(Key1(99), saved_read_ts2));
+ ASSERT_EQ("foo1_99", Get(Key1(99), read_ts));
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(TimestampCompatibleCompactionTest, MultipleSubCompactions) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_subcompactions = 3;
+ options.target_file_size_base = 1024;
+ options.statistics = CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ uint64_t ts = 100;
+ uint64_t key = 0;
+ WriteOptions write_opts;
+
+ // Write keys 0, 1, ..., 499 with ts from 100 to 599.
+ {
+ for (; key <= 499; ++key, ++ts) {
+ std::string ts_str = Timestamp(ts);
+ ASSERT_OK(db_->Put(write_opts, Key1(key), ts_str,
+ "foo_" + std::to_string(key)));
+ }
+ }
+
+ // Write keys 500, ..., 999 with ts from 600 to 1099.
+ {
+ for (; key <= 999; ++key, ++ts) {
+ std::string ts_str = Timestamp(ts);
+ ASSERT_OK(db_->Put(write_opts, Key1(key), ts_str,
+ "foo_" + std::to_string(key)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // Wait for compaction to finish
+ {
+ ASSERT_OK(dbfull()->RunManualCompaction(
+ static_cast_with_check<ColumnFamilyHandleImpl>(
+ db_->DefaultColumnFamily())
+ ->cfd(),
+ 0 /* input_level */, 1 /* output_level */, CompactRangeOptions(),
+ nullptr /* begin */, nullptr /* end */, true /* exclusive */,
+ true /* disallow_trivial_move */,
+ std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
+ "" /*trim_ts*/));
+ }
+
+ // Check stats to make sure multiple subcompactions were scheduled for
+ // boundaries not to be nullptr.
+ {
+ HistogramData num_sub_compactions;
+ options.statistics->histogramData(NUM_SUBCOMPACTIONS_SCHEDULED,
+ &num_sub_compactions);
+ ASSERT_GT(num_sub_compactions.sum, 1);
+ }
+
+ for (key = 0; key <= 999; ++key) {
+ ASSERT_EQ("foo_" + std::to_string(key), Get(Key1(key), ts));
+ }
+}
+
+class TestFilePartitioner : public SstPartitioner {
+ public:
+ explicit TestFilePartitioner() {}
+ ~TestFilePartitioner() override {}
+
+ const char* Name() const override { return "TestFilePartitioner"; }
+ PartitionerResult ShouldPartition(
+ const PartitionerRequest& /*request*/) override {
+ return PartitionerResult::kRequired;
+ }
+ bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+ const Slice& /*largest_user_key*/) override {
+ return false;
+ }
+};
+
+class TestFilePartitionerFactory : public SstPartitionerFactory {
+ public:
+ explicit TestFilePartitionerFactory() {}
+ std::unique_ptr<SstPartitioner> CreatePartitioner(
+ const SstPartitioner::Context& /*context*/) const override {
+ std::unique_ptr<SstPartitioner> ret =
+ std::make_unique<TestFilePartitioner>();
+ return ret;
+ }
+ const char* Name() const override { return "TestFilePartitionerFactory"; }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(TimestampCompatibleCompactionTest, CompactFilesRangeCheckL0) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.sst_partitioner_factory =
+ std::make_shared<TestFilePartitionerFactory>();
+ options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ constexpr int kNumFiles = 10;
+ constexpr int kKeysPerFile = 2;
+ const std::string user_key = "foo";
+ constexpr uint64_t start_ts = 10000;
+
+ uint64_t cur_ts = start_ts;
+ for (int k = 0; k < kNumFiles; ++k) {
+ for (int i = 0; i < kKeysPerFile; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts),
+ "v" + std::to_string(i)));
+ ++cur_ts;
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+
+ std::vector<std::string> input_files{};
+ {
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(dbname_, &files));
+ for (const auto& f : files) {
+ uint64_t file_num = 0;
+ FileType file_type = FileType::kWalFile;
+ if (!ParseFileName(f, &file_num, &file_type) ||
+ file_type != FileType::kTableFile) {
+ continue;
+ }
+ input_files.emplace_back(f);
+ }
+ // sorting here by name, which also happens to sort by generation date.
+ std::sort(input_files.begin(), input_files.end());
+ assert(kNumFiles == input_files.size());
+ std::vector<std::string> tmp;
+ tmp.emplace_back(input_files[input_files.size() / 2]);
+ input_files.swap(tmp);
+ }
+
+ {
+ std::vector<std::string> output_file_names;
+ CompactionJobInfo compaction_job_info;
+ ASSERT_OK(db_->CompactFiles(CompactionOptions(), input_files,
+ /*output_level=*/1, /*output_path_id=*/-1,
+ &output_file_names, &compaction_job_info));
+ // We expect the L0 files older than the original provided input were all
+ // included in the compaction.
+ ASSERT_EQ(static_cast<size_t>(kNumFiles / 2 + 1),
+ compaction_job_info.input_files.size());
+ }
+}
+
+TEST_F(TimestampCompatibleCompactionTest, CompactFilesRangeCheckL1) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.sst_partitioner_factory =
+ std::make_shared<TestFilePartitionerFactory>();
+ options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+
+ constexpr int kNumFiles = 4;
+ options.level0_file_num_compaction_trigger = kNumFiles;
+
+ DestroyAndReopen(options);
+
+ constexpr int kKeysPerFile = 2;
+ const std::string user_key = "foo";
+ constexpr uint64_t start_ts = 10000;
+
+ uint64_t cur_ts = start_ts;
+ // Generate some initial files in both L0 and L1.
+ for (int k = 0; k < kNumFiles; ++k) {
+ for (int i = 0; i < kKeysPerFile; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts),
+ "v" + std::to_string(i)));
+ ++cur_ts;
+ }
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0, /*cf=*/0));
+ ASSERT_EQ(kNumFiles * kKeysPerFile,
+ NumTableFilesAtLevel(/*level=*/1, /*cf=*/0));
+
+ constexpr int additional_l0s = 2;
+ for (int i = 0; i < additional_l0s; ++i, ++cur_ts) {
+ ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts), "v"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ }
+ ASSERT_EQ(additional_l0s, NumTableFilesAtLevel(/*level=*/0, /*cf=*/0));
+
+ std::vector<std::string> inputs;
+ {
+ std::vector<LiveFileMetaData> fmetas;
+ db_->GetLiveFilesMetaData(&fmetas);
+ bool included_one_l1 = false;
+ for (const auto& meta : fmetas) {
+ if (meta.level == 0) {
+ inputs.emplace_back(meta.relative_filename);
+ } else if (!included_one_l1) {
+ inputs.emplace_back(meta.relative_filename);
+ included_one_l1 = true;
+ }
+ }
+ }
+ ASSERT_EQ(static_cast<size_t>(3), inputs.size());
+ {
+ std::vector<std::string> output_file_names;
+ CompactionJobInfo compaction_job_info;
+
+ ASSERT_OK(db_->CompactFiles(CompactionOptions(), inputs, /*output_level=*/1,
+ /*output_path_id=*/-1, &output_file_names,
+ &compaction_job_info));
+ ASSERT_EQ(kNumFiles * kKeysPerFile + 2, output_file_names.size());
+ ASSERT_EQ(kNumFiles * kKeysPerFile + 2,
+ static_cast<int>(compaction_job_info.input_files.size()));
+ }
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_with_timestamp_test_util.cc b/src/rocksdb/db/db_with_timestamp_test_util.cc
new file mode 100644
index 000000000..f562bcb48
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_test_util.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_with_timestamp_test_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+std::string DBBasicTestWithTimestampBase::Key1(uint64_t k) {
+ std::string ret;
+ PutFixed64(&ret, k);
+ std::reverse(ret.begin(), ret.end());
+ return ret;
+}
+
+std::string DBBasicTestWithTimestampBase::KeyWithPrefix(std::string prefix,
+ uint64_t k) {
+ std::string ret;
+ PutFixed64(&ret, k);
+ std::reverse(ret.begin(), ret.end());
+ return prefix + ret;
+}
+
+std::vector<Slice> DBBasicTestWithTimestampBase::ConvertStrToSlice(
+ std::vector<std::string>& strings) {
+ std::vector<Slice> ret;
+ for (const auto& s : strings) {
+ ret.emplace_back(s);
+ }
+ return ret;
+}
+
+std::string DBBasicTestWithTimestampBase::Timestamp(uint64_t low,
+ uint64_t high) {
+ std::string ts;
+ PutFixed64(&ts, low);
+ PutFixed64(&ts, high);
+ return ts;
+}
+
+void DBBasicTestWithTimestampBase::CheckIterUserEntry(
+ const Iterator* it, const Slice& expected_key,
+ ValueType expected_value_type, const Slice& expected_value,
+ const Slice& expected_ts) const {
+ ASSERT_TRUE(it->Valid());
+ ASSERT_OK(it->status());
+ ASSERT_EQ(expected_key, it->key());
+ if (kTypeValue == expected_value_type) {
+ ASSERT_EQ(expected_value, it->value());
+ }
+ ASSERT_EQ(expected_ts, it->timestamp());
+}
+
+void DBBasicTestWithTimestampBase::CheckIterEntry(
+ const Iterator* it, const Slice& expected_ukey, SequenceNumber expected_seq,
+ ValueType expected_val_type, const Slice& expected_value,
+ const Slice& expected_ts) const {
+ ASSERT_TRUE(it->Valid());
+ ASSERT_OK(it->status());
+ std::string ukey_and_ts;
+ ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
+ ukey_and_ts.append(expected_ts.data(), expected_ts.size());
+ ParsedInternalKey parsed_ikey;
+ ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
+ ASSERT_EQ(ukey_and_ts, parsed_ikey.user_key);
+ ASSERT_EQ(expected_val_type, parsed_ikey.type);
+ ASSERT_EQ(expected_seq, parsed_ikey.sequence);
+ if (expected_val_type == kTypeValue) {
+ ASSERT_EQ(expected_value, it->value());
+ }
+ ASSERT_EQ(expected_ts, it->timestamp());
+}
+
+void DBBasicTestWithTimestampBase::CheckIterEntry(
+ const Iterator* it, const Slice& expected_ukey, ValueType expected_val_type,
+ const Slice& expected_value, const Slice& expected_ts) const {
+ ASSERT_TRUE(it->Valid());
+ ASSERT_OK(it->status());
+ std::string ukey_and_ts;
+ ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
+ ukey_and_ts.append(expected_ts.data(), expected_ts.size());
+
+ ParsedInternalKey parsed_ikey;
+ ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
+ ASSERT_EQ(expected_val_type, parsed_ikey.type);
+ ASSERT_EQ(Slice(ukey_and_ts), parsed_ikey.user_key);
+ if (expected_val_type == kTypeValue) {
+ ASSERT_EQ(expected_value, it->value());
+ }
+ ASSERT_EQ(expected_ts, it->timestamp());
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_with_timestamp_test_util.h b/src/rocksdb/db/db_with_timestamp_test_util.h
new file mode 100644
index 000000000..8a0d8e4e3
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_test_util.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBBasicTestWithTimestampBase : public DBTestBase {
+ public:
+ explicit DBBasicTestWithTimestampBase(const std::string& dbname)
+ : DBTestBase(dbname, /*env_do_fsync=*/true) {}
+
+ protected:
+ static std::string Key1(uint64_t k);
+
+ static std::string KeyWithPrefix(std::string prefix, uint64_t k);
+
+ static std::vector<Slice> ConvertStrToSlice(
+ std::vector<std::string>& strings);
+
+ class TestComparator : public Comparator {
+ private:
+ const Comparator* cmp_without_ts_;
+
+ public:
+ explicit TestComparator(size_t ts_sz)
+ : Comparator(ts_sz), cmp_without_ts_(nullptr) {
+ cmp_without_ts_ = BytewiseComparator();
+ }
+
+ const char* Name() const override { return "TestComparator"; }
+
+ void FindShortSuccessor(std::string*) const override {}
+
+ void FindShortestSeparator(std::string*, const Slice&) const override {}
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ int r = CompareWithoutTimestamp(a, b);
+ if (r != 0 || 0 == timestamp_size()) {
+ return r;
+ }
+ return -CompareTimestamp(
+ Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
+ Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
+ }
+
+ using Comparator::CompareWithoutTimestamp;
+ int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+ bool b_has_ts) const override {
+ if (a_has_ts) {
+ assert(a.size() >= timestamp_size());
+ }
+ if (b_has_ts) {
+ assert(b.size() >= timestamp_size());
+ }
+ Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, timestamp_size()) : a;
+ Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, timestamp_size()) : b;
+ return cmp_without_ts_->Compare(lhs, rhs);
+ }
+
+ int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+ if (!ts1.data() && !ts2.data()) {
+ return 0;
+ } else if (ts1.data() && !ts2.data()) {
+ return 1;
+ } else if (!ts1.data() && ts2.data()) {
+ return -1;
+ }
+ assert(ts1.size() == ts2.size());
+ uint64_t low1 = 0;
+ uint64_t low2 = 0;
+ uint64_t high1 = 0;
+ uint64_t high2 = 0;
+ const size_t kSize = ts1.size();
+ std::unique_ptr<char[]> ts1_buf(new char[kSize]);
+ memcpy(ts1_buf.get(), ts1.data(), ts1.size());
+ std::unique_ptr<char[]> ts2_buf(new char[kSize]);
+ memcpy(ts2_buf.get(), ts2.data(), ts2.size());
+ Slice ts1_copy = Slice(ts1_buf.get(), kSize);
+ Slice ts2_copy = Slice(ts2_buf.get(), kSize);
+ auto* ptr1 = const_cast<Slice*>(&ts1_copy);
+ auto* ptr2 = const_cast<Slice*>(&ts2_copy);
+ if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
+ !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
+ assert(false);
+ }
+ if (high1 < high2) {
+ return -1;
+ } else if (high1 > high2) {
+ return 1;
+ }
+ if (low1 < low2) {
+ return -1;
+ } else if (low1 > low2) {
+ return 1;
+ }
+ return 0;
+ }
+ };
+
+ std::string Timestamp(uint64_t low, uint64_t high);
+
+ void CheckIterUserEntry(const Iterator* it, const Slice& expected_key,
+ ValueType expected_value_type,
+ const Slice& expected_value,
+ const Slice& expected_ts) const;
+
+ void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
+ SequenceNumber expected_seq, ValueType expected_val_type,
+ const Slice& expected_value,
+ const Slice& expected_ts) const;
+
+ void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
+ ValueType expected_val_type, const Slice& expected_value,
+ const Slice& expected_ts) const;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_write_buffer_manager_test.cc b/src/rocksdb/db/db_write_buffer_manager_test.cc
new file mode 100644
index 000000000..4c31a7824
--- /dev/null
+++ b/src/rocksdb/db/db_write_buffer_manager_test.cc
@@ -0,0 +1,862 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "db/write_thread.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBWriteBufferManagerTest : public DBTestBase,
+ public testing::WithParamInterface<bool> {
+ public:
+ DBWriteBufferManagerTest()
+ : DBTestBase("db_write_buffer_manager_test", /*env_do_fsync=*/false) {}
+ bool cost_cache_;
+};
+
+TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) {
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+ options.write_buffer_size = 500000; // this is never hit
+ std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+ ASSERT_LT(cache->GetUsage(), 256 * 1024);
+ cost_cache_ = GetParam();
+
+ if (cost_cache_) {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, cache, true));
+ } else {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, nullptr, true));
+ }
+
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ Flush(3);
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ Flush(0);
+
+ // Write to "Default", "cf2" and "cf3".
+ ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+ ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+
+ ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+ // WriteBufferManager::buffer_size_ has exceeded after the previous write is
+ // completed.
+
+ // This make sures write will go through and if stall was in effect, it will
+ // end.
+ ASSERT_OK(Put(0, Key(2), DummyString(1), wo));
+}
+
+// Test Single DB with multiple writer threads get blocked when
+// WriteBufferManager execeeds buffer_size_ and flush is waiting to be
+// finished.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) {
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+ options.write_buffer_size = 500000; // this is never hit
+ std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+ ASSERT_LT(cache->GetUsage(), 256 * 1024);
+ cost_cache_ = GetParam();
+
+ if (cost_cache_) {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, cache, true));
+ } else {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, nullptr, true));
+ }
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ Flush(3);
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ Flush(0);
+
+ // Write to "Default", "cf2" and "cf3". No flush will be triggered.
+ ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+ ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+
+ ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+ // WriteBufferManager::buffer_size_ has exceeded after the previous write is
+ // completed.
+
+ std::unordered_set<WriteThread::Writer*> w_set;
+ std::vector<port::Thread> threads;
+ int wait_count_db = 0;
+ int num_writers = 4;
+ InstrumentedMutex mutex;
+ InstrumentedCondVar cv(&mutex);
+ std::atomic<int> thread_num(0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+ "DBImpl::BackgroundCallFlush:start"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WBMStallInterface::BlockDB", [&](void*) {
+ InstrumentedMutexLock lock(&mutex);
+ wait_count_db++;
+ cv.SignalAll();
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::WriteStall::Wait", [&](void* arg) {
+ InstrumentedMutexLock lock(&mutex);
+ WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+ w_set.insert(w);
+ // Allow the flush to continue if all writer threads are blocked.
+ if (w_set.size() == (unsigned long)num_writers) {
+ TEST_SYNC_POINT(
+ "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ bool s = true;
+
+ std::function<void(int)> writer = [&](int cf) {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ Status tmp = Put(cf, Slice(key), DummyString(1), wo);
+ InstrumentedMutexLock lock(&mutex);
+ s = s && tmp.ok();
+ };
+
+ // Flow:
+ // main_writer thread will write but will be blocked (as Flush will on hold,
+ // buffer_size_ has exceeded, thus will create stall in effect).
+ // |
+ // |
+ // multiple writer threads will be created to write across multiple columns
+ // and they will be blocked.
+ // |
+ // |
+ // Last writer thread will write and when its blocked it will signal Flush to
+ // continue to clear the stall.
+
+ threads.emplace_back(writer, 1);
+ // Wait untill first thread (main_writer) writing to DB is blocked and then
+ // create the multiple writers which will be blocked from getting added to the
+ // queue because stall is in effect.
+ {
+ InstrumentedMutexLock lock(&mutex);
+ while (wait_count_db != 1) {
+ cv.Wait();
+ }
+ }
+ for (int i = 0; i < num_writers; i++) {
+ threads.emplace_back(writer, i % 4);
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ ASSERT_TRUE(s);
+
+ // Number of DBs blocked.
+ ASSERT_EQ(wait_count_db, 1);
+ // Number of Writer threads blocked.
+ ASSERT_EQ(w_set.size(), num_writers);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush
+// is waiting to be finished but DBs tries to write meanwhile.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
+ std::vector<std::string> dbnames;
+ std::vector<DB*> dbs;
+ int num_dbs = 3;
+
+ for (int i = 0; i < num_dbs; i++) {
+ dbs.push_back(nullptr);
+ dbnames.push_back(
+ test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+ }
+
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+ options.write_buffer_size = 500000; // this is never hit
+ std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+ ASSERT_LT(cache->GetUsage(), 256 * 1024);
+ cost_cache_ = GetParam();
+
+ if (cost_cache_) {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, cache, true));
+ } else {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, nullptr, true));
+ }
+ CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+ for (int i = 0; i < num_dbs; i++) {
+ ASSERT_OK(DestroyDB(dbnames[i], options));
+ ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+ }
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ for (int i = 0; i < num_dbs; i++) {
+ ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+ }
+ // Insert to db_.
+ ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+ // WriteBufferManager Limit exceeded.
+ std::vector<port::Thread> threads;
+ int wait_count_db = 0;
+ InstrumentedMutex mutex;
+ InstrumentedCondVar cv(&mutex);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+ "DBImpl::BackgroundCallFlush:start"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WBMStallInterface::BlockDB", [&](void*) {
+ {
+ InstrumentedMutexLock lock(&mutex);
+ wait_count_db++;
+ cv.Signal();
+ // Since this is the last DB, signal Flush to continue.
+ if (wait_count_db == num_dbs + 1) {
+ TEST_SYNC_POINT(
+ "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ bool s = true;
+
+ // Write to DB.
+ std::function<void(DB*)> write_db = [&](DB* db) {
+ Status tmp = db->Put(wo, Key(3), DummyString(1));
+ InstrumentedMutexLock lock(&mutex);
+ s = s && tmp.ok();
+ };
+
+ // Flow:
+ // db_ will write and will be blocked (as Flush will on hold and will create
+ // stall in effect).
+ // |
+ // multiple dbs writers will be created to write to that db and they will be
+ // blocked.
+ // |
+ // |
+ // Last writer will write and when its blocked it will signal Flush to
+ // continue to clear the stall.
+
+ threads.emplace_back(write_db, db_);
+ // Wait untill first DB is blocked and then create the multiple writers for
+ // different DBs which will be blocked from getting added to the queue because
+ // stall is in effect.
+ {
+ InstrumentedMutexLock lock(&mutex);
+ while (wait_count_db != 1) {
+ cv.Wait();
+ }
+ }
+ for (int i = 0; i < num_dbs; i++) {
+ threads.emplace_back(write_db, dbs[i]);
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ ASSERT_TRUE(s);
+ ASSERT_EQ(num_dbs + 1, wait_count_db);
+ // Clean up DBs.
+ for (int i = 0; i < num_dbs; i++) {
+ ASSERT_OK(dbs[i]->Close());
+ ASSERT_OK(DestroyDB(dbnames[i], options));
+ delete dbs[i];
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple DBs and multiple columns get
+// blocked when stall by WriteBufferManager is in effect.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
+ std::vector<std::string> dbnames;
+ std::vector<DB*> dbs;
+ int num_dbs = 3;
+
+ for (int i = 0; i < num_dbs; i++) {
+ dbs.push_back(nullptr);
+ dbnames.push_back(
+ test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+ }
+
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+ options.write_buffer_size = 500000; // this is never hit
+ std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+ ASSERT_LT(cache->GetUsage(), 256 * 1024);
+ cost_cache_ = GetParam();
+
+ if (cost_cache_) {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, cache, true));
+ } else {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, nullptr, true));
+ }
+ CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+ for (int i = 0; i < num_dbs; i++) {
+ ASSERT_OK(DestroyDB(dbnames[i], options));
+ ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+ }
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ for (int i = 0; i < num_dbs; i++) {
+ ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+ }
+ // Insert to db_.
+ ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+ // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+ // dbs[0] is completed.
+ std::vector<port::Thread> threads;
+ int wait_count_db = 0;
+ InstrumentedMutex mutex;
+ InstrumentedCondVar cv(&mutex);
+ std::unordered_set<WriteThread::Writer*> w_set;
+ std::vector<port::Thread> writer_threads;
+ std::atomic<int> thread_num(0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+ "DBImpl::BackgroundCallFlush:start"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WBMStallInterface::BlockDB", [&](void*) {
+ {
+ InstrumentedMutexLock lock(&mutex);
+ wait_count_db++;
+ thread_num.fetch_add(1);
+ cv.Signal();
+ // Allow the flush to continue if all writer threads are blocked.
+ if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
+ TEST_SYNC_POINT(
+ "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::WriteStall::Wait", [&](void* arg) {
+ WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+ {
+ InstrumentedMutexLock lock(&mutex);
+ w_set.insert(w);
+ thread_num.fetch_add(1);
+ // Allow the flush continue if all writer threads are blocked.
+ if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
+ TEST_SYNC_POINT(
+ "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ bool s1 = true, s2 = true;
+ // Write to multiple columns of db_.
+ std::function<void(int)> write_cf = [&](int cf) {
+ Status tmp = Put(cf, Key(3), DummyString(1), wo);
+ InstrumentedMutexLock lock(&mutex);
+ s1 = s1 && tmp.ok();
+ };
+ // Write to multiple DBs.
+ std::function<void(DB*)> write_db = [&](DB* db) {
+ Status tmp = db->Put(wo, Key(3), DummyString(1));
+ InstrumentedMutexLock lock(&mutex);
+ s2 = s2 && tmp.ok();
+ };
+
+ // Flow:
+ // thread will write to db_ will be blocked (as Flush will on hold,
+ // buffer_size_ has exceeded and will create stall in effect).
+ // |
+ // |
+ // multiple writers threads writing to different DBs and to db_ across
+ // multiple columns will be created and they will be blocked due to stall.
+ // |
+ // |
+ // Last writer thread will write and when its blocked it will signal Flush to
+ // continue to clear the stall.
+ threads.emplace_back(write_db, db_);
+ // Wait untill first thread is blocked and then create the multiple writer
+ // threads.
+ {
+ InstrumentedMutexLock lock(&mutex);
+ while (wait_count_db != 1) {
+ cv.Wait();
+ }
+ }
+
+ for (int i = 0; i < num_dbs; i++) {
+ // Write to multiple columns of db_.
+ writer_threads.emplace_back(write_cf, i % 3);
+ // Write to different dbs.
+ threads.emplace_back(write_db, dbs[i]);
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+ for (auto& t : writer_threads) {
+ t.join();
+ }
+
+ ASSERT_TRUE(s1);
+ ASSERT_TRUE(s2);
+
+ // Number of DBs blocked.
+ ASSERT_EQ(num_dbs + 1, wait_count_db);
+ // Number of Writer threads blocked.
+ ASSERT_EQ(w_set.size(), num_dbs);
+ // Clean up DBs.
+ for (int i = 0; i < num_dbs; i++) {
+ ASSERT_OK(dbs[i]->Close());
+ ASSERT_OK(DestroyDB(dbnames[i], options));
+ delete dbs[i];
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple columns of db_ by passing
+// different values to WriteOption.no_slown_down.
+TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) {
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+ options.write_buffer_size = 500000; // this is never hit
+ std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+ ASSERT_LT(cache->GetUsage(), 256 * 1024);
+ cost_cache_ = GetParam();
+
+ if (cost_cache_) {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, cache, true));
+ } else {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, nullptr, true));
+ }
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ Flush(3);
+ ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+ ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+ Flush(0);
+
+ // Write to "Default", "cf2" and "cf3". No flush will be triggered.
+ ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+ ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+ ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+ ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+
+ // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+ // db_ is completed.
+
+ std::unordered_set<WriteThread::Writer*> w_slowdown_set;
+ std::vector<port::Thread> threads;
+ int wait_count_db = 0;
+ int num_writers = 4;
+ InstrumentedMutex mutex;
+ InstrumentedCondVar cv(&mutex);
+ std::atomic<int> thread_num(0);
+ std::atomic<int> w_no_slowdown(0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+ "DBImpl::BackgroundCallFlush:start"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WBMStallInterface::BlockDB", [&](void*) {
+ {
+ InstrumentedMutexLock lock(&mutex);
+ wait_count_db++;
+ cv.SignalAll();
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::WriteStall::Wait", [&](void* arg) {
+ {
+ InstrumentedMutexLock lock(&mutex);
+ WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+ w_slowdown_set.insert(w);
+ // Allow the flush continue if all writer threads are blocked.
+ if (w_slowdown_set.size() + (unsigned long)w_no_slowdown.load(
+ std::memory_order_relaxed) ==
+ (unsigned long)num_writers) {
+ TEST_SYNC_POINT(
+ "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ bool s1 = true, s2 = true;
+
+ std::function<void(int)> write_slow_down = [&](int cf) {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions write_op;
+ write_op.no_slowdown = false;
+ Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
+ InstrumentedMutexLock lock(&mutex);
+ s1 = s1 && tmp.ok();
+ };
+
+ std::function<void(int)> write_no_slow_down = [&](int cf) {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions write_op;
+ write_op.no_slowdown = true;
+ Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
+ {
+ InstrumentedMutexLock lock(&mutex);
+ s2 = s2 && !tmp.ok();
+ w_no_slowdown.fetch_add(1);
+ // Allow the flush continue if all writer threads are blocked.
+ if (w_slowdown_set.size() +
+ (unsigned long)w_no_slowdown.load(std::memory_order_relaxed) ==
+ (unsigned long)num_writers) {
+ TEST_SYNC_POINT(
+ "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+ }
+ }
+ };
+
+ // Flow:
+ // main_writer thread will write but will be blocked (as Flush will on hold,
+ // buffer_size_ has exceeded, thus will create stall in effect).
+ // |
+ // |
+ // multiple writer threads will be created to write across multiple columns
+ // with different values of WriteOptions.no_slowdown. Some of them will
+ // be blocked and some of them will return with Incomplete status.
+ // |
+ // |
+ // Last writer thread will write and when its blocked/return it will signal
+ // Flush to continue to clear the stall.
+ threads.emplace_back(write_slow_down, 1);
+ // Wait untill first thread (main_writer) writing to DB is blocked and then
+ // create the multiple writers which will be blocked from getting added to the
+ // queue because stall is in effect.
+ {
+ InstrumentedMutexLock lock(&mutex);
+ while (wait_count_db != 1) {
+ cv.Wait();
+ }
+ }
+
+ for (int i = 0; i < num_writers; i += 2) {
+ threads.emplace_back(write_no_slow_down, (i) % 4);
+ threads.emplace_back(write_slow_down, (i + 1) % 4);
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ ASSERT_TRUE(s1);
+ ASSERT_TRUE(s2);
+ // Number of DBs blocked.
+ ASSERT_EQ(wait_count_db, 1);
+ // Number of Writer threads blocked.
+ ASSERT_EQ(w_slowdown_set.size(), num_writers / 2);
+ // Number of Writer threads with WriteOptions.no_slowdown = true.
+ ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_writers / 2);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple columns of db_ and different
+// dbs by passing different values to WriteOption.no_slown_down.
+TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
+ std::vector<std::string> dbnames;
+ std::vector<DB*> dbs;
+ int num_dbs = 4;
+
+ for (int i = 0; i < num_dbs; i++) {
+ dbs.push_back(nullptr);
+ dbnames.push_back(
+ test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+ }
+
+ Options options = CurrentOptions();
+ options.arena_block_size = 4096;
+ options.write_buffer_size = 500000; // this is never hit
+ std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+ ASSERT_LT(cache->GetUsage(), 256 * 1024);
+ cost_cache_ = GetParam();
+
+ if (cost_cache_) {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, cache, true));
+ } else {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(100000, nullptr, true));
+ }
+ CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+ for (int i = 0; i < num_dbs; i++) {
+ ASSERT_OK(DestroyDB(dbnames[i], options));
+ ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+ }
+ WriteOptions wo;
+ wo.disableWAL = true;
+
+ for (int i = 0; i < num_dbs; i++) {
+ ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+ }
+ // Insert to db_.
+ ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+ // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+ // dbs[0] is completed.
+ std::vector<port::Thread> threads;
+ int wait_count_db = 0;
+ InstrumentedMutex mutex;
+ InstrumentedCondVar cv(&mutex);
+ std::unordered_set<WriteThread::Writer*> w_slowdown_set;
+ std::vector<port::Thread> writer_threads;
+ std::atomic<int> thread_num(0);
+ std::atomic<int> w_no_slowdown(0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+ "DBImpl::BackgroundCallFlush:start"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WBMStallInterface::BlockDB", [&](void*) {
+ InstrumentedMutexLock lock(&mutex);
+ wait_count_db++;
+ cv.Signal();
+ // Allow the flush continue if all writer threads are blocked.
+ if (w_slowdown_set.size() +
+ (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+ wait_count_db) ==
+ (unsigned long)(2 * num_dbs + 1)) {
+ TEST_SYNC_POINT(
+ "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::WriteStall::Wait", [&](void* arg) {
+ WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+ InstrumentedMutexLock lock(&mutex);
+ w_slowdown_set.insert(w);
+ // Allow the flush continue if all writer threads are blocked.
+ if (w_slowdown_set.size() +
+ (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+ wait_count_db) ==
+ (unsigned long)(2 * num_dbs + 1)) {
+ TEST_SYNC_POINT(
+ "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ bool s1 = true, s2 = true;
+ std::function<void(DB*)> write_slow_down = [&](DB* db) {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions write_op;
+ write_op.no_slowdown = false;
+ Status tmp = db->Put(write_op, Slice(key), DummyString(1));
+ InstrumentedMutexLock lock(&mutex);
+ s1 = s1 && tmp.ok();
+ };
+
+ std::function<void(DB*)> write_no_slow_down = [&](DB* db) {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions write_op;
+ write_op.no_slowdown = true;
+ Status tmp = db->Put(write_op, Slice(key), DummyString(1));
+ {
+ InstrumentedMutexLock lock(&mutex);
+ s2 = s2 && !tmp.ok();
+ w_no_slowdown.fetch_add(1);
+ if (w_slowdown_set.size() +
+ (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+ wait_count_db) ==
+ (unsigned long)(2 * num_dbs + 1)) {
+ TEST_SYNC_POINT(
+ "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+ }
+ }
+ };
+
+ // Flow:
+ // first thread will write but will be blocked (as Flush will on hold,
+ // buffer_size_ has exceeded, thus will create stall in effect).
+ // |
+ // |
+ // multiple writer threads will be created to write across multiple columns
+ // of db_ and different DBs with different values of
+ // WriteOptions.no_slowdown. Some of them will be blocked and some of them
+ // will return with Incomplete status.
+ // |
+ // |
+ // Last writer thread will write and when its blocked/return it will signal
+ // Flush to continue to clear the stall.
+ threads.emplace_back(write_slow_down, db_);
+ // Wait untill first thread writing to DB is blocked and then
+ // create the multiple writers.
+ {
+ InstrumentedMutexLock lock(&mutex);
+ while (wait_count_db != 1) {
+ cv.Wait();
+ }
+ }
+
+ for (int i = 0; i < num_dbs; i += 2) {
+ // Write to multiple columns of db_.
+ writer_threads.emplace_back(write_slow_down, db_);
+ writer_threads.emplace_back(write_no_slow_down, db_);
+ // Write to different DBs.
+ threads.emplace_back(write_slow_down, dbs[i]);
+ threads.emplace_back(write_no_slow_down, dbs[i + 1]);
+ }
+
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ for (auto& t : writer_threads) {
+ t.join();
+ }
+
+ ASSERT_TRUE(s1);
+ ASSERT_TRUE(s2);
+ // Number of DBs blocked.
+ ASSERT_EQ((num_dbs / 2) + 1, wait_count_db);
+ // Number of writer threads writing to db_ blocked from getting added to the
+ // queue.
+ ASSERT_EQ(w_slowdown_set.size(), num_dbs / 2);
+ // Number of threads with WriteOptions.no_slowdown = true.
+ ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_dbs);
+
+ // Clean up DBs.
+ for (int i = 0; i < num_dbs; i++) {
+ ASSERT_OK(dbs[i]->Close());
+ ASSERT_OK(DestroyDB(dbnames[i], options));
+ delete dbs[i];
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#ifndef ROCKSDB_LITE
+
+// Tests a `WriteBufferManager` constructed with `allow_stall == false` does not
+// thrash memtable switching when full and a CF receives multiple writes.
+// Instead, we expect to switch a CF's memtable for flush only when that CF does
+// not have any pending or running flush.
+//
+// This test uses multiple DBs each with a single CF instead of a single DB
+// with multiple CFs. That way we can control which CF is considered for switch
+// by writing to that CF's DB.
+//
+// Not supported in LITE mode due to `GetProperty()` unavailable.
+TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) {
+ Options options = CurrentOptions();
+ options.arena_block_size = 4 << 10; // 4KB
+ options.write_buffer_size = 1 << 20; // 1MB
+ std::shared_ptr<Cache> cache =
+ NewLRUCache(4 << 20 /* capacity (4MB) */, 2 /* num_shard_bits */);
+ ASSERT_LT(cache->GetUsage(), 256 << 10 /* 256KB */);
+ cost_cache_ = GetParam();
+ if (cost_cache_) {
+ options.write_buffer_manager.reset(new WriteBufferManager(
+ 512 << 10 /* buffer_size (512KB) */, cache, false /* allow_stall */));
+ } else {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(512 << 10 /* buffer_size (512KB) */,
+ nullptr /* cache */, false /* allow_stall */));
+ }
+
+ Reopen(options);
+ std::string dbname = test::PerThreadDBPath("db_shared_wbm_db");
+ DB* shared_wbm_db = nullptr;
+
+ ASSERT_OK(DestroyDB(dbname, options));
+ ASSERT_OK(DB::Open(options, dbname, &shared_wbm_db));
+
+ // The last write will make WBM need flush, but it won't flush yet.
+ ASSERT_OK(Put(Key(1), DummyString(256 << 10 /* 256KB */), WriteOptions()));
+ ASSERT_FALSE(options.write_buffer_manager->ShouldFlush());
+ ASSERT_OK(Put(Key(1), DummyString(256 << 10 /* 256KB */), WriteOptions()));
+ ASSERT_TRUE(options.write_buffer_manager->ShouldFlush());
+
+ // Flushes will be pending, not running because flush threads are blocked.
+ test::SleepingBackgroundTask sleeping_task_high;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_high, Env::Priority::HIGH);
+
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(
+ shared_wbm_db->Put(WriteOptions(), Key(1), DummyString(1 /* len */)));
+ std::string prop;
+ ASSERT_TRUE(
+ shared_wbm_db->GetProperty("rocksdb.num-immutable-mem-table", &prop));
+ ASSERT_EQ(std::to_string(i > 0 ? 1 : 0), prop);
+ ASSERT_TRUE(
+ shared_wbm_db->GetProperty("rocksdb.mem-table-flush-pending", &prop));
+ ASSERT_EQ(std::to_string(i > 0 ? 1 : 0), prop);
+ }
+
+ // Clean up DBs.
+ sleeping_task_high.WakeUp();
+ sleeping_task_high.WaitUntilDone();
+ ASSERT_OK(shared_wbm_db->Close());
+ ASSERT_OK(DestroyDB(dbname, options));
+ delete shared_wbm_db;
+}
+
+#endif // ROCKSDB_LITE
+
+INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest,
+ testing::Bool());
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_write_test.cc b/src/rocksdb/db/db_write_test.cc
new file mode 100644
index 000000000..1011d5c9e
--- /dev/null
+++ b/src/rocksdb/db/db_write_test.cc
@@ -0,0 +1,679 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <fstream>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/write_batch_internal.h"
+#include "db/write_thread.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Test variations of WriteImpl.
+class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
+ public:
+ DBWriteTest() : DBTestBase("db_write_test", /*env_do_fsync=*/true) {}
+
+ Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
+
+ void Open() { DBTestBase::Reopen(GetOptions()); }
+};
+
+class DBWriteTestUnparameterized : public DBTestBase {
+ public:
+ explicit DBWriteTestUnparameterized()
+ : DBTestBase("pipelined_write_test", /*env_do_fsync=*/false) {}
+};
+
+// It is invalid to do sync write while disabling WAL.
+TEST_P(DBWriteTest, SyncAndDisableWAL) {
+ WriteOptions write_options;
+ write_options.sync = true;
+ write_options.disableWAL = true;
+ ASSERT_TRUE(dbfull()->Put(write_options, "foo", "bar").IsInvalidArgument());
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("foo", "bar"));
+ ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument());
+}
+
+TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) {
+ Options options = GetOptions();
+ options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger =
+ 4;
+ std::vector<port::Thread> threads;
+ std::atomic<int> thread_num(0);
+ port::Mutex mutex;
+ port::CondVar cv(&mutex);
+ // Guarded by mutex
+ int writers = 0;
+
+ Reopen(options);
+
+ std::function<void()> write_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = false;
+ ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+ };
+ std::function<void()> write_no_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = true;
+ Status s = dbfull()->Put(wo, key, "bar");
+ ASSERT_TRUE(s.ok() || s.IsIncomplete());
+ };
+ std::function<void(void*)> unblock_main_thread_func = [&](void*) {
+ mutex.Lock();
+ ++writers;
+ cv.SignalAll();
+ mutex.Unlock();
+ };
+
+ // Create 3 L0 files and schedule 4th without waiting
+ ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBWriteTest::WriteStallRemoveNoSlowdownWrite:1",
+ "DBImpl::BackgroundCallFlush:start"},
+ {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:2",
+ "DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"},
+ // Make compaction start wait for the write stall to be detected and
+ // implemented by a write group leader
+ {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:3",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Schedule creation of 4th L0 file without waiting. This will seal the
+ // memtable and then wait for a sync point before writing the file. We need
+ // to do it this way because SwitchMemtable() needs to enter the
+ // write_thread
+ FlushOptions fopt;
+ fopt.wait = false;
+ ASSERT_OK(dbfull()->Flush(fopt));
+
+ // Create a mix of slowdown/no_slowdown write threads
+ mutex.Lock();
+ // First leader
+ threads.emplace_back(write_slowdown_func);
+ while (writers != 1) {
+ cv.Wait();
+ }
+
+ // Second leader. Will stall writes
+ // Build a writers list with no slowdown in the middle:
+ // +-------------+
+ // | slowdown +<----+ newest
+ // +--+----------+
+ // |
+ // v
+ // +--+----------+
+ // | no slowdown |
+ // +--+----------+
+ // |
+ // v
+ // +--+----------+
+ // | slowdown +
+ // +-------------+
+ threads.emplace_back(write_slowdown_func);
+ while (writers != 2) {
+ cv.Wait();
+ }
+ threads.emplace_back(write_no_slowdown_func);
+ while (writers != 3) {
+ cv.Wait();
+ }
+ threads.emplace_back(write_slowdown_func);
+ while (writers != 4) {
+ cv.Wait();
+ }
+
+ mutex.Unlock();
+
+ TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:1");
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr));
+ // This would have triggered a write stall. Unblock the write group leader
+ TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:2");
+ // The leader is going to create missing newer links. When the leader
+ // finishes, the next leader is going to delay writes and fail writers with
+ // no_slowdown
+
+ TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:3");
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
+ Options options = GetOptions();
+ options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger =
+ 4;
+ std::vector<port::Thread> threads;
+ std::atomic<int> thread_num(0);
+ port::Mutex mutex;
+ port::CondVar cv(&mutex);
+ // Guarded by mutex
+ int writers = 0;
+
+ Reopen(options);
+
+ std::function<void()> write_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = false;
+ ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+ };
+ std::function<void()> write_no_slowdown_func = [&]() {
+ int a = thread_num.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ wo.no_slowdown = true;
+ Status s = dbfull()->Put(wo, key, "bar");
+ ASSERT_TRUE(s.ok() || s.IsIncomplete());
+ };
+ std::function<void(void*)> unblock_main_thread_func = [&](void*) {
+ mutex.Lock();
+ ++writers;
+ cv.SignalAll();
+ mutex.Unlock();
+ };
+
+ // Create 3 L0 files and schedule 4th without waiting
+ ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBWriteTest::WriteThreadHangOnWriteStall:1",
+ "DBImpl::BackgroundCallFlush:start"},
+ {"DBWriteTest::WriteThreadHangOnWriteStall:2",
+ "DBImpl::WriteImpl:BeforeLeaderEnters"},
+ // Make compaction start wait for the write stall to be detected and
+ // implemented by a write group leader
+ {"DBWriteTest::WriteThreadHangOnWriteStall:3",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Schedule creation of 4th L0 file without waiting. This will seal the
+ // memtable and then wait for a sync point before writing the file. We need
+ // to do it this way because SwitchMemtable() needs to enter the
+ // write_thread
+ FlushOptions fopt;
+ fopt.wait = false;
+ ASSERT_OK(dbfull()->Flush(fopt));
+
+ // Create a mix of slowdown/no_slowdown write threads
+ mutex.Lock();
+ // First leader
+ threads.emplace_back(write_slowdown_func);
+ while (writers != 1) {
+ cv.Wait();
+ }
+ // Second leader. Will stall writes
+ threads.emplace_back(write_slowdown_func);
+ threads.emplace_back(write_no_slowdown_func);
+ threads.emplace_back(write_slowdown_func);
+ threads.emplace_back(write_no_slowdown_func);
+ threads.emplace_back(write_slowdown_func);
+ while (writers != 6) {
+ cv.Wait();
+ }
+ mutex.Unlock();
+
+ TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1");
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr));
+ // This would have triggered a write stall. Unblock the write group leader
+ TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2");
+ // The leader is going to create missing newer links. When the leader
+ // finishes, the next leader is going to delay writes and fail writers with
+ // no_slowdown
+
+ TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:3");
+ for (auto& t : threads) {
+ t.join();
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
+ constexpr int kNumThreads = 5;
+ std::unique_ptr<FaultInjectionTestEnv> mock_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = GetOptions();
+ options.env = mock_env.get();
+ Reopen(options);
+ std::atomic<int> ready_count{0};
+ std::atomic<int> leader_count{0};
+ std::vector<port::Thread> threads;
+ mock_env->SetFilesystemActive(false);
+
+ // Wait until all threads linked to write threads, to make sure
+ // all threads join the same batch group.
+ SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+ ready_count++;
+ auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
+ if (w->state == WriteThread::STATE_GROUP_LEADER) {
+ leader_count++;
+ while (ready_count < kNumThreads) {
+ // busy waiting
+ }
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ for (int i = 0; i < kNumThreads; i++) {
+ threads.push_back(port::Thread(
+ [&](int index) {
+ // All threads should fail.
+ auto res = Put("key" + std::to_string(index), "value");
+ if (options.manual_wal_flush) {
+ ASSERT_TRUE(res.ok());
+ // we should see fs error when we do the flush
+
+ // TSAN reports a false alarm for lock-order-inversion but Open and
+ // FlushWAL are not run concurrently. Disabling this until TSAN is
+ // fixed.
+ // res = dbfull()->FlushWAL(false);
+ // ASSERT_FALSE(res.ok());
+ } else {
+ ASSERT_FALSE(res.ok());
+ }
+ },
+ i));
+ }
+ for (int i = 0; i < kNumThreads; i++) {
+ threads[i].join();
+ }
+ ASSERT_EQ(1, leader_count);
+
+ // The Failed PUT operations can cause a BG error to be set.
+ // Mark it as Checked for the ASSERT_STATUS_CHECKED
+ dbfull()->Resume().PermitUncheckedError();
+
+ // Close before mock_env destruct.
+ Close();
+}
+
+TEST_F(DBWriteTestUnparameterized, PipelinedWriteRace) {
+ // This test was written to trigger a race in ExitAsBatchGroupLeader in case
+ // enable_pipelined_write_ was true.
+ // Writers for which ShouldWriteToMemtable() evaluates to false are removed
+ // from the write_group via CompleteFollower/ CompleteLeader. Writers in the
+ // middle of the group are fully unlinked, but if that writers is the
+ // last_writer, then we did not update the predecessor's link_older, i.e.,
+ // this writer was still reachable via newest_writer_.
+ //
+ // But the problem was, that CompleteFollower already wakes up the thread
+ // owning that writer before the writer has been removed. This resulted in a
+ // race - if the leader thread was fast enough, then everything was fine.
+ // However, if the woken up thread finished the current write operation and
+ // then performed yet another write, then a new writer instance was added
+ // to newest_writer_. It is possible that the new writer is located on the
+ // same address on stack, and if this happened, then we had a problem,
+ // because the old code tried to find the last_writer in the list to unlink
+ // it, which in this case produced a cycle in the list.
+ // Whether two invocations of PipelinedWriteImpl() by the same thread actually
+ // allocate the writer on the same address depends on the OS and/or compiler,
+ // so it is rather hard to create a deterministic test for this.
+
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.enable_pipelined_write = true;
+ std::vector<port::Thread> threads;
+
+ std::atomic<int> write_counter{0};
+ std::atomic<int> active_writers{0};
+ std::atomic<bool> second_write_starting{false};
+ std::atomic<bool> second_write_in_progress{false};
+ std::atomic<WriteThread::Writer*> leader{nullptr};
+ std::atomic<bool> finished_WAL_write{false};
+
+ DestroyAndReopen(options);
+
+ auto write_one_doc = [&]() {
+ int a = write_counter.fetch_add(1);
+ std::string key = "foo" + std::to_string(a);
+ WriteOptions wo;
+ ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+ --active_writers;
+ };
+
+ auto write_two_docs = [&]() {
+ write_one_doc();
+ second_write_starting = true;
+ write_one_doc();
+ };
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+ if (second_write_starting.load()) {
+ second_write_in_progress = true;
+ return;
+ }
+ auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
+ if (w->state == WriteThread::STATE_GROUP_LEADER) {
+ active_writers++;
+ if (leader.load() == nullptr) {
+ leader.store(w);
+ while (active_writers.load() < 2) {
+ // wait for another thread to join the write_group
+ }
+ }
+ } else {
+ // we disable the memtable for all followers so that they they are
+ // removed from the write_group before enqueuing it for the memtable
+ // write
+ w->disable_memtable = true;
+ active_writers++;
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::ExitAsBatchGroupLeader:Start", [&](void* arg) {
+ auto* wg = reinterpret_cast<WriteThread::WriteGroup*>(arg);
+ if (wg->leader == leader && !finished_WAL_write) {
+ finished_WAL_write = true;
+ while (active_writers.load() < 3) {
+ // wait for the new writer to be enqueued
+ }
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters",
+ [&](void* arg) {
+ auto* wg = reinterpret_cast<WriteThread::WriteGroup*>(arg);
+ if (wg->leader == leader) {
+ while (!second_write_in_progress.load()) {
+ // wait for the old follower thread to start the next write
+ }
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // start leader + one follower
+ threads.emplace_back(write_one_doc);
+ while (leader.load() == nullptr) {
+ // wait for leader
+ }
+
+ // we perform two writes in the follower, so that for the second write
+ // the thread reinserts a Writer with the same address
+ threads.emplace_back(write_two_docs);
+
+ // wait for the leader to enter ExitAsBatchGroupLeader
+ while (!finished_WAL_write.load()) {
+ // wait for write_group to have finished the WAL writes
+ }
+
+ // start another writer thread to be enqueued before the leader can
+ // complete the writers from its write_group
+ threads.emplace_back(write_one_doc);
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBWriteTest, ManualWalFlushInEffect) {
+ Options options = GetOptions();
+ Reopen(options);
+ // try the 1st WAL created during open
+ ASSERT_TRUE(Put("key" + std::to_string(0), "value").ok());
+ ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+ ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+ ASSERT_TRUE(dbfull()->WALBufferIsEmpty());
+ // try the 2nd wal created during SwitchWAL
+ ASSERT_OK(dbfull()->TEST_SwitchWAL());
+ ASSERT_TRUE(Put("key" + std::to_string(0), "value").ok());
+ ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+ ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+ ASSERT_TRUE(dbfull()->WALBufferIsEmpty());
+}
+
+TEST_P(DBWriteTest, UnflushedPutRaceWithTrackedWalSync) {
+ // Repro race condition bug where unflushed WAL data extended the synced size
+ // recorded to MANIFEST despite being unrecoverable.
+ Options options = GetOptions();
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(env_));
+ options.env = fault_env.get();
+ options.manual_wal_flush = true;
+ options.track_and_verify_wals_in_manifest = true;
+ Reopen(options);
+
+ ASSERT_OK(Put("key1", "val1"));
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::SyncWAL:Begin",
+ [this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db_->FlushWAL(true /* sync */));
+
+ // Ensure callback ran.
+ ASSERT_EQ("val2", Get("key2"));
+
+ Close();
+
+ // Simulate full loss of unsynced data. This drops "key2" -> "val2" from the
+ // DB WAL.
+ fault_env->DropUnsyncedFileData();
+
+ Reopen(options);
+
+ // Need to close before `fault_env` goes out of scope.
+ Close();
+}
+
+TEST_P(DBWriteTest, InactiveWalFullySyncedBeforeUntracked) {
+ // Repro bug where a WAL is appended and switched after
+ // `FlushWAL(true /* sync */)`'s sync finishes and before it untracks fully
+ // synced inactive logs. Previously such a WAL would be wrongly untracked
+ // so the final append would never be synced.
+ Options options = GetOptions();
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(env_));
+ options.env = fault_env.get();
+ Reopen(options);
+
+ ASSERT_OK(Put("key1", "val1"));
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::SyncWAL:BeforeMarkLogsSynced:1", [this](void* /* arg */) {
+ ASSERT_OK(Put("key2", "val2"));
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db_->FlushWAL(true /* sync */));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_OK(Put("key3", "val3"));
+
+ ASSERT_OK(db_->FlushWAL(true /* sync */));
+
+ Close();
+
+ // Simulate full loss of unsynced data. This should drop nothing since we did
+ // `FlushWAL(true /* sync */)` before `Close()`.
+ fault_env->DropUnsyncedFileData();
+
+ Reopen(options);
+
+ ASSERT_EQ("val1", Get("key1"));
+ ASSERT_EQ("val2", Get("key2"));
+ ASSERT_EQ("val3", Get("key3"));
+
+ // Need to close before `fault_env` goes out of scope.
+ Close();
+}
+
+TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
+ std::unique_ptr<FaultInjectionTestEnv> mock_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = GetOptions();
+ options.env = mock_env.get();
+ Reopen(options);
+ for (int i = 0; i < 2; i++) {
+ // Forcibly fail WAL write for the first Put only. Subsequent Puts should
+ // fail due to read-only mode
+ mock_env->SetFilesystemActive(i != 0);
+ auto res = Put("key" + std::to_string(i), "value");
+ // TSAN reports a false alarm for lock-order-inversion but Open and
+ // FlushWAL are not run concurrently. Disabling this until TSAN is
+ // fixed.
+ /*
+ if (options.manual_wal_flush && i == 0) {
+ // even with manual_wal_flush the 2nd Put should return error because of
+ // the read-only mode
+ ASSERT_TRUE(res.ok());
+ // we should see fs error when we do the flush
+ res = dbfull()->FlushWAL(false);
+ }
+ */
+ if (!options.manual_wal_flush) {
+ ASSERT_NOK(res);
+ } else {
+ ASSERT_OK(res);
+ }
+ }
+ // Close before mock_env destruct.
+ Close();
+}
+
+TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) {
+ Random rnd(301);
+ std::unique_ptr<FaultInjectionTestEnv> mock_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = GetOptions();
+ options.env = mock_env.get();
+ options.writable_file_max_buffer_size = 4 * 1024 * 1024;
+ options.write_buffer_size = 3 * 512 * 1024;
+ options.wal_bytes_per_sync = 256 * 1024;
+ options.manual_wal_flush = true;
+ Reopen(options);
+ mock_env->SetFilesystemActive(false, Status::IOError("Not active"));
+ Status s;
+ for (int i = 0; i < 4 * 512; ++i) {
+ s = Put(Key(i), rnd.RandomString(1024));
+ if (!s.ok()) {
+ break;
+ }
+ }
+ ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
+
+ mock_env->SetFilesystemActive(true);
+ // Close before mock_env destruct.
+ Close();
+}
+
+// Test that db->LockWAL() flushes the WAL after locking.
+TEST_P(DBWriteTest, LockWalInEffect) {
+ Options options = GetOptions();
+ Reopen(options);
+ // try the 1st WAL created during open
+ ASSERT_OK(Put("key" + std::to_string(0), "value"));
+ ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+ ASSERT_OK(dbfull()->LockWAL());
+ ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false));
+ ASSERT_OK(dbfull()->UnlockWAL());
+ // try the 2nd wal created during SwitchWAL
+ ASSERT_OK(dbfull()->TEST_SwitchWAL());
+ ASSERT_OK(Put("key" + std::to_string(0), "value"));
+ ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+ ASSERT_OK(dbfull()->LockWAL());
+ ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false));
+ ASSERT_OK(dbfull()->UnlockWAL());
+}
+
+TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) {
+ Options options = GetOptions();
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kAll);
+ Reopen(options);
+ std::string wal_key_prefix = "WAL_KEY_";
+ std::string no_wal_key_prefix = "K_";
+ // 100 KB value each for NO-WAL operation
+ std::string no_wal_value(1024 * 100, 'X');
+ // 1B value each for WAL operation
+ std::string wal_value = "0";
+ std::thread threads[10];
+ for (int t = 0; t < 10; t++) {
+ threads[t] = std::thread([t, wal_key_prefix, wal_value, no_wal_key_prefix,
+ no_wal_value, this] {
+ for (int i = 0; i < 10; i++) {
+ ROCKSDB_NAMESPACE::WriteOptions write_option_disable;
+ write_option_disable.disableWAL = true;
+ ROCKSDB_NAMESPACE::WriteOptions write_option_default;
+ std::string no_wal_key =
+ no_wal_key_prefix + std::to_string(t) + "_" + std::to_string(i);
+ ASSERT_OK(this->Put(no_wal_key, no_wal_value, write_option_disable));
+ std::string wal_key =
+ wal_key_prefix + std::to_string(i) + "_" + std::to_string(i);
+ ASSERT_OK(this->Put(wal_key, wal_value, write_option_default));
+ ASSERT_OK(dbfull()->SyncWAL());
+ }
+ return;
+ });
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+ uint64_t bytes_num = options.statistics->getTickerCount(
+ ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES);
+ // written WAL size should less than 100KB (even included HEADER & FOOTER
+ // overhead)
+ ASSERT_LE(bytes_num, 1024 * 100);
+}
+
+INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
+ testing::Values(DBTestBase::kDefault,
+ DBTestBase::kConcurrentWALWrites,
+ DBTestBase::kPipelinedWrite));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/dbformat.cc b/src/rocksdb/db/dbformat.cc
new file mode 100644
index 000000000..b0ac6c339
--- /dev/null
+++ b/src/rocksdb/db/dbformat.cc
@@ -0,0 +1,188 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/dbformat.h"
+
+#include <stdio.h>
+
+#include <cinttypes>
+
+#include "db/lookup_key.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+const ValueType kValueTypeForSeek = kTypeWideColumnEntity;
+const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
+const std::string kDisableUserTimestamp("");
+
+EntryType GetEntryType(ValueType value_type) {
+ switch (value_type) {
+ case kTypeValue:
+ return kEntryPut;
+ case kTypeDeletion:
+ return kEntryDelete;
+ case kTypeDeletionWithTimestamp:
+ return kEntryDeleteWithTimestamp;
+ case kTypeSingleDeletion:
+ return kEntrySingleDelete;
+ case kTypeMerge:
+ return kEntryMerge;
+ case kTypeRangeDeletion:
+ return kEntryRangeDeletion;
+ case kTypeBlobIndex:
+ return kEntryBlobIndex;
+ case kTypeWideColumnEntity:
+ return kEntryWideColumnEntity;
+ default:
+ return kEntryOther;
+ }
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+ result->append(key.user_key.data(), key.user_key.size());
+ PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+void AppendInternalKeyWithDifferentTimestamp(std::string* result,
+ const ParsedInternalKey& key,
+ const Slice& ts) {
+ assert(key.user_key.size() >= ts.size());
+ result->append(key.user_key.data(), key.user_key.size() - ts.size());
+ result->append(ts.data(), ts.size());
+ PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
+ ValueType t) {
+ PutFixed64(result, PackSequenceAndType(s, t));
+}
+
+void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
+ size_t ts_sz) {
+ assert(ts_sz > 0);
+ const std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
+ result->append(key.data(), key.size());
+ result->append(kTsMin.data(), ts_sz);
+}
+
+void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
+ size_t ts_sz) {
+ assert(ts_sz > 0);
+ const std::string kTsMax(ts_sz, static_cast<unsigned char>(0xff));
+ result->append(key.data(), key.size());
+ result->append(kTsMax.data(), ts_sz);
+}
+
+void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key,
+ size_t ts_sz) {
+ assert(ts_sz > 0);
+ result->append(key.data(), key.size() - ts_sz);
+
+ static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+ if (ts_sz < strlen(kTsMax)) {
+ result->append(kTsMax, ts_sz);
+ } else {
+ result->append(std::string(ts_sz, '\xff'));
+ }
+}
+
+std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const {
+ std::string result = "'";
+ if (log_err_key) {
+ result += user_key.ToString(hex);
+ } else {
+ result += "<redacted>";
+ }
+
+ char buf[50];
+ snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence,
+ static_cast<int>(type));
+
+ result += buf;
+ return result;
+}
+
+std::string InternalKey::DebugString(bool hex) const {
+ std::string result;
+ ParsedInternalKey parsed;
+ if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) {
+ result = parsed.DebugString(true /* log_err_key */, hex); // TODO
+ } else {
+ result = "(bad)";
+ result.append(EscapeString(rep_));
+ }
+ return result;
+}
+
+int InternalKeyComparator::Compare(const ParsedInternalKey& a,
+ const ParsedInternalKey& b) const {
+ // Order by:
+ // increasing user key (according to user-supplied comparator)
+ // decreasing sequence number
+ // decreasing type (though sequence# should be enough to disambiguate)
+ int r = user_comparator_.Compare(a.user_key, b.user_key);
+ if (r == 0) {
+ if (a.sequence > b.sequence) {
+ r = -1;
+ } else if (a.sequence < b.sequence) {
+ r = +1;
+ } else if (a.type > b.type) {
+ r = -1;
+ } else if (a.type < b.type) {
+ r = +1;
+ }
+ }
+ return r;
+}
+
+LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
+ const Slice* ts) {
+ size_t usize = _user_key.size();
+ size_t ts_sz = (nullptr == ts) ? 0 : ts->size();
+ size_t needed = usize + ts_sz + 13; // A conservative estimate
+ char* dst;
+ if (needed <= sizeof(space_)) {
+ dst = space_;
+ } else {
+ dst = new char[needed];
+ }
+ start_ = dst;
+ // NOTE: We don't support users keys of more than 2GB :)
+ dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + ts_sz + 8));
+ kstart_ = dst;
+ memcpy(dst, _user_key.data(), usize);
+ dst += usize;
+ if (nullptr != ts) {
+ memcpy(dst, ts->data(), ts_sz);
+ dst += ts_sz;
+ }
+ EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+ dst += 8;
+ end_ = dst;
+}
+
+void IterKey::EnlargeBuffer(size_t key_size) {
+ // If size is smaller than buffer size, continue using current buffer,
+ // or the static allocated one, as default
+ assert(key_size > buf_size_);
+ // Need to enlarge the buffer.
+ ResetBuffer();
+ buf_ = new char[key_size];
+ buf_size_ = key_size;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/dbformat.h b/src/rocksdb/db/dbformat.h
new file mode 100644
index 000000000..8c1fc7055
--- /dev/null
+++ b/src/rocksdb/db/dbformat.h
@@ -0,0 +1,865 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdio.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/types.h"
+#include "util/coding.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file declares data structures and functions that deal with internal
+// keys.
+// Each internal key contains a user key, a sequence number (SequenceNumber)
+// and a type (ValueType), and they are usually encoded together.
+// There are some related helper classes here.
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+// The highest bit of the value type needs to be reserved to SST tables
+// for them to do more flexible encoding.
+enum ValueType : unsigned char {
+ kTypeDeletion = 0x0,
+ kTypeValue = 0x1,
+ kTypeMerge = 0x2,
+ kTypeLogData = 0x3, // WAL only.
+ kTypeColumnFamilyDeletion = 0x4, // WAL only.
+ kTypeColumnFamilyValue = 0x5, // WAL only.
+ kTypeColumnFamilyMerge = 0x6, // WAL only.
+ kTypeSingleDeletion = 0x7,
+ kTypeColumnFamilySingleDeletion = 0x8, // WAL only.
+ kTypeBeginPrepareXID = 0x9, // WAL only.
+ kTypeEndPrepareXID = 0xA, // WAL only.
+ kTypeCommitXID = 0xB, // WAL only.
+ kTypeRollbackXID = 0xC, // WAL only.
+ kTypeNoop = 0xD, // WAL only.
+ kTypeColumnFamilyRangeDeletion = 0xE, // WAL only.
+ kTypeRangeDeletion = 0xF, // meta block
+ kTypeColumnFamilyBlobIndex = 0x10, // Blob DB only
+ kTypeBlobIndex = 0x11, // Blob DB only
+ // When the prepared record is also persisted in db, we use a different
+ // record. This is to ensure that the WAL that is generated by a WritePolicy
+ // is not mistakenly read by another, which would result into data
+ // inconsistency.
+ kTypeBeginPersistedPrepareXID = 0x12, // WAL only.
+ // Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL
+ // generated by WriteUnprepared write policy is not mistakenly read by
+ // another.
+ kTypeBeginUnprepareXID = 0x13, // WAL only.
+ kTypeDeletionWithTimestamp = 0x14,
+ kTypeCommitXIDAndTimestamp = 0x15, // WAL only
+ kTypeWideColumnEntity = 0x16,
+ kTypeColumnFamilyWideColumnEntity = 0x17, // WAL only
+ kTypeMaxValid, // Should be after the last valid type, only used for
+ // validation
+ kMaxValue = 0x7F // Not used for storing records.
+};
+
+// Defined in dbformat.cc
+extern const ValueType kValueTypeForSeek;
+extern const ValueType kValueTypeForSeekForPrev;
+
+// Checks whether a type is an inline value type
+// (i.e. a type used in memtable skiplist and sst file datablock).
+inline bool IsValueType(ValueType t) {
+ return t <= kTypeMerge || kTypeSingleDeletion == t || kTypeBlobIndex == t ||
+ kTypeDeletionWithTimestamp == t || kTypeWideColumnEntity == t;
+}
+
+// Checks whether a type is from user operation
+// kTypeRangeDeletion is in meta block so this API is separated from above
+inline bool IsExtendedValueType(ValueType t) {
+ return IsValueType(t) || t == kTypeRangeDeletion;
+}
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
+
+static const SequenceNumber kDisableGlobalSequenceNumber =
+ std::numeric_limits<uint64_t>::max();
+
+constexpr uint64_t kNumInternalBytes = 8;
+
+// Defined in dbformat.cc
+extern const std::string kDisableUserTimestamp;
+
+// The data structure that represents an internal key in the way that user_key,
+// sequence number and type are stored in separated forms.
+struct ParsedInternalKey {
+ Slice user_key;
+ SequenceNumber sequence;
+ ValueType type;
+
+ ParsedInternalKey()
+ : sequence(kMaxSequenceNumber),
+ type(kTypeDeletion) // Make code analyzer happy
+ {} // Intentionally left uninitialized (for speed)
+ // u contains timestamp if user timestamp feature is enabled.
+ ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+ : user_key(u), sequence(seq), type(t) {}
+ std::string DebugString(bool log_err_key, bool hex) const;
+
+ void clear() {
+ user_key.clear();
+ sequence = 0;
+ type = kTypeDeletion;
+ }
+
+ void SetTimestamp(const Slice& ts) {
+ assert(ts.size() <= user_key.size());
+ const char* addr = user_key.data() + user_key.size() - ts.size();
+ memcpy(const_cast<char*>(addr), ts.data(), ts.size());
+ }
+
+ Slice GetTimestamp(size_t ts_sz) {
+ assert(ts_sz <= user_key.size());
+ const char* addr = user_key.data() + user_key.size() - ts_sz;
+ return Slice(const_cast<char*>(addr), ts_sz);
+ }
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+ return key.user_key.size() + kNumInternalBytes;
+}
+
+// Pack a sequence number and a ValueType into a uint64_t
+inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+ assert(seq <= kMaxSequenceNumber);
+ // kTypeMaxValid is used in TruncatedRangeDelIterator, see its constructor.
+ assert(IsExtendedValueType(t) || t == kTypeMaxValid);
+ return (seq << 8) | t;
+}
+
+// Given the result of PackSequenceAndType, store the sequence number in *seq
+// and the ValueType in *t.
+inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq,
+ ValueType* t) {
+ *seq = packed >> 8;
+ *t = static_cast<ValueType>(packed & 0xff);
+
+ // Commented the following two assertions in order to test key-value checksum
+ // on corrupted keys without crashing ("DbKvChecksumTest").
+ // assert(*seq <= kMaxSequenceNumber);
+ // assert(IsExtendedValueType(*t));
+}
+
+EntryType GetEntryType(ValueType value_type);
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+ const ParsedInternalKey& key);
+
+// Append the serialization of "key" to *result, replacing the original
+// timestamp with argument ts.
+extern void AppendInternalKeyWithDifferentTimestamp(
+ std::string* result, const ParsedInternalKey& key, const Slice& ts);
+
+// Serialized internal key consists of user key followed by footer.
+// This function appends the footer to *result, assuming that *result already
+// contains the user key at the end.
+extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
+ ValueType t);
+
+// Append the key and a minimal timestamp to *result
+extern void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
+ size_t ts_sz);
+
+// Append the key and a maximal timestamp to *result
+extern void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
+ size_t ts_sz);
+
+// `key` is a user key with timestamp. Append the user key without timestamp
+// and the maximal timestamp to *result.
+extern void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key,
+ size_t ts_sz);
+
+// Attempt to parse an internal key from "internal_key". On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern Status ParseInternalKey(const Slice& internal_key,
+ ParsedInternalKey* result, bool log_err_key);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+ assert(internal_key.size() >= kNumInternalBytes);
+ return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes);
+}
+
+inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
+ size_t ts_sz) {
+ Slice ret = internal_key;
+ ret.remove_suffix(kNumInternalBytes + ts_sz);
+ return ret;
+}
+
+inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
+ Slice ret = user_key;
+ ret.remove_suffix(ts_sz);
+ return ret;
+}
+
+inline Slice ExtractTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
+ assert(user_key.size() >= ts_sz);
+ return Slice(user_key.data() + user_key.size() - ts_sz, ts_sz);
+}
+
+inline Slice ExtractTimestampFromKey(const Slice& internal_key, size_t ts_sz) {
+ const size_t key_size = internal_key.size();
+ assert(key_size >= kNumInternalBytes + ts_sz);
+ return Slice(internal_key.data() + key_size - ts_sz - kNumInternalBytes,
+ ts_sz);
+}
+
+inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
+ assert(internal_key.size() >= kNumInternalBytes);
+ const size_t n = internal_key.size();
+ return DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+ uint64_t num = ExtractInternalKeyFooter(internal_key);
+ unsigned char c = num & 0xff;
+ return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator
+#ifdef NDEBUG
+ final
+#endif
+ : public CompareInterface {
+ private:
+ UserComparatorWrapper user_comparator_;
+
+ public:
+ // `InternalKeyComparator`s constructed with the default constructor are not
+ // usable and will segfault on any attempt to use them for comparisons.
+ InternalKeyComparator() = default;
+
+ // @param named If true, assign a name to this comparator based on the
+ // underlying comparator's name. This involves an allocation and copy in
+ // this constructor to precompute the result of `Name()`. To avoid this
+ // overhead, set `named` to false. In that case, `Name()` will return a
+ // generic name that is non-specific to the underlying comparator.
+ explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) {}
+ virtual ~InternalKeyComparator() {}
+
+ int Compare(const Slice& a, const Slice& b) const override;
+
+ bool Equal(const Slice& a, const Slice& b) const {
+ // TODO Use user_comparator_.Equal(). Perhaps compare seqno before
+ // comparing the user key too.
+ return Compare(a, b) == 0;
+ }
+
+ // Same as Compare except that it excludes the value type from comparison
+ int CompareKeySeq(const Slice& a, const Slice& b) const;
+
+ const Comparator* user_comparator() const {
+ return user_comparator_.user_comparator();
+ }
+
+ int Compare(const InternalKey& a, const InternalKey& b) const;
+ int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
+ // In this `Compare()` overload, the sequence numbers provided in
+ // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a`
+ // and `b`, respectively. To disable sequence number override(s), provide the
+ // value `kDisableGlobalSequenceNumber`.
+ int Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b,
+ SequenceNumber b_global_seqno) const;
+};
+
+// The class represent the internal key in encoded form.
+class InternalKey {
+ private:
+ std::string rep_;
+
+ public:
+ InternalKey() {} // Leave rep_ as empty to indicate it is invalid
+ InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
+ AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
+ }
+ InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t, Slice ts) {
+ AppendInternalKeyWithDifferentTimestamp(
+ &rep_, ParsedInternalKey(_user_key, s, t), ts);
+ }
+
+ // sets the internal key to be bigger or equal to all internal keys with this
+ // user key
+ void SetMaxPossibleForUserKey(const Slice& _user_key) {
+ AppendInternalKey(
+ &rep_, ParsedInternalKey(_user_key, 0, static_cast<ValueType>(0)));
+ }
+
+ // sets the internal key to be smaller or equal to all internal keys with this
+ // user key
+ void SetMinPossibleForUserKey(const Slice& _user_key) {
+ AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber,
+ kValueTypeForSeek));
+ }
+
+ bool Valid() const {
+ ParsedInternalKey parsed;
+ return (ParseInternalKey(Slice(rep_), &parsed, false /* log_err_key */)
+ .ok()); // TODO
+ }
+
+ void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+ Slice Encode() const {
+ assert(!rep_.empty());
+ return rep_;
+ }
+
+ Slice user_key() const { return ExtractUserKey(rep_); }
+ size_t size() const { return rep_.size(); }
+
+ void Set(const Slice& _user_key, SequenceNumber s, ValueType t) {
+ SetFrom(ParsedInternalKey(_user_key, s, t));
+ }
+
+ void Set(const Slice& _user_key_with_ts, SequenceNumber s, ValueType t,
+ const Slice& ts) {
+ ParsedInternalKey pik = ParsedInternalKey(_user_key_with_ts, s, t);
+ // Should not call pik.SetTimestamp() directly as it overwrites the buffer
+ // containing _user_key.
+ SetFrom(pik, ts);
+ }
+
+ void SetFrom(const ParsedInternalKey& p) {
+ rep_.clear();
+ AppendInternalKey(&rep_, p);
+ }
+
+ void SetFrom(const ParsedInternalKey& p, const Slice& ts) {
+ rep_.clear();
+ AppendInternalKeyWithDifferentTimestamp(&rep_, p, ts);
+ }
+
+ void Clear() { rep_.clear(); }
+
+ // The underlying representation.
+ // Intended only to be used together with ConvertFromUserKey().
+ std::string* rep() { return &rep_; }
+
+ // Assuming that *rep() contains a user key, this method makes internal key
+ // out of it in-place. This saves a memcpy compared to Set()/SetFrom().
+ void ConvertFromUserKey(SequenceNumber s, ValueType t) {
+ AppendInternalKeyFooter(&rep_, s, t);
+ }
+
+ std::string DebugString(bool hex) const;
+};
+
+inline int InternalKeyComparator::Compare(const InternalKey& a,
+ const InternalKey& b) const {
+ return Compare(a.Encode(), b.Encode());
+}
+
+inline Status ParseInternalKey(const Slice& internal_key,
+ ParsedInternalKey* result, bool log_err_key) {
+ const size_t n = internal_key.size();
+
+ if (n < kNumInternalBytes) {
+ return Status::Corruption("Corrupted Key: Internal Key too small. Size=" +
+ std::to_string(n) + ". ");
+ }
+
+ uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
+ unsigned char c = num & 0xff;
+ result->sequence = num >> 8;
+ result->type = static_cast<ValueType>(c);
+ assert(result->type <= ValueType::kMaxValue);
+ result->user_key = Slice(internal_key.data(), n - kNumInternalBytes);
+
+ if (IsExtendedValueType(result->type)) {
+ return Status::OK();
+ } else {
+ return Status::Corruption("Corrupted Key",
+ result->DebugString(log_err_key, true));
+ }
+}
+
+// Update the sequence number in the internal key.
+// Guarantees not to invalidate ikey.data().
+inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) {
+ size_t ikey_sz = ikey->size();
+ assert(ikey_sz >= kNumInternalBytes);
+ uint64_t newval = (seq << 8) | t;
+
+ // Note: Since C++11, strings are guaranteed to be stored contiguously and
+ // string::operator[]() is guaranteed not to change ikey.data().
+ EncodeFixed64(&(*ikey)[ikey_sz - kNumInternalBytes], newval);
+}
+
+// Get the sequence number from the internal key
+inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
+ const size_t n = internal_key.size();
+ assert(n >= kNumInternalBytes);
+ uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
+ return num >> 8;
+}
+
+// The class to store keys in an efficient way. It allows:
+// 1. Users can either copy the key into it, or have it point to an unowned
+// address.
+// 2. For copied key, a short inline buffer is kept to reduce memory
+// allocation for smaller keys.
+// 3. It tracks user key or internal key, and allow conversion between them.
+class IterKey {
+ public:
+ IterKey()
+ : buf_(space_),
+ key_(buf_),
+ key_size_(0),
+ buf_size_(sizeof(space_)),
+ is_user_key_(true) {}
+ // No copying allowed
+ IterKey(const IterKey&) = delete;
+ void operator=(const IterKey&) = delete;
+
+ ~IterKey() { ResetBuffer(); }
+
+ // The bool will be picked up by the next calls to SetKey
+ void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
+
+ // Returns the key in whichever format that was provided to KeyIter
+ // If user-defined timestamp is enabled, then timestamp is included in the
+ // return result.
+ Slice GetKey() const { return Slice(key_, key_size_); }
+
+ Slice GetInternalKey() const {
+ assert(!IsUserKey());
+ return Slice(key_, key_size_);
+ }
+
+ // If user-defined timestamp is enabled, then timestamp is included in the
+ // return result of GetUserKey();
+ Slice GetUserKey() const {
+ if (IsUserKey()) {
+ return Slice(key_, key_size_);
+ } else {
+ assert(key_size_ >= kNumInternalBytes);
+ return Slice(key_, key_size_ - kNumInternalBytes);
+ }
+ }
+
+ size_t Size() const { return key_size_; }
+
+ void Clear() { key_size_ = 0; }
+
+ // Append "non_shared_data" to its back, from "shared_len"
+ // This function is used in Block::Iter::ParseNextKey
+ // shared_len: bytes in [0, shard_len-1] would be remained
+ // non_shared_data: data to be append, its length must be >= non_shared_len
+ void TrimAppend(const size_t shared_len, const char* non_shared_data,
+ const size_t non_shared_len) {
+ assert(shared_len <= key_size_);
+ size_t total_size = shared_len + non_shared_len;
+
+ if (IsKeyPinned() /* key is not in buf_ */) {
+ // Copy the key from external memory to buf_ (copy shared_len bytes)
+ EnlargeBufferIfNeeded(total_size);
+ memcpy(buf_, key_, shared_len);
+ } else if (total_size > buf_size_) {
+ // Need to allocate space, delete previous space
+ char* p = new char[total_size];
+ memcpy(p, key_, shared_len);
+
+ if (buf_ != space_) {
+ delete[] buf_;
+ }
+
+ buf_ = p;
+ buf_size_ = total_size;
+ }
+
+ memcpy(buf_ + shared_len, non_shared_data, non_shared_len);
+ key_ = buf_;
+ key_size_ = total_size;
+ }
+
+ Slice SetKey(const Slice& key, bool copy = true) {
+ // is_user_key_ expected to be set already via SetIsUserKey
+ return SetKeyImpl(key, copy);
+ }
+
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
+ // TODO(yanqin) this is also used to set prefix, which do not include
+ // timestamp. Should be handled.
+ Slice SetUserKey(const Slice& key, bool copy = true) {
+ is_user_key_ = true;
+ return SetKeyImpl(key, copy);
+ }
+
+ Slice SetInternalKey(const Slice& key, bool copy = true) {
+ is_user_key_ = false;
+ return SetKeyImpl(key, copy);
+ }
+
+ // Copies the content of key, updates the reference to the user key in ikey
+ // and returns a Slice referencing the new copy.
+ Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) {
+ size_t key_n = key.size();
+ assert(key_n >= kNumInternalBytes);
+ SetInternalKey(key);
+ ikey->user_key = Slice(key_, key_n - kNumInternalBytes);
+ return Slice(key_, key_n);
+ }
+
+ // Copy the key into IterKey own buf_
+ void OwnKey() {
+ assert(IsKeyPinned() == true);
+
+ Reserve(key_size_);
+ memcpy(buf_, key_, key_size_);
+ key_ = buf_;
+ }
+
+ // Update the sequence number in the internal key. Guarantees not to
+ // invalidate slices to the key (and the user key).
+ void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
+ assert(!IsKeyPinned());
+ assert(key_size_ >= kNumInternalBytes);
+ if (ts) {
+ assert(key_size_ >= kNumInternalBytes + ts->size());
+ memcpy(&buf_[key_size_ - kNumInternalBytes - ts->size()], ts->data(),
+ ts->size());
+ }
+ uint64_t newval = (seq << 8) | t;
+ EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
+ }
+
+ bool IsKeyPinned() const { return (key_ != buf_); }
+
+ // If `ts` is provided, user_key should not contain timestamp,
+ // and `ts` is appended after user_key.
+ // TODO: more efficient storage for timestamp.
+ void SetInternalKey(const Slice& key_prefix, const Slice& user_key,
+ SequenceNumber s,
+ ValueType value_type = kValueTypeForSeek,
+ const Slice* ts = nullptr) {
+ size_t psize = key_prefix.size();
+ size_t usize = user_key.size();
+ size_t ts_sz = (ts != nullptr ? ts->size() : 0);
+ EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t) + ts_sz);
+ if (psize > 0) {
+ memcpy(buf_, key_prefix.data(), psize);
+ }
+ memcpy(buf_ + psize, user_key.data(), usize);
+ if (ts) {
+ memcpy(buf_ + psize + usize, ts->data(), ts_sz);
+ }
+ EncodeFixed64(buf_ + usize + psize + ts_sz,
+ PackSequenceAndType(s, value_type));
+
+ key_ = buf_;
+ key_size_ = psize + usize + sizeof(uint64_t) + ts_sz;
+ is_user_key_ = false;
+ }
+
+ void SetInternalKey(const Slice& user_key, SequenceNumber s,
+ ValueType value_type = kValueTypeForSeek,
+ const Slice* ts = nullptr) {
+ SetInternalKey(Slice(), user_key, s, value_type, ts);
+ }
+
+ void Reserve(size_t size) {
+ EnlargeBufferIfNeeded(size);
+ key_size_ = size;
+ }
+
+ void SetInternalKey(const ParsedInternalKey& parsed_key) {
+ SetInternalKey(Slice(), parsed_key);
+ }
+
+ void SetInternalKey(const Slice& key_prefix,
+ const ParsedInternalKey& parsed_key_suffix) {
+ SetInternalKey(key_prefix, parsed_key_suffix.user_key,
+ parsed_key_suffix.sequence, parsed_key_suffix.type);
+ }
+
+ void EncodeLengthPrefixedKey(const Slice& key) {
+ auto size = key.size();
+ EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));
+ char* ptr = EncodeVarint32(buf_, static_cast<uint32_t>(size));
+ memcpy(ptr, key.data(), size);
+ key_ = buf_;
+ is_user_key_ = true;
+ }
+
+ bool IsUserKey() const { return is_user_key_; }
+
+ private:
+ char* buf_;
+ const char* key_;
+ size_t key_size_;
+ size_t buf_size_;
+ char space_[32]; // Avoid allocation for short keys
+ bool is_user_key_;
+
+ Slice SetKeyImpl(const Slice& key, bool copy) {
+ size_t size = key.size();
+ if (copy) {
+ // Copy key to buf_
+ EnlargeBufferIfNeeded(size);
+ memcpy(buf_, key.data(), size);
+ key_ = buf_;
+ } else {
+ // Update key_ to point to external memory
+ key_ = key.data();
+ }
+ key_size_ = size;
+ return Slice(key_, key_size_);
+ }
+
+ void ResetBuffer() {
+ if (buf_ != space_) {
+ delete[] buf_;
+ buf_ = space_;
+ }
+ buf_size_ = sizeof(space_);
+ key_size_ = 0;
+ }
+
+ // Enlarge the buffer size if needed based on key_size.
+ // By default, static allocated buffer is used. Once there is a key
+ // larger than the static allocated buffer, another buffer is dynamically
+ // allocated, until a larger key buffer is requested. In that case, we
+ // reallocate buffer and delete the old one.
+ void EnlargeBufferIfNeeded(size_t key_size) {
+ // If size is smaller than buffer size, continue using current buffer,
+ // or the static allocated one, as default
+ if (key_size > buf_size_) {
+ EnlargeBuffer(key_size);
+ }
+ }
+
+ void EnlargeBuffer(size_t key_size);
+};
+
+// Convert from a SliceTransform of user keys, to a SliceTransform of
+// internal keys.
+class InternalKeySliceTransform : public SliceTransform {
+ public:
+ explicit InternalKeySliceTransform(const SliceTransform* transform)
+ : transform_(transform) {}
+
+ virtual const char* Name() const override { return transform_->Name(); }
+
+ virtual Slice Transform(const Slice& src) const override {
+ auto user_key = ExtractUserKey(src);
+ return transform_->Transform(user_key);
+ }
+
+ virtual bool InDomain(const Slice& src) const override {
+ auto user_key = ExtractUserKey(src);
+ return transform_->InDomain(user_key);
+ }
+
+ virtual bool InRange(const Slice& dst) const override {
+ auto user_key = ExtractUserKey(dst);
+ return transform_->InRange(user_key);
+ }
+
+ const SliceTransform* user_prefix_extractor() const { return transform_; }
+
+ private:
+ // Like comparator, InternalKeySliceTransform will not take care of the
+ // deletion of transform_
+ const SliceTransform* const transform_;
+};
+
+// Read the key of a record from a write batch.
+// if this record represent the default column family then cf_record
+// must be passed as false, otherwise it must be passed as true.
+extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key,
+ bool cf_record);
+
+// Read record from a write batch piece from input.
+// tag, column_family, key, value and blob are return values. Callers own the
+// slice they point to.
+// Tag is defined as ValueType.
+// input will be advanced to after the record.
+// If user-defined timestamp is enabled for a column family, then the `key`
+// resulting from this call will include timestamp.
+extern Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+ uint32_t* column_family, Slice* key,
+ Slice* value, Slice* blob, Slice* xid);
+
+// When user call DeleteRange() to delete a range of keys,
+// we will store a serialized RangeTombstone in MemTable and SST.
+// the struct here is an easy-understood form
+// start/end_key_ is the start/end user key of the range to be deleted
+struct RangeTombstone {
+ Slice start_key_;
+ Slice end_key_;
+ SequenceNumber seq_;
+ // TODO: we should optimize the storage here when user-defined timestamp
+ // is NOT enabled: they currently take up (16 + 32 + 32) bytes per tombstone.
+ Slice ts_;
+ std::string pinned_start_key_;
+ std::string pinned_end_key_;
+
+ RangeTombstone() = default;
+ RangeTombstone(Slice sk, Slice ek, SequenceNumber sn)
+ : start_key_(sk), end_key_(ek), seq_(sn) {}
+
+ // User-defined timestamp is enabled, `sk` and `ek` should be user key
+ // with timestamp, `ts` will replace the timestamps in `sk` and
+ // `ek`.
+ RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts)
+ : seq_(sn), ts_(ts) {
+ assert(!ts.empty());
+ pinned_start_key_.reserve(sk.size());
+ pinned_start_key_.append(sk.data(), sk.size() - ts.size());
+ pinned_start_key_.append(ts.data(), ts.size());
+ pinned_end_key_.reserve(ek.size());
+ pinned_end_key_.append(ek.data(), ek.size() - ts.size());
+ pinned_end_key_.append(ts.data(), ts.size());
+ start_key_ = pinned_start_key_;
+ end_key_ = pinned_end_key_;
+ }
+
+ RangeTombstone(ParsedInternalKey parsed_key, Slice value) {
+ start_key_ = parsed_key.user_key;
+ seq_ = parsed_key.sequence;
+ end_key_ = value;
+ }
+
+ // be careful to use Serialize(), allocates new memory
+ std::pair<InternalKey, Slice> Serialize() const {
+ auto key = InternalKey(start_key_, seq_, kTypeRangeDeletion);
+ return std::make_pair(std::move(key), end_key_);
+ }
+
+ // be careful to use SerializeKey(), allocates new memory
+ InternalKey SerializeKey() const {
+ return InternalKey(start_key_, seq_, kTypeRangeDeletion);
+ }
+
+ // The tombstone end-key is exclusive, so we generate an internal-key here
+ // which has a similar property. Using kMaxSequenceNumber guarantees that
+ // the returned internal-key will compare less than any other internal-key
+ // with the same user-key. This in turn guarantees that the serialized
+ // end-key for a tombstone such as [a-b] will compare less than the key "b".
+ //
+ // be careful to use SerializeEndKey(), allocates new memory
+ InternalKey SerializeEndKey() const {
+ if (!ts_.empty()) {
+ static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+ if (ts_.size() <= strlen(kTsMax)) {
+ return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion,
+ Slice(kTsMax, ts_.size()));
+ } else {
+ return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion,
+ std::string(ts_.size(), '\xff'));
+ }
+ }
+ return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion);
+ }
+};
+
+inline int InternalKeyComparator::Compare(const Slice& akey,
+ const Slice& bkey) const {
+ // Order by:
+ // increasing user key (according to user-supplied comparator)
+ // decreasing sequence number
+ // decreasing type (though sequence# should be enough to disambiguate)
+ int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+ if (r == 0) {
+ const uint64_t anum =
+ DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes);
+ const uint64_t bnum =
+ DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes);
+ if (anum > bnum) {
+ r = -1;
+ } else if (anum < bnum) {
+ r = +1;
+ }
+ }
+ return r;
+}
+
+inline int InternalKeyComparator::CompareKeySeq(const Slice& akey,
+ const Slice& bkey) const {
+ // Order by:
+ // increasing user key (according to user-supplied comparator)
+ // decreasing sequence number
+ int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+ if (r == 0) {
+ // Shift the number to exclude the last byte which contains the value type
+ const uint64_t anum =
+ DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes) >> 8;
+ const uint64_t bnum =
+ DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes) >> 8;
+ if (anum > bnum) {
+ r = -1;
+ } else if (anum < bnum) {
+ r = +1;
+ }
+ }
+ return r;
+}
+
+inline int InternalKeyComparator::Compare(const Slice& a,
+ SequenceNumber a_global_seqno,
+ const Slice& b,
+ SequenceNumber b_global_seqno) const {
+ int r = user_comparator_.Compare(ExtractUserKey(a), ExtractUserKey(b));
+ if (r == 0) {
+ uint64_t a_footer, b_footer;
+ if (a_global_seqno == kDisableGlobalSequenceNumber) {
+ a_footer = ExtractInternalKeyFooter(a);
+ } else {
+ a_footer = PackSequenceAndType(a_global_seqno, ExtractValueType(a));
+ }
+ if (b_global_seqno == kDisableGlobalSequenceNumber) {
+ b_footer = ExtractInternalKeyFooter(b);
+ } else {
+ b_footer = PackSequenceAndType(b_global_seqno, ExtractValueType(b));
+ }
+ if (a_footer > b_footer) {
+ r = -1;
+ } else if (a_footer < b_footer) {
+ r = +1;
+ }
+ }
+ return r;
+}
+
+// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey.
+struct ParsedInternalKeyComparator {
+ explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
+ : cmp(c) {}
+
+ bool operator()(const ParsedInternalKey& a,
+ const ParsedInternalKey& b) const {
+ return cmp->Compare(a, b) < 0;
+ }
+
+ const InternalKeyComparator* cmp;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/dbformat_test.cc b/src/rocksdb/db/dbformat_test.cc
new file mode 100644
index 000000000..8dc3387df
--- /dev/null
+++ b/src/rocksdb/db/dbformat_test.cc
@@ -0,0 +1,214 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+
+#include "table/block_based/index_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string IKey(const std::string& user_key, uint64_t seq,
+ ValueType vt) {
+ std::string encoded;
+ AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+ return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+ std::string result = s;
+ ShortenedIndexBuilder::FindShortestInternalKeySeparator(*BytewiseComparator(),
+ &result, l);
+ return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+ std::string result = s;
+ ShortenedIndexBuilder::FindShortInternalKeySuccessor(*BytewiseComparator(),
+ &result);
+ return result;
+}
+
+static void TestKey(const std::string& key, uint64_t seq, ValueType vt) {
+ std::string encoded = IKey(key, seq, vt);
+
+ Slice in(encoded);
+ ParsedInternalKey decoded("", 0, kTypeValue);
+
+ ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */));
+ ASSERT_EQ(key, decoded.user_key.ToString());
+ ASSERT_EQ(seq, decoded.sequence);
+ ASSERT_EQ(vt, decoded.type);
+
+ ASSERT_NOK(ParseInternalKey(Slice("bar"), &decoded, true /* log_err_key */));
+}
+
+class FormatTest : public testing::Test {};
+
+TEST_F(FormatTest, InternalKey_EncodeDecode) {
+ const char* keys[] = {"", "k", "hello", "longggggggggggggggggggggg"};
+ const uint64_t seq[] = {1,
+ 2,
+ 3,
+ (1ull << 8) - 1,
+ 1ull << 8,
+ (1ull << 8) + 1,
+ (1ull << 16) - 1,
+ 1ull << 16,
+ (1ull << 16) + 1,
+ (1ull << 32) - 1,
+ 1ull << 32,
+ (1ull << 32) + 1};
+ for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+ for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+ TestKey(keys[k], seq[s], kTypeValue);
+ TestKey("hello", 1, kTypeDeletion);
+ }
+ }
+}
+
+TEST_F(FormatTest, InternalKeyShortSeparator) {
+ // When user keys are same
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 99, kTypeValue)));
+ ASSERT_EQ(
+ IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 101, kTypeValue)));
+ ASSERT_EQ(
+ IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeValue)));
+ ASSERT_EQ(
+ IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeDeletion)));
+
+ // When user keys are misordered
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("bar", 99, kTypeValue)));
+
+ // When user keys are different, but correctly ordered
+ ASSERT_EQ(
+ IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("hello", 200, kTypeValue)));
+
+ ASSERT_EQ(IKey("ABC2", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("ABC1AAAAA", 100, kTypeValue),
+ IKey("ABC2ABB", 200, kTypeValue)));
+
+ ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("AAA1AAA", 100, kTypeValue),
+ IKey("AAA2AA", 200, kTypeValue)));
+
+ ASSERT_EQ(
+ IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA4", 200, kTypeValue)));
+
+ ASSERT_EQ(
+ IKey("AAA1B", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA2", 200, kTypeValue)));
+
+ ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("AAA1AAA", 100, kTypeValue),
+ IKey("AAA2A", 200, kTypeValue)));
+
+ ASSERT_EQ(
+ IKey("AAA1", 100, kTypeValue),
+ Shorten(IKey("AAA1", 100, kTypeValue), IKey("AAA2", 200, kTypeValue)));
+
+ // When start user key is prefix of limit user key
+ ASSERT_EQ(
+ IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("foobar", 200, kTypeValue)));
+
+ // When limit user key is prefix of start user key
+ ASSERT_EQ(
+ IKey("foobar", 100, kTypeValue),
+ Shorten(IKey("foobar", 100, kTypeValue), IKey("foo", 200, kTypeValue)));
+}
+
+TEST_F(FormatTest, InternalKeyShortestSuccessor) {
+ ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+ ShortSuccessor(IKey("foo", 100, kTypeValue)));
+ ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+ ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+TEST_F(FormatTest, IterKeyOperation) {
+ IterKey k;
+ const char p[] = "abcdefghijklmnopqrstuvwxyz";
+ const char q[] = "0123456789";
+
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string(""));
+
+ k.TrimAppend(0, p, 3);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abc"));
+
+ k.TrimAppend(1, p, 3);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("aabc"));
+
+ k.TrimAppend(0, p, 26);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abcdefghijklmnopqrstuvwxyz"));
+
+ k.TrimAppend(26, q, 10);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abcdefghijklmnopqrstuvwxyz0123456789"));
+
+ k.TrimAppend(36, q, 1);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abcdefghijklmnopqrstuvwxyz01234567890"));
+
+ k.TrimAppend(26, q, 1);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abcdefghijklmnopqrstuvwxyz0"));
+
+ // Size going up, memory allocation is triggered
+ k.TrimAppend(27, p, 26);
+ ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+ std::string("abcdefghijklmnopqrstuvwxyz0"
+ "abcdefghijklmnopqrstuvwxyz"));
+}
+
+TEST_F(FormatTest, UpdateInternalKey) {
+ std::string user_key("abcdefghijklmnopqrstuvwxyz");
+ uint64_t new_seq = 0x123456;
+ ValueType new_val_type = kTypeDeletion;
+
+ std::string ikey;
+ AppendInternalKey(&ikey, ParsedInternalKey(user_key, 100U, kTypeValue));
+ size_t ikey_size = ikey.size();
+ UpdateInternalKey(&ikey, new_seq, new_val_type);
+ ASSERT_EQ(ikey_size, ikey.size());
+
+ Slice in(ikey);
+ ParsedInternalKey decoded;
+ ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */));
+ ASSERT_EQ(user_key, decoded.user_key.ToString());
+ ASSERT_EQ(new_seq, decoded.sequence);
+ ASSERT_EQ(new_val_type, decoded.type);
+}
+
+TEST_F(FormatTest, RangeTombstoneSerializeEndKey) {
+ RangeTombstone t("a", "b", 2);
+ InternalKey k("b", 3, kTypeValue);
+ const InternalKeyComparator cmp(BytewiseComparator());
+ ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/deletefile_test.cc b/src/rocksdb/db/deletefile_test.cc
new file mode 100644
index 000000000..34925e828
--- /dev/null
+++ b/src/rocksdb/db/deletefile_test.cc
@@ -0,0 +1,614 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdlib.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DeleteFileTest : public DBTestBase {
+ public:
+ const int numlevels_;
+ const std::string wal_dir_;
+
+ DeleteFileTest()
+ : DBTestBase("deletefile_test", /*env_do_fsync=*/true),
+ numlevels_(7),
+ wal_dir_(dbname_ + "/wal_files") {}
+
+ void SetOptions(Options* options) {
+ ASSERT_NE(options, nullptr);
+ options->delete_obsolete_files_period_micros = 0; // always do full purge
+ options->enable_thread_tracking = true;
+ options->write_buffer_size = 1024 * 1024 * 1000;
+ options->target_file_size_base = 1024 * 1024 * 1000;
+ options->max_bytes_for_level_base = 1024 * 1024 * 1000;
+ options->WAL_ttl_seconds = 300; // Used to test log files
+ options->WAL_size_limit_MB = 1024; // Used to test log files
+ options->wal_dir = wal_dir_;
+ }
+
+ void AddKeys(int numkeys, int startkey = 0) {
+ WriteOptions options;
+ options.sync = false;
+ ReadOptions roptions;
+ for (int i = startkey; i < (numkeys + startkey); i++) {
+ std::string temp = std::to_string(i);
+ Slice key(temp);
+ Slice value(temp);
+ ASSERT_OK(db_->Put(options, key, value));
+ }
+ }
+
+ int numKeysInLevels(std::vector<LiveFileMetaData>& metadata,
+ std::vector<int>* keysperlevel = nullptr) {
+ if (keysperlevel != nullptr) {
+ keysperlevel->resize(numlevels_);
+ }
+
+ int numKeys = 0;
+ for (size_t i = 0; i < metadata.size(); i++) {
+ int startkey = atoi(metadata[i].smallestkey.c_str());
+ int endkey = atoi(metadata[i].largestkey.c_str());
+ int numkeysinfile = (endkey - startkey + 1);
+ numKeys += numkeysinfile;
+ if (keysperlevel != nullptr) {
+ (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
+ }
+ fprintf(stderr, "level %d name %s smallest %s largest %s\n",
+ metadata[i].level, metadata[i].name.c_str(),
+ metadata[i].smallestkey.c_str(), metadata[i].largestkey.c_str());
+ }
+ return numKeys;
+ }
+
+ void CreateTwoLevels() {
+ AddKeys(50000, 10000);
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_OK(dbfull()->TEST_CompactRange(i, nullptr, nullptr));
+ }
+
+ AddKeys(50000, 10000);
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+ }
+
+ void CheckFileTypeCounts(const std::string& dir, int required_log,
+ int required_sst, int required_manifest) {
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dir, &filenames));
+
+ int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+ for (auto file : filenames) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(file, &number, &type)) {
+ log_cnt += (type == kWalFile);
+ sst_cnt += (type == kTableFile);
+ manifest_cnt += (type == kDescriptorFile);
+ }
+ }
+ if (required_log >= 0) {
+ ASSERT_EQ(required_log, log_cnt);
+ }
+ if (required_sst >= 0) {
+ ASSERT_EQ(required_sst, sst_cnt);
+ }
+ if (required_manifest >= 0) {
+ ASSERT_EQ(required_manifest, manifest_cnt);
+ }
+ }
+
+ static void DoSleep(void* arg) {
+ auto test = reinterpret_cast<DeleteFileTest*>(arg);
+ test->env_->SleepForMicroseconds(2 * 1000 * 1000);
+ }
+
+ // An empty job to guard all jobs are processed
+ static void GuardFinish(void* /*arg*/) {
+ TEST_SYNC_POINT("DeleteFileTest::GuardFinish");
+ }
+};
+
+TEST_F(DeleteFileTest, AddKeysAndQueryLevels) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ CreateTwoLevels();
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+
+ std::string level1file = "";
+ int level1keycount = 0;
+ std::string level2file = "";
+ int level2keycount = 0;
+ int level1index = 0;
+ int level2index = 1;
+
+ ASSERT_EQ((int)metadata.size(), 2);
+ if (metadata[0].level == 2) {
+ level1index = 1;
+ level2index = 0;
+ }
+
+ level1file = metadata[level1index].name;
+ int startkey = atoi(metadata[level1index].smallestkey.c_str());
+ int endkey = atoi(metadata[level1index].largestkey.c_str());
+ level1keycount = (endkey - startkey + 1);
+ level2file = metadata[level2index].name;
+ startkey = atoi(metadata[level2index].smallestkey.c_str());
+ endkey = atoi(metadata[level2index].largestkey.c_str());
+ level2keycount = (endkey - startkey + 1);
+
+ // COntrolled setup. Levels 1 and 2 should both have 50K files.
+ // This is a little fragile as it depends on the current
+ // compaction heuristics.
+ ASSERT_EQ(level1keycount, 50000);
+ ASSERT_EQ(level2keycount, 50000);
+
+ Status status = db_->DeleteFile("0.sst");
+ ASSERT_TRUE(status.IsInvalidArgument());
+
+ // intermediate level files cannot be deleted.
+ status = db_->DeleteFile(level1file);
+ ASSERT_TRUE(status.IsInvalidArgument());
+
+ // Lowest level file deletion should succeed.
+ status = db_->DeleteFile(level2file);
+ ASSERT_OK(status);
+}
+
+TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ CreateTwoLevels();
+ // there should be only one (empty) log file because CreateTwoLevels()
+ // flushes the memtables to disk
+ CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+ // 2 ssts, 1 manifest
+ CheckFileTypeCounts(dbname_, 0, 2, 1);
+ std::string first("0"), last("999999");
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ Slice first_slice(first), last_slice(last);
+ ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+ // 1 sst after compaction
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+ // this time, we keep an iterator alive
+ Reopen(options);
+ Iterator* itr = nullptr;
+ CreateTwoLevels();
+ itr = db_->NewIterator(ReadOptions());
+ ASSERT_OK(itr->status());
+ ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+ ASSERT_OK(itr->status());
+ // 3 sst after compaction with live iterator
+ CheckFileTypeCounts(dbname_, 0, 3, 1);
+ delete itr;
+ // 1 sst after iterator deletion
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ std::string first("0"), last("999999");
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ Slice first_slice(first), last_slice(last);
+
+ // We keep an iterator alive
+ Iterator* itr = nullptr;
+ CreateTwoLevels();
+ ReadOptions read_options;
+ read_options.background_purge_on_iterator_cleanup = true;
+ itr = db_->NewIterator(read_options);
+ ASSERT_OK(itr->status());
+ ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+ // 3 sst after compaction with live iterator
+ CheckFileTypeCounts(dbname_, 0, 3, 1);
+ test::SleepingBackgroundTask sleeping_task_before;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_before, Env::Priority::HIGH);
+ delete itr;
+ test::SleepingBackgroundTask sleeping_task_after;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_after, Env::Priority::HIGH);
+
+ // Make sure no purges are executed foreground
+ CheckFileTypeCounts(dbname_, 0, 3, 1);
+ sleeping_task_before.WakeUp();
+ sleeping_task_before.WaitUntilDone();
+
+ // Make sure all background purges are executed
+ sleeping_task_after.WakeUp();
+ sleeping_task_after.WaitUntilDone();
+ // 1 sst after iterator deletion
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, PurgeDuringOpen) {
+ Options options = CurrentOptions();
+ CheckFileTypeCounts(dbname_, -1, 0, -1);
+ Close();
+ std::unique_ptr<WritableFile> file;
+ ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file,
+ EnvOptions()));
+ ASSERT_OK(file->Close());
+ CheckFileTypeCounts(dbname_, -1, 1, -1);
+ options.avoid_unnecessary_blocking_io = false;
+ options.create_if_missing = false;
+ Reopen(options);
+ CheckFileTypeCounts(dbname_, -1, 0, -1);
+ Close();
+
+ // test background purge
+ options.avoid_unnecessary_blocking_io = true;
+ options.create_if_missing = false;
+ ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file,
+ EnvOptions()));
+ ASSERT_OK(file->Close());
+ CheckFileTypeCounts(dbname_, -1, 1, -1);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DeleteFileTest::PurgeDuringOpen:1", "DBImpl::BGWorkPurge:start"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+ Reopen(options);
+ // the obsolete file is not deleted until the background purge job is ran
+ CheckFileTypeCounts(dbname_, -1, 1, -1);
+ TEST_SYNC_POINT("DeleteFileTest::PurgeDuringOpen:1");
+ ASSERT_OK(dbfull()->TEST_WaitForPurge());
+ CheckFileTypeCounts(dbname_, -1, 0, -1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ auto do_test = [&](bool bg_purge) {
+ ColumnFamilyOptions co;
+ co.max_write_buffer_size_to_maintain =
+ static_cast<int64_t>(co.write_buffer_size);
+ WriteOptions wo;
+ FlushOptions fo;
+ ColumnFamilyHandle* cfh = nullptr;
+
+ ASSERT_OK(db_->CreateColumnFamily(co, "dropme", &cfh));
+
+ ASSERT_OK(db_->Put(wo, cfh, "pika", "chu"));
+ ASSERT_OK(db_->Flush(fo, cfh));
+ // Expect 1 sst file.
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+ ASSERT_OK(db_->DropColumnFamily(cfh));
+ // Still 1 file, it won't be deleted while ColumnFamilyHandle is alive.
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+ delete cfh;
+ test::SleepingBackgroundTask sleeping_task_after;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_after, Env::Priority::HIGH);
+ // If background purge is enabled, the file should still be there.
+ CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1);
+ TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1");
+
+ // Execute background purges.
+ sleeping_task_after.WakeUp();
+ sleeping_task_after.WaitUntilDone();
+ // The file should have been deleted.
+ CheckFileTypeCounts(dbname_, 0, 0, 1);
+ };
+
+ {
+ SCOPED_TRACE("avoid_unnecessary_blocking_io = false");
+ do_test(false);
+ }
+
+ options.avoid_unnecessary_blocking_io = true;
+ options.create_if_missing = false;
+ Reopen(options);
+ ASSERT_OK(dbfull()->TEST_WaitForPurge());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DeleteFileTest::BackgroundPurgeCFDropTest:1",
+ "DBImpl::BGWorkPurge:start"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ {
+ SCOPED_TRACE("avoid_unnecessary_blocking_io = true");
+ do_test(true);
+ }
+}
+
+// This test is to reproduce a bug that read invalid ReadOption in iterator
+// cleanup function
+TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ std::string first("0"), last("999999");
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ Slice first_slice(first), last_slice(last);
+
+ // We keep an iterator alive
+ Iterator* itr = nullptr;
+ CreateTwoLevels();
+ {
+ ReadOptions read_options;
+ read_options.background_purge_on_iterator_cleanup = true;
+ itr = db_->NewIterator(read_options);
+ ASSERT_OK(itr->status());
+ // ReadOptions is deleted, but iterator cleanup function should not be
+ // affected
+ }
+
+ ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+ // 3 sst after compaction with live iterator
+ CheckFileTypeCounts(dbname_, 0, 3, 1);
+ delete itr;
+
+ test::SleepingBackgroundTask sleeping_task_after;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_task_after, Env::Priority::HIGH);
+
+ // Make sure all background purges are executed
+ sleeping_task_after.WakeUp();
+ sleeping_task_after.WaitUntilDone();
+ // 1 sst after iterator deletion
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ std::string first("0"), last("999999");
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ Slice first_slice(first), last_slice(last);
+
+ // We keep an iterator alive
+ CreateTwoLevels();
+ ReadOptions read_options;
+ read_options.background_purge_on_iterator_cleanup = true;
+ Iterator* itr1 = db_->NewIterator(read_options);
+ ASSERT_OK(itr1->status());
+ CreateTwoLevels();
+ Iterator* itr2 = db_->NewIterator(read_options);
+ ASSERT_OK(itr2->status());
+ ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+ // 5 sst files after 2 compactions with 2 live iterators
+ CheckFileTypeCounts(dbname_, 0, 5, 1);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ // ~DBImpl should wait until all BGWorkPurge are finished
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::~DBImpl:WaitJob", "DBImpl::BGWorkPurge"},
+ {"DeleteFileTest::GuardFinish",
+ "DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ delete itr1;
+ env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH);
+ delete itr2;
+ env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH);
+ Close();
+
+ TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose");
+ // 1 sst after iterator deletion
+ CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, DeleteFileWithIterator) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ CreateTwoLevels();
+ ReadOptions read_options;
+ Iterator* it = db_->NewIterator(read_options);
+ ASSERT_OK(it->status());
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+
+ std::string level2file;
+
+ ASSERT_EQ(metadata.size(), static_cast<size_t>(2));
+ if (metadata[0].level == 1) {
+ level2file = metadata[1].name;
+ } else {
+ level2file = metadata[0].name;
+ }
+
+ Status status = db_->DeleteFile(level2file);
+ fprintf(stdout, "Deletion status %s: %s\n", level2file.c_str(),
+ status.ToString().c_str());
+ ASSERT_OK(status);
+ it->SeekToFirst();
+ int numKeysIterated = 0;
+ while (it->Valid()) {
+ numKeysIterated++;
+ it->Next();
+ }
+ ASSERT_EQ(numKeysIterated, 50000);
+ delete it;
+}
+
+TEST_F(DeleteFileTest, DeleteLogFiles) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+
+ AddKeys(10, 0);
+ VectorLogPtr logfiles;
+ ASSERT_OK(db_->GetSortedWalFiles(logfiles));
+ ASSERT_GT(logfiles.size(), 0UL);
+ // Take the last log file which is expected to be alive and try to delete it
+ // Should not succeed because live logs are not allowed to be deleted
+ std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
+ ASSERT_EQ(alive_log->Type(), kAliveLogFile);
+ ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
+ fprintf(stdout, "Deleting alive log file %s\n",
+ alive_log->PathName().c_str());
+ ASSERT_NOK(db_->DeleteFile(alive_log->PathName()));
+ ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
+ logfiles.clear();
+
+ // Call Flush to bring about a new working log file and add more keys
+ // Call Flush again to flush out memtable and move alive log to archived log
+ // and try to delete the archived log file
+ FlushOptions fopts;
+ ASSERT_OK(db_->Flush(fopts));
+ AddKeys(10, 0);
+ ASSERT_OK(db_->Flush(fopts));
+ ASSERT_OK(db_->GetSortedWalFiles(logfiles));
+ ASSERT_GT(logfiles.size(), 0UL);
+ std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
+ ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
+ ASSERT_OK(env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
+ fprintf(stdout, "Deleting archived log file %s\n",
+ archived_log->PathName().c_str());
+ ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
+ ASSERT_TRUE(
+ env_->FileExists(wal_dir_ + "/" + archived_log->PathName()).IsNotFound());
+}
+
+TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
+ Options options = CurrentOptions();
+ SetOptions(&options);
+ Destroy(options);
+ options.create_if_missing = true;
+ Reopen(options);
+ CreateAndReopenWithCF({"new_cf"}, options);
+
+ Random rnd(5);
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+ test::RandomKey(&rnd, 10)));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+ test::RandomKey(&rnd, 10)));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(2U, metadata.size());
+ ASSERT_EQ("new_cf", metadata[0].column_family_name);
+ ASSERT_EQ("new_cf", metadata[1].column_family_name);
+ auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno
+ ? metadata[0].name
+ : metadata[1].name;
+ auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno
+ ? metadata[0].name
+ : metadata[1].name;
+ ASSERT_TRUE(db_->DeleteFile(new_file).IsInvalidArgument());
+ ASSERT_OK(db_->DeleteFile(old_file));
+
+ {
+ std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+ ASSERT_OK(itr->status());
+ int count = 0;
+ for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+ ASSERT_OK(itr->status());
+ ++count;
+ }
+ ASSERT_EQ(count, 1000);
+ }
+
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "new_cf"}, options);
+
+ {
+ std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+ int count = 0;
+ for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+ ASSERT_OK(itr->status());
+ ++count;
+ }
+ ASSERT_EQ(count, 1000);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/error_handler.cc b/src/rocksdb/db/error_handler.cc
new file mode 100644
index 000000000..7f68bb026
--- /dev/null
+++ b/src/rocksdb/db/error_handler.cc
@@ -0,0 +1,819 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/error_handler.h"
+
+#include "db/db_impl/db_impl.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "port/lang.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Maps to help decide the severity of an error based on the
+// BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks
+// is set or not. There are 3 maps, going from most specific to least specific
+// (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and
+// paranoid_checks). The less specific map serves as a catch all in case we miss
+// a specific error code or subcode.
+std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>,
+ Status::Severity>
+ ErrorSeverityMap = {
+ // Errors during BG compaction
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ true),
+ Status::Severity::kSoftError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, Status::SubCode::kSpaceLimit,
+ true),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, Status::SubCode::kIOFenced,
+ true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, Status::SubCode::kIOFenced,
+ false),
+ Status::Severity::kFatalError},
+ // Errors during BG flush
+ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+ Status::SubCode::kNoSpace, true),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+ Status::SubCode::kNoSpace, false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+ Status::SubCode::kSpaceLimit, true),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+ Status::SubCode::kIOFenced, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+ Status::SubCode::kIOFenced, false),
+ Status::Severity::kFatalError},
+ // Errors during Write
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ true),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ false),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kIOError, Status::SubCode::kIOFenced,
+ true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kIOError, Status::SubCode::kIOFenced,
+ false),
+ Status::Severity::kFatalError},
+ // Errors during MANIFEST write
+ {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ true),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ false),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+ Status::Code::kIOError, Status::SubCode::kIOFenced,
+ true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+ Status::Code::kIOError, Status::SubCode::kIOFenced,
+ false),
+ Status::Severity::kFatalError},
+ // Errors during BG flush with WAL disabled
+ {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ true),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+ Status::Code::kIOError, Status::SubCode::kSpaceLimit,
+ true),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+ Status::Code::kIOError, Status::SubCode::kIOFenced,
+ true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+ Status::Code::kIOError, Status::SubCode::kIOFenced,
+ false),
+ Status::Severity::kFatalError},
+ // Errors during MANIFEST write when WAL is disabled
+ {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ true),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+ Status::Code::kIOError, Status::SubCode::kNoSpace,
+ false),
+ Status::Severity::kHardError},
+ {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+ Status::Code::kIOError, Status::SubCode::kIOFenced,
+ true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+ Status::Code::kIOError, Status::SubCode::kIOFenced,
+ false),
+ Status::Severity::kFatalError},
+
+};
+
+std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>,
+ Status::Severity>
+ DefaultErrorSeverityMap = {
+ // Errors during BG compaction
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kCorruption, true),
+ Status::Severity::kUnrecoverableError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kCorruption, false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction,
+ Status::Code::kIOError, false),
+ Status::Severity::kNoError},
+ // Errors during BG flush
+ {std::make_tuple(BackgroundErrorReason::kFlush,
+ Status::Code::kCorruption, true),
+ Status::Severity::kUnrecoverableError},
+ {std::make_tuple(BackgroundErrorReason::kFlush,
+ Status::Code::kCorruption, false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+ true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+ false),
+ Status::Severity::kNoError},
+ // Errors during Write
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kCorruption, true),
+ Status::Severity::kUnrecoverableError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kCorruption, false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kIOError, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+ Status::Code::kIOError, false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+ Status::Code::kIOError, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+ Status::Code::kIOError, false),
+ Status::Severity::kFatalError},
+ // Errors during BG flush with WAL disabled
+ {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+ Status::Code::kCorruption, true),
+ Status::Severity::kUnrecoverableError},
+ {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+ Status::Code::kCorruption, false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+ Status::Code::kIOError, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+ Status::Code::kIOError, false),
+ Status::Severity::kNoError},
+ {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+ Status::Code::kIOError, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+ Status::Code::kIOError, false),
+ Status::Severity::kFatalError},
+};
+
+std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
+ DefaultReasonMap = {
+ // Errors during BG compaction
+ {std::make_tuple(BackgroundErrorReason::kCompaction, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kCompaction, false),
+ Status::Severity::kNoError},
+ // Errors during BG flush
+ {std::make_tuple(BackgroundErrorReason::kFlush, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kFlush, false),
+ Status::Severity::kNoError},
+ // Errors during Write
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kWriteCallback, false),
+ Status::Severity::kFatalError},
+ // Errors during Memtable update
+ {std::make_tuple(BackgroundErrorReason::kMemTable, true),
+ Status::Severity::kFatalError},
+ {std::make_tuple(BackgroundErrorReason::kMemTable, false),
+ Status::Severity::kFatalError},
+};
+
+void ErrorHandler::CancelErrorRecovery() {
+#ifndef ROCKSDB_LITE
+ db_mutex_->AssertHeld();
+
+ // We'll release the lock before calling sfm, so make sure no new
+ // recovery gets scheduled at that point
+ auto_recovery_ = false;
+ SstFileManagerImpl* sfm =
+ reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+ if (sfm) {
+ // This may or may not cancel a pending recovery
+ db_mutex_->Unlock();
+ bool cancelled = sfm->CancelErrorRecovery(this);
+ db_mutex_->Lock();
+ if (cancelled) {
+ recovery_in_prog_ = false;
+ }
+ }
+
+ // If auto recovery is also runing to resume from the retryable error,
+ // we should wait and end the auto recovery.
+ EndAutoRecovery();
+#endif
+}
+
+STATIC_AVOID_DESTRUCTION(const Status, kOkStatus){Status::OK()};
+
+// This is the main function for looking at an error during a background
+// operation and deciding the severity, and error recovery strategy. The high
+// level algorithm is as follows -
+// 1. Classify the severity of the error based on the ErrorSeverityMap,
+// DefaultErrorSeverityMap and DefaultReasonMap defined earlier
+// 2. Call a Status code specific override function to adjust the severity
+// if needed. The reason for this is our ability to recover may depend on
+// the exact options enabled in DBOptions
+// 3. Determine if auto recovery is possible. A listener notification callback
+// is called, which can disable the auto recovery even if we decide its
+// feasible
+// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control
+// the actual recovery. If no sst file manager is specified in DBOptions,
+// a default one is allocated during DB::Open(), so there will always be
+// one.
+// This can also get called as part of a recovery operation. In that case, we
+// also track the error separately in recovery_error_ so we can tell in the
+// end whether recovery succeeded or not
+const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
+ BackgroundErrorReason reason) {
+ db_mutex_->AssertHeld();
+ if (bg_err.ok()) {
+ return kOkStatus;
+ }
+
+ if (bg_error_stats_ != nullptr) {
+ RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+ }
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "ErrorHandler: Set regular background error\n");
+
+ bool paranoid = db_options_.paranoid_checks;
+ Status::Severity sev = Status::Severity::kFatalError;
+ Status new_bg_err;
+ DBRecoverContext context;
+ bool found = false;
+
+ {
+ auto entry = ErrorSeverityMap.find(
+ std::make_tuple(reason, bg_err.code(), bg_err.subcode(), paranoid));
+ if (entry != ErrorSeverityMap.end()) {
+ sev = entry->second;
+ found = true;
+ }
+ }
+
+ if (!found) {
+ auto entry = DefaultErrorSeverityMap.find(
+ std::make_tuple(reason, bg_err.code(), paranoid));
+ if (entry != DefaultErrorSeverityMap.end()) {
+ sev = entry->second;
+ found = true;
+ }
+ }
+
+ if (!found) {
+ auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid));
+ if (entry != DefaultReasonMap.end()) {
+ sev = entry->second;
+ }
+ }
+
+ new_bg_err = Status(bg_err, sev);
+
+ // Check if recovery is currently in progress. If it is, we will save this
+ // error so we can check it at the end to see if recovery succeeded or not
+ if (recovery_in_prog_ && recovery_error_.ok()) {
+ recovery_error_ = new_bg_err;
+ }
+
+ bool auto_recovery = auto_recovery_;
+ if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) {
+ auto_recovery = false;
+ }
+
+ // Allow some error specific overrides
+ if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
+ new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
+ new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery);
+ }
+
+ if (!new_bg_err.ok()) {
+ Status s = new_bg_err;
+ EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
+ db_mutex_, &auto_recovery);
+ if (!s.ok() && (s.severity() > bg_error_.severity())) {
+ bg_error_ = s;
+ } else {
+ // This error is less severe than previously encountered error. Don't
+ // take any further action
+ return bg_error_;
+ }
+ }
+
+ recover_context_ = context;
+ if (auto_recovery) {
+ recovery_in_prog_ = true;
+
+ // Kick-off error specific recovery
+ if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
+ new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
+ RecoverFromNoSpace();
+ }
+ }
+ if (bg_error_.severity() >= Status::Severity::kHardError) {
+ is_db_stopped_.store(true, std::memory_order_release);
+ }
+ return bg_error_;
+}
+
+// This is the main function for looking at IO related error during the
+// background operations. The main logic is:
+// 1) File scope IO error is treated as retryable IO error in the write
+// path. In RocksDB, If a file has write IO error and it is at file scope,
+// RocksDB never write to the same file again. RocksDB will create a new
+// file and rewrite the whole content. Thus, it is retryable.
+// 1) if the error is caused by data loss, the error is mapped to
+// unrecoverable error. Application/user must take action to handle
+// this situation (File scope case is excluded).
+// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error,
+// or its retryable flag is set and not a data loss error), auto resume
+// will be called and the auto resume can be controlled by resume count
+// and resume interval options. There are three sub-cases:
+// a) if the error happens during compaction, it is mapped to a soft error.
+// the compaction thread will reschedule a new compaction.
+// b) if the error happens during flush and also WAL is empty, it is mapped
+// to a soft error. Note that, it includes the case that IO error happens
+// in SST or manifest write during flush.
+// c) all other errors are mapped to hard error.
+// 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason
+// reason) will be called to handle other error cases.
+const Status& ErrorHandler::SetBGError(const Status& bg_status,
+ BackgroundErrorReason reason) {
+ db_mutex_->AssertHeld();
+ Status tmp_status = bg_status;
+ IOStatus bg_io_err = status_to_io_status(std::move(tmp_status));
+
+ if (bg_io_err.ok()) {
+ return kOkStatus;
+ }
+ ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s",
+ bg_io_err.ToString().c_str());
+
+ if (recovery_in_prog_ && recovery_io_error_.ok()) {
+ recovery_io_error_ = bg_io_err;
+ }
+ if (BackgroundErrorReason::kManifestWrite == reason ||
+ BackgroundErrorReason::kManifestWriteNoWAL == reason) {
+ // Always returns ok
+ ROCKS_LOG_INFO(db_options_.info_log, "Disabling File Deletions");
+ db_->DisableFileDeletionsWithLock().PermitUncheckedError();
+ }
+
+ Status new_bg_io_err = bg_io_err;
+ DBRecoverContext context;
+ if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile &&
+ bg_io_err.GetDataLoss()) {
+ // First, data loss (non file scope) is treated as unrecoverable error. So
+ // it can directly overwrite any existing bg_error_.
+ bool auto_recovery = false;
+ Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
+ CheckAndSetRecoveryAndBGError(bg_err);
+ if (bg_error_stats_ != nullptr) {
+ RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+ RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+ }
+ ROCKS_LOG_INFO(
+ db_options_.info_log,
+ "ErrorHandler: Set background IO error as unrecoverable error\n");
+ EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+ &bg_err, db_mutex_, &auto_recovery);
+ recover_context_ = context;
+ return bg_error_;
+ } else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
+ (bg_io_err.GetScope() ==
+ IOStatus::IOErrorScope::kIOErrorScopeFile ||
+ bg_io_err.GetRetryable())) {
+ // Second, check if the error is a retryable IO error (file scope IO error
+ // is also treated as retryable IO error in RocksDB write path). if it is
+ // retryable error and its severity is higher than bg_error_, overwrite the
+ // bg_error_ with new error. In current stage, for retryable IO error of
+ // compaction, treat it as soft error. In other cases, treat the retryable
+ // IO error as hard error. Note that, all the NoSpace error should be
+ // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter
+ // it is retryable or file scope, this logic will be bypassed.
+ bool auto_recovery = false;
+ EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+ &new_bg_io_err, db_mutex_,
+ &auto_recovery);
+ if (bg_error_stats_ != nullptr) {
+ RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+ RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+ RecordTick(bg_error_stats_.get(),
+ ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT);
+ }
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "ErrorHandler: Set background retryable IO error\n");
+ if (BackgroundErrorReason::kCompaction == reason) {
+ // We map the retryable IO error during compaction to soft error. Since
+ // compaction can reschedule by itself. We will not set the BG error in
+ // this case
+ // TODO: a better way to set or clean the retryable IO error which
+ // happens during compaction SST file write.
+ if (bg_error_stats_ != nullptr) {
+ RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
+ }
+ ROCKS_LOG_INFO(
+ db_options_.info_log,
+ "ErrorHandler: Compaction will schedule by itself to resume\n");
+ return bg_error_;
+ } else if (BackgroundErrorReason::kFlushNoWAL == reason ||
+ BackgroundErrorReason::kManifestWriteNoWAL == reason) {
+ // When the BG Retryable IO error reason is flush without WAL,
+ // We map it to a soft error. At the same time, all the background work
+ // should be stopped except the BG work from recovery. Therefore, we
+ // set the soft_error_no_bg_work_ to true. At the same time, since DB
+ // continues to receive writes when BG error is soft error, to avoid
+ // to many small memtable being generated during auto resume, the flush
+ // reason is set to kErrorRecoveryRetryFlush.
+ Status bg_err(new_bg_io_err, Status::Severity::kSoftError);
+ CheckAndSetRecoveryAndBGError(bg_err);
+ soft_error_no_bg_work_ = true;
+ context.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
+ recover_context_ = context;
+ return StartRecoverFromRetryableBGIOError(bg_io_err);
+ } else {
+ Status bg_err(new_bg_io_err, Status::Severity::kHardError);
+ CheckAndSetRecoveryAndBGError(bg_err);
+ recover_context_ = context;
+ return StartRecoverFromRetryableBGIOError(bg_io_err);
+ }
+ } else {
+ if (bg_error_stats_ != nullptr) {
+ RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+ }
+ // HandleKnownErrors() will use recovery_error_, so ignore
+ // recovery_io_error_.
+ // TODO: Do some refactoring and use only one recovery_error_
+ recovery_io_error_.PermitUncheckedError();
+ return HandleKnownErrors(new_bg_io_err, reason);
+ }
+}
+
+Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error,
+ bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+ if (bg_error.severity() >= Status::Severity::kFatalError) {
+ return bg_error;
+ }
+
+ if (db_options_.sst_file_manager.get() == nullptr) {
+ // We rely on SFM to poll for enough disk space and recover
+ *auto_recovery = false;
+ return bg_error;
+ }
+
+ if (db_options_.allow_2pc &&
+ (bg_error.severity() <= Status::Severity::kSoftError)) {
+ // Don't know how to recover, as the contents of the current WAL file may
+ // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled,
+ // we can just flush the memtable and discard the log
+ *auto_recovery = false;
+ return Status(bg_error, Status::Severity::kFatalError);
+ }
+
+ {
+ uint64_t free_space;
+ if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path,
+ &free_space) == Status::NotSupported()) {
+ *auto_recovery = false;
+ }
+ }
+
+ return bg_error;
+#else
+ (void)auto_recovery;
+ return Status(bg_error, Status::Severity::kFatalError);
+#endif
+}
+
+void ErrorHandler::RecoverFromNoSpace() {
+#ifndef ROCKSDB_LITE
+ SstFileManagerImpl* sfm =
+ reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+
+ // Inform SFM of the error, so it can kick-off the recovery
+ if (sfm) {
+ sfm->StartErrorRecovery(this, bg_error_);
+ }
+#endif
+}
+
+Status ErrorHandler::ClearBGError() {
+#ifndef ROCKSDB_LITE
+ db_mutex_->AssertHeld();
+
+ // Signal that recovery succeeded
+ if (recovery_error_.ok()) {
+ Status old_bg_error = bg_error_;
+ // old_bg_error is only for notifying listeners, so may not be checked
+ old_bg_error.PermitUncheckedError();
+ // Clear and check the recovery IO and BG error
+ bg_error_ = Status::OK();
+ recovery_io_error_ = IOStatus::OK();
+ bg_error_.PermitUncheckedError();
+ recovery_io_error_.PermitUncheckedError();
+ recovery_in_prog_ = false;
+ soft_error_no_bg_work_ = false;
+ EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error,
+ bg_error_, db_mutex_);
+ }
+ return recovery_error_;
+#else
+ return bg_error_;
+#endif
+}
+
+Status ErrorHandler::RecoverFromBGError(bool is_manual) {
+#ifndef ROCKSDB_LITE
+ InstrumentedMutexLock l(db_mutex_);
+ bool no_bg_work_original_flag = soft_error_no_bg_work_;
+ if (is_manual) {
+ // If its a manual recovery and there's a background recovery in progress
+ // return busy status
+ if (recovery_in_prog_) {
+ return Status::Busy();
+ }
+ recovery_in_prog_ = true;
+
+ // In manual resume, we allow the bg work to run. If it is a auto resume,
+ // the bg work should follow this tag.
+ soft_error_no_bg_work_ = false;
+
+ // In manual resume, if the bg error is a soft error and also requires
+ // no bg work, the error must be recovered by call the flush with
+ // flush reason: kErrorRecoveryRetryFlush. In other case, the flush
+ // reason is set to kErrorRecovery.
+ if (no_bg_work_original_flag) {
+ recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
+ } else {
+ recover_context_.flush_reason = FlushReason::kErrorRecovery;
+ }
+ }
+
+ if (bg_error_.severity() == Status::Severity::kSoftError &&
+ recover_context_.flush_reason == FlushReason::kErrorRecovery) {
+ // Simply clear the background error and return
+ recovery_error_ = Status::OK();
+ return ClearBGError();
+ }
+
+ // Reset recovery_error_. We will use this to record any errors that happen
+ // during the recovery process. While recovering, the only operations that
+ // can generate background errors should be the flush operations
+ recovery_error_ = Status::OK();
+ recovery_error_.PermitUncheckedError();
+ Status s = db_->ResumeImpl(recover_context_);
+ if (s.ok()) {
+ soft_error_no_bg_work_ = false;
+ } else {
+ soft_error_no_bg_work_ = no_bg_work_original_flag;
+ }
+
+ // For manual recover, shutdown, and fatal error cases, set
+ // recovery_in_prog_ to false. For automatic background recovery, leave it
+ // as is regardless of success or failure as it will be retried
+ if (is_manual || s.IsShutdownInProgress() ||
+ bg_error_.severity() >= Status::Severity::kFatalError) {
+ recovery_in_prog_ = false;
+ }
+ return s;
+#else
+ (void)is_manual;
+ return bg_error_;
+#endif
+}
+
+const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
+ const IOStatus& io_error) {
+#ifndef ROCKSDB_LITE
+ db_mutex_->AssertHeld();
+ if (bg_error_.ok()) {
+ return bg_error_;
+ } else if (io_error.ok()) {
+ return kOkStatus;
+ } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) {
+ // Auto resume BG error is not enabled, directly return bg_error_.
+ return bg_error_;
+ }
+ if (bg_error_stats_ != nullptr) {
+ RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
+ }
+ ROCKS_LOG_INFO(
+ db_options_.info_log,
+ "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n");
+ if (recovery_thread_) {
+ // In this case, if recovery_in_prog_ is false, current thread should
+ // wait the previous recover thread to finish and create a new thread
+ // to recover from the bg error.
+ db_mutex_->Unlock();
+ recovery_thread_->join();
+ db_mutex_->Lock();
+ }
+
+ recovery_in_prog_ = true;
+ recovery_thread_.reset(
+ new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));
+
+ if (recovery_io_error_.ok() && recovery_error_.ok()) {
+ return recovery_error_;
+ } else {
+ return bg_error_;
+ }
+#else
+ (void)io_error;
+ return bg_error_;
+#endif
+}
+
+// Automatic recover from Retryable BG IO error. Must be called after db
+// mutex is released.
+void ErrorHandler::RecoverFromRetryableBGIOError() {
+#ifndef ROCKSDB_LITE
+ TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
+ InstrumentedMutexLock l(db_mutex_);
+ if (end_recovery_) {
+ EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+ Status::ShutdownInProgress(),
+ db_mutex_);
+ return;
+ }
+ DBRecoverContext context = recover_context_;
+ int resume_count = db_options_.max_bgerror_resume_count;
+ uint64_t wait_interval = db_options_.bgerror_resume_retry_interval;
+ uint64_t retry_count = 0;
+ // Recover from the retryable error. Create a separate thread to do it.
+ while (resume_count > 0) {
+ if (end_recovery_) {
+ EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+ Status::ShutdownInProgress(),
+ db_mutex_);
+ return;
+ }
+ TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0");
+ TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1");
+ recovery_io_error_ = IOStatus::OK();
+ recovery_error_ = Status::OK();
+ retry_count++;
+ Status s = db_->ResumeImpl(context);
+ if (bg_error_stats_ != nullptr) {
+ RecordTick(bg_error_stats_.get(),
+ ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT);
+ }
+ if (s.IsShutdownInProgress() ||
+ bg_error_.severity() >= Status::Severity::kFatalError) {
+ // If DB shutdown in progress or the error severity is higher than
+ // Hard Error, stop auto resume and returns.
+ recovery_in_prog_ = false;
+ if (bg_error_stats_ != nullptr) {
+ RecordInHistogram(bg_error_stats_.get(),
+ ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+ }
+ EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+ bg_error_, db_mutex_);
+ return;
+ }
+ if (!recovery_io_error_.ok() &&
+ recovery_error_.severity() <= Status::Severity::kHardError &&
+ recovery_io_error_.GetRetryable()) {
+ // If new BG IO error happens during auto recovery and it is retryable
+ // and its severity is Hard Error or lower, the auto resmue sleep for
+ // a period of time and redo auto resume if it is allowed.
+ TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0");
+ TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1");
+ int64_t wait_until = db_options_.clock->NowMicros() + wait_interval;
+ cv_.TimedWait(wait_until);
+ } else {
+ // There are three possibility: 1) recover_io_error is set during resume
+ // and the error is not retryable, 2) recover is successful, 3) other
+ // error happens during resume and cannot be resumed here.
+ if (recovery_io_error_.ok() && recovery_error_.ok() && s.ok()) {
+ // recover from the retryable IO error and no other BG errors. Clean
+ // the bg_error and notify user.
+ TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess");
+ Status old_bg_error = bg_error_;
+ is_db_stopped_.store(false, std::memory_order_release);
+ bg_error_ = Status::OK();
+ bg_error_.PermitUncheckedError();
+ EventHelpers::NotifyOnErrorRecoveryEnd(
+ db_options_.listeners, old_bg_error, bg_error_, db_mutex_);
+ if (bg_error_stats_ != nullptr) {
+ RecordTick(bg_error_stats_.get(),
+ ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT);
+ RecordInHistogram(bg_error_stats_.get(),
+ ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+ }
+ recovery_in_prog_ = false;
+ if (soft_error_no_bg_work_) {
+ soft_error_no_bg_work_ = false;
+ }
+ return;
+ } else {
+ // In this case: 1) recovery_io_error is more serious or not retryable
+ // 2) other Non IO recovery_error happens. The auto recovery stops.
+ recovery_in_prog_ = false;
+ if (bg_error_stats_ != nullptr) {
+ RecordInHistogram(bg_error_stats_.get(),
+ ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+ }
+ EventHelpers::NotifyOnErrorRecoveryEnd(
+ db_options_.listeners, bg_error_,
+ !recovery_io_error_.ok()
+ ? recovery_io_error_
+ : (!recovery_error_.ok() ? recovery_error_ : s),
+ db_mutex_);
+ return;
+ }
+ }
+ resume_count--;
+ }
+ recovery_in_prog_ = false;
+ EventHelpers::NotifyOnErrorRecoveryEnd(
+ db_options_.listeners, bg_error_,
+ Status::Aborted("Exceeded resume retry count"), db_mutex_);
+ TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
+ if (bg_error_stats_ != nullptr) {
+ RecordInHistogram(bg_error_stats_.get(),
+ ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+ }
+ return;
+#else
+ return;
+#endif
+}
+
+void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
+ if (recovery_in_prog_ && recovery_error_.ok()) {
+ recovery_error_ = bg_err;
+ }
+ if (bg_err.severity() > bg_error_.severity()) {
+ bg_error_ = bg_err;
+ }
+ if (bg_error_.severity() >= Status::Severity::kHardError) {
+ is_db_stopped_.store(true, std::memory_order_release);
+ }
+ return;
+}
+
+void ErrorHandler::EndAutoRecovery() {
+ db_mutex_->AssertHeld();
+ if (!end_recovery_) {
+ end_recovery_ = true;
+ }
+ cv_.SignalAll();
+ db_mutex_->Unlock();
+ if (recovery_thread_) {
+ recovery_thread_->join();
+ }
+ db_mutex_->Lock();
+ return;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/error_handler.h b/src/rocksdb/db/error_handler.h
new file mode 100644
index 000000000..34e08a525
--- /dev/null
+++ b/src/rocksdb/db/error_handler.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+
+// This structure is used to store the DB recovery context. The context is
+// the information that related to the recover actions. For example, it contains
+// FlushReason, which tells the flush job why this flush is called.
+struct DBRecoverContext {
+ FlushReason flush_reason;
+
+ DBRecoverContext() : flush_reason(FlushReason::kErrorRecovery) {}
+
+ DBRecoverContext(FlushReason reason) : flush_reason(reason) {}
+};
+
+class ErrorHandler {
+ public:
+ ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options,
+ InstrumentedMutex* db_mutex)
+ : db_(db),
+ db_options_(db_options),
+ cv_(db_mutex),
+ end_recovery_(false),
+ recovery_thread_(nullptr),
+ db_mutex_(db_mutex),
+ auto_recovery_(false),
+ recovery_in_prog_(false),
+ soft_error_no_bg_work_(false),
+ is_db_stopped_(false),
+ bg_error_stats_(db_options.statistics) {
+ // Clear the checked flag for uninitialized errors
+ bg_error_.PermitUncheckedError();
+ recovery_error_.PermitUncheckedError();
+ recovery_io_error_.PermitUncheckedError();
+ }
+
+ void EnableAutoRecovery() { auto_recovery_ = true; }
+
+ Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
+ Status::Code code, Status::SubCode subcode);
+
+ const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason);
+
+ Status GetBGError() const { return bg_error_; }
+
+ Status GetRecoveryError() const { return recovery_error_; }
+
+ Status ClearBGError();
+
+ bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); }
+
+ bool IsBGWorkStopped() {
+ assert(db_mutex_);
+ db_mutex_->AssertHeld();
+ return !bg_error_.ok() &&
+ (bg_error_.severity() >= Status::Severity::kHardError ||
+ !auto_recovery_ || soft_error_no_bg_work_);
+ }
+
+ bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; }
+
+ bool IsRecoveryInProgress() { return recovery_in_prog_; }
+
+ Status RecoverFromBGError(bool is_manual = false);
+ void CancelErrorRecovery();
+
+ void EndAutoRecovery();
+
+ private:
+ DBImpl* db_;
+ const ImmutableDBOptions& db_options_;
+ Status bg_error_;
+ // A separate Status variable used to record any errors during the
+ // recovery process from hard errors
+ Status recovery_error_;
+ // A separate IO Status variable used to record any IO errors during
+ // the recovery process. At the same time, recovery_error_ is also set.
+ IOStatus recovery_io_error_;
+ // The condition variable used with db_mutex during auto resume for time
+ // wait.
+ InstrumentedCondVar cv_;
+ bool end_recovery_;
+ std::unique_ptr<port::Thread> recovery_thread_;
+
+ InstrumentedMutex* db_mutex_;
+ // A flag indicating whether automatic recovery from errors is enabled
+ bool auto_recovery_;
+ bool recovery_in_prog_;
+ // A flag to indicate that for the soft error, we should not allow any
+ // background work except the work is from recovery.
+ bool soft_error_no_bg_work_;
+
+ // Used to store the context for recover, such as flush reason.
+ DBRecoverContext recover_context_;
+ std::atomic<bool> is_db_stopped_;
+
+ // The pointer of DB statistics.
+ std::shared_ptr<Statistics> bg_error_stats_;
+
+ const Status& HandleKnownErrors(const Status& bg_err,
+ BackgroundErrorReason reason);
+ Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery);
+ void RecoverFromNoSpace();
+ const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error);
+ void RecoverFromRetryableBGIOError();
+ // First, if it is in recovery and the recovery_error is ok. Set the
+ // recovery_error_ to bg_err. Second, if the severity is higher than the
+ // current bg_error_, overwrite it.
+ void CheckAndSetRecoveryAndBGError(const Status& bg_err);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/error_handler_fs_test.cc b/src/rocksdb/db/error_handler_fs_test.cc
new file mode 100644
index 000000000..153f3b79e
--- /dev/null
+++ b/src/rocksdb/db/error_handler_fs_test.cc
@@ -0,0 +1,2875 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/stack_trace.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/sst_file_manager.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBErrorHandlingFSTest : public DBTestBase {
+ public:
+ DBErrorHandlingFSTest()
+ : DBTestBase("db_error_handling_fs_test", /*env_do_fsync=*/true) {
+ fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem()));
+ fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_));
+ }
+
+ std::string GetManifestNameFromLiveFiles() {
+ std::vector<std::string> live_files;
+ uint64_t manifest_size;
+
+ Status s = dbfull()->GetLiveFiles(live_files, &manifest_size, false);
+ if (!s.ok()) {
+ return "";
+ }
+ for (auto& file : live_files) {
+ uint64_t num = 0;
+ FileType type;
+ if (ParseFileName(file, &num, &type) && type == kDescriptorFile) {
+ return file;
+ }
+ }
+ return "";
+ }
+
+ std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+ std::unique_ptr<Env> fault_env_;
+};
+
+class ErrorHandlerFSListener : public EventListener {
+ public:
+ ErrorHandlerFSListener()
+ : mutex_(),
+ cv_(&mutex_),
+ no_auto_recovery_(false),
+ recovery_complete_(false),
+ file_creation_started_(false),
+ override_bg_error_(false),
+ file_count_(0),
+ fault_fs_(nullptr) {}
+ ~ErrorHandlerFSListener() {
+ file_creation_error_.PermitUncheckedError();
+ bg_error_.PermitUncheckedError();
+ new_bg_error_.PermitUncheckedError();
+ }
+
+ void OnTableFileCreationStarted(
+ const TableFileCreationBriefInfo& /*ti*/) override {
+ InstrumentedMutexLock l(&mutex_);
+ file_creation_started_ = true;
+ if (file_count_ > 0) {
+ if (--file_count_ == 0) {
+ fault_fs_->SetFilesystemActive(false, file_creation_error_);
+ file_creation_error_ = IOStatus::OK();
+ }
+ }
+ cv_.SignalAll();
+ }
+
+ void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/, Status bg_error,
+ bool* auto_recovery) override {
+ bg_error.PermitUncheckedError();
+ if (*auto_recovery && no_auto_recovery_) {
+ *auto_recovery = false;
+ }
+ }
+
+ void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& info) override {
+ InstrumentedMutexLock l(&mutex_);
+ recovery_complete_ = true;
+ cv_.SignalAll();
+ new_bg_error_ = info.new_bg_error;
+ }
+
+ bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+ InstrumentedMutexLock l(&mutex_);
+ while (!recovery_complete_) {
+ cv_.Wait(/*abs_time_us*/);
+ }
+ if (recovery_complete_) {
+ recovery_complete_ = false;
+ return true;
+ }
+ return false;
+ }
+
+ void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) {
+ InstrumentedMutexLock l(&mutex_);
+ while (!file_creation_started_) {
+ cv_.Wait(/*abs_time_us*/);
+ }
+ file_creation_started_ = false;
+ }
+
+ void OnBackgroundError(BackgroundErrorReason /*reason*/,
+ Status* bg_error) override {
+ if (override_bg_error_) {
+ *bg_error = bg_error_;
+ override_bg_error_ = false;
+ }
+ }
+
+ void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+ void OverrideBGError(Status bg_err) {
+ bg_error_ = bg_err;
+ override_bg_error_ = true;
+ }
+
+ void InjectFileCreationError(FaultInjectionTestFS* fs, int file_count,
+ IOStatus io_s) {
+ fault_fs_ = fs;
+ file_count_ = file_count;
+ file_creation_error_ = io_s;
+ }
+
+ Status new_bg_error() { return new_bg_error_; }
+
+ private:
+ InstrumentedMutex mutex_;
+ InstrumentedCondVar cv_;
+ bool no_auto_recovery_;
+ bool recovery_complete_;
+ bool file_creation_started_;
+ bool override_bg_error_;
+ int file_count_;
+ IOStatus file_creation_error_;
+ Status bg_error_;
+ Status new_bg_error_;
+ FaultInjectionTestFS* fault_fs_;
+};
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.statistics = CreateDBStatistics();
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "val"));
+ SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+ fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_IO_ERROR_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ Destroy(options);
+}
+
+// All the NoSpace IOError will be handled as the regular BG Error no matter the
+// retryable flag is set of not. So the auto resume for retryable IO Error will
+// not be triggered. Also, it is mapped as hard error.
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoSpaceError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 2;
+ options.bgerror_resume_retry_interval = 100000; // 0.1 second
+ options.statistics = CreateDBStatistics();
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::NoSpace("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(1), "val1"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeFinishBuildTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_IO_ERROR_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ options.statistics = CreateDBStatistics();
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(1), "val1"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeFinishBuildTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_IO_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+ Reopen(options);
+ ASSERT_EQ("val1", Get(Key(1)));
+
+ ASSERT_OK(Put(Key(2), "val2"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeSyncTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ Reopen(options);
+ ASSERT_EQ("val2", Get(Key(2)));
+
+ ASSERT_OK(Put(Key(3), "val3"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeCloseTableFile",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ Reopen(options);
+ ASSERT_EQ("val3", Get(Key(3)));
+
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+ error_msg.SetDataLoss(true);
+ error_msg.SetScope(
+ ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+ error_msg.SetRetryable(false);
+
+ ASSERT_OK(Put(Key(1), "val1"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeFinishBuildTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ Reopen(options);
+ ASSERT_EQ("val1", Get(Key(1)));
+
+ ASSERT_OK(Put(Key(2), "val2"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeSyncTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ Reopen(options);
+ ASSERT_EQ("val2", Get(Key(2)));
+
+ ASSERT_OK(Put(Key(3), "val3"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeCloseTableFile",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ Reopen(options);
+ ASSERT_EQ("val3", Get(Key(3)));
+
+ // not file scope, but retyrable set
+ error_msg.SetDataLoss(false);
+ error_msg.SetScope(
+ ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFileSystem);
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(3), "val3"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeCloseTableFile",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ Reopen(options);
+ ASSERT_EQ("val3", Get(Key(3)));
+
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWALWriteRetryableError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ Status s;
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ listener->EnableAutoRecovery(false);
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::SyncClosedLogs:Start",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options);
+
+ WriteOptions wo = WriteOptions();
+ wo.disableWAL = false;
+ ASSERT_OK(Put(Key(1), "val1", wo));
+
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ auto cfh = dbfull()->GetColumnFamilyHandle(1);
+ s = dbfull()->DropColumnFamily(cfh);
+
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ ASSERT_EQ("val1", Get(Key(1)));
+ ASSERT_OK(Put(Key(3), "val3", wo));
+ ASSERT_EQ("val3", Get(Key(3)));
+ s = Flush();
+ ASSERT_OK(s);
+ ASSERT_EQ("val3", Get(Key(3)));
+
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ options.atomic_flush = true;
+ Status s;
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ listener->EnableAutoRecovery(false);
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::SyncClosedLogs:Start",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options);
+
+ WriteOptions wo = WriteOptions();
+ wo.disableWAL = false;
+ ASSERT_OK(Put(Key(1), "val1", wo));
+
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ auto cfh = dbfull()->GetColumnFamilyHandle(1);
+ s = dbfull()->DropColumnFamily(cfh);
+
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ ASSERT_EQ("val1", Get(Key(1)));
+ ASSERT_OK(Put(Key(3), "val3", wo));
+ ASSERT_EQ("val3", Get(Key(3)));
+ s = Flush();
+ ASSERT_OK(s);
+ ASSERT_EQ("val3", Get(Key(3)));
+
+ Destroy(options);
+}
+
+// The flush error is injected before we finish the table build
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ options.statistics = CreateDBStatistics();
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ WriteOptions wo = WriteOptions();
+ wo.disableWAL = true;
+ ASSERT_OK(Put(Key(1), "val1", wo));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeFinishBuildTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_OK(Put(Key(2), "val2", wo));
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ ASSERT_EQ("val2", Get(Key(2)));
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ ASSERT_EQ("val1", Get(Key(1)));
+ ASSERT_EQ("val2", Get(Key(2)));
+ ASSERT_OK(Put(Key(3), "val3", wo));
+ ASSERT_EQ("val3", Get(Key(3)));
+ s = Flush();
+ ASSERT_OK(s);
+ ASSERT_EQ("val3", Get(Key(3)));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_IO_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+ Destroy(options);
+}
+
+// The retryable IO error is injected before we sync table
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ WriteOptions wo = WriteOptions();
+ wo.disableWAL = true;
+
+ ASSERT_OK(Put(Key(1), "val1", wo));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeSyncTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_OK(Put(Key(2), "val2", wo));
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ ASSERT_EQ("val2", Get(Key(2)));
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ ASSERT_EQ("val1", Get(Key(1)));
+ ASSERT_EQ("val2", Get(Key(2)));
+ ASSERT_OK(Put(Key(3), "val3", wo));
+ ASSERT_EQ("val3", Get(Key(3)));
+ s = Flush();
+ ASSERT_OK(s);
+ ASSERT_EQ("val3", Get(Key(3)));
+
+ Destroy(options);
+}
+
+// The retryable IO error is injected before we close the table file
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ WriteOptions wo = WriteOptions();
+ wo.disableWAL = true;
+
+ ASSERT_OK(Put(Key(1), "val1", wo));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeCloseTableFile",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_OK(Put(Key(2), "val2", wo));
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ ASSERT_EQ("val2", Get(Key(2)));
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ ASSERT_EQ("val1", Get(Key(1)));
+ ASSERT_EQ("val2", Get(Key(2)));
+ ASSERT_OK(Put(Key(3), "val3", wo));
+ ASSERT_EQ("val3", Get(Key(3)));
+ s = Flush();
+ ASSERT_OK(s);
+ ASSERT_EQ("val3", Get(Key(3)));
+
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(1), "val"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+ fault_fs_->SetFilesystemActive(false,
+ IOStatus::NoSpace("Out of space"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(1), "val"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+ error_msg.SetDataLoss(true);
+ error_msg.SetScope(
+ ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+ error_msg.SetRetryable(false);
+
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(1), "val"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ WriteOptions wo = WriteOptions();
+ wo.disableWAL = true;
+ ASSERT_OK(Put(Key(0), "val", wo));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(1), "val", wo));
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(1), "val"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+ fault_fs_->SetFilesystemActive(false,
+ IOStatus::NoSpace("Out of space"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ fault_fs_->SetFilesystemActive(true);
+
+ // This Resume() will attempt to create a new manifest file and fail again
+ s = dbfull()->Resume();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ fault_fs_->SetFilesystemActive(true);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // A successful Resume() will create a new manifest file
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) {
+ if (mem_env_ != nullptr) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+ return;
+ }
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(listener);
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+ std::atomic<bool> fail_manifest(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Put(Key(2), "val"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ // Wait for flush of 2nd L0 file before starting compaction
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"},
+ // Wait for compaction to detect manifest write error
+ {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"},
+ // Make compaction thread wait for error to be cleared
+ {"CompactionManifestWriteError:1",
+ "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
+ // Wait for DB instance to clear bg_error before calling
+ // TEST_WaitForCompact
+ {"SstFileManagerImpl::ErrorCleared", "CompactionManifestWriteError:2"}});
+ // trigger manifest write failure in compaction thread
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+ if (fail_manifest.load()) {
+ fault_fs_->SetFilesystemActive(false,
+ IOStatus::NoSpace("Out of space"));
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(1), "val"));
+ // This Flush will trigger a compaction, which will fail when appending to
+ // the manifest
+ s = Flush();
+ ASSERT_OK(s);
+
+ TEST_SYNC_POINT("CompactionManifestWriteError:0");
+ // Clear all errors so when the compaction is retried, it will succeed
+ fault_fs_->SetFilesystemActive(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ TEST_SYNC_POINT("CompactionManifestWriteError:1");
+ TEST_SYNC_POINT("CompactionManifestWriteError:2");
+
+ s = dbfull()->TEST_WaitForCompact();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_OK(s);
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ ASSERT_EQ("val", Get(Key(2)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+ std::atomic<bool> fail_manifest(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Put(Key(2), "val"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+ listener->EnableAutoRecovery(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ // Wait for flush of 2nd L0 file before starting compaction
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"},
+ // Wait for compaction to detect manifest write error
+ {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"},
+ // Make compaction thread wait for error to be cleared
+ {"CompactionManifestWriteError:1",
+ "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}});
+ // trigger manifest write failure in compaction thread
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+ if (fail_manifest.load()) {
+ fault_fs_->SetFilesystemActive(false, error_msg);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(1), "val"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ TEST_SYNC_POINT("CompactionManifestWriteError:0");
+ TEST_SYNC_POINT("CompactionManifestWriteError:1");
+
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+
+ fault_fs_->SetFilesystemActive(true);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ ASSERT_EQ("val", Get(Key(2)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionWriteError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(listener);
+ Status s;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "va;"));
+ ASSERT_OK(Put(Key(2), "va;"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ listener->OverrideBGError(
+ Status(Status::NoSpace(), Status::Severity::kHardError));
+ listener->EnableAutoRecovery(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) {
+ fault_fs_->SetFilesystemActive(false,
+ IOStatus::NoSpace("Out of space"));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(1), "val"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteRetryableError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ Status s;
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(0), "va;"));
+ ASSERT_OK(Put(Key(2), "va;"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+ listener->EnableAutoRecovery(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::OpenCompactionOutputFile",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:Finish",
+ [&](void*) { CancelAllBackgroundWork(dbfull()); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(1), "val"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ s = dbfull()->TEST_GetBGError();
+ ASSERT_OK(s);
+ fault_fs_->SetFilesystemActive(true);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteFileScopeError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 0;
+ Status s;
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+ error_msg.SetDataLoss(true);
+ error_msg.SetScope(
+ ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+ error_msg.SetRetryable(false);
+
+ ASSERT_OK(Put(Key(0), "va;"));
+ ASSERT_OK(Put(Key(2), "va;"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+ listener->EnableAutoRecovery(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::OpenCompactionOutputFile",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:Finish",
+ [&](void*) { CancelAllBackgroundWork(dbfull()); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(1), "val"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ s = dbfull()->TEST_GetBGError();
+ ASSERT_OK(s);
+
+ fault_fs_->SetFilesystemActive(true);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+ s = dbfull()->Resume();
+ ASSERT_OK(s);
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, CorruptionError) {
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ Status s;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "va;"));
+ ASSERT_OK(Put(Key(2), "va;"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) {
+ fault_fs_->SetFilesystemActive(false,
+ IOStatus::Corruption("Corruption"));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(1), "val"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s.severity(),
+ ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_NOK(s);
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) {
+ if (mem_env_ != nullptr) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+ return;
+ }
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.statistics = CreateDBStatistics();
+ Status s;
+
+ listener->EnableAutoRecovery();
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "val"));
+ SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+ fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+ s = Put(Key(1), "val");
+ ASSERT_OK(s);
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_IO_ERROR_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ Status s;
+
+ listener->EnableAutoRecovery();
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "val"));
+ SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+ fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ // We should be able to shutdown the database while auto recovery is going
+ // on in the background
+ Close();
+ DestroyDB(dbname_, options).PermitUncheckedError();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteError) {
+ if (mem_env_ != nullptr) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+ return;
+ }
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.writable_file_max_buffer_size = 32768;
+ options.listeners.emplace_back(listener);
+ Status s;
+ Random rnd(301);
+
+ listener->EnableAutoRecovery();
+ DestroyAndReopen(options);
+
+ {
+ WriteBatch batch;
+
+ for (auto i = 0; i < 100; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(dbfull()->Write(wopts, &batch));
+ };
+
+ {
+ WriteBatch batch;
+ int write_error = 0;
+
+ for (auto i = 100; i < 199; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+ write_error++;
+ if (write_error > 2) {
+ fault_fs_->SetFilesystemActive(false,
+ IOStatus::NoSpace("Out of space"));
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ WriteOptions wopts;
+ wopts.sync = true;
+ s = dbfull()->Write(wopts, &batch);
+ ASSERT_EQ(s, s.NoSpace());
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ // `ClearAllCallBacks()` is needed in addition to `DisableProcessing()` to
+ // drain all callbacks. Otherwise, a pending callback in the background
+ // could re-disable `fault_fs_` after we enable it below.
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ fault_fs_->SetFilesystemActive(true);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+ for (auto i = 0; i < 199; ++i) {
+ if (i < 100) {
+ ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+ Reopen(options);
+ for (auto i = 0; i < 199; ++i) {
+ if (i < 100) {
+ ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.writable_file_max_buffer_size = 32768;
+ options.listeners.emplace_back(listener);
+ options.paranoid_checks = true;
+ options.max_bgerror_resume_count = 0;
+ Random rnd(301);
+
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ // For the first batch, write is successful, require sync
+ {
+ WriteBatch batch;
+
+ for (auto i = 0; i < 100; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(dbfull()->Write(wopts, &batch));
+ };
+
+ // For the second batch, the first 2 file Append are successful, then the
+ // following Append fails due to file system retryable IOError.
+ {
+ WriteBatch batch;
+ int write_error = 0;
+
+ for (auto i = 100; i < 200; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+ write_error++;
+ if (write_error > 2) {
+ fault_fs_->SetFilesystemActive(false, error_msg);
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ WriteOptions wopts;
+ wopts.sync = true;
+ Status s = dbfull()->Write(wopts, &batch);
+ ASSERT_TRUE(s.IsIOError());
+ }
+ fault_fs_->SetFilesystemActive(true);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // Data in corrupted WAL are not stored
+ for (auto i = 0; i < 199; ++i) {
+ if (i < 100) {
+ ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+
+ // Resume and write a new batch, should be in the WAL
+ ASSERT_OK(dbfull()->Resume());
+ {
+ WriteBatch batch;
+
+ for (auto i = 200; i < 300; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(dbfull()->Write(wopts, &batch));
+ };
+
+ Reopen(options);
+ for (auto i = 0; i < 300; ++i) {
+ if (i < 100 || i >= 200) {
+ ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) {
+ if (mem_env_ != nullptr) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+ return;
+ }
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.writable_file_max_buffer_size = 32768;
+ options.listeners.emplace_back(listener);
+ Random rnd(301);
+
+ listener->EnableAutoRecovery();
+ CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+ {
+ WriteBatch batch;
+
+ for (auto i = 1; i < 4; ++i) {
+ for (auto j = 0; j < 100; ++j) {
+ ASSERT_OK(batch.Put(handles_[i], Key(j), rnd.RandomString(1024)));
+ }
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(dbfull()->Write(wopts, &batch));
+ };
+
+ {
+ WriteBatch batch;
+ int write_error = 0;
+
+ // Write to one CF
+ for (auto i = 100; i < 199; ++i) {
+ ASSERT_OK(batch.Put(handles_[2], Key(i), rnd.RandomString(1024)));
+ }
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+ write_error++;
+ if (write_error > 2) {
+ fault_fs_->SetFilesystemActive(false,
+ IOStatus::NoSpace("Out of space"));
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ WriteOptions wopts;
+ wopts.sync = true;
+ Status s = dbfull()->Write(wopts, &batch);
+ ASSERT_TRUE(s.IsNoSpace());
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ // `ClearAllCallBacks()` is needed in addition to `DisableProcessing()` to
+ // drain all callbacks. Otherwise, a pending callback in the background
+ // could re-disable `fault_fs_` after we enable it below.
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ fault_fs_->SetFilesystemActive(true);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+ for (auto i = 1; i < 4; ++i) {
+ // Every CF should have been flushed
+ ASSERT_EQ(NumTableFilesAtLevel(0, i), 1);
+ }
+
+ for (auto i = 1; i < 4; ++i) {
+ for (auto j = 0; j < 199; ++j) {
+ if (j < 100) {
+ ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+ }
+ }
+ }
+ ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
+ for (auto i = 1; i < 4; ++i) {
+ for (auto j = 0; j < 199; ++j) {
+ if (j < 100) {
+ ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+ }
+ }
+ }
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
+ if (mem_env_ != nullptr) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+ return;
+ }
+ FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_);
+ std::vector<std::unique_ptr<Env>> fault_envs;
+ std::vector<FaultInjectionTestFS*> fault_fs;
+ std::vector<Options> options;
+ std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
+ std::vector<DB*> db;
+ std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+ int kNumDbInstances = 3;
+ Random rnd(301);
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ listener.emplace_back(new ErrorHandlerFSListener());
+ options.emplace_back(GetDefaultOptions());
+ fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem()));
+ std::shared_ptr<FileSystem> fs(fault_fs.back());
+ fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs));
+ options[i].env = fault_envs.back().get();
+ options[i].create_if_missing = true;
+ options[i].level0_file_num_compaction_trigger = 2;
+ options[i].writable_file_max_buffer_size = 32768;
+ options[i].listeners.emplace_back(listener[i]);
+ options[i].sst_file_manager = sfm;
+ DB* dbptr;
+ char buf[16];
+
+ listener[i]->EnableAutoRecovery();
+ // Setup for returning error for the 3rd SST, which would be level 1
+ listener[i]->InjectFileCreationError(fault_fs[i], 3,
+ IOStatus::NoSpace("Out of space"));
+ snprintf(buf, sizeof(buf), "_%d", i);
+ ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+ ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
+ db.emplace_back(dbptr);
+ }
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ WriteBatch batch;
+
+ for (auto j = 0; j <= 100; ++j) {
+ ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(db[i]->Write(wopts, &batch));
+ ASSERT_OK(db[i]->Flush(FlushOptions()));
+ }
+
+ def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ WriteBatch batch;
+
+ // Write to one CF
+ for (auto j = 100; j < 199; ++j) {
+ ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(db[i]->Write(wopts, &batch));
+ ASSERT_OK(db[i]->Flush(FlushOptions()));
+ }
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+ ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+ fault_fs[i]->SetFilesystemActive(true);
+ }
+
+ def_env->SetFilesystemActive(true);
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ std::string prop;
+ ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+ ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true));
+ EXPECT_TRUE(db[i]->GetProperty(
+ "rocksdb.num-files-at-level" + std::to_string(0), &prop));
+ EXPECT_EQ(atoi(prop.c_str()), 0);
+ EXPECT_TRUE(db[i]->GetProperty(
+ "rocksdb.num-files-at-level" + std::to_string(1), &prop));
+ EXPECT_EQ(atoi(prop.c_str()), 1);
+ }
+
+ SstFileManagerImpl* sfmImpl =
+ static_cast_with_check<SstFileManagerImpl>(sfm.get());
+ sfmImpl->Close();
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "_%d", i);
+ delete db[i];
+ fault_fs[i]->SetFilesystemActive(true);
+ if (getenv("KEEP_DB")) {
+ printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+ } else {
+ ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+ }
+ }
+ options.clear();
+ sfm.reset();
+ delete def_env;
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
+ if (mem_env_ != nullptr) {
+ ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+ return;
+ }
+ FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_);
+ std::vector<std::unique_ptr<Env>> fault_envs;
+ std::vector<FaultInjectionTestFS*> fault_fs;
+ std::vector<Options> options;
+ std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
+ std::vector<DB*> db;
+ std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+ int kNumDbInstances = 3;
+ Random rnd(301);
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ listener.emplace_back(new ErrorHandlerFSListener());
+ options.emplace_back(GetDefaultOptions());
+ fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem()));
+ std::shared_ptr<FileSystem> fs(fault_fs.back());
+ fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs));
+ options[i].env = fault_envs.back().get();
+ options[i].create_if_missing = true;
+ options[i].level0_file_num_compaction_trigger = 2;
+ options[i].writable_file_max_buffer_size = 32768;
+ options[i].listeners.emplace_back(listener[i]);
+ options[i].sst_file_manager = sfm;
+ DB* dbptr;
+ char buf[16];
+
+ listener[i]->EnableAutoRecovery();
+ switch (i) {
+ case 0:
+ // Setup for returning error for the 3rd SST, which would be level 1
+ listener[i]->InjectFileCreationError(fault_fs[i], 3,
+ IOStatus::NoSpace("Out of space"));
+ break;
+ case 1:
+ // Setup for returning error after the 1st SST, which would result
+ // in a hard error
+ listener[i]->InjectFileCreationError(fault_fs[i], 2,
+ IOStatus::NoSpace("Out of space"));
+ break;
+ default:
+ break;
+ }
+ snprintf(buf, sizeof(buf), "_%d", i);
+ ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+ ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
+ db.emplace_back(dbptr);
+ }
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ WriteBatch batch;
+
+ for (auto j = 0; j <= 100; ++j) {
+ ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(db[i]->Write(wopts, &batch));
+ ASSERT_OK(db[i]->Flush(FlushOptions()));
+ }
+
+ def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ WriteBatch batch;
+
+ // Write to one CF
+ for (auto j = 100; j < 199; ++j) {
+ ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(db[i]->Write(wopts, &batch));
+ if (i != 1) {
+ ASSERT_OK(db[i]->Flush(FlushOptions()));
+ } else {
+ ASSERT_TRUE(db[i]->Flush(FlushOptions()).IsNoSpace());
+ }
+ }
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+ switch (i) {
+ case 0:
+ ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+ break;
+ case 1:
+ ASSERT_EQ(s.severity(), Status::Severity::kHardError);
+ break;
+ case 2:
+ ASSERT_OK(s);
+ break;
+ }
+ fault_fs[i]->SetFilesystemActive(true);
+ }
+
+ def_env->SetFilesystemActive(true);
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ std::string prop;
+ if (i < 2) {
+ ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+ }
+ if (i == 1) {
+ ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true));
+ }
+ EXPECT_TRUE(db[i]->GetProperty(
+ "rocksdb.num-files-at-level" + std::to_string(0), &prop));
+ EXPECT_EQ(atoi(prop.c_str()), 0);
+ EXPECT_TRUE(db[i]->GetProperty(
+ "rocksdb.num-files-at-level" + std::to_string(1), &prop));
+ EXPECT_EQ(atoi(prop.c_str()), 1);
+ }
+
+ SstFileManagerImpl* sfmImpl =
+ static_cast_with_check<SstFileManagerImpl>(sfm.get());
+ sfmImpl->Close();
+
+ for (auto i = 0; i < kNumDbInstances; ++i) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "_%d", i);
+ fault_fs[i]->SetFilesystemActive(true);
+ delete db[i];
+ if (getenv("KEEP_DB")) {
+ printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+ } else {
+ EXPECT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+ }
+ }
+ options.clear();
+ delete def_env;
+}
+
+// When Put the KV-pair, the write option is set to disable WAL.
+// If retryable error happens in this condition, map the bg error
+// to soft error and trigger auto resume. During auto resume, SwitchMemtable
+// is disabled to avoid small SST tables. Write can still be applied before
+// the bg error is cleaned unless the memtable is full.
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) {
+ // Activate the FS before the first resume
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 2;
+ options.bgerror_resume_retry_interval = 100000; // 0.1 second
+ options.statistics = CreateDBStatistics();
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ WriteOptions wo = WriteOptions();
+ wo.disableWAL = true;
+ ASSERT_OK(Put(Key(1), "val1", wo));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"RecoverFromRetryableBGIOError:LoopOut",
+ "FLushWritNoWALRetryableeErrorAutoRecover1:1"}});
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeFinishBuildTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ("val1", Get(Key(1)));
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ TEST_SYNC_POINT("FLushWritNoWALRetryableeErrorAutoRecover1:1");
+ ASSERT_EQ("val1", Get(Key(1)));
+ ASSERT_EQ("val1", Get(Key(1)));
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_ERROR_COUNT));
+ ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_IO_ERROR_COUNT));
+ ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_COUNT));
+ ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+ ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+ HistogramData autoresume_retry;
+ options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+ &autoresume_retry);
+ ASSERT_GE(autoresume_retry.max, 0);
+ ASSERT_OK(Put(Key(2), "val2", wo));
+ s = Flush();
+ // Since auto resume fails, the bg error is not cleand, flush will
+ // return the bg_error set before.
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ ASSERT_EQ("val2", Get(Key(2)));
+
+ // call auto resume
+ ASSERT_OK(dbfull()->Resume());
+ ASSERT_OK(Put(Key(3), "val3", wo));
+ // After resume is successful, the flush should be ok.
+ ASSERT_OK(Flush());
+ ASSERT_EQ("val3", Get(Key(3)));
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) {
+ // Activate the FS before the first resume
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 2;
+ options.bgerror_resume_retry_interval = 100000; // 0.1 second
+ options.statistics = CreateDBStatistics();
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ WriteOptions wo = WriteOptions();
+ wo.disableWAL = true;
+ ASSERT_OK(Put(Key(1), "val1", wo));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeFinishBuildTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ("val1", Get(Key(1)));
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+ ASSERT_EQ("val1", Get(Key(1)));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_IO_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_COUNT));
+ ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+ ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+ HistogramData autoresume_retry;
+ options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+ &autoresume_retry);
+ ASSERT_GE(autoresume_retry.max, 0);
+ ASSERT_OK(Put(Key(2), "val2", wo));
+ s = Flush();
+ // Since auto resume is successful, the bg error is cleaned, flush will
+ // be successful.
+ ASSERT_OK(s);
+ ASSERT_EQ("val2", Get(Key(2)));
+ Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error. Activate the FS before the
+// first resume. Resume is successful
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
+ // Activate the FS before the first resume
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 2;
+ options.bgerror_resume_retry_interval = 100000; // 0.1 second
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(1), "val1"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeFinishBuildTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+ ASSERT_EQ("val1", Get(Key(1)));
+ Reopen(options);
+ ASSERT_EQ("val1", Get(Key(1)));
+ ASSERT_OK(Put(Key(2), "val2"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("val2", Get(Key(2)));
+
+ Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error and set the retry limit count.
+// Never activate the FS and auto resume should fail at the end
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
+ // Fail all the resume and let user to resume
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 2;
+ options.bgerror_resume_retry_interval = 100000; // 0.1 second
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(1), "val1"));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"FLushWritRetryableeErrorAutoRecover2:0",
+ "RecoverFromRetryableBGIOError:BeforeStart"},
+ {"RecoverFromRetryableBGIOError:LoopOut",
+ "FLushWritRetryableeErrorAutoRecover2:1"}});
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeFinishBuildTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:0");
+ TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:1");
+ fault_fs_->SetFilesystemActive(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ ASSERT_EQ("val1", Get(Key(1)));
+ // Auto resume fails due to FS does not recover during resume. User call
+ // resume manually here.
+ s = dbfull()->Resume();
+ ASSERT_EQ("val1", Get(Key(1)));
+ ASSERT_OK(s);
+ ASSERT_OK(Put(Key(2), "val2"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("val2", Get(Key(2)));
+
+ Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error and set the retry limit count.
+// Fail the first resume and let the second resume be successful.
+TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
+ // Fail the first resume and let the second resume be successful
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 2;
+ options.bgerror_resume_retry_interval = 100000; // 0.1 second
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(1), "val"));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"RecoverFromRetryableBGIOError:BeforeStart",
+ "ManifestWriteRetryableErrorAutoRecover:0"},
+ {"ManifestWriteRetryableErrorAutoRecover:1",
+ "RecoverFromRetryableBGIOError:BeforeWait1"},
+ {"RecoverFromRetryableBGIOError:RecoverSuccess",
+ "ManifestWriteRetryableErrorAutoRecover:2"}});
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:0");
+ fault_fs_->SetFilesystemActive(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:1");
+ TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:2");
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableErrorAutoRecover) {
+ // Fail the first resume and let the second resume be successful
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 2;
+ options.bgerror_resume_retry_interval = 100000; // 0.1 second
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ WriteOptions wo = WriteOptions();
+ wo.disableWAL = true;
+ ASSERT_OK(Put(Key(0), "val", wo));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(1), "val", wo));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"RecoverFromRetryableBGIOError:BeforeStart",
+ "ManifestWriteNoWALRetryableErrorAutoRecover:0"},
+ {"ManifestWriteNoWALRetryableErrorAutoRecover:1",
+ "RecoverFromRetryableBGIOError:BeforeWait1"},
+ {"RecoverFromRetryableBGIOError:RecoverSuccess",
+ "ManifestWriteNoWALRetryableErrorAutoRecover:2"}});
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:0");
+ fault_fs_->SetFilesystemActive(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:1");
+ TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:2");
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest,
+ CompactionManifestWriteRetryableErrorAutoRecover) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 2;
+ options.bgerror_resume_retry_interval = 100000; // 0.1 second
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+ std::atomic<bool> fail_manifest(false);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Put(Key(2), "val"));
+ ASSERT_OK(Flush());
+
+ listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+ listener->EnableAutoRecovery(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ // Wait for flush of 2nd L0 file before starting compaction
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"},
+ // Wait for compaction to detect manifest write error
+ {"BackgroundCallCompaction:1", "CompactionManifestWriteErrorAR:0"},
+ // Make compaction thread wait for error to be cleared
+ {"CompactionManifestWriteErrorAR:1",
+ "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
+ {"CompactionManifestWriteErrorAR:2",
+ "RecoverFromRetryableBGIOError:BeforeStart"},
+ // Fail the first resume, before the wait in resume
+ {"RecoverFromRetryableBGIOError:BeforeResume0",
+ "CompactionManifestWriteErrorAR:3"},
+ // Activate the FS before the second resume
+ {"CompactionManifestWriteErrorAR:4",
+ "RecoverFromRetryableBGIOError:BeforeResume1"},
+ // Wait the auto resume be sucessful
+ {"RecoverFromRetryableBGIOError:RecoverSuccess",
+ "CompactionManifestWriteErrorAR:5"}});
+ // trigger manifest write failure in compaction thread
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+ if (fail_manifest.load()) {
+ fault_fs_->SetFilesystemActive(false, error_msg);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(1), "val"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ TEST_SYNC_POINT("CompactionManifestWriteErrorAR:0");
+ TEST_SYNC_POINT("CompactionManifestWriteErrorAR:1");
+
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ TEST_SYNC_POINT("CompactionManifestWriteErrorAR:2");
+ TEST_SYNC_POINT("CompactionManifestWriteErrorAR:3");
+ fault_fs_->SetFilesystemActive(true);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ TEST_SYNC_POINT("CompactionManifestWriteErrorAR:4");
+ TEST_SYNC_POINT("CompactionManifestWriteErrorAR:5");
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ new_manifest = GetManifestNameFromLiveFiles();
+ ASSERT_NE(new_manifest, old_manifest);
+
+ Reopen(options);
+ ASSERT_EQ("val", Get(Key(0)));
+ ASSERT_EQ("val", Get(Key(1)));
+ ASSERT_EQ("val", Get(Key(2)));
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) {
+ // In this test, in the first round of compaction, the FS is set to error.
+ // So the first compaction fails due to retryable IO error and it is mapped
+ // to soft error. Then, compaction is rescheduled, in the second round of
+ // compaction, the FS is set to active and compaction is successful, so
+ // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync
+ // point.
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(listener);
+ Status s;
+ std::atomic<bool> fail_first(false);
+ std::atomic<bool> fail_second(true);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(0), "va;"));
+ ASSERT_OK(Put(Key(2), "va;"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+ listener->EnableAutoRecovery(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"},
+ {"CompactionJob::FinishCompactionOutputFile1",
+ "CompactionWriteRetryableErrorAutoRecover0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:Start",
+ [&](void*) { fault_fs_->SetFilesystemActive(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::OpenCompactionOutputFile", [&](void*) {
+ if (fail_first.load() && fail_second.load()) {
+ fault_fs_->SetFilesystemActive(false, error_msg);
+ fail_second.store(false);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(1), "val"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_OK(s);
+ TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0");
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.writable_file_max_buffer_size = 32768;
+ options.listeners.emplace_back(listener);
+ options.paranoid_checks = true;
+ options.max_bgerror_resume_count = 2;
+ options.bgerror_resume_retry_interval = 100000; // 0.1 second
+ Status s;
+ Random rnd(301);
+
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ // For the first batch, write is successful, require sync
+ {
+ WriteBatch batch;
+
+ for (auto i = 0; i < 100; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(dbfull()->Write(wopts, &batch));
+ };
+
+ // For the second batch, the first 2 file Append are successful, then the
+ // following Append fails due to file system retryable IOError.
+ {
+ WriteBatch batch;
+ int write_error = 0;
+
+ for (auto i = 100; i < 200; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"WALWriteErrorDone", "RecoverFromRetryableBGIOError:BeforeStart"},
+ {"RecoverFromRetryableBGIOError:BeforeResume0", "WALWriteError1:0"},
+ {"WALWriteError1:1", "RecoverFromRetryableBGIOError:BeforeResume1"},
+ {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError1:2"}});
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+ write_error++;
+ if (write_error > 2) {
+ fault_fs_->SetFilesystemActive(false, error_msg);
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ WriteOptions wopts;
+ wopts.sync = true;
+ s = dbfull()->Write(wopts, &batch);
+ ASSERT_EQ(true, s.IsIOError());
+ TEST_SYNC_POINT("WALWriteErrorDone");
+
+ TEST_SYNC_POINT("WALWriteError1:0");
+ fault_fs_->SetFilesystemActive(true);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ TEST_SYNC_POINT("WALWriteError1:1");
+ TEST_SYNC_POINT("WALWriteError1:2");
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // Data in corrupted WAL are not stored
+ for (auto i = 0; i < 199; ++i) {
+ if (i < 100) {
+ ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+
+ // Resume and write a new batch, should be in the WAL
+ {
+ WriteBatch batch;
+
+ for (auto i = 200; i < 300; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(dbfull()->Write(wopts, &batch));
+ };
+
+ Reopen(options);
+ for (auto i = 0; i < 300; ++i) {
+ if (i < 100 || i >= 200) {
+ ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+ Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
+ // Fail the first recover and try second time.
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.writable_file_max_buffer_size = 32768;
+ options.listeners.emplace_back(listener);
+ options.paranoid_checks = true;
+ options.max_bgerror_resume_count = 2;
+ options.bgerror_resume_retry_interval = 100000; // 0.1 second
+ Status s;
+ Random rnd(301);
+
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ // For the first batch, write is successful, require sync
+ {
+ WriteBatch batch;
+
+ for (auto i = 0; i < 100; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(dbfull()->Write(wopts, &batch));
+ };
+
+ // For the second batch, the first 2 file Append are successful, then the
+ // following Append fails due to file system retryable IOError.
+ {
+ WriteBatch batch;
+ int write_error = 0;
+
+ for (auto i = 100; i < 200; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"RecoverFromRetryableBGIOError:BeforeWait0", "WALWriteError2:0"},
+ {"WALWriteError2:1", "RecoverFromRetryableBGIOError:BeforeWait1"},
+ {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError2:2"}});
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+ write_error++;
+ if (write_error > 2) {
+ fault_fs_->SetFilesystemActive(false, error_msg);
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ WriteOptions wopts;
+ wopts.sync = true;
+ s = dbfull()->Write(wopts, &batch);
+ ASSERT_EQ(true, s.IsIOError());
+
+ TEST_SYNC_POINT("WALWriteError2:0");
+ fault_fs_->SetFilesystemActive(true);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ TEST_SYNC_POINT("WALWriteError2:1");
+ TEST_SYNC_POINT("WALWriteError2:2");
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // Data in corrupted WAL are not stored
+ for (auto i = 0; i < 199; ++i) {
+ if (i < 100) {
+ ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+
+ // Resume and write a new batch, should be in the WAL
+ {
+ WriteBatch batch;
+
+ for (auto i = 200; i < 300; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(dbfull()->Write(wopts, &batch));
+ };
+
+ Reopen(options);
+ for (auto i = 0; i < 300; ++i) {
+ if (i < 100 || i >= 200) {
+ ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+ Close();
+}
+
+// Fail auto resume from a flush retryable error and verify that
+// OnErrorRecoveryEnd listener callback is called
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) {
+ // Activate the FS before the first resume
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.max_bgerror_resume_count = 2;
+ options.bgerror_resume_retry_interval = 100000; // 0.1 second
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ ASSERT_OK(Put(Key(1), "val1"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeFinishBuildTable",
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+ ASSERT_EQ(listener->new_bg_error(), Status::Aborted());
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+
+ Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FlushReadError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener =
+ std::make_shared<ErrorHandlerFSListener>();
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.statistics = CreateDBStatistics();
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "val"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeOutputValidation", [&](void*) {
+ IOStatus st = IOStatus::IOError();
+ st.SetRetryable(true);
+ st.SetScope(IOStatus::IOErrorScope::kIOErrorScopeFile);
+ fault_fs_->SetFilesystemActive(false, st);
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeDeleteFile",
+ [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_IO_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+ ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_COUNT));
+ ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+ s = dbfull()->TEST_GetBGError();
+ ASSERT_OK(s);
+
+ Reopen(GetDefaultOptions());
+ ASSERT_EQ("val", Get(Key(0)));
+}
+
+TEST_F(DBErrorHandlingFSTest, AtomicFlushReadError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener =
+ std::make_shared<ErrorHandlerFSListener>();
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.statistics = CreateDBStatistics();
+ Status s;
+
+ listener->EnableAutoRecovery(false);
+ options.atomic_flush = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(0, Key(0), "val"));
+ ASSERT_OK(Put(1, Key(0), "val"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeOutputValidation", [&](void*) {
+ IOStatus st = IOStatus::IOError();
+ st.SetRetryable(true);
+ st.SetScope(IOStatus::IOErrorScope::kIOErrorScopeFile);
+ fault_fs_->SetFilesystemActive(false, st);
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeDeleteFile",
+ [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush({0, 1});
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_IO_ERROR_COUNT));
+ ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+ ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_COUNT));
+ ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+ s = dbfull()->TEST_GetBGError();
+ ASSERT_OK(s);
+
+ TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+ GetDefaultOptions());
+ ASSERT_EQ("val", Get(Key(0)));
+}
+
+TEST_F(DBErrorHandlingFSTest, AtomicFlushNoSpaceError) {
+ std::shared_ptr<ErrorHandlerFSListener> listener =
+ std::make_shared<ErrorHandlerFSListener>();
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.statistics = CreateDBStatistics();
+ Status s;
+
+ listener->EnableAutoRecovery(true);
+ options.atomic_flush = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ ASSERT_OK(Put(0, Key(0), "val"));
+ ASSERT_OK(Put(1, Key(0), "val"));
+ SyncPoint::GetInstance()->SetCallBack("BuildTable:create_file", [&](void*) {
+ IOStatus st = IOStatus::NoSpace();
+ fault_fs_->SetFilesystemActive(false, st);
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "BuildTable:BeforeDeleteFile",
+ [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush({0, 1});
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+ ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_ERROR_COUNT));
+ ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+ ERROR_HANDLER_BG_IO_ERROR_COUNT));
+ s = dbfull()->TEST_GetBGError();
+ ASSERT_OK(s);
+
+ TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+ GetDefaultOptions());
+ ASSERT_EQ("val", Get(Key(0)));
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionReadRetryableErrorAutoRecover) {
+ // In this test, in the first round of compaction, the FS is set to error.
+ // So the first compaction fails due to retryable IO error and it is mapped
+ // to soft error. Then, compaction is rescheduled, in the second round of
+ // compaction, the FS is set to active and compaction is successful, so
+ // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync
+ // point.
+ std::shared_ptr<ErrorHandlerFSListener> listener =
+ std::make_shared<ErrorHandlerFSListener>();
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(listener);
+ BlockBasedTableOptions table_options;
+ table_options.no_block_cache = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Status s;
+ std::atomic<bool> fail_first(false);
+ std::atomic<bool> fail_second(true);
+ Random rnd(301);
+ DestroyAndReopen(options);
+
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ error_msg.SetRetryable(true);
+
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+ }
+ s = Flush();
+ ASSERT_OK(s);
+
+ listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+ listener->EnableAutoRecovery(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"},
+ {"CompactionJob::FinishCompactionOutputFile1",
+ "CompactionWriteRetryableErrorAutoRecover0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:Start",
+ [&](void*) { fault_fs_->SetFilesystemActive(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():PausingManualCompaction:2", [&](void*) {
+ if (fail_first.load() && fail_second.load()) {
+ fault_fs_->SetFilesystemActive(false, error_msg);
+ fail_second.store(false);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(1), "val"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_OK(s);
+ TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0");
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ Reopen(GetDefaultOptions());
+}
+
+class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest,
+ public testing::WithParamInterface<bool> {};
+
+TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.paranoid_checks = GetParam();
+ Status s;
+
+ listener->EnableAutoRecovery(true);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "val"));
+ SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+ fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+ ASSERT_TRUE(s.IsIOFenced());
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_TRUE(s.IsIOFenced());
+ Destroy(options);
+}
+
+TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.listeners.emplace_back(listener);
+ options.paranoid_checks = GetParam();
+ Status s;
+ std::string old_manifest;
+ std::string new_manifest;
+
+ listener->EnableAutoRecovery(true);
+ DestroyAndReopen(options);
+ old_manifest = GetManifestNameFromLiveFiles();
+
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(1), "val"));
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+ fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ s = Flush();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+ ASSERT_TRUE(s.IsIOFenced());
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_TRUE(s.IsIOFenced());
+ Close();
+}
+
+TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(listener);
+ options.paranoid_checks = GetParam();
+ Status s;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(Key(0), "va;"));
+ ASSERT_OK(Put(Key(2), "va;"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ listener->EnableAutoRecovery(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+ "BackgroundCallCompaction:0"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackgroundCallCompaction:0", [&](void*) {
+ fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(Key(1), "val"));
+ s = Flush();
+ ASSERT_OK(s);
+
+ s = dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+ ASSERT_TRUE(s.IsIOFenced());
+
+ fault_fs_->SetFilesystemActive(true);
+ s = dbfull()->Resume();
+ ASSERT_TRUE(s.IsIOFenced());
+ Destroy(options);
+}
+
+TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) {
+ std::shared_ptr<ErrorHandlerFSListener> listener(
+ new ErrorHandlerFSListener());
+ Options options = GetDefaultOptions();
+ options.env = fault_env_.get();
+ options.create_if_missing = true;
+ options.writable_file_max_buffer_size = 32768;
+ options.listeners.emplace_back(listener);
+ options.paranoid_checks = GetParam();
+ Status s;
+ Random rnd(301);
+
+ listener->EnableAutoRecovery(true);
+ DestroyAndReopen(options);
+
+ {
+ WriteBatch batch;
+
+ for (auto i = 0; i < 100; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ ASSERT_OK(dbfull()->Write(wopts, &batch));
+ };
+
+ {
+ WriteBatch batch;
+ int write_error = 0;
+
+ for (auto i = 100; i < 199; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+ write_error++;
+ if (write_error > 2) {
+ fault_fs_->SetFilesystemActive(false,
+ IOStatus::IOFenced("IO fenced"));
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ WriteOptions wopts;
+ wopts.sync = true;
+ s = dbfull()->Write(wopts, &batch);
+ ASSERT_TRUE(s.IsIOFenced());
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ fault_fs_->SetFilesystemActive(true);
+ {
+ WriteBatch batch;
+
+ for (auto i = 0; i < 100; ++i) {
+ ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+ }
+
+ WriteOptions wopts;
+ wopts.sync = true;
+ s = dbfull()->Write(wopts, &batch);
+ ASSERT_TRUE(s.IsIOFenced());
+ }
+ Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DBErrorHandlingFSTest, DBErrorHandlingFencingTest,
+ ::testing::Bool());
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/event_helpers.cc b/src/rocksdb/db/event_helpers.cc
new file mode 100644
index 000000000..7987b8ec6
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.cc
@@ -0,0 +1,371 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/event_helpers.h"
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/utilities/customizable_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+Status EventListener::CreateFromString(const ConfigOptions& config_options,
+ const std::string& id,
+ std::shared_ptr<EventListener>* result) {
+ return LoadSharedObject<EventListener>(config_options, id, nullptr, result);
+}
+#endif // ROCKSDB_LITE
+
+namespace {
+template <class T>
+inline T SafeDivide(T a, T b) {
+ return b == 0 ? 0 : a / b;
+}
+} // anonymous namespace
+
+void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) {
+ *jwriter << "time_micros"
+ << std::chrono::duration_cast<std::chrono::microseconds>(
+ std::chrono::system_clock::now().time_since_epoch())
+ .count();
+}
+
+#ifndef ROCKSDB_LITE
+void EventHelpers::NotifyTableFileCreationStarted(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id, TableFileCreationReason reason) {
+ if (listeners.empty()) {
+ return;
+ }
+ TableFileCreationBriefInfo info;
+ info.db_name = db_name;
+ info.cf_name = cf_name;
+ info.file_path = file_path;
+ info.job_id = job_id;
+ info.reason = reason;
+ for (auto& listener : listeners) {
+ listener->OnTableFileCreationStarted(info);
+ }
+}
+#endif // !ROCKSDB_LITE
+
+void EventHelpers::NotifyOnBackgroundError(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex,
+ bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+ if (listeners.empty()) {
+ return;
+ }
+ db_mutex->AssertHeld();
+ // release lock while notifying events
+ db_mutex->Unlock();
+ for (auto& listener : listeners) {
+ listener->OnBackgroundError(reason, bg_error);
+ bg_error->PermitUncheckedError();
+ if (*auto_recovery) {
+ listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery);
+ }
+ }
+ db_mutex->Lock();
+#else
+ (void)listeners;
+ (void)reason;
+ (void)bg_error;
+ (void)db_mutex;
+ (void)auto_recovery;
+#endif // ROCKSDB_LITE
+}
+
+void EventHelpers::LogAndNotifyTableFileCreationFinished(
+ EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id, const FileDescriptor& fd,
+ uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+ TableFileCreationReason reason, const Status& s,
+ const std::string& file_checksum,
+ const std::string& file_checksum_func_name) {
+ if (s.ok() && event_logger) {
+ JSONWriter jwriter;
+ AppendCurrentTime(&jwriter);
+ jwriter << "cf_name" << cf_name << "job" << job_id << "event"
+ << "table_file_creation"
+ << "file_number" << fd.GetNumber() << "file_size"
+ << fd.GetFileSize() << "file_checksum"
+ << Slice(file_checksum).ToString(true) << "file_checksum_func_name"
+ << file_checksum_func_name << "smallest_seqno" << fd.smallest_seqno
+ << "largest_seqno" << fd.largest_seqno;
+
+ // table_properties
+ {
+ jwriter << "table_properties";
+ jwriter.StartObject();
+
+ // basic properties:
+ jwriter << "data_size" << table_properties.data_size << "index_size"
+ << table_properties.index_size << "index_partitions"
+ << table_properties.index_partitions << "top_level_index_size"
+ << table_properties.top_level_index_size
+ << "index_key_is_user_key"
+ << table_properties.index_key_is_user_key
+ << "index_value_is_delta_encoded"
+ << table_properties.index_value_is_delta_encoded << "filter_size"
+ << table_properties.filter_size << "raw_key_size"
+ << table_properties.raw_key_size << "raw_average_key_size"
+ << SafeDivide(table_properties.raw_key_size,
+ table_properties.num_entries)
+ << "raw_value_size" << table_properties.raw_value_size
+ << "raw_average_value_size"
+ << SafeDivide(table_properties.raw_value_size,
+ table_properties.num_entries)
+ << "num_data_blocks" << table_properties.num_data_blocks
+ << "num_entries" << table_properties.num_entries
+ << "num_filter_entries" << table_properties.num_filter_entries
+ << "num_deletions" << table_properties.num_deletions
+ << "num_merge_operands" << table_properties.num_merge_operands
+ << "num_range_deletions" << table_properties.num_range_deletions
+ << "format_version" << table_properties.format_version
+ << "fixed_key_len" << table_properties.fixed_key_len
+ << "filter_policy" << table_properties.filter_policy_name
+ << "column_family_name" << table_properties.column_family_name
+ << "column_family_id" << table_properties.column_family_id
+ << "comparator" << table_properties.comparator_name
+ << "merge_operator" << table_properties.merge_operator_name
+ << "prefix_extractor_name"
+ << table_properties.prefix_extractor_name << "property_collectors"
+ << table_properties.property_collectors_names << "compression"
+ << table_properties.compression_name << "compression_options"
+ << table_properties.compression_options << "creation_time"
+ << table_properties.creation_time << "oldest_key_time"
+ << table_properties.oldest_key_time << "file_creation_time"
+ << table_properties.file_creation_time
+ << "slow_compression_estimated_data_size"
+ << table_properties.slow_compression_estimated_data_size
+ << "fast_compression_estimated_data_size"
+ << table_properties.fast_compression_estimated_data_size
+ << "db_id" << table_properties.db_id << "db_session_id"
+ << table_properties.db_session_id << "orig_file_number"
+ << table_properties.orig_file_number << "seqno_to_time_mapping";
+
+ if (table_properties.seqno_to_time_mapping.empty()) {
+ jwriter << "N/A";
+ } else {
+ SeqnoToTimeMapping tmp;
+ Status status = tmp.Add(table_properties.seqno_to_time_mapping);
+ if (status.ok()) {
+ jwriter << tmp.ToHumanString();
+ } else {
+ jwriter << "Invalid";
+ }
+ }
+
+ // user collected properties
+ for (const auto& prop : table_properties.readable_properties) {
+ jwriter << prop.first << prop.second;
+ }
+ jwriter.EndObject();
+ }
+
+ if (oldest_blob_file_number != kInvalidBlobFileNumber) {
+ jwriter << "oldest_blob_file_number" << oldest_blob_file_number;
+ }
+
+ jwriter.EndObject();
+
+ event_logger->Log(jwriter);
+ }
+
+#ifndef ROCKSDB_LITE
+ if (listeners.empty()) {
+ return;
+ }
+ TableFileCreationInfo info;
+ info.db_name = db_name;
+ info.cf_name = cf_name;
+ info.file_path = file_path;
+ info.file_size = fd.file_size;
+ info.job_id = job_id;
+ info.table_properties = table_properties;
+ info.reason = reason;
+ info.status = s;
+ info.file_checksum = file_checksum;
+ info.file_checksum_func_name = file_checksum_func_name;
+ for (auto& listener : listeners) {
+ listener->OnTableFileCreated(info);
+ }
+ info.status.PermitUncheckedError();
+#else
+ (void)listeners;
+ (void)db_name;
+ (void)cf_name;
+ (void)file_path;
+ (void)reason;
+#endif // !ROCKSDB_LITE
+}
+
+void EventHelpers::LogAndNotifyTableFileDeletion(
+ EventLogger* event_logger, int job_id, uint64_t file_number,
+ const std::string& file_path, const Status& status,
+ const std::string& dbname,
+ const std::vector<std::shared_ptr<EventListener>>& listeners) {
+ JSONWriter jwriter;
+ AppendCurrentTime(&jwriter);
+
+ jwriter << "job" << job_id << "event"
+ << "table_file_deletion"
+ << "file_number" << file_number;
+ if (!status.ok()) {
+ jwriter << "status" << status.ToString();
+ }
+
+ jwriter.EndObject();
+
+ event_logger->Log(jwriter);
+
+#ifndef ROCKSDB_LITE
+ if (listeners.empty()) {
+ return;
+ }
+ TableFileDeletionInfo info;
+ info.db_name = dbname;
+ info.job_id = job_id;
+ info.file_path = file_path;
+ info.status = status;
+ for (auto& listener : listeners) {
+ listener->OnTableFileDeleted(info);
+ }
+ info.status.PermitUncheckedError();
+#else
+ (void)file_path;
+ (void)dbname;
+ (void)listeners;
+#endif // !ROCKSDB_LITE
+}
+
+void EventHelpers::NotifyOnErrorRecoveryEnd(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const Status& old_bg_error, const Status& new_bg_error,
+ InstrumentedMutex* db_mutex) {
+#ifndef ROCKSDB_LITE
+ if (!listeners.empty()) {
+ db_mutex->AssertHeld();
+ // release lock while notifying events
+ db_mutex->Unlock();
+ for (auto& listener : listeners) {
+ BackgroundErrorRecoveryInfo info;
+ info.old_bg_error = old_bg_error;
+ info.new_bg_error = new_bg_error;
+ listener->OnErrorRecoveryCompleted(old_bg_error);
+ listener->OnErrorRecoveryEnd(info);
+ info.old_bg_error.PermitUncheckedError();
+ info.new_bg_error.PermitUncheckedError();
+ }
+ db_mutex->Lock();
+ }
+#else
+ (void)listeners;
+ (void)old_bg_error;
+ (void)new_bg_error;
+ (void)db_mutex;
+#endif // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+void EventHelpers::NotifyBlobFileCreationStarted(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id,
+ BlobFileCreationReason creation_reason) {
+ if (listeners.empty()) {
+ return;
+ }
+ BlobFileCreationBriefInfo info(db_name, cf_name, file_path, job_id,
+ creation_reason);
+ for (const auto& listener : listeners) {
+ listener->OnBlobFileCreationStarted(info);
+ }
+}
+#endif // !ROCKSDB_LITE
+
+void EventHelpers::LogAndNotifyBlobFileCreationFinished(
+ EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id, uint64_t file_number,
+ BlobFileCreationReason creation_reason, const Status& s,
+ const std::string& file_checksum,
+ const std::string& file_checksum_func_name, uint64_t total_blob_count,
+ uint64_t total_blob_bytes) {
+ if (s.ok() && event_logger) {
+ JSONWriter jwriter;
+ AppendCurrentTime(&jwriter);
+ jwriter << "cf_name" << cf_name << "job" << job_id << "event"
+ << "blob_file_creation"
+ << "file_number" << file_number << "total_blob_count"
+ << total_blob_count << "total_blob_bytes" << total_blob_bytes
+ << "file_checksum" << file_checksum << "file_checksum_func_name"
+ << file_checksum_func_name << "status" << s.ToString();
+
+ jwriter.EndObject();
+ event_logger->Log(jwriter);
+ }
+
+#ifndef ROCKSDB_LITE
+ if (listeners.empty()) {
+ return;
+ }
+ BlobFileCreationInfo info(db_name, cf_name, file_path, job_id,
+ creation_reason, total_blob_count, total_blob_bytes,
+ s, file_checksum, file_checksum_func_name);
+ for (const auto& listener : listeners) {
+ listener->OnBlobFileCreated(info);
+ }
+ info.status.PermitUncheckedError();
+#else
+ (void)listeners;
+ (void)db_name;
+ (void)file_path;
+ (void)creation_reason;
+#endif
+}
+
+void EventHelpers::LogAndNotifyBlobFileDeletion(
+ EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners, int job_id,
+ uint64_t file_number, const std::string& file_path, const Status& status,
+ const std::string& dbname) {
+ if (event_logger) {
+ JSONWriter jwriter;
+ AppendCurrentTime(&jwriter);
+
+ jwriter << "job" << job_id << "event"
+ << "blob_file_deletion"
+ << "file_number" << file_number;
+ if (!status.ok()) {
+ jwriter << "status" << status.ToString();
+ }
+
+ jwriter.EndObject();
+ event_logger->Log(jwriter);
+ }
+#ifndef ROCKSDB_LITE
+ if (listeners.empty()) {
+ return;
+ }
+ BlobFileDeletionInfo info(dbname, file_path, job_id, status);
+ for (const auto& listener : listeners) {
+ listener->OnBlobFileDeleted(info);
+ }
+ info.status.PermitUncheckedError();
+#else
+ (void)listeners;
+ (void)dbname;
+ (void)file_path;
+#endif // !ROCKSDB_LITE
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/event_helpers.h b/src/rocksdb/db/event_helpers.h
new file mode 100644
index 000000000..68d819fe6
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "logging/event_logger.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EventHelpers {
+ public:
+ static void AppendCurrentTime(JSONWriter* json_writer);
+#ifndef ROCKSDB_LITE
+ static void NotifyTableFileCreationStarted(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id, TableFileCreationReason reason);
+#endif // !ROCKSDB_LITE
+ static void NotifyOnBackgroundError(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ BackgroundErrorReason reason, Status* bg_error,
+ InstrumentedMutex* db_mutex, bool* auto_recovery);
+ static void LogAndNotifyTableFileCreationFinished(
+ EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id, const FileDescriptor& fd,
+ uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+ TableFileCreationReason reason, const Status& s,
+ const std::string& file_checksum,
+ const std::string& file_checksum_func_name);
+ static void LogAndNotifyTableFileDeletion(
+ EventLogger* event_logger, int job_id, uint64_t file_number,
+ const std::string& file_path, const Status& status,
+ const std::string& db_name,
+ const std::vector<std::shared_ptr<EventListener>>& listeners);
+ static void NotifyOnErrorRecoveryEnd(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const Status& old_bg_error, const Status& new_bg_error,
+ InstrumentedMutex* db_mutex);
+
+#ifndef ROCKSDB_LITE
+ static void NotifyBlobFileCreationStarted(
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id,
+ BlobFileCreationReason creation_reason);
+#endif // !ROCKSDB_LITE
+
+ static void LogAndNotifyBlobFileCreationFinished(
+ EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& db_name, const std::string& cf_name,
+ const std::string& file_path, int job_id, uint64_t file_number,
+ BlobFileCreationReason creation_reason, const Status& s,
+ const std::string& file_checksum,
+ const std::string& file_checksum_func_name, uint64_t total_blob_count,
+ uint64_t total_blob_bytes);
+
+ static void LogAndNotifyBlobFileDeletion(
+ EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners, int job_id,
+ uint64_t file_number, const std::string& file_path, const Status& status,
+ const std::string& db_name);
+
+ private:
+ static void LogAndNotifyTableFileCreation(
+ EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const FileDescriptor& fd, const TableFileCreationInfo& info);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/experimental.cc b/src/rocksdb/db/experimental.cc
new file mode 100644
index 000000000..d838ebde5
--- /dev/null
+++ b/src/rocksdb/db/experimental.cc
@@ -0,0 +1,155 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/experimental.h"
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_util.h"
+#include "logging/logging.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace experimental {
+
+#ifndef ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) {
+ if (db == nullptr) {
+ return Status::InvalidArgument("DB is empty");
+ }
+
+ return db->SuggestCompactRange(column_family, begin, end);
+}
+
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
+ if (db == nullptr) {
+ return Status::InvalidArgument("Didn't recognize DB object");
+ }
+ return db->PromoteL0(column_family, target_level);
+}
+
+#else // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/, const Slice* /*end*/) {
+ return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+ int /*target_level*/) {
+ return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+#endif // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) {
+ return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end);
+}
+
+Status UpdateManifestForFilesState(
+ const DBOptions& db_opts, const std::string& db_name,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ const UpdateManifestForFilesStateOptions& opts) {
+ OfflineManifestWriter w(db_opts, db_name);
+ Status s = w.Recover(column_families);
+
+ size_t files_updated = 0;
+ size_t cfs_updated = 0;
+ auto fs = db_opts.env->GetFileSystem();
+
+ for (auto cfd : *w.Versions().GetColumnFamilySet()) {
+ if (!s.ok()) {
+ break;
+ }
+ assert(cfd);
+
+ if (cfd->IsDropped() || !cfd->initialized()) {
+ continue;
+ }
+
+ const auto* current = cfd->current();
+ assert(current);
+
+ const auto* vstorage = current->storage_info();
+ assert(vstorage);
+
+ VersionEdit edit;
+ edit.SetColumnFamily(cfd->GetID());
+
+ /* SST files */
+ for (int level = 0; level < cfd->NumberLevels(); level++) {
+ if (!s.ok()) {
+ break;
+ }
+ const auto& level_files = vstorage->LevelFiles(level);
+
+ for (const auto& lf : level_files) {
+ assert(lf);
+
+ uint64_t number = lf->fd.GetNumber();
+ std::string fname =
+ TableFileName(w.IOptions().db_paths, number, lf->fd.GetPathId());
+
+ std::unique_ptr<FSSequentialFile> f;
+ FileOptions fopts;
+ // Use kUnknown to signal the FileSystem to search all tiers for the
+ // file.
+ fopts.temperature = Temperature::kUnknown;
+
+ IOStatus file_ios =
+ fs->NewSequentialFile(fname, fopts, &f, /*dbg*/ nullptr);
+ if (file_ios.ok()) {
+ if (opts.update_temperatures) {
+ Temperature temp = f->GetTemperature();
+ if (temp != Temperature::kUnknown && temp != lf->temperature) {
+ // Current state inconsistent with manifest
+ ++files_updated;
+ edit.DeleteFile(level, number);
+ edit.AddFile(
+ level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(),
+ lf->smallest, lf->largest, lf->fd.smallest_seqno,
+ lf->fd.largest_seqno, lf->marked_for_compaction, temp,
+ lf->oldest_blob_file_number, lf->oldest_ancester_time,
+ lf->file_creation_time, lf->file_checksum,
+ lf->file_checksum_func_name, lf->unique_id);
+ }
+ }
+ } else {
+ s = file_ios;
+ break;
+ }
+ }
+ }
+
+ if (s.ok() && edit.NumEntries() > 0) {
+ std::unique_ptr<FSDirectory> db_dir;
+ s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr);
+ if (s.ok()) {
+ s = w.LogAndApply(cfd, &edit, db_dir.get());
+ }
+ if (s.ok()) {
+ ++cfs_updated;
+ }
+ }
+ }
+
+ if (cfs_updated > 0) {
+ ROCKS_LOG_INFO(db_opts.info_log,
+ "UpdateManifestForFilesState: updated %zu files in %zu CFs",
+ files_updated, cfs_updated);
+ } else if (s.ok()) {
+ ROCKS_LOG_INFO(db_opts.info_log,
+ "UpdateManifestForFilesState: no updates needed");
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_opts.info_log, "UpdateManifestForFilesState failed: %s",
+ s.ToString().c_str());
+ }
+
+ return s;
+}
+
+} // namespace experimental
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/external_sst_file_basic_test.cc b/src/rocksdb/db/external_sst_file_basic_test.cc
new file mode 100644
index 000000000..665c89869
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_basic_test.cc
@@ -0,0 +1,1997 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class ExternalSSTFileBasicTest
+ : public DBTestBase,
+ public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ ExternalSSTFileBasicTest()
+ : DBTestBase("external_sst_file_basic_test", /*env_do_fsync=*/true) {
+ sst_files_dir_ = dbname_ + "_sst_files/";
+ fault_injection_test_env_.reset(new FaultInjectionTestEnv(env_));
+ DestroyAndRecreateExternalSSTFilesDir();
+
+ // Check if the Env supports RandomRWFile
+ std::string file_path = sst_files_dir_ + "test_random_rw_file";
+ std::unique_ptr<WritableFile> wfile;
+ assert(env_->NewWritableFile(file_path, &wfile, EnvOptions()).ok());
+ wfile.reset();
+ std::unique_ptr<RandomRWFile> rwfile;
+ Status s = env_->NewRandomRWFile(file_path, &rwfile, EnvOptions());
+ if (s.IsNotSupported()) {
+ random_rwfile_supported_ = false;
+ } else {
+ EXPECT_OK(s);
+ random_rwfile_supported_ = true;
+ }
+ rwfile.reset();
+ EXPECT_OK(env_->DeleteFile(file_path));
+ }
+
+ void DestroyAndRecreateExternalSSTFilesDir() {
+ ASSERT_OK(DestroyDir(env_, sst_files_dir_));
+ ASSERT_OK(env_->CreateDir(sst_files_dir_));
+ }
+
+ Status DeprecatedAddFile(const std::vector<std::string>& files,
+ bool move_files = false,
+ bool skip_snapshot_check = false) {
+ IngestExternalFileOptions opts;
+ opts.move_files = move_files;
+ opts.snapshot_consistency = !skip_snapshot_check;
+ opts.allow_global_seqno = false;
+ opts.allow_blocking_flush = false;
+ return db_->IngestExternalFile(files, opts);
+ }
+
+ Status AddFileWithFileChecksum(
+ const std::vector<std::string>& files,
+ const std::vector<std::string>& files_checksums,
+ const std::vector<std::string>& files_checksum_func_names,
+ bool verify_file_checksum = true, bool move_files = false,
+ bool skip_snapshot_check = false, bool write_global_seqno = true) {
+ IngestExternalFileOptions opts;
+ opts.move_files = move_files;
+ opts.snapshot_consistency = !skip_snapshot_check;
+ opts.allow_global_seqno = false;
+ opts.allow_blocking_flush = false;
+ opts.write_global_seqno = write_global_seqno;
+ opts.verify_file_checksum = verify_file_checksum;
+
+ IngestExternalFileArg arg;
+ arg.column_family = db_->DefaultColumnFamily();
+ arg.external_files = files;
+ arg.options = opts;
+ arg.files_checksums = files_checksums;
+ arg.files_checksum_func_names = files_checksum_func_names;
+ return db_->IngestExternalFiles({arg});
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options, std::vector<int> keys,
+ const std::vector<ValueType>& value_types,
+ std::vector<std::pair<int, int>> range_deletions, int file_id,
+ bool write_global_seqno, bool verify_checksums_before_ingest,
+ std::map<std::string, std::string>* true_data) {
+ assert(value_types.size() == 1 || keys.size() == value_types.size());
+ std::string file_path = sst_files_dir_ + std::to_string(file_id);
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ Status s = sst_file_writer.Open(file_path);
+ if (!s.ok()) {
+ return s;
+ }
+ for (size_t i = 0; i < range_deletions.size(); i++) {
+ // Account for the effect of range deletions on true_data before
+ // all point operators, even though sst_file_writer.DeleteRange
+ // must be called before other sst_file_writer methods. This is
+ // because point writes take precedence over range deletions
+ // in the same ingested sst.
+ std::string start_key = Key(range_deletions[i].first);
+ std::string end_key = Key(range_deletions[i].second);
+ s = sst_file_writer.DeleteRange(start_key, end_key);
+ if (!s.ok()) {
+ sst_file_writer.Finish();
+ return s;
+ }
+ auto start_key_it = true_data->find(start_key);
+ if (start_key_it == true_data->end()) {
+ start_key_it = true_data->upper_bound(start_key);
+ }
+ auto end_key_it = true_data->find(end_key);
+ if (end_key_it == true_data->end()) {
+ end_key_it = true_data->upper_bound(end_key);
+ }
+ true_data->erase(start_key_it, end_key_it);
+ }
+ for (size_t i = 0; i < keys.size(); i++) {
+ std::string key = Key(keys[i]);
+ std::string value = Key(keys[i]) + std::to_string(file_id);
+ ValueType value_type =
+ (value_types.size() == 1 ? value_types[0] : value_types[i]);
+ switch (value_type) {
+ case ValueType::kTypeValue:
+ s = sst_file_writer.Put(key, value);
+ (*true_data)[key] = value;
+ break;
+ case ValueType::kTypeMerge:
+ s = sst_file_writer.Merge(key, value);
+ // we only use TestPutOperator in this test
+ (*true_data)[key] = value;
+ break;
+ case ValueType::kTypeDeletion:
+ s = sst_file_writer.Delete(key);
+ true_data->erase(key);
+ break;
+ default:
+ return Status::InvalidArgument("Value type is not supported");
+ }
+ if (!s.ok()) {
+ sst_file_writer.Finish();
+ return s;
+ }
+ }
+ s = sst_file_writer.Finish();
+
+ if (s.ok()) {
+ IngestExternalFileOptions ifo;
+ ifo.allow_global_seqno = true;
+ ifo.write_global_seqno = write_global_seqno;
+ ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
+ s = db_->IngestExternalFile({file_path}, ifo);
+ }
+ return s;
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options, std::vector<int> keys,
+ const std::vector<ValueType>& value_types, int file_id,
+ bool write_global_seqno, bool verify_checksums_before_ingest,
+ std::map<std::string, std::string>* true_data) {
+ return GenerateAndAddExternalFile(
+ options, keys, value_types, {}, file_id, write_global_seqno,
+ verify_checksums_before_ingest, true_data);
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options, std::vector<int> keys, const ValueType value_type,
+ int file_id, bool write_global_seqno, bool verify_checksums_before_ingest,
+ std::map<std::string, std::string>* true_data) {
+ return GenerateAndAddExternalFile(
+ options, keys, std::vector<ValueType>(1, value_type), file_id,
+ write_global_seqno, verify_checksums_before_ingest, true_data);
+ }
+
+ ~ExternalSSTFileBasicTest() override {
+ DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
+ }
+
+ protected:
+ std::string sst_files_dir_;
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_test_env_;
+ bool random_rwfile_supported_;
+};
+
+TEST_F(ExternalSSTFileBasicTest, Basic) {
+ Options options = CurrentOptions();
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // Current file size should be 0 after sst_file_writer init and before open a
+ // file.
+ ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+ // file1.sst (0 => 99)
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 0; k < 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ Status s = sst_file_writer.Finish(&file1_info);
+ ASSERT_OK(s) << s.ToString();
+
+ // Current file size should be non-zero after success write.
+ ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(0));
+ ASSERT_EQ(file1_info.largest_key, Key(99));
+ ASSERT_EQ(file1_info.num_range_del_entries, 0);
+ ASSERT_EQ(file1_info.smallest_range_del_key, "");
+ ASSERT_EQ(file1_info.largest_range_del_key, "");
+ ASSERT_EQ(file1_info.file_checksum, kUnknownFileChecksum);
+ ASSERT_EQ(file1_info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+ // sst_file_writer already finished, cannot add this value
+ s = sst_file_writer.Put(Key(100), "bad_val");
+ ASSERT_NOK(s) << s.ToString();
+ s = sst_file_writer.DeleteRange(Key(100), Key(200));
+ ASSERT_NOK(s) << s.ToString();
+
+ DestroyAndReopen(options);
+ // Add file using file path
+ s = DeprecatedAddFile({file1});
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ for (int k = 0; k < 100; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+
+ DestroyAndRecreateExternalSSTFilesDir();
+}
+
+class ChecksumVerifyHelper {
+ private:
+ Options options_;
+
+ public:
+ ChecksumVerifyHelper(Options& options) : options_(options) {}
+ ~ChecksumVerifyHelper() {}
+
+ Status GetSingleFileChecksumAndFuncName(
+ const std::string& file_path, std::string* file_checksum,
+ std::string* file_checksum_func_name) {
+ Status s;
+ EnvOptions soptions;
+ std::unique_ptr<SequentialFile> file_reader;
+ s = options_.env->NewSequentialFile(file_path, &file_reader, soptions);
+ if (!s.ok()) {
+ return s;
+ }
+ std::unique_ptr<char[]> scratch(new char[2048]);
+ Slice result;
+ FileChecksumGenFactory* file_checksum_gen_factory =
+ options_.file_checksum_gen_factory.get();
+ if (file_checksum_gen_factory == nullptr) {
+ *file_checksum = kUnknownFileChecksum;
+ *file_checksum_func_name = kUnknownFileChecksumFuncName;
+ return Status::OK();
+ } else {
+ FileChecksumGenContext gen_context;
+ std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
+ file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context);
+ *file_checksum_func_name = file_checksum_gen->Name();
+ s = file_reader->Read(2048, &result, scratch.get());
+ if (!s.ok()) {
+ return s;
+ }
+ while (result.size() != 0) {
+ file_checksum_gen->Update(scratch.get(), result.size());
+ s = file_reader->Read(2048, &result, scratch.get());
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ file_checksum_gen->Finalize();
+ *file_checksum = file_checksum_gen->GetChecksum();
+ }
+ return Status::OK();
+ }
+};
+
+TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) {
+ Options options = CurrentOptions();
+ options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ ChecksumVerifyHelper checksum_helper(options);
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // Current file size should be 0 after sst_file_writer init and before open a
+ // file.
+ ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+ // file1.sst (0 => 99)
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 0; k < 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ Status s = sst_file_writer.Finish(&file1_info);
+ ASSERT_OK(s) << s.ToString();
+ std::string file_checksum, file_checksum_func_name;
+ ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+ file1, &file_checksum, &file_checksum_func_name));
+
+ // Current file size should be non-zero after success write.
+ ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(0));
+ ASSERT_EQ(file1_info.largest_key, Key(99));
+ ASSERT_EQ(file1_info.num_range_del_entries, 0);
+ ASSERT_EQ(file1_info.smallest_range_del_key, "");
+ ASSERT_EQ(file1_info.largest_range_del_key, "");
+ ASSERT_EQ(file1_info.file_checksum, file_checksum);
+ ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name);
+ // sst_file_writer already finished, cannot add this value
+ s = sst_file_writer.Put(Key(100), "bad_val");
+ ASSERT_NOK(s) << s.ToString();
+ s = sst_file_writer.DeleteRange(Key(100), Key(200));
+ ASSERT_NOK(s) << s.ToString();
+
+ DestroyAndReopen(options);
+ // Add file using file path
+ s = DeprecatedAddFile({file1});
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ for (int k = 0; k < 100; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+
+ DestroyAndRecreateExternalSSTFilesDir();
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
+ Options old_options = CurrentOptions();
+ Options options = CurrentOptions();
+ options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ const ImmutableCFOptions ioptions(options);
+ ChecksumVerifyHelper checksum_helper(options);
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file01.sst (1000 => 1099)
+ std::string file1 = sst_files_dir_ + "file01.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 1000; k < 1100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ Status s = sst_file_writer.Finish(&file1_info);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(1000));
+ ASSERT_EQ(file1_info.largest_key, Key(1099));
+ std::string file_checksum1, file_checksum_func_name1;
+ ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+ file1, &file_checksum1, &file_checksum_func_name1));
+ ASSERT_EQ(file1_info.file_checksum, file_checksum1);
+ ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name1);
+
+ // file02.sst (1100 => 1299)
+ std::string file2 = sst_files_dir_ + "file02.sst";
+ ASSERT_OK(sst_file_writer.Open(file2));
+ for (int k = 1100; k < 1300; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file2_info;
+ s = sst_file_writer.Finish(&file2_info);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(file2_info.file_path, file2);
+ ASSERT_EQ(file2_info.num_entries, 200);
+ ASSERT_EQ(file2_info.smallest_key, Key(1100));
+ ASSERT_EQ(file2_info.largest_key, Key(1299));
+ std::string file_checksum2, file_checksum_func_name2;
+ ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+ file2, &file_checksum2, &file_checksum_func_name2));
+ ASSERT_EQ(file2_info.file_checksum, file_checksum2);
+ ASSERT_EQ(file2_info.file_checksum_func_name, file_checksum_func_name2);
+
+ // file03.sst (1300 => 1499)
+ std::string file3 = sst_files_dir_ + "file03.sst";
+ ASSERT_OK(sst_file_writer.Open(file3));
+ for (int k = 1300; k < 1500; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file3_info;
+ s = sst_file_writer.Finish(&file3_info);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(file3_info.file_path, file3);
+ ASSERT_EQ(file3_info.num_entries, 200);
+ ASSERT_EQ(file3_info.smallest_key, Key(1300));
+ ASSERT_EQ(file3_info.largest_key, Key(1499));
+ std::string file_checksum3, file_checksum_func_name3;
+ ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+ file3, &file_checksum3, &file_checksum_func_name3));
+ ASSERT_EQ(file3_info.file_checksum, file_checksum3);
+ ASSERT_EQ(file3_info.file_checksum_func_name, file_checksum_func_name3);
+
+ // file04.sst (1500 => 1799)
+ std::string file4 = sst_files_dir_ + "file04.sst";
+ ASSERT_OK(sst_file_writer.Open(file4));
+ for (int k = 1500; k < 1800; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file4_info;
+ s = sst_file_writer.Finish(&file4_info);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(file4_info.file_path, file4);
+ ASSERT_EQ(file4_info.num_entries, 300);
+ ASSERT_EQ(file4_info.smallest_key, Key(1500));
+ ASSERT_EQ(file4_info.largest_key, Key(1799));
+ std::string file_checksum4, file_checksum_func_name4;
+ ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+ file4, &file_checksum4, &file_checksum_func_name4));
+ ASSERT_EQ(file4_info.file_checksum, file_checksum4);
+ ASSERT_EQ(file4_info.file_checksum_func_name, file_checksum_func_name4);
+
+ // file05.sst (1800 => 1899)
+ std::string file5 = sst_files_dir_ + "file05.sst";
+ ASSERT_OK(sst_file_writer.Open(file5));
+ for (int k = 1800; k < 2000; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file5_info;
+ s = sst_file_writer.Finish(&file5_info);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(file5_info.file_path, file5);
+ ASSERT_EQ(file5_info.num_entries, 200);
+ ASSERT_EQ(file5_info.smallest_key, Key(1800));
+ ASSERT_EQ(file5_info.largest_key, Key(1999));
+ std::string file_checksum5, file_checksum_func_name5;
+ ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+ file5, &file_checksum5, &file_checksum_func_name5));
+ ASSERT_EQ(file5_info.file_checksum, file_checksum5);
+ ASSERT_EQ(file5_info.file_checksum_func_name, file_checksum_func_name5);
+
+ // file06.sst (2000 => 2199)
+ std::string file6 = sst_files_dir_ + "file06.sst";
+ ASSERT_OK(sst_file_writer.Open(file6));
+ for (int k = 2000; k < 2200; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file6_info;
+ s = sst_file_writer.Finish(&file6_info);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(file6_info.file_path, file6);
+ ASSERT_EQ(file6_info.num_entries, 200);
+ ASSERT_EQ(file6_info.smallest_key, Key(2000));
+ ASSERT_EQ(file6_info.largest_key, Key(2199));
+ std::string file_checksum6, file_checksum_func_name6;
+ ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+ file6, &file_checksum6, &file_checksum_func_name6));
+ ASSERT_EQ(file6_info.file_checksum, file_checksum6);
+ ASSERT_EQ(file6_info.file_checksum_func_name, file_checksum_func_name6);
+
+ s = AddFileWithFileChecksum({file1}, {file_checksum1, "xyz"},
+ {file_checksum1}, true, false, false, false);
+ // does not care the checksum input since db does not enable file checksum
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_OK(env_->FileExists(file1));
+ std::vector<LiveFileMetaData> live_files;
+ dbfull()->GetLiveFilesMetaData(&live_files);
+ std::set<std::string> set1;
+ for (auto f : live_files) {
+ set1.insert(f.name);
+ ASSERT_EQ(f.file_checksum, kUnknownFileChecksum);
+ ASSERT_EQ(f.file_checksum_func_name, kUnknownFileChecksumFuncName);
+ }
+
+ // check the temperature of the file being ingested
+ ColumnFamilyMetaData metadata;
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(1, metadata.file_count);
+ ASSERT_EQ(Temperature::kUnknown, metadata.levels[6].files[0].temperature);
+ auto size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kHot);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kCold);
+ ASSERT_EQ(size, 0);
+
+ // Reopen Db with checksum enabled
+ Reopen(options);
+ // Enable verify_file_checksum option
+ // The checksum vector does not match, fail the ingestion
+ s = AddFileWithFileChecksum({file2}, {file_checksum2, "xyz"},
+ {file_checksum_func_name2}, true, false, false,
+ false);
+ ASSERT_NOK(s) << s.ToString();
+
+ // Enable verify_file_checksum option
+ // The checksum name does not match, fail the ingestion
+ s = AddFileWithFileChecksum({file2}, {file_checksum2}, {"xyz"}, true, false,
+ false, false);
+ ASSERT_NOK(s) << s.ToString();
+
+ // Enable verify_file_checksum option
+ // The checksum itself does not match, fail the ingestion
+ s = AddFileWithFileChecksum({file2}, {"xyz"}, {file_checksum_func_name2},
+ true, false, false, false);
+ ASSERT_NOK(s) << s.ToString();
+
+ // Enable verify_file_checksum option
+ // All matches, ingestion is successful
+ s = AddFileWithFileChecksum({file2}, {file_checksum2},
+ {file_checksum_func_name2}, true, false, false,
+ false);
+ ASSERT_OK(s) << s.ToString();
+ std::vector<LiveFileMetaData> live_files1;
+ dbfull()->GetLiveFilesMetaData(&live_files1);
+ for (auto f : live_files1) {
+ if (set1.find(f.name) == set1.end()) {
+ ASSERT_EQ(f.file_checksum, file_checksum2);
+ ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name2);
+ set1.insert(f.name);
+ }
+ }
+ ASSERT_OK(env_->FileExists(file2));
+
+ // Enable verify_file_checksum option
+ // No checksum information is provided, generate it when ingesting
+ std::vector<std::string> checksum, checksum_func;
+ s = AddFileWithFileChecksum({file3}, checksum, checksum_func, true, false,
+ false, false);
+ ASSERT_OK(s) << s.ToString();
+ std::vector<LiveFileMetaData> live_files2;
+ dbfull()->GetLiveFilesMetaData(&live_files2);
+ for (auto f : live_files2) {
+ if (set1.find(f.name) == set1.end()) {
+ ASSERT_EQ(f.file_checksum, file_checksum3);
+ ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name3);
+ set1.insert(f.name);
+ }
+ }
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_OK(env_->FileExists(file3));
+
+ // Does not enable verify_file_checksum options
+ // The checksum name does not match, fail the ingestion
+ s = AddFileWithFileChecksum({file4}, {file_checksum4}, {"xyz"}, false, false,
+ false, false);
+ ASSERT_NOK(s) << s.ToString();
+
+ // Does not enable verify_file_checksum options
+ // Checksum function name matches, store the checksum being ingested.
+ s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4},
+ false, false, false, false);
+ ASSERT_OK(s) << s.ToString();
+ std::vector<LiveFileMetaData> live_files3;
+ dbfull()->GetLiveFilesMetaData(&live_files3);
+ for (auto f : live_files3) {
+ if (set1.find(f.name) == set1.end()) {
+ ASSERT_FALSE(f.file_checksum == file_checksum4);
+ ASSERT_EQ(f.file_checksum, "asd");
+ ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4);
+ set1.insert(f.name);
+ }
+ }
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_OK(env_->FileExists(file4));
+
+ // enable verify_file_checksum options, DB enable checksum, and enable
+ // write_global_seq. So the checksum stored is different from the one
+ // ingested due to the sequence number changes.
+ s = AddFileWithFileChecksum({file5}, {file_checksum5},
+ {file_checksum_func_name5}, true, false, false,
+ true);
+ ASSERT_OK(s) << s.ToString();
+ std::vector<LiveFileMetaData> live_files4;
+ dbfull()->GetLiveFilesMetaData(&live_files4);
+ for (auto f : live_files4) {
+ if (set1.find(f.name) == set1.end()) {
+ std::string cur_checksum5, cur_checksum_func_name5;
+ ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+ dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5));
+ ASSERT_EQ(f.file_checksum, cur_checksum5);
+ ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name5);
+ set1.insert(f.name);
+ }
+ }
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_OK(env_->FileExists(file5));
+
+ // Does not enable verify_file_checksum options and also the ingested file
+ // checksum information is empty. DB will generate and store the checksum
+ // in Manifest.
+ std::vector<std::string> files_c6, files_name6;
+ s = AddFileWithFileChecksum({file6}, files_c6, files_name6, false, false,
+ false, false);
+ ASSERT_OK(s) << s.ToString();
+ std::vector<LiveFileMetaData> live_files6;
+ dbfull()->GetLiveFilesMetaData(&live_files6);
+ for (auto f : live_files6) {
+ if (set1.find(f.name) == set1.end()) {
+ ASSERT_EQ(f.file_checksum, file_checksum6);
+ ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name6);
+ set1.insert(f.name);
+ }
+ }
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_OK(env_->FileExists(file6));
+ db_->GetColumnFamilyMetaData(&metadata);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kHot);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kCold);
+ ASSERT_EQ(size, 0);
+}
+
+TEST_F(ExternalSSTFileBasicTest, NoCopy) {
+ Options options = CurrentOptions();
+ const ImmutableCFOptions ioptions(options);
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file1.sst (0 => 99)
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 0; k < 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ Status s = sst_file_writer.Finish(&file1_info);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(0));
+ ASSERT_EQ(file1_info.largest_key, Key(99));
+
+ // file2.sst (100 => 299)
+ std::string file2 = sst_files_dir_ + "file2.sst";
+ ASSERT_OK(sst_file_writer.Open(file2));
+ for (int k = 100; k < 300; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file2_info;
+ s = sst_file_writer.Finish(&file2_info);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(file2_info.file_path, file2);
+ ASSERT_EQ(file2_info.num_entries, 200);
+ ASSERT_EQ(file2_info.smallest_key, Key(100));
+ ASSERT_EQ(file2_info.largest_key, Key(299));
+
+ // file3.sst (110 => 124) .. overlap with file2.sst
+ std::string file3 = sst_files_dir_ + "file3.sst";
+ ASSERT_OK(sst_file_writer.Open(file3));
+ for (int k = 110; k < 125; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file3_info;
+ s = sst_file_writer.Finish(&file3_info);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(file3_info.file_path, file3);
+ ASSERT_EQ(file3_info.num_entries, 15);
+ ASSERT_EQ(file3_info.smallest_key, Key(110));
+ ASSERT_EQ(file3_info.largest_key, Key(124));
+
+ s = DeprecatedAddFile({file1}, true /* move file */);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(Status::NotFound(), env_->FileExists(file1));
+
+ s = DeprecatedAddFile({file2}, false /* copy file */);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_OK(env_->FileExists(file2));
+
+ // This file has overlapping values with the existing data
+ s = DeprecatedAddFile({file3}, true /* move file */);
+ ASSERT_NOK(s) << s.ToString();
+ ASSERT_OK(env_->FileExists(file3));
+
+ for (int k = 0; k < 300; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ do {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+
+ int file_id = 1;
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 4, 6}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {11, 15, 19}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {120, 130}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 130}, ValueType::kTypeValue, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+ // Write some keys through normal write path
+ for (int i = 0; i < 50; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+ SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {40, 41, 42}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {20, 30, 40}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ // We will need a seqno for the file regardless if the file overwrite
+ // keys in the DB or not because we have a snapshot
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1000, 1002}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {2000, 3002}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 20, 40, 100, 150}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ db_->ReleaseSnapshot(snapshot);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // No snapshot anymore, no need to assign a seqno
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ do {
+ Options options = CurrentOptions();
+ options.merge_operator.reset(new TestPutOperator());
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+
+ int file_id = 1;
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 4, 6}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {11, 15, 19}, ValueType::kTypeDeletion, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {120, 130}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 130}, ValueType::kTypeDeletion, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {120}, {ValueType::kTypeValue}, {{120, 135}}, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {}, {}, {{110, 120}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // The range deletion ends on a key, but it doesn't actually delete
+ // this key because the largest key in the range is exclusive. Still,
+ // it counts as an overlap so a new seqno will be assigned.
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {}, {}, {{100, 109}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+ // Write some keys through normal write path
+ for (int i = 0; i < 50; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+ SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {40, 41, 42}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {20, 30, 40}, ValueType::kTypeDeletion, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ // We will need a seqno for the file regardless if the file overwrite
+ // keys in the DB or not because we have a snapshot
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1000, 1002}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {2000, 3002}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 20, 40, 100, 150}, ValueType::kTypeMerge, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ db_->ReleaseSnapshot(snapshot);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data));
+ // No snapshot anymore, no need to assign a seqno
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ do {
+ Options options = CurrentOptions();
+ options.merge_operator.reset(new TestPutOperator());
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+
+ int file_id = 1;
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2, 3, 4, 5, 6},
+ {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
+ ValueType::kTypeMerge, ValueType::kTypeValue, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {10, 11, 12, 13},
+ {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
+ ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 4, 6},
+ {ValueType::kTypeDeletion, ValueType::kTypeValue,
+ ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {11, 15, 19},
+ {ValueType::kTypeDeletion, ValueType::kTypeMerge,
+ ValueType::kTypeValue},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {120, 130}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 130}, {ValueType::kTypeMerge, ValueType::kTypeDeletion},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {150, 151, 152},
+ {ValueType::kTypeValue, ValueType::kTypeMerge,
+ ValueType::kTypeDeletion},
+ {{150, 160}, {180, 190}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {150, 151, 152},
+ {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
+ {{200, 250}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {300, 301, 302},
+ {ValueType::kTypeValue, ValueType::kTypeMerge,
+ ValueType::kTypeDeletion},
+ {{1, 2}, {152, 154}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+ // Write some keys through normal write path
+ for (int i = 0; i < 50; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+ SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {60, 61, 62},
+ {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File doesn't overwrite any keys, no seqno needed
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {40, 41, 42},
+ {ValueType::kTypeValue, ValueType::kTypeDeletion,
+ ValueType::kTypeDeletion},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {20, 30, 40},
+ {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
+ ValueType::kTypeDeletion},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // File overwrites some keys, a seqno will be assigned
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ // We will need a seqno for the file regardless if the file overwrite
+ // keys in the DB or not because we have a snapshot
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1000, 1002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {2000, 3002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 20, 40, 100, 150},
+ {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
+ ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // A global seqno will be assigned anyway because of the snapshot
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ db_->ReleaseSnapshot(snapshot);
+
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {5000, 5001}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ // No snapshot anymore, no need to assign a seqno
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
+ Options options = CurrentOptions();
+ const int kNumKeys = 10000;
+
+ size_t total_fadvised_bytes = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileWriter::Rep::InvalidatePageCache", [&](void* arg) {
+ size_t fadvise_size = *(reinterpret_cast<size_t*>(arg));
+ total_fadvised_bytes += fadvise_size;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::unique_ptr<SstFileWriter> sst_file_writer;
+
+ std::string sst_file_path = sst_files_dir_ + "file_fadvise_disable.sst";
+ sst_file_writer.reset(
+ new SstFileWriter(EnvOptions(), options, nullptr, false));
+ ASSERT_OK(sst_file_writer->Open(sst_file_path));
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(sst_file_writer->Put(Key(i), Key(i)));
+ }
+ ASSERT_OK(sst_file_writer->Finish());
+ // fadvise disabled
+ ASSERT_EQ(total_fadvised_bytes, 0);
+
+ sst_file_path = sst_files_dir_ + "file_fadvise_enable.sst";
+ sst_file_writer.reset(
+ new SstFileWriter(EnvOptions(), options, nullptr, true));
+ ASSERT_OK(sst_file_writer->Open(sst_file_path));
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(sst_file_writer->Put(Key(i), Key(i)));
+ }
+ ASSERT_OK(sst_file_writer->Finish());
+ // fadvise enabled
+ ASSERT_EQ(total_fadvised_bytes, sst_file_writer->FileSize());
+ ASSERT_GT(total_fadvised_bytes, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
+ Options options;
+ options.create_if_missing = true;
+ options.env = fault_injection_test_env_.get();
+
+ std::vector<std::pair<std::string, std::string>> test_cases = {
+ {"ExternalSstFileIngestionJob::BeforeSyncIngestedFile",
+ "ExternalSstFileIngestionJob::AfterSyncIngestedFile"},
+ {"ExternalSstFileIngestionJob::BeforeSyncDir",
+ "ExternalSstFileIngestionJob::AfterSyncDir"},
+ {"ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno",
+ "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}};
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ bool no_sync = false;
+ SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) {
+ fault_injection_test_env_->SetFilesystemActive(false);
+ });
+ SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) {
+ fault_injection_test_env_->SetFilesystemActive(true);
+ });
+ if (i == 0) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* s) {
+ Status* status = static_cast<Status*>(s);
+ if (status->IsNotSupported()) {
+ no_sync = true;
+ }
+ });
+ }
+ if (i == 2) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "ExternalSstFileIngestionJob::NewRandomRWFile", [&](void* s) {
+ Status* status = static_cast<Status*>(s);
+ if (status->IsNotSupported()) {
+ no_sync = true;
+ }
+ });
+ }
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyAndReopen(options);
+ if (i == 2) {
+ ASSERT_OK(Put("foo", "v1"));
+ }
+
+ Options sst_file_writer_options;
+ sst_file_writer_options.env = fault_injection_test_env_.get();
+ std::unique_ptr<SstFileWriter> sst_file_writer(
+ new SstFileWriter(EnvOptions(), sst_file_writer_options));
+ std::string file_name =
+ sst_files_dir_ + "sync_failure_test_" + std::to_string(i) + ".sst";
+ ASSERT_OK(sst_file_writer->Open(file_name));
+ ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+ ASSERT_OK(sst_file_writer->Finish());
+
+ IngestExternalFileOptions ingest_opt;
+ if (i == 0) {
+ ingest_opt.move_files = true;
+ }
+ const Snapshot* snapshot = db_->GetSnapshot();
+ if (i == 2) {
+ ingest_opt.write_global_seqno = true;
+ }
+ Status s = db_->IngestExternalFile({file_name}, ingest_opt);
+ if (no_sync) {
+ ASSERT_OK(s);
+ } else {
+ ASSERT_NOK(s);
+ }
+ db_->ReleaseSnapshot(snapshot);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ Destroy(options);
+ }
+}
+
+TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) {
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) {
+ Status* s = static_cast<Status*>(arg);
+ *s = Status::NotSupported();
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyAndReopen(options);
+
+ Options sst_file_writer_options;
+ sst_file_writer_options.env = env_;
+ std::unique_ptr<SstFileWriter> sst_file_writer(
+ new SstFileWriter(EnvOptions(), sst_file_writer_options));
+ std::string file_name =
+ sst_files_dir_ + "reopen_not_supported_test_" + ".sst";
+ ASSERT_OK(sst_file_writer->Open(file_name));
+ ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+ ASSERT_OK(sst_file_writer->Finish());
+
+ IngestExternalFileOptions ingest_opt;
+ ingest_opt.move_files = true;
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+ db_->ReleaseSnapshot(snapshot);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ Destroy(options);
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) {
+ Options options;
+ options.create_if_missing = true;
+ SpecialEnv senv(env_);
+ options.env = &senv;
+ DestroyAndReopen(options);
+
+ Options sst_file_writer_options;
+ sst_file_writer_options.env = env_;
+ std::unique_ptr<SstFileWriter> sst_file_writer(
+ new SstFileWriter(EnvOptions(), sst_file_writer_options));
+ std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst";
+ ASSERT_OK(sst_file_writer->Open(file_name));
+ Random rnd(301);
+ std::string value = rnd.RandomString(4000);
+ for (int i = 0; i < 5000; i++) {
+ ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value));
+ }
+ ASSERT_OK(sst_file_writer->Finish());
+
+ // Ingest it once without verifying checksums to see the baseline
+ // preads.
+ IngestExternalFileOptions ingest_opt;
+ ingest_opt.move_files = false;
+ senv.count_random_reads_ = true;
+ senv.random_read_bytes_counter_ = 0;
+ ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+ auto base_num_reads = senv.random_read_counter_.Read();
+ // Make sure the counter is enabled.
+ ASSERT_GT(base_num_reads, 0);
+
+ // Ingest again and observe the reads made for for readahead.
+ ingest_opt.move_files = false;
+ ingest_opt.verify_checksums_before_ingest = true;
+ ingest_opt.verify_checksums_readahead_size = size_t{2 * 1024 * 1024};
+
+ senv.count_random_reads_ = true;
+ senv.random_read_bytes_counter_ = 0;
+ ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+ // Make sure the counter is enabled.
+ ASSERT_GT(senv.random_read_counter_.Read() - base_num_reads, 0);
+
+ // The SST file is about 20MB. Readahead size is 2MB.
+ // Give a conservative 15 reads for metadata blocks, the number
+ // of random reads should be within 20 MB / 2MB + 15 = 25.
+ ASSERT_LE(senv.random_read_counter_.Read() - base_num_reads, 40);
+
+ Destroy(options);
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestRangeDeletionTombstoneWithGlobalSeqno) {
+ for (int i = 5; i < 25; i++) {
+ ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
+ Key(i) + "_val"));
+ }
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file.sst (delete 0 => 30)
+ std::string file = sst_files_dir_ + "file.sst";
+ ASSERT_OK(sst_file_writer.Open(file));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(30)));
+ ExternalSstFileInfo file_info;
+ ASSERT_OK(sst_file_writer.Finish(&file_info));
+ ASSERT_EQ(file_info.file_path, file);
+ ASSERT_EQ(file_info.num_entries, 0);
+ ASSERT_EQ(file_info.smallest_key, "");
+ ASSERT_EQ(file_info.largest_key, "");
+ ASSERT_EQ(file_info.num_range_del_entries, 1);
+ ASSERT_EQ(file_info.smallest_range_del_key, Key(0));
+ ASSERT_EQ(file_info.largest_range_del_key, Key(30));
+
+ IngestExternalFileOptions ifo;
+ ifo.move_files = true;
+ ifo.snapshot_consistency = true;
+ ifo.allow_global_seqno = true;
+ ifo.write_global_seqno = true;
+ ifo.verify_checksums_before_ingest = false;
+ ASSERT_OK(db_->IngestExternalFile({file}, ifo));
+
+ for (int i = 5; i < 25; i++) {
+ std::string res;
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &res).IsNotFound());
+ }
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
+ int kNumLevels = 7;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = kNumLevels;
+ Reopen(options);
+
+ std::map<std::string, std::string> true_data;
+ int file_id = 1;
+ // prevent range deletions from being dropped due to becoming obsolete.
+ const Snapshot* snapshot = db_->GetSnapshot();
+
+ // range del [0, 50) in L6 file, [50, 100) in L0 file, [100, 150) in memtable
+ for (int i = 0; i < 3; i++) {
+ if (i != 0) {
+ db_->Flush(FlushOptions());
+ if (i == 1) {
+ MoveFilesToLevel(kNumLevels - 1);
+ }
+ }
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(50 * i), Key(50 * (i + 1))));
+ }
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1));
+
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ // overlaps with L0 file but not memtable, so flush is skipped and file is
+ // ingested into L0
+ SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
+ {{65, 70}, {70, 85}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+ // overlaps with L6 file but not memtable or L0 file, so flush is skipped and
+ // file is ingested into L5
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+ // overlaps with L5 file but not memtable or L0 file, so flush is skipped and
+ // file is ingested into L4
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {}, {}, {{5, 15}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+ // ingested file overlaps with memtable, so flush is triggered before the file
+ // is ingested such that the ingested data is considered newest. So L0 file
+ // count increases by two.
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {100, 140}, {ValueType::kTypeValue, ValueType::kTypeValue},
+ file_id++, write_global_seqno, verify_checksums_before_ingest,
+ &true_data));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+ ASSERT_EQ(4, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+ // snapshot unneeded now that all range deletions are persisted
+ db_->ReleaseSnapshot(snapshot);
+
+ // overlaps with nothing, so places at bottom level and skips incrementing
+ // seqnum.
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue},
+ {{160, 200}}, file_id++, write_global_seqno,
+ verify_checksums_before_ingest, &true_data));
+ ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+ ASSERT_EQ(4, NumTableFilesAtLevel(0));
+ ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+ ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1));
+}
+
+TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) {
+ Options options = CurrentOptions();
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file8.sst (delete 300 => 400)
+ std::string file8 = sst_files_dir_ + "file8.sst";
+ ASSERT_OK(sst_file_writer.Open(file8));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400)));
+ ExternalSstFileInfo file8_info;
+ Status s = sst_file_writer.Finish(&file8_info);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(file8_info.file_path, file8);
+ ASSERT_EQ(file8_info.num_entries, 0);
+ ASSERT_EQ(file8_info.smallest_key, "");
+ ASSERT_EQ(file8_info.largest_key, "");
+ ASSERT_EQ(file8_info.num_range_del_entries, 1);
+ ASSERT_EQ(file8_info.smallest_range_del_key, Key(300));
+ ASSERT_EQ(file8_info.largest_range_del_key, Key(400));
+
+ // file9.sst (delete 400 => 500)
+ std::string file9 = sst_files_dir_ + "file9.sst";
+ ASSERT_OK(sst_file_writer.Open(file9));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
+ ExternalSstFileInfo file9_info;
+ s = sst_file_writer.Finish(&file9_info);
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(file9_info.file_path, file9);
+ ASSERT_EQ(file9_info.num_entries, 0);
+ ASSERT_EQ(file9_info.smallest_key, "");
+ ASSERT_EQ(file9_info.largest_key, "");
+ ASSERT_EQ(file9_info.num_range_del_entries, 1);
+ ASSERT_EQ(file9_info.smallest_range_del_key, Key(400));
+ ASSERT_EQ(file9_info.largest_range_del_key, Key(500));
+
+ // Range deletion tombstones are exclusive on their end key, so these SSTs
+ // should not be considered as overlapping.
+ s = DeprecatedAddFile({file8, file9});
+ ASSERT_OK(s) << s.ToString();
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ DestroyAndRecreateExternalSSTFilesDir();
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) {
+ bool change_checksum_called = false;
+ const auto& change_checksum = [&](void* arg) {
+ if (!change_checksum_called) {
+ char* buf = reinterpret_cast<char*>(arg);
+ assert(nullptr != buf);
+ buf[0] ^= 0x1;
+ change_checksum_called = true;
+ }
+ };
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WriteMaybeCompressedBlock:TamperWithChecksum",
+ change_checksum);
+ SyncPoint::GetInstance()->EnableProcessing();
+ int file_id = 0;
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ do {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+ Status s = GenerateAndAddExternalFile(
+ options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+ write_global_seqno, verify_checksums_before_ingest, &true_data);
+ if (verify_checksums_before_ingest) {
+ ASSERT_NOK(s);
+ } else {
+ ASSERT_OK(s);
+ }
+ change_checksum_called = false;
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) {
+ if (!random_rwfile_supported_) {
+ ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support");
+ return;
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ int file_id = 0;
+ EnvOptions env_options;
+ do {
+ Options options = CurrentOptions();
+ std::string file_path = sst_files_dir_ + std::to_string(file_id++);
+ SstFileWriter sst_file_writer(env_options, options);
+ Status s = sst_file_writer.Open(file_path);
+ ASSERT_OK(s);
+ for (int i = 0; i != 100; ++i) {
+ std::string key = Key(i);
+ std::string value = Key(i) + std::to_string(0);
+ ASSERT_OK(sst_file_writer.Put(key, value));
+ }
+ ASSERT_OK(sst_file_writer.Finish());
+ {
+ // Get file size
+ uint64_t file_size = 0;
+ ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+ ASSERT_GT(file_size, 8);
+ std::unique_ptr<RandomRWFile> rwfile;
+ ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+ // Manually corrupt the file
+ // We deterministically corrupt the first byte because we currently
+ // cannot choose a random offset. The reason for this limitation is that
+ // we do not checksum property block at present.
+ const uint64_t offset = 0;
+ char scratch[8] = {0};
+ Slice buf;
+ ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+ scratch[0] ^= 0xff; // flip one bit
+ ASSERT_OK(rwfile->Write(offset, buf));
+ }
+ // Ingest file.
+ IngestExternalFileOptions ifo;
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ s = db_->IngestExternalFile({file_path}, ifo);
+ if (ifo.verify_checksums_before_ingest) {
+ ASSERT_NOK(s);
+ } else {
+ ASSERT_OK(s);
+ }
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) {
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ if (!verify_checksums_before_ingest) {
+ ROCKSDB_GTEST_BYPASS("Bypassing test when !verify_checksums_before_ingest");
+ return;
+ }
+ if (!random_rwfile_supported_) {
+ ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support");
+ return;
+ }
+ uint64_t props_block_offset = 0;
+ size_t props_block_size = 0;
+ const auto& get_props_block_offset = [&](void* arg) {
+ props_block_offset = *reinterpret_cast<uint64_t*>(arg);
+ };
+ const auto& get_props_block_size = [&](void* arg) {
+ props_block_size = *reinterpret_cast<uint64_t*>(arg);
+ };
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
+ get_props_block_offset);
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
+ get_props_block_size);
+ SyncPoint::GetInstance()->EnableProcessing();
+ int file_id = 0;
+ Random64 rand(time(nullptr));
+ do {
+ std::string file_path = sst_files_dir_ + std::to_string(file_id++);
+ Options options = CurrentOptions();
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ Status s = sst_file_writer.Open(file_path);
+ ASSERT_OK(s);
+ for (int i = 0; i != 100; ++i) {
+ std::string key = Key(i);
+ std::string value = Key(i) + std::to_string(0);
+ ASSERT_OK(sst_file_writer.Put(key, value));
+ }
+ ASSERT_OK(sst_file_writer.Finish());
+
+ {
+ std::unique_ptr<RandomRWFile> rwfile;
+ ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+ // Manually corrupt the file
+ ASSERT_GT(props_block_size, 8);
+ uint64_t offset =
+ props_block_offset + rand.Next() % (props_block_size - 8);
+ char scratch[8] = {0};
+ Slice buf;
+ ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+ scratch[0] ^= 0xff; // flip one bit
+ ASSERT_OK(rwfile->Write(offset, buf));
+ }
+
+ // Ingest file.
+ IngestExternalFileOptions ifo;
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ ifo.verify_checksums_before_ingest = true;
+ s = db_->IngestExternalFile({file_path}, ifo);
+ ASSERT_NOK(s);
+ } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
+ Options options = CurrentOptions();
+
+ std::vector<std::string> files;
+ {
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ ASSERT_OK(sst_file_writer.Put("a", "z"));
+ ASSERT_OK(sst_file_writer.Put("i", "m"));
+ ExternalSstFileInfo file1_info;
+ ASSERT_OK(sst_file_writer.Finish(&file1_info));
+ files.push_back(std::move(file1));
+ }
+ {
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ std::string file2 = sst_files_dir_ + "file2.sst";
+ ASSERT_OK(sst_file_writer.Open(file2));
+ ASSERT_OK(sst_file_writer.Put("i", "k"));
+ ExternalSstFileInfo file2_info;
+ ASSERT_OK(sst_file_writer.Finish(&file2_info));
+ files.push_back(std::move(file2));
+ }
+
+ IngestExternalFileOptions ifo;
+ ASSERT_OK(db_->IngestExternalFile(files, ifo));
+ ASSERT_EQ(Get("a"), "z");
+ ASSERT_EQ(Get("i"), "k");
+
+ int total_keys = 0;
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ total_keys++;
+ }
+ delete iter;
+ ASSERT_EQ(total_keys, 2);
+
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) {
+ // Repro https://github.com/facebook/rocksdb/issues/6245.
+ // Flush three files to L0. Ingest one more file to trigger L0->L1 compaction
+ // via trivial move. The bug happened when L1 files were incorrectly sorted
+ // resulting in an old value for "k" returned by `Get()`.
+ Options options = CurrentOptions();
+
+ ASSERT_OK(Put("k", "a"));
+ Flush();
+ ASSERT_OK(Put("k", "a"));
+ Flush();
+ ASSERT_OK(Put("k", "a"));
+ Flush();
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // Current file size should be 0 after sst_file_writer init and before open a
+ // file.
+ ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ ASSERT_OK(sst_file_writer.Put("k", "b"));
+
+ ExternalSstFileInfo file1_info;
+ Status s = sst_file_writer.Finish(&file1_info);
+ ASSERT_OK(s) << s.ToString();
+
+ // Current file size should be non-zero after success write.
+ ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+ IngestExternalFileOptions ifo;
+ s = db_->IngestExternalFile({file1}, ifo);
+ ASSERT_OK(s);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(Get("k"), "b");
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) {
+ Options options = CurrentOptions();
+ const ImmutableCFOptions ioptions(options);
+ options.bottommost_temperature = Temperature::kWarm;
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ options.level0_file_num_compaction_trigger = 2;
+ Reopen(options);
+
+ auto size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kHot);
+ ASSERT_EQ(size, 0);
+
+ // create file01.sst (1000 => 1099) and ingest it
+ std::string file1 = sst_files_dir_ + "file01.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 1000; k < 1100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ Status s = sst_file_writer.Finish(&file1_info);
+ ASSERT_OK(s);
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(1000));
+ ASSERT_EQ(file1_info.largest_key, Key(1099));
+
+ std::vector<std::string> files;
+ std::vector<std::string> files_checksums;
+ std::vector<std::string> files_checksum_func_names;
+ Temperature file_temperature = Temperature::kWarm;
+
+ files.push_back(file1);
+ IngestExternalFileOptions in_opts;
+ in_opts.move_files = false;
+ in_opts.snapshot_consistency = true;
+ in_opts.allow_global_seqno = false;
+ in_opts.allow_blocking_flush = false;
+ in_opts.write_global_seqno = true;
+ in_opts.verify_file_checksum = false;
+ IngestExternalFileArg arg;
+ arg.column_family = db_->DefaultColumnFamily();
+ arg.external_files = files;
+ arg.options = in_opts;
+ arg.files_checksums = files_checksums;
+ arg.files_checksum_func_names = files_checksum_func_names;
+ arg.file_temperature = file_temperature;
+ s = db_->IngestExternalFiles({arg});
+ ASSERT_OK(s);
+
+ // check the temperature of the file being ingested
+ ColumnFamilyMetaData metadata;
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(1, metadata.file_count);
+ ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_GT(size, 1);
+
+ // non-bottommost file still has unknown temperature
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("bar", "bar"));
+ ASSERT_OK(Flush());
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(2, metadata.file_count);
+ ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_GT(size, 0);
+
+ // reopen and check the information is persisted
+ Reopen(options);
+ db_->GetColumnFamilyMetaData(&metadata);
+ ASSERT_EQ(2, metadata.file_count);
+ ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+ ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature);
+ size = GetSstSizeHelper(Temperature::kUnknown);
+ ASSERT_GT(size, 0);
+ size = GetSstSizeHelper(Temperature::kWarm);
+ ASSERT_GT(size, 0);
+
+ // check other non-exist temperatures
+ size = GetSstSizeHelper(Temperature::kHot);
+ ASSERT_EQ(size, 0);
+ size = GetSstSizeHelper(Temperature::kCold);
+ ASSERT_EQ(size, 0);
+ std::string prop;
+ ASSERT_TRUE(dbfull()->GetProperty(
+ DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+ &prop));
+ ASSERT_EQ(std::atoi(prop.c_str()), 0);
+}
+
+TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevel) {
+ Options options = GetDefaultOptions();
+
+ std::string file_path = sst_files_dir_ + std::to_string(1);
+ SstFileWriter sfw(EnvOptions(), options);
+
+ ASSERT_OK(sfw.Open(file_path));
+ ASSERT_OK(sfw.Put("b", "dontcare"));
+ ASSERT_OK(sfw.Finish());
+
+ // Test universal compaction + ingest with snapshot consistency
+ options.create_if_missing = true;
+ options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+ DestroyAndReopen(options);
+ {
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ManagedSnapshot snapshot_guard(db_, snapshot);
+ IngestExternalFileOptions ifo;
+ ifo.fail_if_not_bottommost_level = true;
+ ifo.snapshot_consistency = true;
+ const Status s = db_->IngestExternalFile({file_path}, ifo);
+ ASSERT_TRUE(s.IsTryAgain());
+ }
+
+ // Test level compaction
+ options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+ options.num_levels = 2;
+ DestroyAndReopen(options);
+ ASSERT_OK(db_->Put(WriteOptions(), "a", "dontcare"));
+ ASSERT_OK(db_->Put(WriteOptions(), "c", "dontcare"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ ASSERT_OK(db_->Put(WriteOptions(), "b", "dontcare"));
+ ASSERT_OK(db_->Put(WriteOptions(), "d", "dontcare"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ {
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ IngestExternalFileOptions ifo;
+ ifo.fail_if_not_bottommost_level = true;
+ const Status s = db_->IngestExternalFile({file_path}, ifo);
+ ASSERT_TRUE(s.IsTryAgain());
+ }
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifyChecksum) {
+ const std::string kPutVal = "put_val";
+ const std::string kIngestedVal = "ingested_val";
+
+ ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+ ASSERT_OK(Flush());
+
+ std::string external_file = sst_files_dir_ + "/file_to_ingest.sst";
+ {
+ SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+
+ ASSERT_OK(sst_file_writer.Open(external_file));
+ ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+ ASSERT_OK(sst_file_writer.Finish());
+ }
+
+ ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+ IngestExternalFileOptions()));
+
+ ASSERT_OK(db_->VerifyChecksum());
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifySstUniqueId) {
+ const std::string kPutVal = "put_val";
+ const std::string kIngestedVal = "ingested_val";
+
+ ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+ ASSERT_OK(Flush());
+
+ std::string external_file = sst_files_dir_ + "/file_to_ingest.sst";
+ {
+ SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+
+ ASSERT_OK(sst_file_writer.Open(external_file));
+ ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+ ASSERT_OK(sst_file_writer.Finish());
+ }
+
+ ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+ IngestExternalFileOptions()));
+
+ // Test ingest file without session_id and db_id (for example generated by an
+ // older version of sst_writer)
+ SyncPoint::GetInstance()->SetCallBack(
+ "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+ auto props = static_cast<TableProperties*>(props_vs);
+ // update table property session_id to a different one
+ props->db_session_id = "";
+ props->db_id = "";
+ });
+ std::atomic_int skipped = 0, passed = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::SkippedVerifyUniqueId",
+ [&](void* /*arg*/) { skipped++; });
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::PassedVerifyUniqueId",
+ [&](void* /*arg*/) { passed++; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ auto options = CurrentOptions();
+ ASSERT_TRUE(options.verify_sst_unique_id_in_manifest);
+ Reopen(options);
+ ASSERT_EQ(skipped, 0);
+ ASSERT_EQ(passed, 2); // one flushed + one ingested
+
+ external_file = sst_files_dir_ + "/file_to_ingest2.sst";
+ {
+ SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+
+ ASSERT_OK(sst_file_writer.Open(external_file));
+ ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+ ASSERT_OK(sst_file_writer.Finish());
+ }
+
+ ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+ IngestExternalFileOptions()));
+
+ // Two table file opens skipping verification:
+ // * ExternalSstFileIngestionJob::GetIngestedFileInfo
+ // * TableCache::GetTableReader
+ ASSERT_EQ(skipped, 2);
+ ASSERT_EQ(passed, 2);
+
+ // Check same after re-open (except no GetIngestedFileInfo)
+ skipped = 0;
+ passed = 0;
+ Reopen(options);
+ ASSERT_EQ(skipped, 1);
+ ASSERT_EQ(passed, 2);
+}
+
+TEST_F(ExternalSSTFileBasicTest, StableSnapshotWhileLoggingToManifest) {
+ const std::string kPutVal = "put_val";
+ const std::string kIngestedVal = "ingested_val";
+
+ ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+ ASSERT_OK(Flush());
+
+ std::string external_file = sst_files_dir_ + "/file_to_ingest.sst";
+ {
+ SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+ ASSERT_OK(sst_file_writer.Open(external_file));
+ ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+ ASSERT_OK(sst_file_writer.Finish());
+ }
+
+ const Snapshot* snapshot = nullptr;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void* /* arg */) {
+ // prevent background compaction job to call this callback
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ snapshot = db_->GetSnapshot();
+ ReadOptions read_opts;
+ read_opts.snapshot = snapshot;
+ std::string value;
+ ASSERT_OK(db_->Get(read_opts, "k", &value));
+ ASSERT_EQ(kPutVal, value);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+ IngestExternalFileOptions()));
+ auto ingested_file_seqno = db_->GetLatestSequenceNumber();
+ ASSERT_NE(nullptr, snapshot);
+ // snapshot is taken before SST ingestion is done
+ ASSERT_EQ(ingested_file_seqno, snapshot->GetSequenceNumber() + 1);
+
+ ReadOptions read_opts;
+ read_opts.snapshot = snapshot;
+ std::string value;
+ ASSERT_OK(db_->Get(read_opts, "k", &value));
+ ASSERT_EQ(kPutVal, value);
+ db_->ReleaseSnapshot(snapshot);
+
+ // After reopen, sequence number should be up current such that
+ // ingested value is read
+ Reopen(CurrentOptions());
+ ASSERT_OK(db_->Get(ReadOptions(), "k", &value));
+ ASSERT_EQ(kIngestedVal, value);
+
+ // New write should get higher seqno compared to ingested file
+ ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), ingested_file_seqno + 1);
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
+ testing::Values(std::make_tuple(true, true),
+ std::make_tuple(true, false),
+ std::make_tuple(false, true),
+ std::make_tuple(false, false)));
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.cc b/src/rocksdb/db/external_sst_file_ingestion_job.cc
new file mode 100644
index 000000000..ba1277eab
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_ingestion_job.cc
@@ -0,0 +1,1020 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/external_sst_file_ingestion_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ExternalSstFileIngestionJob::Prepare(
+ const std::vector<std::string>& external_files_paths,
+ const std::vector<std::string>& files_checksums,
+ const std::vector<std::string>& files_checksum_func_names,
+ const Temperature& file_temperature, uint64_t next_file_number,
+ SuperVersion* sv) {
+ Status status;
+
+ // Read the information of files we are ingesting
+ for (const std::string& file_path : external_files_paths) {
+ IngestedFileInfo file_to_ingest;
+ status =
+ GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv);
+ if (!status.ok()) {
+ return status;
+ }
+
+ if (file_to_ingest.cf_id !=
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
+ file_to_ingest.cf_id != cfd_->GetID()) {
+ return Status::InvalidArgument(
+ "External file column family id don't match");
+ }
+
+ if (file_to_ingest.num_entries == 0 &&
+ file_to_ingest.num_range_deletions == 0) {
+ return Status::InvalidArgument("File contain no entries");
+ }
+
+ if (!file_to_ingest.smallest_internal_key.Valid() ||
+ !file_to_ingest.largest_internal_key.Valid()) {
+ return Status::Corruption("Generated table have corrupted keys");
+ }
+
+ files_to_ingest_.emplace_back(std::move(file_to_ingest));
+ }
+
+ const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
+ auto num_files = files_to_ingest_.size();
+ if (num_files == 0) {
+ return Status::InvalidArgument("The list of files is empty");
+ } else if (num_files > 1) {
+ // Verify that passed files don't have overlapping ranges
+ autovector<const IngestedFileInfo*> sorted_files;
+ for (size_t i = 0; i < num_files; i++) {
+ sorted_files.push_back(&files_to_ingest_[i]);
+ }
+
+ std::sort(
+ sorted_files.begin(), sorted_files.end(),
+ [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
+ return sstableKeyCompare(ucmp, info1->smallest_internal_key,
+ info2->smallest_internal_key) < 0;
+ });
+
+ for (size_t i = 0; i + 1 < num_files; i++) {
+ if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
+ sorted_files[i + 1]->smallest_internal_key) >= 0) {
+ files_overlap_ = true;
+ break;
+ }
+ }
+ }
+
+ // Hanlde the file temperature
+ for (size_t i = 0; i < num_files; i++) {
+ files_to_ingest_[i].file_temperature = file_temperature;
+ }
+
+ if (ingestion_options_.ingest_behind && files_overlap_) {
+ return Status::NotSupported("Files have overlapping ranges");
+ }
+
+ // Copy/Move external files into DB
+ std::unordered_set<size_t> ingestion_path_ids;
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ f.copy_file = false;
+ const std::string path_outside_db = f.external_file_path;
+ const std::string path_inside_db = TableFileName(
+ cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
+ if (ingestion_options_.move_files) {
+ status =
+ fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
+ if (status.ok()) {
+ // It is unsafe to assume application had sync the file and file
+ // directory before ingest the file. For integrity of RocksDB we need
+ // to sync the file.
+ std::unique_ptr<FSWritableFile> file_to_sync;
+ Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
+ &file_to_sync, nullptr);
+ TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen",
+ &s);
+ // Some file systems (especially remote/distributed) don't support
+ // reopening a file for writing and don't require reopening and
+ // syncing the file. Ignore the NotSupported error in that case.
+ if (!s.IsNotSupported()) {
+ status = s;
+ if (status.ok()) {
+ TEST_SYNC_POINT(
+ "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
+ status = SyncIngestedFile(file_to_sync.get());
+ TEST_SYNC_POINT(
+ "ExternalSstFileIngestionJob::AfterSyncIngestedFile");
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to sync ingested file %s: %s",
+ path_inside_db.c_str(), status.ToString().c_str());
+ }
+ }
+ }
+ } else if (status.IsNotSupported() &&
+ ingestion_options_.failed_move_fall_back_to_copy) {
+ // Original file is on a different FS, use copy instead of hard linking.
+ f.copy_file = true;
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Triy to link file %s but it's not supported : %s",
+ path_outside_db.c_str(), status.ToString().c_str());
+ }
+ } else {
+ f.copy_file = true;
+ }
+
+ if (f.copy_file) {
+ TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
+ nullptr);
+ // CopyFile also sync the new file.
+ status =
+ CopyFile(fs_.get(), path_outside_db, path_inside_db, 0,
+ db_options_.use_fsync, io_tracer_, Temperature::kUnknown);
+ }
+ TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
+ if (!status.ok()) {
+ break;
+ }
+ f.internal_file_path = path_inside_db;
+ // Initialize the checksum information of ingested files.
+ f.file_checksum = kUnknownFileChecksum;
+ f.file_checksum_func_name = kUnknownFileChecksumFuncName;
+ ingestion_path_ids.insert(f.fd.GetPathId());
+ }
+
+ TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
+ if (status.ok()) {
+ for (auto path_id : ingestion_path_ids) {
+ status = directories_->GetDataDir(path_id)->FsyncWithDirOptions(
+ IOOptions(), nullptr,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to sync directory %" ROCKSDB_PRIszt
+ " while ingest file: %s",
+ path_id, status.ToString().c_str());
+ break;
+ }
+ }
+ }
+ TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir");
+
+ // Generate and check the sst file checksum. Note that, if
+ // IngestExternalFileOptions::write_global_seqno is true, we will not update
+ // the checksum information in the files_to_ingests_ here, since the file is
+ // upadted with the new global_seqno. After global_seqno is updated, DB will
+ // generate the new checksum and store it in the Manifest. In all other cases
+ // if ingestion_options_.write_global_seqno == true and
+ // verify_file_checksum is false, we only check the checksum function name.
+ if (status.ok() && db_options_.file_checksum_gen_factory != nullptr) {
+ if (ingestion_options_.verify_file_checksum == false &&
+ files_checksums.size() == files_to_ingest_.size() &&
+ files_checksum_func_names.size() == files_to_ingest_.size()) {
+ // Only when verify_file_checksum == false and the checksum for ingested
+ // files are provided, DB will use the provided checksum and does not
+ // generate the checksum for ingested files.
+ need_generate_file_checksum_ = false;
+ } else {
+ need_generate_file_checksum_ = true;
+ }
+ FileChecksumGenContext gen_context;
+ std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
+ db_options_.file_checksum_gen_factory->CreateFileChecksumGenerator(
+ gen_context);
+ std::vector<std::string> generated_checksums;
+ std::vector<std::string> generated_checksum_func_names;
+ // Step 1: generate the checksum for ingested sst file.
+ if (need_generate_file_checksum_) {
+ for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+ std::string generated_checksum;
+ std::string generated_checksum_func_name;
+ std::string requested_checksum_func_name;
+ // TODO: rate limit file reads for checksum calculation during file
+ // ingestion.
+ IOStatus io_s = GenerateOneFileChecksum(
+ fs_.get(), files_to_ingest_[i].internal_file_path,
+ db_options_.file_checksum_gen_factory.get(),
+ requested_checksum_func_name, &generated_checksum,
+ &generated_checksum_func_name,
+ ingestion_options_.verify_checksums_readahead_size,
+ db_options_.allow_mmap_reads, io_tracer_,
+ db_options_.rate_limiter.get(),
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!io_s.ok()) {
+ status = io_s;
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Sst file checksum generation of file: %s failed: %s",
+ files_to_ingest_[i].internal_file_path.c_str(),
+ status.ToString().c_str());
+ break;
+ }
+ if (ingestion_options_.write_global_seqno == false) {
+ files_to_ingest_[i].file_checksum = generated_checksum;
+ files_to_ingest_[i].file_checksum_func_name =
+ generated_checksum_func_name;
+ }
+ generated_checksums.push_back(generated_checksum);
+ generated_checksum_func_names.push_back(generated_checksum_func_name);
+ }
+ }
+
+ // Step 2: based on the verify_file_checksum and ingested checksum
+ // information, do the verification.
+ if (status.ok()) {
+ if (files_checksums.size() == files_to_ingest_.size() &&
+ files_checksum_func_names.size() == files_to_ingest_.size()) {
+ // Verify the checksum and checksum function name.
+ if (ingestion_options_.verify_file_checksum) {
+ for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+ if (files_checksum_func_names[i] !=
+ generated_checksum_func_names[i]) {
+ status = Status::InvalidArgument(
+ "Checksum function name does not match with the checksum "
+ "function name of this DB");
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "Sst file checksum verification of file: %s failed: %s",
+ external_files_paths[i].c_str(), status.ToString().c_str());
+ break;
+ }
+ if (files_checksums[i] != generated_checksums[i]) {
+ status = Status::Corruption(
+ "Ingested checksum does not match with the generated "
+ "checksum");
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "Sst file checksum verification of file: %s failed: %s",
+ files_to_ingest_[i].internal_file_path.c_str(),
+ status.ToString().c_str());
+ break;
+ }
+ }
+ } else {
+ // If verify_file_checksum is not enabled, we only verify the
+ // checksum function name. If it does not match, fail the ingestion.
+ // If matches, we trust the ingested checksum information and store
+ // in the Manifest.
+ for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+ if (files_checksum_func_names[i] != file_checksum_gen->Name()) {
+ status = Status::InvalidArgument(
+ "Checksum function name does not match with the checksum "
+ "function name of this DB");
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "Sst file checksum verification of file: %s failed: %s",
+ external_files_paths[i].c_str(), status.ToString().c_str());
+ break;
+ }
+ files_to_ingest_[i].file_checksum = files_checksums[i];
+ files_to_ingest_[i].file_checksum_func_name =
+ files_checksum_func_names[i];
+ }
+ }
+ } else if (files_checksums.size() != files_checksum_func_names.size() ||
+ (files_checksums.size() == files_checksum_func_names.size() &&
+ files_checksums.size() != 0)) {
+ // The checksum or checksum function name vector are not both empty
+ // and they are incomplete.
+ status = Status::InvalidArgument(
+ "The checksum information of ingested sst files are nonempty and "
+ "the size of checksums or the size of the checksum function "
+ "names "
+ "does not match with the number of ingested sst files");
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "The ingested sst files checksum information is incomplete: %s",
+ status.ToString().c_str());
+ }
+ }
+ }
+
+ // TODO: The following is duplicated with Cleanup().
+ if (!status.ok()) {
+ IOOptions io_opts;
+ // We failed, remove all files that we copied into the db
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ if (f.internal_file_path.empty()) {
+ continue;
+ }
+ Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "AddFile() clean up for file %s failed : %s",
+ f.internal_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ }
+
+ return status;
+}
+
+Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
+ SuperVersion* super_version) {
+ autovector<Range> ranges;
+ autovector<std::string> keys;
+ size_t ts_sz = cfd_->user_comparator()->timestamp_size();
+ if (ts_sz) {
+ // Check all ranges [begin, end] inclusively. Add maximum
+ // timestamp to include all `begin` keys, and add minimal timestamp to
+ // include all `end` keys.
+ for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
+ std::string begin_str;
+ std::string end_str;
+ AppendUserKeyWithMaxTimestamp(
+ &begin_str, file_to_ingest.smallest_internal_key.user_key(), ts_sz);
+ AppendKeyWithMinTimestamp(
+ &end_str, file_to_ingest.largest_internal_key.user_key(), ts_sz);
+ keys.emplace_back(std::move(begin_str));
+ keys.emplace_back(std::move(end_str));
+ }
+ for (size_t i = 0; i < files_to_ingest_.size(); ++i) {
+ ranges.emplace_back(keys[2 * i], keys[2 * i + 1]);
+ }
+ } else {
+ for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
+ ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(),
+ file_to_ingest.largest_internal_key.user_key());
+ }
+ }
+ Status status = cfd_->RangesOverlapWithMemtables(
+ ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
+ if (status.ok() && *flush_needed &&
+ !ingestion_options_.allow_blocking_flush) {
+ status = Status::InvalidArgument("External file requires flush");
+ }
+ return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ExternalSstFileIngestionJob::Run() {
+ Status status;
+ SuperVersion* super_version = cfd_->GetSuperVersion();
+#ifndef NDEBUG
+ // We should never run the job with a memtable that is overlapping
+ // with the files we are ingesting
+ bool need_flush = false;
+ status = NeedsFlush(&need_flush, super_version);
+ if (!status.ok()) {
+ return status;
+ }
+ if (need_flush) {
+ return Status::TryAgain();
+ }
+ assert(status.ok() && need_flush == false);
+#endif
+
+ bool force_global_seqno = false;
+
+ if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) {
+ // We need to assign a global sequence number to all the files even
+ // if the don't overlap with any ranges since we have snapshots
+ force_global_seqno = true;
+ }
+ // It is safe to use this instead of LastAllocatedSequence since we are
+ // the only active writer, and hence they are equal
+ SequenceNumber last_seqno = versions_->LastSequence();
+ edit_.SetColumnFamily(cfd_->GetID());
+ // The levels that the files will be ingested into
+
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ SequenceNumber assigned_seqno = 0;
+ if (ingestion_options_.ingest_behind) {
+ status = CheckLevelForIngestedBehindFile(&f);
+ } else {
+ status = AssignLevelAndSeqnoForIngestedFile(
+ super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
+ last_seqno, &f, &assigned_seqno);
+ }
+
+ // Modify the smallest/largest internal key to include the sequence number
+ // that we just learned. Only overwrite sequence number zero. There could
+ // be a nonzero sequence number already to indicate a range tombstone's
+ // exclusive endpoint.
+ ParsedInternalKey smallest_parsed, largest_parsed;
+ if (status.ok()) {
+ status = ParseInternalKey(*f.smallest_internal_key.rep(),
+ &smallest_parsed, false /* log_err_key */);
+ }
+ if (status.ok()) {
+ status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed,
+ false /* log_err_key */);
+ }
+ if (!status.ok()) {
+ return status;
+ }
+ if (smallest_parsed.sequence == 0) {
+ UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno,
+ smallest_parsed.type);
+ }
+ if (largest_parsed.sequence == 0) {
+ UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno,
+ largest_parsed.type);
+ }
+
+ status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
+ TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
+ &assigned_seqno);
+ if (assigned_seqno > last_seqno) {
+ assert(assigned_seqno == last_seqno + 1);
+ last_seqno = assigned_seqno;
+ ++consumed_seqno_count_;
+ }
+ if (!status.ok()) {
+ return status;
+ }
+
+ status = GenerateChecksumForIngestedFile(&f);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // We use the import time as the ancester time. This is the time the data
+ // is written to the database.
+ int64_t temp_current_time = 0;
+ uint64_t current_time = kUnknownFileCreationTime;
+ uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+ if (clock_->GetCurrentTime(&temp_current_time).ok()) {
+ current_time = oldest_ancester_time =
+ static_cast<uint64_t>(temp_current_time);
+ }
+ FileMetaData f_metadata(
+ f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(),
+ f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
+ f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber,
+ oldest_ancester_time, current_time, f.file_checksum,
+ f.file_checksum_func_name, f.unique_id);
+ f_metadata.temperature = f.file_temperature;
+ edit_.AddFile(f.picked_level, f_metadata);
+ }
+ return status;
+}
+
+void ExternalSstFileIngestionJob::UpdateStats() {
+ // Update internal stats for new ingested files
+ uint64_t total_keys = 0;
+ uint64_t total_l0_files = 0;
+ uint64_t total_time = clock_->NowMicros() - job_start_time_;
+
+ EventLoggerStream stream = event_logger_->Log();
+ stream << "event"
+ << "ingest_finished";
+ stream << "files_ingested";
+ stream.StartArray();
+
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ InternalStats::CompactionStats stats(
+ CompactionReason::kExternalSstIngestion, 1);
+ stats.micros = total_time;
+ // If actual copy occurred for this file, then we need to count the file
+ // size as the actual bytes written. If the file was linked, then we ignore
+ // the bytes written for file metadata.
+ // TODO (yanqin) maybe account for file metadata bytes for exact accuracy?
+ if (f.copy_file) {
+ stats.bytes_written = f.fd.GetFileSize();
+ } else {
+ stats.bytes_moved = f.fd.GetFileSize();
+ }
+ stats.num_output_files = 1;
+ cfd_->internal_stats()->AddCompactionStats(f.picked_level,
+ Env::Priority::USER, stats);
+ cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE,
+ f.fd.GetFileSize());
+ total_keys += f.num_entries;
+ if (f.picked_level == 0) {
+ total_l0_files += 1;
+ }
+ ROCKS_LOG_INFO(
+ db_options_.info_log,
+ "[AddFile] External SST file %s was ingested in L%d with path %s "
+ "(global_seqno=%" PRIu64 ")\n",
+ f.external_file_path.c_str(), f.picked_level,
+ f.internal_file_path.c_str(), f.assigned_seqno);
+ stream << "file" << f.internal_file_path << "level" << f.picked_level;
+ }
+ stream.EndArray();
+
+ stream << "lsm_state";
+ stream.StartArray();
+ auto vstorage = cfd_->current()->storage_info();
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ stream << vstorage->NumLevelFiles(level);
+ }
+ stream.EndArray();
+
+ cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
+ total_keys);
+ cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
+ files_to_ingest_.size());
+ cfd_->internal_stats()->AddCFStats(
+ InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files);
+}
+
+void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
+ IOOptions io_opts;
+ if (!status.ok()) {
+ // We failed to add the files to the database
+ // remove all the files we copied
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ if (f.internal_file_path.empty()) {
+ continue;
+ }
+ Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "AddFile() clean up for file %s failed : %s",
+ f.internal_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ consumed_seqno_count_ = 0;
+ files_overlap_ = false;
+ } else if (status.ok() && ingestion_options_.move_files) {
+ // The files were moved and added successfully, remove original file links
+ for (IngestedFileInfo& f : files_to_ingest_) {
+ Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "%s was added to DB successfully but failed to remove original "
+ "file link : %s",
+ f.external_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ }
+}
+
+Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
+ const std::string& external_file, uint64_t new_file_number,
+ IngestedFileInfo* file_to_ingest, SuperVersion* sv) {
+ file_to_ingest->external_file_path = external_file;
+
+ // Get external file size
+ Status status = fs_->GetFileSize(external_file, IOOptions(),
+ &file_to_ingest->file_size, nullptr);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Assign FD with number
+ file_to_ingest->fd =
+ FileDescriptor(new_file_number, 0, file_to_ingest->file_size);
+
+ // Create TableReader for external file
+ std::unique_ptr<TableReader> table_reader;
+ std::unique_ptr<FSRandomAccessFile> sst_file;
+ std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+ status =
+ fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr);
+ if (!status.ok()) {
+ return status;
+ }
+ sst_file_reader.reset(new RandomAccessFileReader(
+ std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_));
+
+ status = cfd_->ioptions()->table_factory->NewTableReader(
+ TableReaderOptions(
+ *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
+ env_options_, cfd_->internal_comparator(),
+ /*skip_filters*/ false, /*immortal*/ false,
+ /*force_direct_prefetch*/ false, /*level*/ -1,
+ /*block_cache_tracer*/ nullptr,
+ /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(),
+ /*cur_file_num*/ new_file_number),
+ std::move(sst_file_reader), file_to_ingest->file_size, &table_reader);
+ if (!status.ok()) {
+ return status;
+ }
+
+ if (ingestion_options_.verify_checksums_before_ingest) {
+ // If customized readahead size is needed, we can pass a user option
+ // all the way to here. Right now we just rely on the default readahead
+ // to keep things simple.
+ ReadOptions ro;
+ ro.readahead_size = ingestion_options_.verify_checksums_readahead_size;
+ status = table_reader->VerifyChecksum(
+ ro, TableReaderCaller::kExternalSSTIngestion);
+ }
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Get the external file properties
+ auto props = table_reader->GetTableProperties();
+ const auto& uprops = props->user_collected_properties;
+
+ // Get table version
+ auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
+ if (version_iter == uprops.end()) {
+ return Status::Corruption("External file version not found");
+ }
+ file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
+
+ auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+ if (file_to_ingest->version == 2) {
+ // version 2 imply that we have global sequence number
+ if (seqno_iter == uprops.end()) {
+ return Status::Corruption(
+ "External file global sequence number not found");
+ }
+
+ // Set the global sequence number
+ file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str());
+ if (props->external_sst_file_global_seqno_offset == 0) {
+ file_to_ingest->global_seqno_offset = 0;
+ return Status::Corruption("Was not able to find file global seqno field");
+ }
+ file_to_ingest->global_seqno_offset =
+ static_cast<size_t>(props->external_sst_file_global_seqno_offset);
+ } else if (file_to_ingest->version == 1) {
+ // SST file V1 should not have global seqno field
+ assert(seqno_iter == uprops.end());
+ file_to_ingest->original_seqno = 0;
+ if (ingestion_options_.allow_blocking_flush ||
+ ingestion_options_.allow_global_seqno) {
+ return Status::InvalidArgument(
+ "External SST file V1 does not support global seqno");
+ }
+ } else {
+ return Status::InvalidArgument("External file version is not supported");
+ }
+ // Get number of entries in table
+ file_to_ingest->num_entries = props->num_entries;
+ file_to_ingest->num_range_deletions = props->num_range_deletions;
+
+ ParsedInternalKey key;
+ ReadOptions ro;
+ // During reading the external file we can cache blocks that we read into
+ // the block cache, if we later change the global seqno of this file, we will
+ // have block in cache that will include keys with wrong seqno.
+ // We need to disable fill_cache so that we read from the file without
+ // updating the block cache.
+ ro.fill_cache = false;
+ std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+ ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+ std::unique_ptr<InternalIterator> range_del_iter(
+ table_reader->NewRangeTombstoneIterator(ro));
+
+ // Get first (smallest) and last (largest) key from file.
+ file_to_ingest->smallest_internal_key =
+ InternalKey("", 0, ValueType::kTypeValue);
+ file_to_ingest->largest_internal_key =
+ InternalKey("", 0, ValueType::kTypeValue);
+ bool bounds_set = false;
+ bool allow_data_in_errors = db_options_.allow_data_in_errors;
+ iter->SeekToFirst();
+ if (iter->Valid()) {
+ Status pik_status =
+ ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+ if (!pik_status.ok()) {
+ return Status::Corruption("Corrupted key in external file. ",
+ pik_status.getState());
+ }
+ if (key.sequence != 0) {
+ return Status::Corruption("External file has non zero sequence number");
+ }
+ file_to_ingest->smallest_internal_key.SetFrom(key);
+
+ iter->SeekToLast();
+ pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+ if (!pik_status.ok()) {
+ return Status::Corruption("Corrupted key in external file. ",
+ pik_status.getState());
+ }
+ if (key.sequence != 0) {
+ return Status::Corruption("External file has non zero sequence number");
+ }
+ file_to_ingest->largest_internal_key.SetFrom(key);
+
+ bounds_set = true;
+ }
+
+ // We may need to adjust these key bounds, depending on whether any range
+ // deletion tombstones extend past them.
+ const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
+ if (range_del_iter != nullptr) {
+ for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
+ range_del_iter->Next()) {
+ Status pik_status =
+ ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors);
+ if (!pik_status.ok()) {
+ return Status::Corruption("Corrupted key in external file. ",
+ pik_status.getState());
+ }
+ RangeTombstone tombstone(key, range_del_iter->value());
+
+ InternalKey start_key = tombstone.SerializeKey();
+ if (!bounds_set ||
+ sstableKeyCompare(ucmp, start_key,
+ file_to_ingest->smallest_internal_key) < 0) {
+ file_to_ingest->smallest_internal_key = start_key;
+ }
+ InternalKey end_key = tombstone.SerializeEndKey();
+ if (!bounds_set ||
+ sstableKeyCompare(ucmp, end_key,
+ file_to_ingest->largest_internal_key) > 0) {
+ file_to_ingest->largest_internal_key = end_key;
+ }
+ bounds_set = true;
+ }
+ }
+
+ file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+ file_to_ingest->table_properties = *props;
+
+ auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
+ props->orig_file_number,
+ &(file_to_ingest->unique_id));
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to get SST unique id for file %s",
+ file_to_ingest->internal_file_path.c_str());
+ file_to_ingest->unique_id = kNullUniqueId64x2;
+ }
+
+ return status;
+}
+
+Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
+ SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
+ SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest,
+ SequenceNumber* assigned_seqno) {
+ Status status;
+ *assigned_seqno = 0;
+ if (force_global_seqno) {
+ *assigned_seqno = last_seqno + 1;
+ if (compaction_style == kCompactionStyleUniversal || files_overlap_) {
+ if (ingestion_options_.fail_if_not_bottommost_level) {
+ status = Status::TryAgain(
+ "Files cannot be ingested to Lmax. Please make sure key range of "
+ "Lmax does not overlap with files to ingest.");
+ return status;
+ }
+ file_to_ingest->picked_level = 0;
+ return status;
+ }
+ }
+
+ bool overlap_with_db = false;
+ Arena arena;
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ int target_level = 0;
+ auto* vstorage = cfd_->current()->storage_info();
+
+ for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
+ if (lvl > 0 && lvl < vstorage->base_level()) {
+ continue;
+ }
+
+ if (vstorage->NumLevelFiles(lvl) > 0) {
+ bool overlap_with_level = false;
+ status = sv->current->OverlapWithLevelIterator(
+ ro, env_options_, file_to_ingest->smallest_internal_key.user_key(),
+ file_to_ingest->largest_internal_key.user_key(), lvl,
+ &overlap_with_level);
+ if (!status.ok()) {
+ return status;
+ }
+ if (overlap_with_level) {
+ // We must use L0 or any level higher than `lvl` to be able to overwrite
+ // the keys that we overlap with in this level, We also need to assign
+ // this file a seqno to overwrite the existing keys in level `lvl`
+ overlap_with_db = true;
+ break;
+ }
+
+ if (compaction_style == kCompactionStyleUniversal && lvl != 0) {
+ const std::vector<FileMetaData*>& level_files =
+ vstorage->LevelFiles(lvl);
+ const SequenceNumber level_largest_seqno =
+ (*std::max_element(level_files.begin(), level_files.end(),
+ [](FileMetaData* f1, FileMetaData* f2) {
+ return f1->fd.largest_seqno <
+ f2->fd.largest_seqno;
+ }))
+ ->fd.largest_seqno;
+ // should only assign seqno to current level's largest seqno when
+ // the file fits
+ if (level_largest_seqno != 0 &&
+ IngestedFileFitInLevel(file_to_ingest, lvl)) {
+ *assigned_seqno = level_largest_seqno;
+ } else {
+ continue;
+ }
+ }
+ } else if (compaction_style == kCompactionStyleUniversal) {
+ continue;
+ }
+
+ // We don't overlap with any keys in this level, but we still need to check
+ // if our file can fit in it
+ if (IngestedFileFitInLevel(file_to_ingest, lvl)) {
+ target_level = lvl;
+ }
+ }
+ // If files overlap, we have to ingest them at level 0 and assign the newest
+ // sequence number
+ if (files_overlap_) {
+ target_level = 0;
+ *assigned_seqno = last_seqno + 1;
+ }
+
+ if (ingestion_options_.fail_if_not_bottommost_level &&
+ target_level < cfd_->NumberLevels() - 1) {
+ status = Status::TryAgain(
+ "Files cannot be ingested to Lmax. Please make sure key range of Lmax "
+ "does not overlap with files to ingest.");
+ return status;
+ }
+
+ TEST_SYNC_POINT_CALLBACK(
+ "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
+ &overlap_with_db);
+ file_to_ingest->picked_level = target_level;
+ if (overlap_with_db && *assigned_seqno == 0) {
+ *assigned_seqno = last_seqno + 1;
+ }
+ return status;
+}
+
+Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
+ IngestedFileInfo* file_to_ingest) {
+ auto* vstorage = cfd_->current()->storage_info();
+ // first check if new files fit in the bottommost level
+ int bottom_lvl = cfd_->NumberLevels() - 1;
+ if (!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) {
+ return Status::InvalidArgument(
+ "Can't ingest_behind file as it doesn't fit "
+ "at the bottommost level!");
+ }
+
+ // second check if despite allow_ingest_behind=true we still have 0 seqnums
+ // at some upper level
+ for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
+ for (auto file : vstorage->LevelFiles(lvl)) {
+ if (file->fd.smallest_seqno == 0) {
+ return Status::InvalidArgument(
+ "Can't ingest_behind file as despite allow_ingest_behind=true "
+ "there are files with 0 seqno in database at upper levels!");
+ }
+ }
+ }
+
+ file_to_ingest->picked_level = bottom_lvl;
+ return Status::OK();
+}
+
+Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
+ IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
+ if (file_to_ingest->original_seqno == seqno) {
+ // This file already have the correct global seqno
+ return Status::OK();
+ } else if (!ingestion_options_.allow_global_seqno) {
+ return Status::InvalidArgument("Global seqno is required, but disabled");
+ } else if (file_to_ingest->global_seqno_offset == 0) {
+ return Status::InvalidArgument(
+ "Trying to set global seqno for a file that don't have a global seqno "
+ "field");
+ }
+
+ if (ingestion_options_.write_global_seqno) {
+ // Determine if we can write global_seqno to a given offset of file.
+ // If the file system does not support random write, then we should not.
+ // Otherwise we should.
+ std::unique_ptr<FSRandomRWFile> rwfile;
+ Status status = fs_->NewRandomRWFile(file_to_ingest->internal_file_path,
+ env_options_, &rwfile, nullptr);
+ TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile",
+ &status);
+ if (status.ok()) {
+ FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_,
+ file_to_ingest->internal_file_path);
+ std::string seqno_val;
+ PutFixed64(&seqno_val, seqno);
+ status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val,
+ IOOptions(), nullptr);
+ if (status.ok()) {
+ TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
+ status = SyncIngestedFile(fsptr.get());
+ TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno");
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to sync ingested file %s after writing global "
+ "sequence number: %s",
+ file_to_ingest->internal_file_path.c_str(),
+ status.ToString().c_str());
+ }
+ }
+ if (!status.ok()) {
+ return status;
+ }
+ } else if (!status.IsNotSupported()) {
+ return status;
+ }
+ }
+
+ file_to_ingest->assigned_seqno = seqno;
+ return Status::OK();
+}
+
+IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile(
+ IngestedFileInfo* file_to_ingest) {
+ if (db_options_.file_checksum_gen_factory == nullptr ||
+ need_generate_file_checksum_ == false ||
+ ingestion_options_.write_global_seqno == false) {
+ // If file_checksum_gen_factory is not set, we are not able to generate
+ // the checksum. if write_global_seqno is false, it means we will use
+ // file checksum generated during Prepare(). This step will be skipped.
+ return IOStatus::OK();
+ }
+ std::string file_checksum;
+ std::string file_checksum_func_name;
+ std::string requested_checksum_func_name;
+ // TODO: rate limit file reads for checksum calculation during file ingestion.
+ IOStatus io_s = GenerateOneFileChecksum(
+ fs_.get(), file_to_ingest->internal_file_path,
+ db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name,
+ &file_checksum, &file_checksum_func_name,
+ ingestion_options_.verify_checksums_readahead_size,
+ db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(),
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ file_to_ingest->file_checksum = file_checksum;
+ file_to_ingest->file_checksum_func_name = file_checksum_func_name;
+ return IOStatus::OK();
+}
+
+bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
+ const IngestedFileInfo* file_to_ingest, int level) {
+ if (level == 0) {
+ // Files can always fit in L0
+ return true;
+ }
+
+ auto* vstorage = cfd_->current()->storage_info();
+ Slice file_smallest_user_key(
+ file_to_ingest->smallest_internal_key.user_key());
+ Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key());
+
+ if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
+ &file_largest_user_key)) {
+ // File overlap with another files in this level, we cannot
+ // add it to this level
+ return false;
+ }
+ if (cfd_->RangeOverlapWithCompaction(file_smallest_user_key,
+ file_largest_user_key, level)) {
+ // File overlap with a running compaction output that will be stored
+ // in this level, we cannot add this file to this level
+ return false;
+ }
+
+ // File did not overlap with level files, our compaction output
+ return true;
+}
+
+template <typename TWritableFile>
+Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
+ assert(file != nullptr);
+ if (db_options_.use_fsync) {
+ return file->Fsync(IOOptions(), nullptr);
+ } else {
+ return file->Sync(IOOptions(), nullptr);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.h b/src/rocksdb/db/external_sst_file_ingestion_job.h
new file mode 100644
index 000000000..ce50ae86d
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_ingestion_job.h
@@ -0,0 +1,201 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/internal_stats.h"
+#include "db/snapshot_impl.h"
+#include "env/file_system_tracer.h"
+#include "logging/event_logger.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Directories;
+class SystemClock;
+
+struct IngestedFileInfo {
+ // External file path
+ std::string external_file_path;
+ // Smallest internal key in external file
+ InternalKey smallest_internal_key;
+ // Largest internal key in external file
+ InternalKey largest_internal_key;
+ // Sequence number for keys in external file
+ SequenceNumber original_seqno;
+ // Offset of the global sequence number field in the file, will
+ // be zero if version is 1 (global seqno is not supported)
+ size_t global_seqno_offset;
+ // External file size
+ uint64_t file_size;
+ // total number of keys in external file
+ uint64_t num_entries;
+ // total number of range deletions in external file
+ uint64_t num_range_deletions;
+ // Id of column family this file shoule be ingested into
+ uint32_t cf_id;
+ // TableProperties read from external file
+ TableProperties table_properties;
+ // Version of external file
+ int version;
+
+ // FileDescriptor for the file inside the DB
+ FileDescriptor fd;
+ // file path that we picked for file inside the DB
+ std::string internal_file_path;
+ // Global sequence number that we picked for the file inside the DB
+ SequenceNumber assigned_seqno = 0;
+ // Level inside the DB we picked for the external file.
+ int picked_level = 0;
+ // Whether to copy or link the external sst file. copy_file will be set to
+ // false if ingestion_options.move_files is true and underlying FS
+ // supports link operation. Need to provide a default value to make the
+ // undefined-behavior sanity check of llvm happy. Since
+ // ingestion_options.move_files is false by default, thus copy_file is true
+ // by default.
+ bool copy_file = true;
+ // The checksum of ingested file
+ std::string file_checksum;
+ // The name of checksum function that generate the checksum
+ std::string file_checksum_func_name;
+ // The temperature of the file to be ingested
+ Temperature file_temperature = Temperature::kUnknown;
+ // Unique id of the file to be ingested
+ UniqueId64x2 unique_id{};
+};
+
+class ExternalSstFileIngestionJob {
+ public:
+ ExternalSstFileIngestionJob(
+ VersionSet* versions, ColumnFamilyData* cfd,
+ const ImmutableDBOptions& db_options, const EnvOptions& env_options,
+ SnapshotList* db_snapshots,
+ const IngestExternalFileOptions& ingestion_options,
+ Directories* directories, EventLogger* event_logger,
+ const std::shared_ptr<IOTracer>& io_tracer)
+ : clock_(db_options.clock),
+ fs_(db_options.fs, io_tracer),
+ versions_(versions),
+ cfd_(cfd),
+ db_options_(db_options),
+ env_options_(env_options),
+ db_snapshots_(db_snapshots),
+ ingestion_options_(ingestion_options),
+ directories_(directories),
+ event_logger_(event_logger),
+ job_start_time_(clock_->NowMicros()),
+ consumed_seqno_count_(0),
+ io_tracer_(io_tracer) {
+ assert(directories != nullptr);
+ }
+
+ // Prepare the job by copying external files into the DB.
+ Status Prepare(const std::vector<std::string>& external_files_paths,
+ const std::vector<std::string>& files_checksums,
+ const std::vector<std::string>& files_checksum_func_names,
+ const Temperature& file_temperature, uint64_t next_file_number,
+ SuperVersion* sv);
+
+ // Check if we need to flush the memtable before running the ingestion job
+ // This will be true if the files we are ingesting are overlapping with any
+ // key range in the memtable.
+ //
+ // @param super_version A referenced SuperVersion that will be held for the
+ // duration of this function.
+ //
+ // Thread-safe
+ Status NeedsFlush(bool* flush_needed, SuperVersion* super_version);
+
+ // Will execute the ingestion job and prepare edit() to be applied.
+ // REQUIRES: Mutex held
+ Status Run();
+
+ // Update column family stats.
+ // REQUIRES: Mutex held
+ void UpdateStats();
+
+ // Cleanup after successful/failed job
+ void Cleanup(const Status& status);
+
+ VersionEdit* edit() { return &edit_; }
+
+ const autovector<IngestedFileInfo>& files_to_ingest() const {
+ return files_to_ingest_;
+ }
+
+ // How many sequence numbers did we consume as part of the ingest job?
+ int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; }
+
+ private:
+ // Open the external file and populate `file_to_ingest` with all the
+ // external information we need to ingest this file.
+ Status GetIngestedFileInfo(const std::string& external_file,
+ uint64_t new_file_number,
+ IngestedFileInfo* file_to_ingest,
+ SuperVersion* sv);
+
+ // Assign `file_to_ingest` the appropriate sequence number and the lowest
+ // possible level that it can be ingested to according to compaction_style.
+ // REQUIRES: Mutex held
+ Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv,
+ bool force_global_seqno,
+ CompactionStyle compaction_style,
+ SequenceNumber last_seqno,
+ IngestedFileInfo* file_to_ingest,
+ SequenceNumber* assigned_seqno);
+
+ // File that we want to ingest behind always goes to the lowest level;
+ // we just check that it fits in the level, that DB allows ingest_behind,
+ // and that we don't have 0 seqnums at the upper levels.
+ // REQUIRES: Mutex held
+ Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest);
+
+ // Set the file global sequence number to `seqno`
+ Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest,
+ SequenceNumber seqno);
+ // Generate the file checksum and store in the IngestedFileInfo
+ IOStatus GenerateChecksumForIngestedFile(IngestedFileInfo* file_to_ingest);
+
+ // Check if `file_to_ingest` can fit in level `level`
+ // REQUIRES: Mutex held
+ bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,
+ int level);
+
+ // Helper method to sync given file.
+ template <typename TWritableFile>
+ Status SyncIngestedFile(TWritableFile* file);
+
+ SystemClock* clock_;
+ FileSystemPtr fs_;
+ VersionSet* versions_;
+ ColumnFamilyData* cfd_;
+ const ImmutableDBOptions& db_options_;
+ const EnvOptions& env_options_;
+ SnapshotList* db_snapshots_;
+ autovector<IngestedFileInfo> files_to_ingest_;
+ const IngestExternalFileOptions& ingestion_options_;
+ Directories* directories_;
+ EventLogger* event_logger_;
+ VersionEdit edit_;
+ uint64_t job_start_time_;
+ int consumed_seqno_count_;
+ // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are
+ // ingested in L0
+ bool files_overlap_{false};
+ // Set in ExternalSstFileIngestionJob::Prepare(), if true and DB
+ // file_checksum_gen_factory is set, DB will generate checksum each file.
+ bool need_generate_file_checksum_{true};
+ std::shared_ptr<IOTracer> io_tracer_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/external_sst_file_test.cc b/src/rocksdb/db/external_sst_file_test.cc
new file mode 100644
index 000000000..d16f6a58c
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_test.cc
@@ -0,0 +1,2967 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "file/filename.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_reader.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/thread_guard.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A test environment that can be configured to fail the Link operation.
+class ExternalSSTTestEnv : public EnvWrapper {
+ public:
+ ExternalSSTTestEnv(Env* t, bool fail_link)
+ : EnvWrapper(t), fail_link_(fail_link) {}
+ static const char* kClassName() { return "ExternalSSTTestEnv"; }
+ const char* Name() const override { return kClassName(); }
+
+ Status LinkFile(const std::string& s, const std::string& t) override {
+ if (fail_link_) {
+ return Status::NotSupported("Link failed");
+ }
+ return target()->LinkFile(s, t);
+ }
+
+ void set_fail_link(bool fail_link) { fail_link_ = fail_link; }
+
+ private:
+ bool fail_link_;
+};
+
+class ExternalSSTFileTestBase : public DBTestBase {
+ public:
+ ExternalSSTFileTestBase()
+ : DBTestBase("external_sst_file_test", /*env_do_fsync=*/true) {
+ sst_files_dir_ = dbname_ + "/sst_files/";
+ DestroyAndRecreateExternalSSTFilesDir();
+ }
+
+ void DestroyAndRecreateExternalSSTFilesDir() {
+ ASSERT_OK(DestroyDir(env_, sst_files_dir_));
+ ASSERT_OK(env_->CreateDir(sst_files_dir_));
+ }
+
+ ~ExternalSSTFileTestBase() override {
+ DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
+ }
+
+ protected:
+ std::string sst_files_dir_;
+};
+
+class ExternSSTFileLinkFailFallbackTest
+ : public ExternalSSTFileTestBase,
+ public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ ExternSSTFileLinkFailFallbackTest()
+ : test_env_(new ExternalSSTTestEnv(env_, true)) {
+ options_ = CurrentOptions();
+ options_.disable_auto_compactions = true;
+ options_.env = test_env_;
+ }
+
+ void TearDown() override {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, options_));
+ delete test_env_;
+ test_env_ = nullptr;
+ }
+
+ protected:
+ Options options_;
+ ExternalSSTTestEnv* test_env_;
+};
+
+class ExternalSSTFileTest
+ : public ExternalSSTFileTestBase,
+ public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ ExternalSSTFileTest() {}
+
+ Status GenerateOneExternalFile(
+ const Options& options, ColumnFamilyHandle* cfh,
+ std::vector<std::pair<std::string, std::string>>& data, int file_id,
+ bool sort_data, std::string* external_file_path,
+ std::map<std::string, std::string>* true_data) {
+ // Generate a file id if not provided
+ if (-1 == file_id) {
+ file_id = (++last_file_id_);
+ }
+ // Sort data if asked to do so
+ if (sort_data) {
+ std::sort(data.begin(), data.end(),
+ [&](const std::pair<std::string, std::string>& e1,
+ const std::pair<std::string, std::string>& e2) {
+ return options.comparator->Compare(e1.first, e2.first) < 0;
+ });
+ auto uniq_iter = std::unique(
+ data.begin(), data.end(),
+ [&](const std::pair<std::string, std::string>& e1,
+ const std::pair<std::string, std::string>& e2) {
+ return options.comparator->Compare(e1.first, e2.first) == 0;
+ });
+ data.resize(uniq_iter - data.begin());
+ }
+ std::string file_path = sst_files_dir_ + std::to_string(file_id);
+ SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
+ Status s = sst_file_writer.Open(file_path);
+ if (!s.ok()) {
+ return s;
+ }
+ for (const auto& entry : data) {
+ s = sst_file_writer.Put(entry.first, entry.second);
+ if (!s.ok()) {
+ sst_file_writer.Finish().PermitUncheckedError();
+ return s;
+ }
+ }
+ s = sst_file_writer.Finish();
+ if (s.ok() && external_file_path != nullptr) {
+ *external_file_path = file_path;
+ }
+ if (s.ok() && nullptr != true_data) {
+ for (const auto& entry : data) {
+ true_data->insert({entry.first, entry.second});
+ }
+ }
+ return s;
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options,
+ std::vector<std::pair<std::string, std::string>> data, int file_id = -1,
+ bool allow_global_seqno = false, bool write_global_seqno = false,
+ bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+ bool sort_data = false,
+ std::map<std::string, std::string>* true_data = nullptr,
+ ColumnFamilyHandle* cfh = nullptr) {
+ // Generate a file id if not provided
+ if (file_id == -1) {
+ file_id = last_file_id_ + 1;
+ last_file_id_++;
+ }
+
+ // Sort data if asked to do so
+ if (sort_data) {
+ std::sort(data.begin(), data.end(),
+ [&](const std::pair<std::string, std::string>& e1,
+ const std::pair<std::string, std::string>& e2) {
+ return options.comparator->Compare(e1.first, e2.first) < 0;
+ });
+ auto uniq_iter = std::unique(
+ data.begin(), data.end(),
+ [&](const std::pair<std::string, std::string>& e1,
+ const std::pair<std::string, std::string>& e2) {
+ return options.comparator->Compare(e1.first, e2.first) == 0;
+ });
+ data.resize(uniq_iter - data.begin());
+ }
+ std::string file_path = sst_files_dir_ + std::to_string(file_id);
+ SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
+
+ Status s = sst_file_writer.Open(file_path);
+ if (!s.ok()) {
+ return s;
+ }
+ for (auto& entry : data) {
+ s = sst_file_writer.Put(entry.first, entry.second);
+ if (!s.ok()) {
+ sst_file_writer.Finish().PermitUncheckedError();
+ return s;
+ }
+ }
+ s = sst_file_writer.Finish();
+
+ if (s.ok()) {
+ IngestExternalFileOptions ifo;
+ ifo.allow_global_seqno = allow_global_seqno;
+ ifo.write_global_seqno = allow_global_seqno ? write_global_seqno : false;
+ ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
+ ifo.ingest_behind = ingest_behind;
+ if (cfh) {
+ s = db_->IngestExternalFile(cfh, {file_path}, ifo);
+ } else {
+ s = db_->IngestExternalFile({file_path}, ifo);
+ }
+ }
+
+ if (s.ok() && true_data) {
+ for (auto& entry : data) {
+ (*true_data)[entry.first] = entry.second;
+ }
+ }
+
+ return s;
+ }
+
+ Status GenerateAndAddExternalFiles(
+ const Options& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ const std::vector<IngestExternalFileOptions>& ifos,
+ std::vector<std::vector<std::pair<std::string, std::string>>>& data,
+ int file_id, bool sort_data,
+ std::vector<std::map<std::string, std::string>>& true_data) {
+ if (-1 == file_id) {
+ file_id = (++last_file_id_);
+ }
+ // Generate external SST files, one for each column family
+ size_t num_cfs = column_families.size();
+ assert(ifos.size() == num_cfs);
+ assert(data.size() == num_cfs);
+ std::vector<IngestExternalFileArg> args(num_cfs);
+ for (size_t i = 0; i != num_cfs; ++i) {
+ std::string external_file_path;
+ Status s = GenerateOneExternalFile(
+ options, column_families[i], data[i], file_id, sort_data,
+ &external_file_path,
+ true_data.size() == num_cfs ? &true_data[i] : nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ ++file_id;
+
+ args[i].column_family = column_families[i];
+ args[i].external_files.push_back(external_file_path);
+ args[i].options = ifos[i];
+ }
+ return db_->IngestExternalFiles(args);
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options, std::vector<std::pair<int, std::string>> data,
+ int file_id = -1, bool allow_global_seqno = false,
+ bool write_global_seqno = false,
+ bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+ bool sort_data = false,
+ std::map<std::string, std::string>* true_data = nullptr,
+ ColumnFamilyHandle* cfh = nullptr) {
+ std::vector<std::pair<std::string, std::string>> file_data;
+ for (auto& entry : data) {
+ file_data.emplace_back(Key(entry.first), entry.second);
+ }
+ return GenerateAndAddExternalFile(options, file_data, file_id,
+ allow_global_seqno, write_global_seqno,
+ verify_checksums_before_ingest,
+ ingest_behind, sort_data, true_data, cfh);
+ }
+
+ Status GenerateAndAddExternalFile(
+ const Options options, std::vector<int> keys, int file_id = -1,
+ bool allow_global_seqno = false, bool write_global_seqno = false,
+ bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+ bool sort_data = false,
+ std::map<std::string, std::string>* true_data = nullptr,
+ ColumnFamilyHandle* cfh = nullptr) {
+ std::vector<std::pair<std::string, std::string>> file_data;
+ for (auto& k : keys) {
+ file_data.emplace_back(Key(k), Key(k) + std::to_string(file_id));
+ }
+ return GenerateAndAddExternalFile(options, file_data, file_id,
+ allow_global_seqno, write_global_seqno,
+ verify_checksums_before_ingest,
+ ingest_behind, sort_data, true_data, cfh);
+ }
+
+ Status DeprecatedAddFile(const std::vector<std::string>& files,
+ bool move_files = false,
+ bool skip_snapshot_check = false,
+ bool skip_write_global_seqno = false) {
+ IngestExternalFileOptions opts;
+ opts.move_files = move_files;
+ opts.snapshot_consistency = !skip_snapshot_check;
+ opts.allow_global_seqno = false;
+ opts.allow_blocking_flush = false;
+ opts.write_global_seqno = !skip_write_global_seqno;
+ return db_->IngestExternalFile(files, opts);
+ }
+
+ protected:
+ int last_file_id_ = 0;
+};
+
+TEST_F(ExternalSSTFileTest, Basic) {
+ do {
+ Options options = CurrentOptions();
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // Current file size should be 0 after sst_file_writer init and before open
+ // a file.
+ ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+ // file1.sst (0 => 99)
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 0; k < 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ ASSERT_OK(sst_file_writer.Finish(&file1_info));
+
+ // Current file size should be non-zero after success write.
+ ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(0));
+ ASSERT_EQ(file1_info.largest_key, Key(99));
+ ASSERT_EQ(file1_info.num_range_del_entries, 0);
+ ASSERT_EQ(file1_info.smallest_range_del_key, "");
+ ASSERT_EQ(file1_info.largest_range_del_key, "");
+ // sst_file_writer already finished, cannot add this value
+ ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val"));
+
+ // file2.sst (100 => 199)
+ std::string file2 = sst_files_dir_ + "file2.sst";
+ ASSERT_OK(sst_file_writer.Open(file2));
+ for (int k = 100; k < 200; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ // Cannot add this key because it's not after last added key
+ ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val"));
+ ExternalSstFileInfo file2_info;
+ ASSERT_OK(sst_file_writer.Finish(&file2_info));
+ ASSERT_EQ(file2_info.file_path, file2);
+ ASSERT_EQ(file2_info.num_entries, 100);
+ ASSERT_EQ(file2_info.smallest_key, Key(100));
+ ASSERT_EQ(file2_info.largest_key, Key(199));
+
+ // file3.sst (195 => 299)
+ // This file values overlap with file2 values
+ std::string file3 = sst_files_dir_ + "file3.sst";
+ ASSERT_OK(sst_file_writer.Open(file3));
+ for (int k = 195; k < 300; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file3_info;
+ ASSERT_OK(sst_file_writer.Finish(&file3_info));
+
+ // Current file size should be non-zero after success finish.
+ ASSERT_GT(sst_file_writer.FileSize(), 0);
+ ASSERT_EQ(file3_info.file_path, file3);
+ ASSERT_EQ(file3_info.num_entries, 105);
+ ASSERT_EQ(file3_info.smallest_key, Key(195));
+ ASSERT_EQ(file3_info.largest_key, Key(299));
+
+ // file4.sst (30 => 39)
+ // This file values overlap with file1 values
+ std::string file4 = sst_files_dir_ + "file4.sst";
+ ASSERT_OK(sst_file_writer.Open(file4));
+ for (int k = 30; k < 40; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file4_info;
+ ASSERT_OK(sst_file_writer.Finish(&file4_info));
+ ASSERT_EQ(file4_info.file_path, file4);
+ ASSERT_EQ(file4_info.num_entries, 10);
+ ASSERT_EQ(file4_info.smallest_key, Key(30));
+ ASSERT_EQ(file4_info.largest_key, Key(39));
+
+ // file5.sst (400 => 499)
+ std::string file5 = sst_files_dir_ + "file5.sst";
+ ASSERT_OK(sst_file_writer.Open(file5));
+ for (int k = 400; k < 500; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file5_info;
+ ASSERT_OK(sst_file_writer.Finish(&file5_info));
+ ASSERT_EQ(file5_info.file_path, file5);
+ ASSERT_EQ(file5_info.num_entries, 100);
+ ASSERT_EQ(file5_info.smallest_key, Key(400));
+ ASSERT_EQ(file5_info.largest_key, Key(499));
+
+ // file6.sst (delete 400 => 500)
+ std::string file6 = sst_files_dir_ + "file6.sst";
+ ASSERT_OK(sst_file_writer.Open(file6));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
+ ExternalSstFileInfo file6_info;
+ ASSERT_OK(sst_file_writer.Finish(&file6_info));
+ ASSERT_EQ(file6_info.file_path, file6);
+ ASSERT_EQ(file6_info.num_entries, 0);
+ ASSERT_EQ(file6_info.smallest_key, "");
+ ASSERT_EQ(file6_info.largest_key, "");
+ ASSERT_EQ(file6_info.num_range_del_entries, 1);
+ ASSERT_EQ(file6_info.smallest_range_del_key, Key(400));
+ ASSERT_EQ(file6_info.largest_range_del_key, Key(500));
+
+ // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2)
+ std::string file7 = sst_files_dir_ + "file7.sst";
+ ASSERT_OK(sst_file_writer.Open(file7));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(500), Key(550)));
+ for (int k = 520; k < 560; k += 2) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(525), Key(575)));
+ for (int k = 560; k < 600; k += 2) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file7_info;
+ ASSERT_OK(sst_file_writer.Finish(&file7_info));
+ ASSERT_EQ(file7_info.file_path, file7);
+ ASSERT_EQ(file7_info.num_entries, 40);
+ ASSERT_EQ(file7_info.smallest_key, Key(520));
+ ASSERT_EQ(file7_info.largest_key, Key(598));
+ ASSERT_EQ(file7_info.num_range_del_entries, 2);
+ ASSERT_EQ(file7_info.smallest_range_del_key, Key(500));
+ ASSERT_EQ(file7_info.largest_range_del_key, Key(575));
+
+ // file8.sst (delete 600 => 700)
+ std::string file8 = sst_files_dir_ + "file8.sst";
+ ASSERT_OK(sst_file_writer.Open(file8));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(600), Key(700)));
+ ExternalSstFileInfo file8_info;
+ ASSERT_OK(sst_file_writer.Finish(&file8_info));
+ ASSERT_EQ(file8_info.file_path, file8);
+ ASSERT_EQ(file8_info.num_entries, 0);
+ ASSERT_EQ(file8_info.smallest_key, "");
+ ASSERT_EQ(file8_info.largest_key, "");
+ ASSERT_EQ(file8_info.num_range_del_entries, 1);
+ ASSERT_EQ(file8_info.smallest_range_del_key, Key(600));
+ ASSERT_EQ(file8_info.largest_range_del_key, Key(700));
+
+ // Cannot create an empty sst file
+ std::string file_empty = sst_files_dir_ + "file_empty.sst";
+ ExternalSstFileInfo file_empty_info;
+ ASSERT_NOK(sst_file_writer.Finish(&file_empty_info));
+
+ DestroyAndReopen(options);
+ // Add file using file path
+ ASSERT_OK(DeprecatedAddFile({file1}));
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ for (int k = 0; k < 100; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+
+ // Add file while holding a snapshot will fail
+ const Snapshot* s1 = db_->GetSnapshot();
+ if (s1 != nullptr) {
+ ASSERT_NOK(DeprecatedAddFile({file2}));
+ db_->ReleaseSnapshot(s1);
+ }
+ // We can add the file after releaseing the snapshot
+ ASSERT_OK(DeprecatedAddFile({file2}));
+
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ for (int k = 0; k < 200; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+
+ // This file has overlapping values with the existing data
+ ASSERT_NOK(DeprecatedAddFile({file3}));
+
+ // This file has overlapping values with the existing data
+ ASSERT_NOK(DeprecatedAddFile({file4}));
+
+ // Overwrite values of keys divisible by 5
+ for (int k = 0; k < 200; k += 5) {
+ ASSERT_OK(Put(Key(k), Key(k) + "_val_new"));
+ }
+ ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
+
+ // Key range of file5 (400 => 499) don't overlap with any keys in DB
+ ASSERT_OK(DeprecatedAddFile({file5}));
+
+ // This file has overlapping values with the existing data
+ ASSERT_NOK(DeprecatedAddFile({file6}));
+
+ // Key range of file7 (500 => 598) don't overlap with any keys in DB
+ ASSERT_OK(DeprecatedAddFile({file7}));
+
+ // Key range of file7 (600 => 700) don't overlap with any keys in DB
+ ASSERT_OK(DeprecatedAddFile({file8}));
+
+ // Make sure values are correct before and after flush/compaction
+ for (int i = 0; i < 2; i++) {
+ for (int k = 0; k < 200; k++) {
+ std::string value = Key(k) + "_val";
+ if (k % 5 == 0) {
+ value += "_new";
+ }
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ for (int k = 400; k < 500; k++) {
+ std::string value = Key(k) + "_val";
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ for (int k = 500; k < 600; k++) {
+ std::string value = Key(k) + "_val";
+ if (k < 520 || k % 2 == 1) {
+ value = "NOT_FOUND";
+ }
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+
+ Close();
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ // Delete keys in range (400 => 499)
+ for (int k = 400; k < 500; k++) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ // We deleted range (400 => 499) but cannot add file5 because
+ // of the range tombstones
+ ASSERT_NOK(DeprecatedAddFile({file5}));
+
+ // Compacting the DB will remove the tombstones
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Now we can add the file
+ ASSERT_OK(DeprecatedAddFile({file5}));
+
+ // Verify values of file5 in DB
+ for (int k = 400; k < 500; k++) {
+ std::string value = Key(k) + "_val";
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ DestroyAndRecreateExternalSSTFilesDir();
+ } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+ kRangeDelSkipConfigs));
+}
+
+class SstFileWriterCollector : public TablePropertiesCollector {
+ public:
+ explicit SstFileWriterCollector(const std::string prefix) : prefix_(prefix) {
+ name_ = prefix_ + "_SstFileWriterCollector";
+ }
+
+ const char* Name() const override { return name_.c_str(); }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ std::string count = std::to_string(count_);
+ *properties = UserCollectedProperties{
+ {prefix_ + "_SstFileWriterCollector", "YES"},
+ {prefix_ + "_Count", count},
+ };
+ return Status::OK();
+ }
+
+ Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+ EntryType /*type*/, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ ++count_;
+ return Status::OK();
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ private:
+ uint32_t count_ = 0;
+ std::string prefix_;
+ std::string name_;
+};
+
+class SstFileWriterCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+ explicit SstFileWriterCollectorFactory(std::string prefix)
+ : prefix_(prefix), num_created_(0) {}
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ num_created_++;
+ return new SstFileWriterCollector(prefix_);
+ }
+ const char* Name() const override { return "SstFileWriterCollectorFactory"; }
+
+ std::string prefix_;
+ uint32_t num_created_;
+};
+
+TEST_F(ExternalSSTFileTest, AddList) {
+ do {
+ Options options = CurrentOptions();
+
+ auto abc_collector = std::make_shared<SstFileWriterCollectorFactory>("abc");
+ auto xyz_collector = std::make_shared<SstFileWriterCollectorFactory>("xyz");
+
+ options.table_properties_collector_factories.emplace_back(abc_collector);
+ options.table_properties_collector_factories.emplace_back(xyz_collector);
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file1.sst (0 => 99)
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 0; k < 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ ASSERT_OK(sst_file_writer.Finish(&file1_info));
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(0));
+ ASSERT_EQ(file1_info.largest_key, Key(99));
+ // sst_file_writer already finished, cannot add this value
+ ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val"));
+
+ // file2.sst (100 => 199)
+ std::string file2 = sst_files_dir_ + "file2.sst";
+ ASSERT_OK(sst_file_writer.Open(file2));
+ for (int k = 100; k < 200; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ // Cannot add this key because it's not after last added key
+ ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val"));
+ ExternalSstFileInfo file2_info;
+ ASSERT_OK(sst_file_writer.Finish(&file2_info));
+ ASSERT_EQ(file2_info.file_path, file2);
+ ASSERT_EQ(file2_info.num_entries, 100);
+ ASSERT_EQ(file2_info.smallest_key, Key(100));
+ ASSERT_EQ(file2_info.largest_key, Key(199));
+
+ // file3.sst (195 => 199)
+ // This file values overlap with file2 values
+ std::string file3 = sst_files_dir_ + "file3.sst";
+ ASSERT_OK(sst_file_writer.Open(file3));
+ for (int k = 195; k < 200; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file3_info;
+ ASSERT_OK(sst_file_writer.Finish(&file3_info));
+ ASSERT_EQ(file3_info.file_path, file3);
+ ASSERT_EQ(file3_info.num_entries, 5);
+ ASSERT_EQ(file3_info.smallest_key, Key(195));
+ ASSERT_EQ(file3_info.largest_key, Key(199));
+
+ // file4.sst (30 => 39)
+ // This file values overlap with file1 values
+ std::string file4 = sst_files_dir_ + "file4.sst";
+ ASSERT_OK(sst_file_writer.Open(file4));
+ for (int k = 30; k < 40; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+ }
+ ExternalSstFileInfo file4_info;
+ ASSERT_OK(sst_file_writer.Finish(&file4_info));
+ ASSERT_EQ(file4_info.file_path, file4);
+ ASSERT_EQ(file4_info.num_entries, 10);
+ ASSERT_EQ(file4_info.smallest_key, Key(30));
+ ASSERT_EQ(file4_info.largest_key, Key(39));
+
+ // file5.sst (200 => 299)
+ std::string file5 = sst_files_dir_ + "file5.sst";
+ ASSERT_OK(sst_file_writer.Open(file5));
+ for (int k = 200; k < 300; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file5_info;
+ ASSERT_OK(sst_file_writer.Finish(&file5_info));
+ ASSERT_EQ(file5_info.file_path, file5);
+ ASSERT_EQ(file5_info.num_entries, 100);
+ ASSERT_EQ(file5_info.smallest_key, Key(200));
+ ASSERT_EQ(file5_info.largest_key, Key(299));
+
+ // file6.sst (delete 0 => 100)
+ std::string file6 = sst_files_dir_ + "file6.sst";
+ ASSERT_OK(sst_file_writer.Open(file6));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75)));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100)));
+ ExternalSstFileInfo file6_info;
+ ASSERT_OK(sst_file_writer.Finish(&file6_info));
+ ASSERT_EQ(file6_info.file_path, file6);
+ ASSERT_EQ(file6_info.num_entries, 0);
+ ASSERT_EQ(file6_info.smallest_key, "");
+ ASSERT_EQ(file6_info.largest_key, "");
+ ASSERT_EQ(file6_info.num_range_del_entries, 2);
+ ASSERT_EQ(file6_info.smallest_range_del_key, Key(0));
+ ASSERT_EQ(file6_info.largest_range_del_key, Key(100));
+
+ // file7.sst (delete 99 => 201)
+ std::string file7 = sst_files_dir_ + "file7.sst";
+ ASSERT_OK(sst_file_writer.Open(file7));
+ ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201)));
+ ExternalSstFileInfo file7_info;
+ ASSERT_OK(sst_file_writer.Finish(&file7_info));
+ ASSERT_EQ(file7_info.file_path, file7);
+ ASSERT_EQ(file7_info.num_entries, 0);
+ ASSERT_EQ(file7_info.smallest_key, "");
+ ASSERT_EQ(file7_info.largest_key, "");
+ ASSERT_EQ(file7_info.num_range_del_entries, 1);
+ ASSERT_EQ(file7_info.smallest_range_del_key, Key(99));
+ ASSERT_EQ(file7_info.largest_range_del_key, Key(201));
+
+ // list 1 has internal key range conflict
+ std::vector<std::string> file_list0({file1, file2});
+ std::vector<std::string> file_list1({file3, file2, file1});
+ std::vector<std::string> file_list2({file5});
+ std::vector<std::string> file_list3({file3, file4});
+ std::vector<std::string> file_list4({file5, file7});
+ std::vector<std::string> file_list5({file6, file7});
+
+ DestroyAndReopen(options);
+
+ // These lists of files have key ranges that overlap with each other
+ ASSERT_NOK(DeprecatedAddFile(file_list1));
+ // Both of the following overlap on the range deletion tombstone.
+ ASSERT_NOK(DeprecatedAddFile(file_list4));
+ ASSERT_NOK(DeprecatedAddFile(file_list5));
+
+ // Add files using file path list
+ ASSERT_OK(DeprecatedAddFile(file_list0));
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ for (int k = 0; k < 200; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+
+ TablePropertiesCollection props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+ ASSERT_EQ(props.size(), 2);
+ for (auto file_props : props) {
+ auto user_props = file_props.second->user_collected_properties;
+ ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES");
+ ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES");
+ ASSERT_EQ(user_props["abc_Count"], "100");
+ ASSERT_EQ(user_props["xyz_Count"], "100");
+ }
+
+ // Add file while holding a snapshot will fail
+ const Snapshot* s1 = db_->GetSnapshot();
+ if (s1 != nullptr) {
+ ASSERT_NOK(DeprecatedAddFile(file_list2));
+ db_->ReleaseSnapshot(s1);
+ }
+ // We can add the file after releaseing the snapshot
+ ASSERT_OK(DeprecatedAddFile(file_list2));
+ ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+ for (int k = 0; k < 300; k++) {
+ ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+ }
+
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+ ASSERT_EQ(props.size(), 3);
+ for (auto file_props : props) {
+ auto user_props = file_props.second->user_collected_properties;
+ ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES");
+ ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES");
+ ASSERT_EQ(user_props["abc_Count"], "100");
+ ASSERT_EQ(user_props["xyz_Count"], "100");
+ }
+
+ // This file list has overlapping values with the existing data
+ ASSERT_NOK(DeprecatedAddFile(file_list3));
+
+ // Overwrite values of keys divisible by 5
+ for (int k = 0; k < 200; k += 5) {
+ ASSERT_OK(Put(Key(k), Key(k) + "_val_new"));
+ }
+ ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
+
+ // Make sure values are correct before and after flush/compaction
+ for (int i = 0; i < 2; i++) {
+ for (int k = 0; k < 200; k++) {
+ std::string value = Key(k) + "_val";
+ if (k % 5 == 0) {
+ value += "_new";
+ }
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ for (int k = 200; k < 300; k++) {
+ std::string value = Key(k) + "_val";
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+
+ // Delete keys in range (200 => 299)
+ for (int k = 200; k < 300; k++) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ // We deleted range (200 => 299) but cannot add file5 because
+ // of the range tombstones
+ ASSERT_NOK(DeprecatedAddFile(file_list2));
+
+ // Compacting the DB will remove the tombstones
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // Now we can add the file
+ ASSERT_OK(DeprecatedAddFile(file_list2));
+
+ // Verify values of file5 in DB
+ for (int k = 200; k < 300; k++) {
+ std::string value = Key(k) + "_val";
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ DestroyAndRecreateExternalSSTFilesDir();
+ } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+ kRangeDelSkipConfigs));
+}
+
+TEST_F(ExternalSSTFileTest, AddListAtomicity) {
+ do {
+ Options options = CurrentOptions();
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // files[0].sst (0 => 99)
+ // files[1].sst (100 => 199)
+ // ...
+ // file[8].sst (800 => 899)
+ int n = 9;
+ std::vector<std::string> files(n);
+ std::vector<ExternalSstFileInfo> files_info(n);
+ for (int i = 0; i < n; i++) {
+ files[i] = sst_files_dir_ + "file" + std::to_string(i) + ".sst";
+ ASSERT_OK(sst_file_writer.Open(files[i]));
+ for (int k = i * 100; k < (i + 1) * 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ASSERT_OK(sst_file_writer.Finish(&files_info[i]));
+ ASSERT_EQ(files_info[i].file_path, files[i]);
+ ASSERT_EQ(files_info[i].num_entries, 100);
+ ASSERT_EQ(files_info[i].smallest_key, Key(i * 100));
+ ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1));
+ }
+ files.push_back(sst_files_dir_ + "file" + std::to_string(n) + ".sst");
+ ASSERT_NOK(DeprecatedAddFile(files));
+ for (int k = 0; k < n * 100; k++) {
+ ASSERT_EQ("NOT_FOUND", Get(Key(k)));
+ }
+ files.pop_back();
+ ASSERT_OK(DeprecatedAddFile(files));
+ for (int k = 0; k < n * 100; k++) {
+ std::string value = Key(k) + "_val";
+ ASSERT_EQ(Get(Key(k)), value);
+ }
+ DestroyAndRecreateExternalSSTFilesDir();
+ } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+// This test reporduce a bug that can happen in some cases if the DB started
+// purging obsolete files when we are adding an external sst file.
+// This situation may result in deleting the file while it's being added.
+TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) {
+ Options options = CurrentOptions();
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file1.sst (0 => 500)
+ std::string sst_file_path = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(sst_file_path));
+ for (int i = 0; i < 500; i++) {
+ std::string k = Key(i);
+ ASSERT_OK(sst_file_writer.Put(k, k + "_val"));
+ }
+
+ ExternalSstFileInfo sst_file_info;
+ ASSERT_OK(sst_file_writer.Finish(&sst_file_info));
+
+ options.delete_obsolete_files_period_micros = 0;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ExternalSstFileIngestionJob::Prepare:FileAdded", [&](void* /* arg */) {
+ ASSERT_OK(Put("aaa", "bbb"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("aaa", "xxx"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(DeprecatedAddFile({sst_file_path}));
+
+ for (int i = 0; i < 500; i++) {
+ std::string k = Key(i);
+ std::string v = k + "_val";
+ ASSERT_EQ(Get(k), v);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, SkipSnapshot) {
+ Options options = CurrentOptions();
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // file1.sst (0 => 99)
+ std::string file1 = sst_files_dir_ + "file1.sst";
+ ASSERT_OK(sst_file_writer.Open(file1));
+ for (int k = 0; k < 100; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file1_info;
+ ASSERT_OK(sst_file_writer.Finish(&file1_info));
+ ASSERT_EQ(file1_info.file_path, file1);
+ ASSERT_EQ(file1_info.num_entries, 100);
+ ASSERT_EQ(file1_info.smallest_key, Key(0));
+ ASSERT_EQ(file1_info.largest_key, Key(99));
+
+ // file2.sst (100 => 299)
+ std::string file2 = sst_files_dir_ + "file2.sst";
+ ASSERT_OK(sst_file_writer.Open(file2));
+ for (int k = 100; k < 300; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file2_info;
+ ASSERT_OK(sst_file_writer.Finish(&file2_info));
+ ASSERT_EQ(file2_info.file_path, file2);
+ ASSERT_EQ(file2_info.num_entries, 200);
+ ASSERT_EQ(file2_info.smallest_key, Key(100));
+ ASSERT_EQ(file2_info.largest_key, Key(299));
+
+ ASSERT_OK(DeprecatedAddFile({file1}));
+
+ // Add file will fail when holding snapshot and use the default
+ // skip_snapshot_check to false
+ const Snapshot* s1 = db_->GetSnapshot();
+ if (s1 != nullptr) {
+ ASSERT_NOK(DeprecatedAddFile({file2}));
+ }
+
+ // Add file will success when set skip_snapshot_check to true even db holding
+ // snapshot
+ if (s1 != nullptr) {
+ ASSERT_OK(DeprecatedAddFile({file2}, false, true));
+ db_->ReleaseSnapshot(s1);
+ }
+
+ // file3.sst (300 => 399)
+ std::string file3 = sst_files_dir_ + "file3.sst";
+ ASSERT_OK(sst_file_writer.Open(file3));
+ for (int k = 300; k < 400; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+ }
+ ExternalSstFileInfo file3_info;
+ ASSERT_OK(sst_file_writer.Finish(&file3_info));
+ ASSERT_EQ(file3_info.file_path, file3);
+ ASSERT_EQ(file3_info.num_entries, 100);
+ ASSERT_EQ(file3_info.smallest_key, Key(300));
+ ASSERT_EQ(file3_info.largest_key, Key(399));
+
+ // check that we have change the old key
+ ASSERT_EQ(Get(Key(300)), "NOT_FOUND");
+ const Snapshot* s2 = db_->GetSnapshot();
+ ASSERT_OK(DeprecatedAddFile({file3}, false, true));
+ ASSERT_EQ(Get(Key(300)), Key(300) + ("_val"));
+ ASSERT_EQ(Get(Key(300), s2), Key(300) + ("_val"));
+
+ db_->ReleaseSnapshot(s2);
+}
+
+TEST_F(ExternalSSTFileTest, MultiThreaded) {
+ env_->skip_fsync_ = true;
+ // Bulk load 10 files every file contain 1000 keys
+ int num_files = 10;
+ int keys_per_file = 1000;
+
+ // Generate file names
+ std::vector<std::string> file_names;
+ for (int i = 0; i < num_files; i++) {
+ std::string file_name = "file_" + std::to_string(i) + ".sst";
+ file_names.push_back(sst_files_dir_ + file_name);
+ }
+
+ do {
+ Options options = CurrentOptions();
+
+ std::atomic<int> thread_num(0);
+ std::function<void()> write_file_func = [&]() {
+ int file_idx = thread_num.fetch_add(1);
+ int range_start = file_idx * keys_per_file;
+ int range_end = range_start + keys_per_file;
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ ASSERT_OK(sst_file_writer.Open(file_names[file_idx]));
+
+ for (int k = range_start; k < range_end; k++) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
+ }
+
+ ASSERT_OK(sst_file_writer.Finish());
+ };
+ // Write num_files files in parallel
+ std::vector<port::Thread> sst_writer_threads;
+ for (int i = 0; i < num_files; ++i) {
+ sst_writer_threads.emplace_back(write_file_func);
+ }
+
+ for (auto& t : sst_writer_threads) {
+ t.join();
+ }
+
+ fprintf(stderr, "Wrote %d files (%d keys)\n", num_files,
+ num_files * keys_per_file);
+
+ thread_num.store(0);
+ std::atomic<int> files_added(0);
+ // Thread 0 -> Load {f0,f1}
+ // Thread 1 -> Load {f0,f1}
+ // Thread 2 -> Load {f2,f3}
+ // Thread 3 -> Load {f2,f3}
+ // Thread 4 -> Load {f4,f5}
+ // Thread 5 -> Load {f4,f5}
+ // ...
+ std::function<void()> load_file_func = [&]() {
+ // We intentionally add every file twice, and assert that it was added
+ // only once and the other add failed
+ int thread_id = thread_num.fetch_add(1);
+ int file_idx = (thread_id / 2) * 2;
+ // sometimes we use copy, sometimes link .. the result should be the same
+ bool move_file = (thread_id % 3 == 0);
+
+ std::vector<std::string> files_to_add;
+
+ files_to_add = {file_names[file_idx]};
+ if (static_cast<size_t>(file_idx + 1) < file_names.size()) {
+ files_to_add.push_back(file_names[file_idx + 1]);
+ }
+
+ Status s = DeprecatedAddFile(files_to_add, move_file);
+ if (s.ok()) {
+ files_added += static_cast<int>(files_to_add.size());
+ }
+ };
+
+ // Bulk load num_files files in parallel
+ std::vector<port::Thread> add_file_threads;
+ DestroyAndReopen(options);
+ for (int i = 0; i < num_files; ++i) {
+ add_file_threads.emplace_back(load_file_func);
+ }
+
+ for (auto& t : add_file_threads) {
+ t.join();
+ }
+ ASSERT_EQ(files_added.load(), num_files);
+ fprintf(stderr, "Loaded %d files (%d keys)\n", num_files,
+ num_files * keys_per_file);
+
+ // Overwrite values of keys divisible by 100
+ for (int k = 0; k < num_files * keys_per_file; k += 100) {
+ std::string key = Key(k);
+ ASSERT_OK(Put(key, key + "_new"));
+ }
+
+ for (int i = 0; i < 2; i++) {
+ // Make sure the values are correct before and after flush/compaction
+ for (int k = 0; k < num_files * keys_per_file; ++k) {
+ std::string key = Key(k);
+ std::string value = (k % 100 == 0) ? (key + "_new") : key;
+ ASSERT_EQ(Get(key), value);
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+
+ fprintf(stderr, "Verified %d values\n", num_files * keys_per_file);
+ DestroyAndRecreateExternalSSTFilesDir();
+ } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+TEST_F(ExternalSSTFileTest, OverlappingRanges) {
+ env_->skip_fsync_ = true;
+ Random rnd(301);
+ SequenceNumber assigned_seqno = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ExternalSstFileIngestionJob::Run", [&assigned_seqno](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ assigned_seqno = *(static_cast<SequenceNumber*>(arg));
+ });
+ bool need_flush = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::IngestExternalFile:NeedFlush", [&need_flush](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ need_flush = *(static_cast<bool*>(arg));
+ });
+ bool overlap_with_db = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
+ [&overlap_with_db](void* arg) {
+ ASSERT_TRUE(arg != nullptr);
+ overlap_with_db = *(static_cast<bool*>(arg));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ do {
+ Options options = CurrentOptions();
+ env_->skip_fsync_ = true;
+ DestroyAndReopen(options);
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ printf("Option config = %d\n", option_config_);
+ std::vector<std::pair<int, int>> key_ranges;
+ for (int i = 0; i < 100; i++) {
+ int range_start = rnd.Uniform(20000);
+ int keys_per_range = 10 + rnd.Uniform(41);
+
+ key_ranges.emplace_back(range_start, range_start + keys_per_range);
+ }
+
+ int memtable_add = 0;
+ int success_add_file = 0;
+ int failed_add_file = 0;
+ std::map<std::string, std::string> true_data;
+ for (size_t i = 0; i < key_ranges.size(); i++) {
+ int range_start = key_ranges[i].first;
+ int range_end = key_ranges[i].second;
+
+ Status s;
+ std::string range_val = "range_" + std::to_string(i);
+
+ // For 20% of ranges we use DB::Put, for 80% we use DB::AddFile
+ if (i && i % 5 == 0) {
+ // Use DB::Put to insert range (insert into memtable)
+ range_val += "_put";
+ for (int k = range_start; k <= range_end; k++) {
+ s = Put(Key(k), range_val);
+ ASSERT_OK(s);
+ }
+ memtable_add++;
+ } else {
+ // Use DB::AddFile to insert range
+ range_val += "_add_file";
+
+ // Generate the file containing the range
+ std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
+ s = sst_file_writer.Open(file_name);
+ ASSERT_OK(s);
+ for (int k = range_start; k <= range_end; k++) {
+ s = sst_file_writer.Put(Key(k), range_val);
+ ASSERT_OK(s);
+ }
+ ExternalSstFileInfo file_info;
+ s = sst_file_writer.Finish(&file_info);
+ ASSERT_OK(s);
+
+ // Insert the generated file
+ s = DeprecatedAddFile({file_name});
+ auto it = true_data.lower_bound(Key(range_start));
+ if (option_config_ != kUniversalCompaction &&
+ option_config_ != kUniversalCompactionMultiLevel &&
+ option_config_ != kUniversalSubcompactions) {
+ if (it != true_data.end() && it->first <= Key(range_end)) {
+ // This range overlap with data already exist in DB
+ ASSERT_NOK(s);
+ failed_add_file++;
+ } else {
+ ASSERT_OK(s);
+ success_add_file++;
+ }
+ } else {
+ if ((it != true_data.end() && it->first <= Key(range_end)) ||
+ need_flush || assigned_seqno > 0 || overlap_with_db) {
+ // This range overlap with data already exist in DB
+ ASSERT_NOK(s);
+ failed_add_file++;
+ } else {
+ ASSERT_OK(s);
+ success_add_file++;
+ }
+ }
+ }
+
+ if (s.ok()) {
+ // Update true_data map to include the new inserted data
+ for (int k = range_start; k <= range_end; k++) {
+ true_data[Key(k)] = range_val;
+ }
+ }
+
+ // Flush / Compact the DB
+ if (i && i % 50 == 0) {
+ ASSERT_OK(Flush());
+ }
+ if (i && i % 75 == 0) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+ }
+
+ printf("Total: %" ROCKSDB_PRIszt
+ " ranges\n"
+ "AddFile()|Success: %d ranges\n"
+ "AddFile()|RangeConflict: %d ranges\n"
+ "Put(): %d ranges\n",
+ key_ranges.size(), success_add_file, failed_add_file, memtable_add);
+
+ // Verify the correctness of the data
+ for (const auto& kv : true_data) {
+ ASSERT_EQ(Get(kv.first), kv.second);
+ }
+ printf("keys/values verified\n");
+ DestroyAndRecreateExternalSSTFilesDir();
+ } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+TEST_P(ExternalSSTFileTest, PickedLevel) {
+ env_->skip_fsync_ = true;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = false;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ DestroyAndReopen(options);
+
+ std::map<std::string, std::string> true_data;
+
+ // File 0 will go to last level (L3)
+ ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, -1, false, false, true,
+ false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "0,0,0,1");
+
+ // File 1 will go to level L2 (since it overlap with file 0 in L3)
+ ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, -1, false, false, true,
+ false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "0,0,1,1");
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"ExternalSSTFileTest::PickedLevel:0", "BackgroundCallCompaction:0"},
+ {"DBImpl::BackgroundCompaction:Start",
+ "ExternalSSTFileTest::PickedLevel:1"},
+ {"ExternalSSTFileTest::PickedLevel:2",
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Flush 4 files containing the same keys
+ for (int i = 0; i < 4; i++) {
+ ASSERT_OK(Put(Key(3), Key(3) + "put"));
+ ASSERT_OK(Put(Key(8), Key(8) + "put"));
+ true_data[Key(3)] = Key(3) + "put";
+ true_data[Key(8)] = Key(8) + "put";
+ ASSERT_OK(Flush());
+ }
+
+ // Wait for BackgroundCompaction() to be called
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:0");
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:1");
+
+ EXPECT_EQ(FilesPerLevel(), "4,0,1,1");
+
+ // This file overlaps with file 0 (L3), file 1 (L2) and the
+ // output of compaction going to L1
+ ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false, true,
+ false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "5,0,1,1");
+
+ // This file does not overlap with any file or with the running compaction
+ ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
+ false, false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "5,0,1,2");
+
+ // Hold compaction from finishing
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:2");
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ EXPECT_EQ(FilesPerLevel(), "1,1,1,2");
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, PickedLevelBug) {
+ env_->skip_fsync_ = true;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = false;
+ options.level0_file_num_compaction_trigger = 3;
+ options.num_levels = 2;
+ DestroyAndReopen(options);
+
+ std::vector<int> file_keys;
+
+ // file #1 in L0
+ file_keys = {0, 5, 7};
+ for (int k : file_keys) {
+ ASSERT_OK(Put(Key(k), Key(k)));
+ }
+ ASSERT_OK(Flush());
+
+ // file #2 in L0
+ file_keys = {4, 6, 8, 9};
+ for (int k : file_keys) {
+ ASSERT_OK(Put(Key(k), Key(k)));
+ }
+ ASSERT_OK(Flush());
+
+ // We have 2 overlapping files in L0
+ EXPECT_EQ(FilesPerLevel(), "2");
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+ "ExternalSSTFileTest::PickedLevelBug:0"},
+ {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"},
+ {"ExternalSSTFileTest::PickedLevelBug:2",
+ "DBImpl::RunManualCompaction:0"},
+ {"ExternalSSTFileTest::PickedLevelBug:3",
+ "DBImpl::RunManualCompaction:1"}});
+
+ std::atomic<bool> bg_compact_started(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:Start",
+ [&](void* /*arg*/) { bg_compact_started.store(true); });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Status bg_compact_status;
+ Status bg_addfile_status;
+
+ {
+ // While writing the MANIFEST start a thread that will ask for compaction
+ ThreadGuard bg_compact(port::Thread([&]() {
+ bg_compact_status =
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ }));
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2");
+
+ // Start a thread that will ingest a new file
+ ThreadGuard bg_addfile(port::Thread([&]() {
+ file_keys = {1, 2, 3};
+ bg_addfile_status = GenerateAndAddExternalFile(options, file_keys, 1);
+ }));
+
+ // Wait for AddFile to start picking levels and writing MANIFEST
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0");
+
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3");
+
+ // We need to verify that no compactions can run while AddFile is
+ // ingesting the files into the levels it find suitable. So we will
+ // wait for 2 seconds to give a chance for compactions to run during
+ // this period, and then make sure that no compactions where able to run
+ env_->SleepForMicroseconds(1000000 * 2);
+ bool bg_compact_started_tmp = bg_compact_started.load();
+
+ // Hold AddFile from finishing writing the MANIFEST
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1");
+
+ // check the status at the end, so even if the ASSERT fails the threads
+ // could be joined and return.
+ ASSERT_FALSE(bg_compact_started_tmp);
+ }
+
+ ASSERT_OK(bg_addfile_status);
+ ASSERT_OK(bg_compact_status);
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ int total_keys = 0;
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ total_keys++;
+ }
+ ASSERT_EQ(total_keys, 10);
+
+ delete iter;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, IngestNonExistingFile) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+
+ Status s = db_->IngestExternalFile({"non_existing_file"},
+ IngestExternalFileOptions());
+ ASSERT_NOK(s);
+
+ // Verify file deletion is not impacted (verify a bug fix)
+ ASSERT_OK(Put(Key(1), Key(1)));
+ ASSERT_OK(Put(Key(9), Key(9)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(1), Key(1)));
+ ASSERT_OK(Put(Key(9), Key(9)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ // After full compaction, there should be only 1 file.
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(dbname_, &files));
+ int num_sst_files = 0;
+ for (auto& f : files) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(f, &number, &type) && type == kTableFile) {
+ num_sst_files++;
+ }
+ }
+ ASSERT_EQ(1, num_sst_files);
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) {
+ env_->skip_fsync_ = true;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = false;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 2;
+ DestroyAndReopen(options);
+
+ std::function<void()> bg_compact = [&]() {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ };
+
+ int range_id = 0;
+ std::vector<int> file_keys;
+ std::function<void()> bg_addfile = [&]() {
+ ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, range_id));
+ };
+
+ const int num_of_ranges = 1000;
+ std::vector<port::Thread> threads;
+ while (range_id < num_of_ranges) {
+ int range_start = range_id * 10;
+ int range_end = range_start + 10;
+
+ file_keys.clear();
+ for (int k = range_start + 1; k < range_end; k++) {
+ file_keys.push_back(k);
+ }
+ ASSERT_OK(Put(Key(range_start), Key(range_start)));
+ ASSERT_OK(Put(Key(range_end), Key(range_end)));
+ ASSERT_OK(Flush());
+
+ if (range_id % 10 == 0) {
+ threads.emplace_back(bg_compact);
+ }
+ threads.emplace_back(bg_addfile);
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ threads.clear();
+
+ range_id++;
+ }
+
+ for (int rid = 0; rid < num_of_ranges; rid++) {
+ int range_start = rid * 10;
+ int range_end = range_start + 10;
+
+ ASSERT_EQ(Get(Key(range_start)), Key(range_start)) << rid;
+ ASSERT_EQ(Get(Key(range_end)), Key(range_end)) << rid;
+ for (int k = range_start + 1; k < range_end; k++) {
+ std::string v = Key(k) + std::to_string(rid);
+ ASSERT_EQ(Get(Key(k)), v) << rid;
+ }
+ }
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
+ env_->skip_fsync_ = true;
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = false;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.num_levels = 4;
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"ExternalSSTFileTest::PickedLevelDynamic:0",
+ "BackgroundCallCompaction:0"},
+ {"DBImpl::BackgroundCompaction:Start",
+ "ExternalSSTFileTest::PickedLevelDynamic:1"},
+ {"ExternalSSTFileTest::PickedLevelDynamic:2",
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Flush 4 files containing the same keys
+ for (int i = 0; i < 4; i++) {
+ for (int k = 20; k <= 30; k++) {
+ ASSERT_OK(Put(Key(k), Key(k) + "put"));
+ true_data[Key(k)] = Key(k) + "put";
+ }
+ for (int k = 50; k <= 60; k++) {
+ ASSERT_OK(Put(Key(k), Key(k) + "put"));
+ true_data[Key(k)] = Key(k) + "put";
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // Wait for BackgroundCompaction() to be called
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:0");
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:1");
+
+ // This file overlaps with the output of the compaction (going to L3)
+ // so the file will be added to L0 since L3 is the base level
+ ASSERT_OK(GenerateAndAddExternalFile(options, {31, 32, 33, 34}, -1, false,
+ false, true, false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "5");
+
+ // This file does not overlap with the current running compactiong
+ ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
+ true, false, false, &true_data));
+ EXPECT_EQ(FilesPerLevel(), "5,0,0,1");
+
+ // Hold compaction from finishing
+ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:2");
+
+ // Output of the compaction will go to L3
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ EXPECT_EQ(FilesPerLevel(), "1,0,0,2");
+
+ Close();
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {1, 15, 19}, -1, false, false,
+ true, false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "1,0,0,3");
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {1000, 1001, 1002}, -1, false,
+ false, true, false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "1,0,0,4");
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {500, 600, 700}, -1, false,
+ false, true, false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "1,0,0,5");
+
+ // File 5 overlaps with file 2 (L3 / base level)
+ ASSERT_OK(GenerateAndAddExternalFile(options, {2, 10}, -1, false, false, true,
+ false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "2,0,0,5");
+
+ // File 6 overlaps with file 2 (L3 / base level) and file 5 (L0)
+ ASSERT_OK(GenerateAndAddExternalFile(options, {3, 9}, -1, false, false, true,
+ false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "3,0,0,5");
+
+ // Verify data in files
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+
+ // Write range [5 => 10] to L0
+ for (int i = 5; i <= 10; i++) {
+ std::string k = Key(i);
+ std::string v = k + "put";
+ ASSERT_OK(Put(k, v));
+ true_data[k] = v;
+ }
+ ASSERT_OK(Flush());
+ ASSERT_EQ(FilesPerLevel(), "4,0,0,5");
+
+ // File 7 overlaps with file 4 (L3)
+ ASSERT_OK(GenerateAndAddExternalFile(options, {650, 651, 652}, -1, false,
+ false, true, false, false, &true_data));
+ ASSERT_EQ(FilesPerLevel(), "5,0,0,5");
+
+ VerifyDBFromMap(true_data, &kcnt, false);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, AddExternalSstFileWithCustomCompartor) {
+ Options options = CurrentOptions();
+ options.comparator = ReverseBytewiseComparator();
+ DestroyAndReopen(options);
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // Generate files with these key ranges
+ // {14 -> 0}
+ // {24 -> 10}
+ // {34 -> 20}
+ // {44 -> 30}
+ // ..
+ std::vector<std::string> generated_files;
+ for (int i = 0; i < 10; i++) {
+ std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
+ ASSERT_OK(sst_file_writer.Open(file_name));
+
+ int range_end = i * 10;
+ int range_start = range_end + 15;
+ for (int k = (range_start - 1); k >= range_end; k--) {
+ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
+ }
+ ExternalSstFileInfo file_info;
+ ASSERT_OK(sst_file_writer.Finish(&file_info));
+ generated_files.push_back(file_name);
+ }
+
+ std::vector<std::string> in_files;
+
+ // These 2nd and 3rd files overlap with each other
+ in_files = {generated_files[0], generated_files[4], generated_files[5],
+ generated_files[7]};
+ ASSERT_NOK(DeprecatedAddFile(in_files));
+
+ // These 2 files don't overlap with each other
+ in_files = {generated_files[0], generated_files[2]};
+ ASSERT_OK(DeprecatedAddFile(in_files));
+
+ // These 2 files don't overlap with each other but overlap with keys in DB
+ in_files = {generated_files[3], generated_files[7]};
+ ASSERT_NOK(DeprecatedAddFile(in_files));
+
+ // Files don't overlap and don't overlap with DB key range
+ in_files = {generated_files[4], generated_files[6], generated_files[8]};
+ ASSERT_OK(DeprecatedAddFile(in_files));
+
+ for (int i = 0; i < 100; i++) {
+ if (i % 20 <= 14) {
+ ASSERT_EQ(Get(Key(i)), Key(i));
+ } else {
+ ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+ }
+ }
+}
+
+TEST_F(ExternalSSTFileTest, AddFileTrivialMoveBug) {
+ Options options = CurrentOptions();
+ options.num_levels = 3;
+ options.IncreaseParallelism(20);
+ DestroyAndReopen(options);
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {1, 4}, 1)); // L3
+ ASSERT_OK(GenerateAndAddExternalFile(options, {2, 3}, 2)); // L2
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {10, 14}, 3)); // L3
+ ASSERT_OK(GenerateAndAddExternalFile(options, {12, 13}, 4)); // L2
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {20, 24}, 5)); // L3
+ ASSERT_OK(GenerateAndAddExternalFile(options, {22, 23}, 6)); // L2
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::Run():Start", [&](void* /*arg*/) {
+ // fit in L3 but will overlap with compaction so will be added
+ // to L2 but a compaction will trivially move it to L3
+ // and break LSM consistency
+ static std::atomic<bool> called = {false};
+ if (!called) {
+ called = true;
+ ASSERT_OK(dbfull()->SetOptions({{"max_bytes_for_level_base", "1"}}));
+ ASSERT_OK(GenerateAndAddExternalFile(options, {15, 16}, 7));
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = false;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, CompactAddedFiles) {
+ Options options = CurrentOptions();
+ options.num_levels = 3;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, 1)); // L3
+ ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, 2)); // L2
+ ASSERT_OK(GenerateAndAddExternalFile(options, {3, 8}, 3)); // L1
+ ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, 4)); // L0
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+}
+
+TEST_F(ExternalSSTFileTest, SstFileWriterNonSharedKeys) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ std::string file_path = sst_files_dir_ + "/not_shared";
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ std::string suffix(100, 'X');
+ ASSERT_OK(sst_file_writer.Open(file_path));
+ ASSERT_OK(sst_file_writer.Put("A" + suffix, "VAL"));
+ ASSERT_OK(sst_file_writer.Put("BB" + suffix, "VAL"));
+ ASSERT_OK(sst_file_writer.Put("CC" + suffix, "VAL"));
+ ASSERT_OK(sst_file_writer.Put("CXD" + suffix, "VAL"));
+ ASSERT_OK(sst_file_writer.Put("CZZZ" + suffix, "VAL"));
+ ASSERT_OK(sst_file_writer.Put("ZAAAX" + suffix, "VAL"));
+
+ ASSERT_OK(sst_file_writer.Finish());
+ ASSERT_OK(DeprecatedAddFile({file_path}));
+}
+
+TEST_F(ExternalSSTFileTest, WithUnorderedWrite) {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL",
+ "ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL"},
+ {"DBImpl::WaitForPendingWrites:BeforeBlock",
+ "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}});
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::IngestExternalFile:NeedFlush", [&](void* need_flush) {
+ ASSERT_TRUE(*reinterpret_cast<bool*>(need_flush));
+ });
+
+ Options options = CurrentOptions();
+ options.unordered_write = true;
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "v1"));
+ SyncPoint::GetInstance()->EnableProcessing();
+ port::Thread writer([&]() { ASSERT_OK(Put("bar", "v2")); });
+
+ TEST_SYNC_POINT("ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL");
+ ASSERT_OK(GenerateAndAddExternalFile(options, {{"bar", "v3"}}, -1,
+ true /* allow_global_seqno */));
+ ASSERT_EQ(Get("bar"), "v3");
+
+ writer.join();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
+ env_->skip_fsync_ = true;
+ Options options = CurrentOptions();
+ options.IncreaseParallelism(20);
+ options.level0_slowdown_writes_trigger = 256;
+ options.level0_stop_writes_trigger = 256;
+
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ for (int iter = 0; iter < 2; iter++) {
+ bool write_to_memtable = (iter == 0);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ std::map<std::string, std::string> true_data;
+ for (int i = 0; i < 500; i++) {
+ std::vector<std::pair<std::string, std::string>> random_data;
+ for (int j = 0; j < 100; j++) {
+ std::string k = rnd.RandomString(rnd.Next() % 20);
+ std::string v = rnd.RandomString(rnd.Next() % 50);
+ random_data.emplace_back(k, v);
+ }
+
+ if (write_to_memtable && rnd.OneIn(4)) {
+ // 25% of writes go through memtable
+ for (auto& entry : random_data) {
+ ASSERT_OK(Put(entry.first, entry.second));
+ true_data[entry.first] = entry.second;
+ }
+ } else {
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, random_data, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, true, &true_data));
+ }
+ }
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ VerifyDBFromMap(true_data, &kcnt, false);
+ }
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
+ Options options = CurrentOptions();
+ options.num_levels = 5;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ std::vector<std::pair<std::string, std::string>> file_data;
+ std::map<std::string, std::string> true_data;
+
+ // Insert 100 -> 200 into the memtable
+ for (int i = 100; i <= 200; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+
+ // Insert 0 -> 20 using AddFile
+ file_data.clear();
+ for (int i = 0; i <= 20; i++) {
+ file_data.emplace_back(Key(i), "L4");
+ }
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, file_data, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+
+ // This file don't overlap with anything in the DB, will go to L4
+ ASSERT_EQ("0,0,0,0,1", FilesPerLevel());
+
+ // Insert 80 -> 130 using AddFile
+ file_data.clear();
+ for (int i = 80; i <= 130; i++) {
+ file_data.emplace_back(Key(i), "L0");
+ }
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, file_data, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+
+ // This file overlap with the memtable, so it will flush it and add
+ // it self to L0
+ ASSERT_EQ("2,0,0,0,1", FilesPerLevel());
+
+ // Insert 30 -> 50 using AddFile
+ file_data.clear();
+ for (int i = 30; i <= 50; i++) {
+ file_data.emplace_back(Key(i), "L4");
+ }
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, file_data, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+
+ // This file don't overlap with anything in the DB and fit in L4 as well
+ ASSERT_EQ("2,0,0,0,2", FilesPerLevel());
+
+ // Insert 10 -> 40 using AddFile
+ file_data.clear();
+ for (int i = 10; i <= 40; i++) {
+ file_data.emplace_back(Key(i), "L3");
+ }
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, file_data, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+
+ // This file overlap with files in L4, we will ingest it in L3
+ ASSERT_EQ("2,0,0,1,2", FilesPerLevel());
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ uint64_t entries_in_memtable;
+ std::map<std::string, std::string> true_data;
+
+ for (int k : {10, 20, 40, 80}) {
+ ASSERT_OK(Put(Key(k), "memtable"));
+ true_data[Key(k)] = "memtable";
+ }
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable));
+ ASSERT_GE(entries_in_memtable, 1);
+
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ // No need for flush
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {90, 100, 110}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable));
+ ASSERT_GE(entries_in_memtable, 1);
+
+ // This file will flush the memtable
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {19, 20, 21}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable));
+ ASSERT_EQ(entries_in_memtable, 0);
+
+ for (int k : {200, 201, 205, 206}) {
+ ASSERT_OK(Put(Key(k), "memtable"));
+ true_data[Key(k)] = "memtable";
+ }
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable));
+ ASSERT_GE(entries_in_memtable, 1);
+
+ // No need for flush, this file keys fit between the memtable keys
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {202, 203, 204}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable));
+ ASSERT_GE(entries_in_memtable, 1);
+
+ // This file will flush the memtable
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {206, 207}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false, &true_data));
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &entries_in_memtable));
+ ASSERT_EQ(entries_in_memtable, 0);
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_P(ExternalSSTFileTest, L0SortingIssue) {
+ Options options = CurrentOptions();
+ options.num_levels = 2;
+ DestroyAndReopen(options);
+ std::map<std::string, std::string> true_data;
+
+ ASSERT_OK(Put(Key(1), "memtable"));
+ ASSERT_OK(Put(Key(10), "memtable"));
+
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ // No Flush needed, No global seqno needed, Ingest in L1
+ ASSERT_OK(
+ GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false));
+ // No Flush needed, but need a global seqno, Ingest in L0
+ ASSERT_OK(
+ GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, false));
+ printf("%s\n", FilesPerLevel().c_str());
+
+ // Overwrite what we added using external files
+ ASSERT_OK(Put(Key(7), "memtable"));
+ ASSERT_OK(Put(Key(8), "memtable"));
+
+ // Read values from memtable
+ ASSERT_EQ(Get(Key(7)), "memtable");
+ ASSERT_EQ(Get(Key(8)), "memtable");
+
+ // Flush and read from L0
+ ASSERT_OK(Flush());
+ printf("%s\n", FilesPerLevel().c_str());
+ ASSERT_EQ(Get(Key(7)), "memtable");
+ ASSERT_EQ(Get(Key(8)), "memtable");
+}
+
+TEST_F(ExternalSSTFileTest, CompactionDeadlock) {
+ Options options = CurrentOptions();
+ options.num_levels = 2;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 4;
+ options.level0_stop_writes_trigger = 4;
+ DestroyAndReopen(options);
+
+ // atomic conter of currently running bg threads
+ std::atomic<int> running_threads(0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::DelayWrite:Wait", "ExternalSSTFileTest::DeadLock:0"},
+ {"ExternalSSTFileTest::DeadLock:1", "DBImpl::AddFile:Start"},
+ {"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::DeadLock:2"},
+ {"ExternalSSTFileTest::DeadLock:3", "BackgroundCallCompaction:0"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Start ingesting and extrnal file in the background
+ ROCKSDB_NAMESPACE::port::Thread bg_ingest_file([&]() {
+ running_threads += 1;
+ ASSERT_OK(GenerateAndAddExternalFile(options, {5, 6}));
+ running_threads -= 1;
+ });
+
+ ASSERT_OK(Put(Key(1), "memtable"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(2), "memtable"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(3), "memtable"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(4), "memtable"));
+ ASSERT_OK(Flush());
+
+ // This thread will try to insert into the memtable but since we have 4 L0
+ // files this thread will be blocked and hold the writer thread
+ ROCKSDB_NAMESPACE::port::Thread bg_block_put([&]() {
+ running_threads += 1;
+ ASSERT_OK(Put(Key(10), "memtable"));
+ running_threads -= 1;
+ });
+
+ // Make sure DelayWrite is called first
+ TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:0");
+
+ // `DBImpl::AddFile:Start` will wait until we be here
+ TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:1");
+
+ // Wait for IngestExternalFile() to start and aquire mutex
+ TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:2");
+
+ // Now let compaction start
+ TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:3");
+
+ // Wait for max 5 seconds, if we did not finish all bg threads
+ // then we hit the deadlock bug
+ for (int i = 0; i < 10; i++) {
+ if (running_threads.load() == 0) {
+ break;
+ }
+ // Make sure we do a "real sleep", not a mock one.
+ SystemClock::Default()->SleepForMicroseconds(500000);
+ }
+
+ ASSERT_EQ(running_threads.load(), 0);
+
+ bg_ingest_file.join();
+ bg_block_put.join();
+}
+
+TEST_F(ExternalSSTFileTest, DirtyExit) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ std::string file_path = sst_files_dir_ + "/dirty_exit";
+ std::unique_ptr<SstFileWriter> sst_file_writer;
+
+ // Destruct SstFileWriter without calling Finish()
+ sst_file_writer.reset(new SstFileWriter(EnvOptions(), options));
+ ASSERT_OK(sst_file_writer->Open(file_path));
+ sst_file_writer.reset();
+
+ // Destruct SstFileWriter with a failing Finish
+ sst_file_writer.reset(new SstFileWriter(EnvOptions(), options));
+ ASSERT_OK(sst_file_writer->Open(file_path));
+ ASSERT_NOK(sst_file_writer->Finish());
+}
+
+TEST_F(ExternalSSTFileTest, FileWithCFInfo) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko", "toto"}, options);
+
+ SstFileWriter sfw_default(EnvOptions(), options, handles_[0]);
+ SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+ SstFileWriter sfw_cf2(EnvOptions(), options, handles_[2]);
+ SstFileWriter sfw_unknown(EnvOptions(), options);
+
+ // default_cf.sst
+ const std::string cf_default_sst = sst_files_dir_ + "/default_cf.sst";
+ ASSERT_OK(sfw_default.Open(cf_default_sst));
+ ASSERT_OK(sfw_default.Put("K1", "V1"));
+ ASSERT_OK(sfw_default.Put("K2", "V2"));
+ ASSERT_OK(sfw_default.Finish());
+
+ // cf1.sst
+ const std::string cf1_sst = sst_files_dir_ + "/cf1.sst";
+ ASSERT_OK(sfw_cf1.Open(cf1_sst));
+ ASSERT_OK(sfw_cf1.Put("K3", "V1"));
+ ASSERT_OK(sfw_cf1.Put("K4", "V2"));
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // cf_unknown.sst
+ const std::string unknown_sst = sst_files_dir_ + "/cf_unknown.sst";
+ ASSERT_OK(sfw_unknown.Open(unknown_sst));
+ ASSERT_OK(sfw_unknown.Put("K5", "V1"));
+ ASSERT_OK(sfw_unknown.Put("K6", "V2"));
+ ASSERT_OK(sfw_unknown.Finish());
+
+ IngestExternalFileOptions ifo;
+
+ // SST CF don't match
+ ASSERT_NOK(db_->IngestExternalFile(handles_[0], {cf1_sst}, ifo));
+ // SST CF don't match
+ ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf1_sst}, ifo));
+ // SST CF match
+ ASSERT_OK(db_->IngestExternalFile(handles_[1], {cf1_sst}, ifo));
+
+ // SST CF don't match
+ ASSERT_NOK(db_->IngestExternalFile(handles_[1], {cf_default_sst}, ifo));
+ // SST CF don't match
+ ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf_default_sst}, ifo));
+ // SST CF match
+ ASSERT_OK(db_->IngestExternalFile(handles_[0], {cf_default_sst}, ifo));
+
+ // SST CF unknown
+ ASSERT_OK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo));
+ // SST CF unknown
+ ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo));
+ // SST CF unknown
+ ASSERT_OK(db_->IngestExternalFile(handles_[0], {unknown_sst}, ifo));
+
+ // Cannot ingest a file into a dropped CF
+ ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+ ASSERT_NOK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo));
+
+ // CF was not dropped, ok to Ingest
+ ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo));
+}
+
+/*
+ * Test and verify the functionality of ingestion_options.move_files and
+ * ingestion_options.failed_move_fall_back_to_copy
+ */
+TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
+ const bool fail_link = std::get<0>(GetParam());
+ const bool failed_move_fall_back_to_copy = std::get<1>(GetParam());
+ test_env_->set_fail_link(fail_link);
+ const EnvOptions env_options;
+ DestroyAndReopen(options_);
+ const int kNumKeys = 10000;
+ IngestExternalFileOptions ifo;
+ ifo.move_files = true;
+ ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;
+
+ std::string file_path = sst_files_dir_ + "file1.sst";
+ // Create SstFileWriter for default column family
+ SstFileWriter sst_file_writer(env_options, options_);
+ ASSERT_OK(sst_file_writer.Open(file_path));
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value"));
+ }
+ ASSERT_OK(sst_file_writer.Finish());
+ uint64_t file_size = 0;
+ ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+
+ bool copyfile = false;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ExternalSstFileIngestionJob::Prepare:CopyFile",
+ [&](void* /* arg */) { copyfile = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ const Status s = db_->IngestExternalFile({file_path}, ifo);
+
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+ ColumnFamilyData* cfd = cfh->cfd();
+ const InternalStats* internal_stats_ptr = cfd->internal_stats();
+ const std::vector<InternalStats::CompactionStats>& comp_stats =
+ internal_stats_ptr->TEST_GetCompactionStats();
+ uint64_t bytes_copied = 0;
+ uint64_t bytes_moved = 0;
+ for (const auto& stats : comp_stats) {
+ bytes_copied += stats.bytes_written;
+ bytes_moved += stats.bytes_moved;
+ }
+
+ if (!fail_link) {
+ // Link operation succeeds. External SST should be moved.
+ ASSERT_OK(s);
+ ASSERT_EQ(0, bytes_copied);
+ ASSERT_EQ(file_size, bytes_moved);
+ ASSERT_FALSE(copyfile);
+ } else {
+ // Link operation fails.
+ ASSERT_EQ(0, bytes_moved);
+ if (failed_move_fall_back_to_copy) {
+ ASSERT_OK(s);
+ // Copy file is true since a failed link falls back to copy file.
+ ASSERT_TRUE(copyfile);
+ ASSERT_EQ(file_size, bytes_copied);
+ } else {
+ ASSERT_TRUE(s.IsNotSupported());
+ // Copy file is false since a failed link does not fall back to copy file.
+ ASSERT_FALSE(copyfile);
+ ASSERT_EQ(0, bytes_copied);
+ }
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class TestIngestExternalFileListener : public EventListener {
+ public:
+ void OnExternalFileIngested(DB* /*db*/,
+ const ExternalFileIngestionInfo& info) override {
+ ingested_files.push_back(info);
+ }
+
+ std::vector<ExternalFileIngestionInfo> ingested_files;
+};
+
+TEST_P(ExternalSSTFileTest, IngestionListener) {
+ Options options = CurrentOptions();
+ TestIngestExternalFileListener* listener =
+ new TestIngestExternalFileListener();
+ options.listeners.emplace_back(listener);
+ CreateAndReopenWithCF({"koko", "toto"}, options);
+
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+ // Ingest into default cf
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, true, nullptr, handles_[0]));
+ ASSERT_EQ(listener->ingested_files.size(), 1);
+ ASSERT_EQ(listener->ingested_files.back().cf_name, "default");
+ ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+ 0);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+ "default");
+
+ // Ingest into cf1
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, true, nullptr, handles_[1]));
+ ASSERT_EQ(listener->ingested_files.size(), 2);
+ ASSERT_EQ(listener->ingested_files.back().cf_name, "koko");
+ ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+ 1);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+ "koko");
+
+ // Ingest into cf2
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, {1, 2}, -1, true, write_global_seqno,
+ verify_checksums_before_ingest, false, true, nullptr, handles_[2]));
+ ASSERT_EQ(listener->ingested_files.size(), 3);
+ ASSERT_EQ(listener->ingested_files.back().cf_name, "toto");
+ ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+ 2);
+ ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+ "toto");
+}
+
+TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ const int kNumKeys = 10000;
+
+ // Insert keys using normal path and take a snapshot
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(i), Key(i) + "_V1"));
+ }
+ const Snapshot* snap = db_->GetSnapshot();
+
+ // Overwrite all keys using IngestExternalFile
+ std::string sst_file_path = sst_files_dir_ + "file1.sst";
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ ASSERT_OK(sst_file_writer.Open(sst_file_path));
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_V2"));
+ }
+ ASSERT_OK(sst_file_writer.Finish());
+
+ IngestExternalFileOptions ifo;
+ ifo.move_files = true;
+ ASSERT_OK(db_->IngestExternalFile({sst_file_path}, ifo));
+
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_EQ(Get(Key(i), snap), Key(i) + "_V1");
+ ASSERT_EQ(Get(Key(i)), Key(i) + "_V2");
+ }
+
+ db_->ReleaseSnapshot(snap);
+}
+
+TEST_P(ExternalSSTFileTest, IngestBehind) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 3;
+ options.disable_auto_compactions = false;
+ DestroyAndReopen(options);
+ std::vector<std::pair<std::string, std::string>> file_data;
+ std::map<std::string, std::string> true_data;
+
+ // Insert 100 -> 200 into the memtable
+ for (int i = 100; i <= 200; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+
+ // Insert 100 -> 200 using IngestExternalFile
+ file_data.clear();
+ for (int i = 0; i <= 20; i++) {
+ file_data.emplace_back(Key(i), "ingest_behind");
+ }
+
+ bool allow_global_seqno = true;
+ bool ingest_behind = true;
+ bool write_global_seqno = std::get<0>(GetParam());
+ bool verify_checksums_before_ingest = std::get<1>(GetParam());
+
+ // Can't ingest behind since allow_ingest_behind isn't set to true
+ ASSERT_NOK(GenerateAndAddExternalFile(
+ options, file_data, -1, allow_global_seqno, write_global_seqno,
+ verify_checksums_before_ingest, ingest_behind, false /*sort_data*/,
+ &true_data));
+
+ options.allow_ingest_behind = true;
+ // check that we still can open the DB, as num_levels should be
+ // sanitized to 3
+ options.num_levels = 2;
+ DestroyAndReopen(options);
+
+ options.num_levels = 3;
+ DestroyAndReopen(options);
+ // Insert 100 -> 200 into the memtable
+ for (int i = 100; i <= 200; i++) {
+ ASSERT_OK(Put(Key(i), "memtable"));
+ true_data[Key(i)] = "memtable";
+ }
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // Universal picker should go at second from the bottom level
+ ASSERT_EQ("0,1", FilesPerLevel());
+ ASSERT_OK(GenerateAndAddExternalFile(
+ options, file_data, -1, allow_global_seqno, write_global_seqno,
+ verify_checksums_before_ingest, true /*ingest_behind*/,
+ false /*sort_data*/, &true_data));
+ ASSERT_EQ("0,1,1", FilesPerLevel());
+ // this time ingest should fail as the file doesn't fit to the bottom level
+ ASSERT_NOK(GenerateAndAddExternalFile(
+ options, file_data, -1, allow_global_seqno, write_global_seqno,
+ verify_checksums_before_ingest, true /*ingest_behind*/,
+ false /*sort_data*/, &true_data));
+ ASSERT_EQ("0,1,1", FilesPerLevel());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // bottom level should be empty
+ ASSERT_EQ("0,1", FilesPerLevel());
+
+ size_t kcnt = 0;
+ VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
+ Options options = CurrentOptions();
+
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ table_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ // Create external SST file and include bloom filters
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ {
+ std::string file_path = sst_files_dir_ + "sst_with_bloom.sst";
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+ ASSERT_OK(sst_file_writer.Open(file_path));
+ ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+ ASSERT_OK(sst_file_writer.Finish());
+
+ ASSERT_OK(
+ db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+ ASSERT_EQ(Get("Key1"), "Value1");
+ ASSERT_GE(
+ options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 1);
+ }
+
+ // Create external SST file but skip bloom filters
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ DestroyAndReopen(options);
+ {
+ std::string file_path = sst_files_dir_ + "sst_with_no_bloom.sst";
+ SstFileWriter sst_file_writer(EnvOptions(), options, nullptr, true,
+ Env::IOPriority::IO_TOTAL,
+ true /* skip_filters */);
+ ASSERT_OK(sst_file_writer.Open(file_path));
+ ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+ ASSERT_OK(sst_file_writer.Finish());
+
+ ASSERT_OK(
+ db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+ ASSERT_EQ(Get("Key1"), "Value1");
+ ASSERT_EQ(
+ options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 0);
+ }
+}
+
+TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) {
+ if (!ZSTD_Supported()) {
+ return;
+ }
+ const int kNumEntries = 1 << 10;
+ const int kNumBytesPerEntry = 1 << 10;
+ Options options = CurrentOptions();
+ options.compression = kZSTD;
+ options.compression_opts.max_dict_bytes = 1 << 14; // 16KB
+ options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB
+ DestroyAndReopen(options);
+
+ std::atomic<int> num_compression_dicts(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+ [&](void* /* arg */) { ++num_compression_dicts; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ std::vector<std::pair<std::string, std::string>> random_data;
+ for (int i = 0; i < kNumEntries; i++) {
+ std::string val = rnd.RandomString(kNumBytesPerEntry);
+ random_data.emplace_back(Key(i), std::move(val));
+ }
+ ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data)));
+ ASSERT_EQ(1, num_compression_dicts);
+}
+
+class ExternalSSTBlockChecksumTest
+ : public ExternalSSTFileTestBase,
+ public testing::WithParamInterface<uint32_t> {};
+
+INSTANTIATE_TEST_CASE_P(FormatVersions, ExternalSSTBlockChecksumTest,
+ testing::ValuesIn(test::kFooterFormatVersionsToTest));
+
+// Very slow, not worth the cost to run regularly
+TEST_P(ExternalSSTBlockChecksumTest, DISABLED_HugeBlockChecksum) {
+ BlockBasedTableOptions table_options;
+ table_options.format_version = GetParam();
+ for (auto t : GetSupportedChecksums()) {
+ table_options.checksum = t;
+ Options options = CurrentOptions();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ SstFileWriter sst_file_writer(EnvOptions(), options);
+
+ // 2^32 - 1, will lead to data block with more than 2^32 bytes
+ size_t huge_size = std::numeric_limits<uint32_t>::max();
+
+ std::string f = sst_files_dir_ + "f.sst";
+ ASSERT_OK(sst_file_writer.Open(f));
+ {
+ Random64 r(123);
+ std::string huge(huge_size, 0);
+ for (size_t j = 0; j + 7 < huge_size; j += 8) {
+ EncodeFixed64(&huge[j], r.Next());
+ }
+ ASSERT_OK(sst_file_writer.Put("Huge", huge));
+ }
+
+ ExternalSstFileInfo f_info;
+ ASSERT_OK(sst_file_writer.Finish(&f_info));
+ ASSERT_GT(f_info.file_size, uint64_t{huge_size} + 10);
+
+ SstFileReader sst_file_reader(options);
+ ASSERT_OK(sst_file_reader.Open(f));
+ ASSERT_OK(sst_file_reader.VerifyChecksum());
+ }
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+ // Exercise different situations in different column families: two are empty
+ // (so no new sequence number is needed), but at least one overlaps with the
+ // DB and needs to bump the sequence number.
+ ASSERT_OK(db_->Put(WriteOptions(), "foo1", "oldvalue"));
+
+ std::vector<ColumnFamilyHandle*> column_families;
+ column_families.push_back(handles_[0]);
+ column_families.push_back(handles_[1]);
+ column_families.push_back(handles_[2]);
+ std::vector<IngestExternalFileOptions> ifos(column_families.size());
+ for (auto& ifo : ifos) {
+ ifo.allow_global_seqno = true; // Always allow global_seqno
+ // May or may not write global_seqno
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ // Whether to verify checksums before ingestion
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ }
+ std::vector<std::vector<std::pair<std::string, std::string>>> data;
+ data.push_back(
+ {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+ data.push_back(
+ {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+ data.push_back(
+ {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
+ // Resize the true_data vector upon construction to avoid re-alloc
+ std::vector<std::map<std::string, std::string>> true_data(
+ column_families.size());
+ ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+ -1, true, true_data));
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+ options);
+ ASSERT_EQ(3, handles_.size());
+ int cf = 0;
+ for (const auto& verify_map : true_data) {
+ for (const auto& elem : verify_map) {
+ const std::string& key = elem.first;
+ const std::string& value = elem.second;
+ ASSERT_EQ(value, Get(cf, key));
+ }
+ ++cf;
+ }
+ Close();
+ Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+ IngestFilesIntoMultipleColumnFamilies_NoMixedStateWithSnapshot) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::IngestExternalFiles:InstallSVForFirstCF:0",
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+ "BeforeRead"},
+ {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+ "AfterRead",
+ "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ const std::vector<std::map<std::string, std::string>> data_before_ingestion =
+ {{{"foo1", "fv1_0"}, {"foo2", "fv2_0"}, {"foo3", "fv3_0"}},
+ {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}},
+ {{"bar4", "bv4_0"}, {"bar5", "bv5_0"}, {"bar6", "bv6_0"}}};
+ for (size_t i = 0; i != handles_.size(); ++i) {
+ int cf = static_cast<int>(i);
+ const auto& orig_data = data_before_ingestion[i];
+ for (const auto& kv : orig_data) {
+ ASSERT_OK(Put(cf, kv.first, kv.second));
+ }
+ ASSERT_OK(Flush(cf));
+ }
+
+ std::vector<ColumnFamilyHandle*> column_families;
+ column_families.push_back(handles_[0]);
+ column_families.push_back(handles_[1]);
+ column_families.push_back(handles_[2]);
+ std::vector<IngestExternalFileOptions> ifos(column_families.size());
+ for (auto& ifo : ifos) {
+ ifo.allow_global_seqno = true; // Always allow global_seqno
+ // May or may not write global_seqno
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ // Whether to verify checksums before ingestion
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ }
+ std::vector<std::vector<std::pair<std::string, std::string>>> data;
+ data.push_back(
+ {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+ data.push_back(
+ {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+ data.push_back(
+ {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+ // Resize the true_data vector upon construction to avoid re-alloc
+ std::vector<std::map<std::string, std::string>> true_data(
+ column_families.size());
+ // Take snapshot before ingestion starts
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ read_opts.snapshot = dbfull()->GetSnapshot();
+ std::vector<Iterator*> iters(handles_.size());
+
+ // Range scan checks first kv of each CF before ingestion starts.
+ for (size_t i = 0; i != handles_.size(); ++i) {
+ iters[i] = dbfull()->NewIterator(read_opts, handles_[i]);
+ iters[i]->SeekToFirst();
+ ASSERT_TRUE(iters[i]->Valid());
+ const std::string& key = iters[i]->key().ToString();
+ const std::string& value = iters[i]->value().ToString();
+ const std::map<std::string, std::string>& orig_data =
+ data_before_ingestion[i];
+ std::map<std::string, std::string>::const_iterator it = orig_data.find(key);
+ ASSERT_NE(orig_data.end(), it);
+ ASSERT_EQ(it->second, value);
+ iters[i]->Next();
+ }
+ port::Thread ingest_thread([&]() {
+ ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+ -1, true, true_data));
+ });
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+ "BeforeRead");
+ // Should see only data before ingestion
+ for (size_t i = 0; i != handles_.size(); ++i) {
+ const auto& orig_data = data_before_ingestion[i];
+ for (; iters[i]->Valid(); iters[i]->Next()) {
+ const std::string& key = iters[i]->key().ToString();
+ const std::string& value = iters[i]->value().ToString();
+ std::map<std::string, std::string>::const_iterator it =
+ orig_data.find(key);
+ ASSERT_NE(orig_data.end(), it);
+ ASSERT_EQ(it->second, value);
+ }
+ }
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+ "AfterRead");
+ ingest_thread.join();
+ for (auto* iter : iters) {
+ delete iter;
+ }
+ iters.clear();
+ dbfull()->ReleaseSnapshot(read_opts.snapshot);
+
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+ options);
+ // Should see consistent state after ingestion for all column families even
+ // without snapshot.
+ ASSERT_EQ(3, handles_.size());
+ int cf = 0;
+ for (const auto& verify_map : true_data) {
+ for (const auto& elem : verify_map) {
+ const std::string& key = elem.first;
+ const std::string& value = elem.second;
+ ASSERT_EQ(value, Get(cf, key));
+ }
+ ++cf;
+ }
+ Close();
+ Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0",
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+ "0"},
+ {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+ "1",
+ "DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ std::vector<ColumnFamilyHandle*> column_families;
+ column_families.push_back(handles_[0]);
+ column_families.push_back(handles_[1]);
+ column_families.push_back(handles_[2]);
+ std::vector<IngestExternalFileOptions> ifos(column_families.size());
+ for (auto& ifo : ifos) {
+ ifo.allow_global_seqno = true; // Always allow global_seqno
+ // May or may not write global_seqno
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ // Whether to verify block checksums before ingest
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ }
+ std::vector<std::vector<std::pair<std::string, std::string>>> data;
+ data.push_back(
+ {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+ data.push_back(
+ {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+ data.push_back(
+ {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
+ // Resize the true_data vector upon construction to avoid re-alloc
+ std::vector<std::map<std::string, std::string>> true_data(
+ column_families.size());
+ port::Thread ingest_thread([&]() {
+ ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+ -1, true, true_data));
+ });
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+ "0");
+ fault_injection_env->SetFilesystemActive(false);
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+ "1");
+ ingest_thread.join();
+
+ fault_injection_env->SetFilesystemActive(true);
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+ options);
+ ASSERT_EQ(3, handles_.size());
+ int cf = 0;
+ for (const auto& verify_map : true_data) {
+ for (const auto& elem : verify_map) {
+ const std::string& key = elem.first;
+ ASSERT_EQ("NOT_FOUND", Get(cf, key));
+ }
+ ++cf;
+ }
+ Close();
+ Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::IngestExternalFiles:BeforeJobsRun:0",
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+ "0"},
+ {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+ "1",
+ "DBImpl::IngestExternalFiles:BeforeJobsRun:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ std::vector<ColumnFamilyHandle*> column_families;
+ column_families.push_back(handles_[0]);
+ column_families.push_back(handles_[1]);
+ column_families.push_back(handles_[2]);
+ std::vector<IngestExternalFileOptions> ifos(column_families.size());
+ for (auto& ifo : ifos) {
+ ifo.allow_global_seqno = true; // Always allow global_seqno
+ // May or may not write global_seqno
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ // Whether to verify block checksums before ingestion
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ }
+ std::vector<std::vector<std::pair<std::string, std::string>>> data;
+ data.push_back(
+ {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+ data.push_back(
+ {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+ data.push_back(
+ {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+ // Resize the true_data vector upon construction to avoid re-alloc
+ std::vector<std::map<std::string, std::string>> true_data(
+ column_families.size());
+ port::Thread ingest_thread([&]() {
+ ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+ -1, true, true_data));
+ });
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+ "0");
+ fault_injection_env->SetFilesystemActive(false);
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+ "1");
+ ingest_thread.join();
+
+ fault_injection_env->SetFilesystemActive(true);
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+ options);
+ ASSERT_EQ(3, handles_.size());
+ int cf = 0;
+ for (const auto& verify_map : true_data) {
+ for (const auto& elem : verify_map) {
+ const std::string& key = elem.first;
+ ASSERT_EQ("NOT_FOUND", Get(cf, key));
+ }
+ ++cf;
+ }
+ Close();
+ Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+ IngestFilesIntoMultipleColumnFamilies_PartialManifestWriteFail) {
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+ new FaultInjectionTestEnv(env_));
+ Options options = CurrentOptions();
+ options.env = fault_injection_env.get();
+
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+ SyncPoint::GetInstance()->ClearTrace();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+ "PartialManifestWriteFail:0"},
+ {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+ "PartialManifestWriteFail:1",
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<ColumnFamilyHandle*> column_families;
+ column_families.push_back(handles_[0]);
+ column_families.push_back(handles_[1]);
+ column_families.push_back(handles_[2]);
+ std::vector<IngestExternalFileOptions> ifos(column_families.size());
+ for (auto& ifo : ifos) {
+ ifo.allow_global_seqno = true; // Always allow global_seqno
+ // May or may not write global_seqno
+ ifo.write_global_seqno = std::get<0>(GetParam());
+ // Whether to verify block checksums before ingestion
+ ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+ }
+ std::vector<std::vector<std::pair<std::string, std::string>>> data;
+ data.push_back(
+ {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+ data.push_back(
+ {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+ data.push_back(
+ {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+ // Resize the true_data vector upon construction to avoid re-alloc
+ std::vector<std::map<std::string, std::string>> true_data(
+ column_families.size());
+ port::Thread ingest_thread([&]() {
+ ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+ -1, true, true_data));
+ });
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+ "PartialManifestWriteFail:0");
+ fault_injection_env->SetFilesystemActive(false);
+ TEST_SYNC_POINT(
+ "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+ "PartialManifestWriteFail:1");
+ ingest_thread.join();
+
+ ASSERT_OK(fault_injection_env->DropUnsyncedFileData());
+ fault_injection_env->SetFilesystemActive(true);
+ Close();
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+ options);
+ ASSERT_EQ(3, handles_.size());
+ int cf = 0;
+ for (const auto& verify_map : true_data) {
+ for (const auto& elem : verify_map) {
+ const std::string& key = elem.first;
+ ASSERT_EQ("NOT_FOUND", Get(cf, key));
+ }
+ ++cf;
+ }
+ Close();
+ Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) {
+ Options options = CurrentOptions();
+ // Use large buffer to avoid memtable flush
+ options.write_buffer_size = 1024 * 1024;
+ options.two_write_queues = true;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "1000", "v1"));
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "1001", "v1"));
+ ASSERT_OK(dbfull()->Put(WriteOptions(), "9999", "v1"));
+
+ // Put one key which is overlap with keys in memtable.
+ // It will trigger flushing memtable and require this thread is
+ // currently at the front of the 2nd writer queue. We must make
+ // sure that it won't enter the 2nd writer queue for the second time.
+ std::vector<std::pair<std::string, std::string>> data;
+ data.push_back(std::make_pair("1001", "v2"));
+ ASSERT_OK(GenerateAndAddExternalFile(options, data, -1, true));
+}
+
+TEST_P(ExternalSSTFileTest, DeltaEncodingWhileGlobalSeqnoPresent) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ constexpr size_t kValueSize = 8;
+ Random rnd(301);
+ std::string value = rnd.RandomString(kValueSize);
+
+ // Write some key to make global seqno larger than zero
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put("ab" + Key(i), value));
+ }
+ // Get a Snapshot to make RocksDB assign global seqno to ingested sst files.
+ auto snap = dbfull()->GetSnapshot();
+
+ std::string fname = sst_files_dir_ + "test_file";
+ ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options);
+ ASSERT_OK(writer.Open(fname));
+ std::string key1 = "ab";
+ std::string key2 = "ab";
+
+ // Make the prefix of key2 is same with key1 add zero seqno. The tail of every
+ // key is composed as (seqno << 8 | value_type), and here `1` represents
+ // ValueType::kTypeValue
+
+ PutFixed64(&key2, PackSequenceAndType(0, kTypeValue));
+ key2 += "cdefghijkl";
+
+ ASSERT_OK(writer.Put(key1, value));
+ ASSERT_OK(writer.Put(key2, value));
+
+ ExternalSstFileInfo info;
+ ASSERT_OK(writer.Finish(&info));
+
+ ASSERT_OK(dbfull()->IngestExternalFile({info.file_path},
+ IngestExternalFileOptions()));
+ dbfull()->ReleaseSnapshot(snap);
+ ASSERT_EQ(value, Get(key1));
+ // You will get error here
+ ASSERT_EQ(value, Get(key2));
+}
+
+TEST_P(ExternalSSTFileTest,
+ DeltaEncodingWhileGlobalSeqnoPresentIteratorSwitch) {
+ // Regression test for bug where global seqno corrupted the shared bytes
+ // buffer when switching from reverse iteration to forward iteration.
+ constexpr size_t kValueSize = 8;
+ Options options = CurrentOptions();
+
+ Random rnd(301);
+ std::string value = rnd.RandomString(kValueSize);
+
+ std::string key0 = "aa";
+ std::string key1 = "ab";
+ // Make the prefix of key2 is same with key1 add zero seqno. The tail of every
+ // key is composed as (seqno << 8 | value_type), and here `1` represents
+ // ValueType::kTypeValue
+ std::string key2 = "ab";
+ PutFixed64(&key2, PackSequenceAndType(0, kTypeValue));
+ key2 += "cdefghijkl";
+ std::string key3 = key2 + "_";
+
+ // Write some key to make global seqno larger than zero
+ ASSERT_OK(Put(key0, value));
+
+ std::string fname = sst_files_dir_ + "test_file";
+ ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options);
+ ASSERT_OK(writer.Open(fname));
+
+ // key0 is a dummy to ensure the turnaround point (key1) comes from Prev
+ // cache rather than block (restart keys are pinned in block).
+ ASSERT_OK(writer.Put(key0, value));
+ ASSERT_OK(writer.Put(key1, value));
+ ASSERT_OK(writer.Put(key2, value));
+ ASSERT_OK(writer.Put(key3, value));
+
+ ExternalSstFileInfo info;
+ ASSERT_OK(writer.Finish(&info));
+
+ ASSERT_OK(dbfull()->IngestExternalFile({info.file_path},
+ IngestExternalFileOptions()));
+ ReadOptions read_opts;
+ // Prevents Seek() when switching directions, which circumvents the bug.
+ read_opts.total_order_seek = true;
+ Iterator* iter = db_->NewIterator(read_opts);
+ // Scan backwards to key2. File iterator will then be positioned at key1.
+ iter->Seek(key3);
+ ASSERT_EQ(key3, iter->key());
+ iter->Prev();
+ ASSERT_EQ(key2, iter->key());
+ // Scan forwards and make sure key3 is present. Previously key3 would be
+ // corrupted by the global seqno from key1.
+ iter->Next();
+ ASSERT_EQ(key3, iter->key());
+ delete iter;
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
+ testing::Values(std::make_tuple(false, false),
+ std::make_tuple(false, true),
+ std::make_tuple(true, false),
+ std::make_tuple(true, true)));
+
+INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
+ ExternSSTFileLinkFailFallbackTest,
+ testing::Values(std::make_tuple(true, false),
+ std::make_tuple(true, true),
+ std::make_tuple(false, false)));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as External SST File Writer and Ingestion are not supported "
+ "in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/fault_injection_test.cc b/src/rocksdb/db/fault_injection_test.cc
new file mode 100644
index 000000000..ddd4b47cc
--- /dev/null
+++ b/src/rocksdb/db/fault_injection_test.cc
@@ -0,0 +1,637 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include "db/db_impl/db_impl.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#ifndef NDEBUG
+#include "utilities/fault_injection_fs.h"
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+static const int kMaxNumValues = 2000;
+static const size_t kNumIterations = 3;
+
+enum FaultInjectionOptionConfig {
+ kDefault,
+ kDifferentDataDir,
+ kWalDir,
+ kSyncWal,
+ kWalDirSyncWal,
+ kMultiLevels,
+ kEnd,
+};
+class FaultInjectionTest
+ : public testing::Test,
+ public testing::WithParamInterface<std::tuple<
+ bool, FaultInjectionOptionConfig, FaultInjectionOptionConfig>> {
+ protected:
+ int option_config_;
+ int non_inclusive_end_range_; // kEnd or equivalent to that
+ // When need to make sure data is persistent, sync WAL
+ bool sync_use_wal_;
+ // When need to make sure data is persistent, call DB::CompactRange()
+ bool sync_use_compact_;
+
+ bool sequential_order_;
+
+ public:
+ enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
+ enum ResetMethod {
+ kResetDropUnsyncedData,
+ kResetDropRandomUnsyncedData,
+ kResetDeleteUnsyncedFiles,
+ kResetDropAndDeleteUnsynced
+ };
+
+ std::unique_ptr<Env> base_env_;
+ FaultInjectionTestEnv* env_;
+ std::string dbname_;
+ std::shared_ptr<Cache> tiny_cache_;
+ Options options_;
+ DB* db_;
+
+ FaultInjectionTest()
+ : option_config_(std::get<1>(GetParam())),
+ non_inclusive_end_range_(std::get<2>(GetParam())),
+ sync_use_wal_(false),
+ sync_use_compact_(true),
+ base_env_(nullptr),
+ env_(nullptr),
+ db_(nullptr) {
+ EXPECT_OK(
+ test::CreateEnvFromSystem(ConfigOptions(), &system_env_, &env_guard_));
+ EXPECT_NE(system_env_, nullptr);
+ }
+
+ ~FaultInjectionTest() override {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ }
+
+ bool ChangeOptions() {
+ option_config_++;
+ if (option_config_ >= non_inclusive_end_range_) {
+ return false;
+ } else {
+ if (option_config_ == kMultiLevels) {
+ base_env_.reset(MockEnv::Create(system_env_));
+ }
+ return true;
+ }
+ }
+
+ // Return the current option configuration.
+ Options CurrentOptions() {
+ sync_use_wal_ = false;
+ sync_use_compact_ = true;
+ Options options;
+ switch (option_config_) {
+ case kWalDir:
+ options.wal_dir = test::PerThreadDBPath(env_, "fault_test_wal");
+ break;
+ case kDifferentDataDir:
+ options.db_paths.emplace_back(
+ test::PerThreadDBPath(env_, "fault_test_data"), 1000000U);
+ break;
+ case kSyncWal:
+ sync_use_wal_ = true;
+ sync_use_compact_ = false;
+ break;
+ case kWalDirSyncWal:
+ options.wal_dir = test::PerThreadDBPath(env_, "/fault_test_wal");
+ sync_use_wal_ = true;
+ sync_use_compact_ = false;
+ break;
+ case kMultiLevels:
+ options.write_buffer_size = 64 * 1024;
+ options.target_file_size_base = 64 * 1024;
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 2;
+ options.level0_stop_writes_trigger = 4;
+ options.max_bytes_for_level_base = 128 * 1024;
+ options.max_write_buffer_number = 2;
+ options.max_background_compactions = 8;
+ options.max_background_flushes = 8;
+ sync_use_wal_ = true;
+ sync_use_compact_ = false;
+ break;
+ default:
+ break;
+ }
+ return options;
+ }
+
+ Status NewDB() {
+ assert(db_ == nullptr);
+ assert(tiny_cache_ == nullptr);
+ assert(env_ == nullptr);
+
+ env_ = new FaultInjectionTestEnv(base_env_ ? base_env_.get() : system_env_);
+
+ options_ = CurrentOptions();
+ options_.env = env_;
+ options_.paranoid_checks = true;
+
+ BlockBasedTableOptions table_options;
+ tiny_cache_ = NewLRUCache(100);
+ table_options.block_cache = tiny_cache_;
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ dbname_ = test::PerThreadDBPath("fault_test");
+
+ EXPECT_OK(DestroyDB(dbname_, options_));
+
+ options_.create_if_missing = true;
+ Status s = OpenDB();
+ options_.create_if_missing = false;
+ return s;
+ }
+
+ void SetUp() override {
+ sequential_order_ = std::get<0>(GetParam());
+ ASSERT_OK(NewDB());
+ }
+
+ void TearDown() override {
+ CloseDB();
+
+ Status s = DestroyDB(dbname_, options_);
+
+ delete env_;
+ env_ = nullptr;
+
+ tiny_cache_.reset();
+
+ ASSERT_OK(s);
+ }
+
+ void Build(const WriteOptions& write_options, int start_idx, int num_vals) {
+ std::string key_space, value_space;
+ WriteBatch batch;
+ for (int i = start_idx; i < start_idx + num_vals; i++) {
+ Slice key = Key(i, &key_space);
+ batch.Clear();
+ ASSERT_OK(batch.Put(key, Value(i, &value_space)));
+ ASSERT_OK(db_->Write(write_options, &batch));
+ }
+ }
+
+ Status ReadValue(int i, std::string* val) const {
+ std::string key_space, value_space;
+ Slice key = Key(i, &key_space);
+ Value(i, &value_space);
+ ReadOptions options;
+ return db_->Get(options, key, val);
+ }
+
+ Status Verify(int start_idx, int num_vals,
+ ExpectedVerifResult expected) const {
+ std::string val;
+ std::string value_space;
+ Status s;
+ for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
+ Value(i, &value_space);
+ s = ReadValue(i, &val);
+ if (s.ok()) {
+ EXPECT_EQ(value_space, val);
+ }
+ if (expected == kValExpectFound) {
+ if (!s.ok()) {
+ fprintf(stderr, "Error when read %dth record (expect found): %s\n", i,
+ s.ToString().c_str());
+ return s;
+ }
+ } else if (!s.ok() && !s.IsNotFound()) {
+ fprintf(stderr, "Error when read %dth record: %s\n", i,
+ s.ToString().c_str());
+ return s;
+ }
+ }
+ return Status::OK();
+ }
+
+ // Return the ith key
+ Slice Key(int i, std::string* storage) const {
+ unsigned long long num = i;
+ if (!sequential_order_) {
+ // random transfer
+ const int m = 0x5bd1e995;
+ num *= m;
+ num ^= num << 24;
+ }
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%016d", static_cast<int>(num));
+ storage->assign(buf, strlen(buf));
+ return Slice(*storage);
+ }
+
+ // Return the value to associate with the specified key
+ Slice Value(int k, std::string* storage) const {
+ Random r(k);
+ *storage = r.RandomString(kValueSize);
+ return Slice(*storage);
+ }
+
+ void CloseDB() {
+ delete db_;
+ db_ = nullptr;
+ }
+
+ Status OpenDB() {
+ CloseDB();
+ env_->ResetState();
+ Status s = DB::Open(options_, dbname_, &db_);
+ assert(db_ != nullptr);
+ return s;
+ }
+
+ void DeleteAllData() {
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ WriteOptions options;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
+ }
+ ASSERT_OK(iter->status());
+ delete iter;
+
+ FlushOptions flush_options;
+ flush_options.wait = true;
+ ASSERT_OK(db_->Flush(flush_options));
+ }
+
+ // rnd cannot be null for kResetDropRandomUnsyncedData
+ void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) {
+ env_->AssertNoOpenFile();
+ switch (reset_method) {
+ case kResetDropUnsyncedData:
+ ASSERT_OK(env_->DropUnsyncedFileData());
+ break;
+ case kResetDropRandomUnsyncedData:
+ ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd));
+ break;
+ case kResetDeleteUnsyncedFiles:
+ ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+ break;
+ case kResetDropAndDeleteUnsynced:
+ ASSERT_OK(env_->DropUnsyncedFileData());
+ ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
+ DeleteAllData();
+
+ WriteOptions write_options;
+ write_options.sync = sync_use_wal_;
+
+ Build(write_options, 0, num_pre_sync);
+ if (sync_use_compact_) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+ write_options.sync = false;
+ Build(write_options, num_pre_sync, num_post_sync);
+ }
+
+ void PartialCompactTestReopenWithFault(ResetMethod reset_method,
+ int num_pre_sync, int num_post_sync,
+ Random* rnd = nullptr) {
+ env_->SetFilesystemActive(false);
+ CloseDB();
+ ResetDBState(reset_method, rnd);
+ ASSERT_OK(OpenDB());
+ ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+ ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+ FaultInjectionTest::kValExpectNoError));
+ WaitCompactionFinish();
+ ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+ ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+ FaultInjectionTest::kValExpectNoError));
+ }
+
+ void NoWriteTestPreFault() {}
+
+ void NoWriteTestReopenWithFault(ResetMethod reset_method) {
+ CloseDB();
+ ResetDBState(reset_method);
+ ASSERT_OK(OpenDB());
+ }
+
+ void WaitCompactionFinish() {
+ ASSERT_OK(static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact());
+ ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+ }
+
+ private:
+ Env* system_env_;
+ std::shared_ptr<Env> env_guard_;
+};
+
+class FaultInjectionTestSplitted : public FaultInjectionTest {};
+
+TEST_P(FaultInjectionTestSplitted, FaultTest) {
+ do {
+ Random rnd(301);
+
+ for (size_t idx = 0; idx < kNumIterations; idx++) {
+ int num_pre_sync = rnd.Uniform(kMaxNumValues);
+ int num_post_sync = rnd.Uniform(kMaxNumValues);
+
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
+ num_post_sync);
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData,
+ num_pre_sync, num_post_sync, &rnd);
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+ // Setting a separate data path won't pass the test as we don't sync
+ // it after creating new files,
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
+ num_pre_sync, num_post_sync);
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ // No new files created so we expect all values since no files will be
+ // dropped.
+ PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync,
+ num_post_sync);
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
+ }
+ } while (ChangeOptions());
+}
+
+// Previous log file is not fsynced if sync is forced after log rolling.
+TEST_P(FaultInjectionTest, WriteOptionSyncTest) {
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ // Block the job queue to prevent flush job from running.
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::HIGH);
+ sleeping_task_low.WaitUntilSleeping();
+
+ WriteOptions write_options;
+ write_options.sync = false;
+
+ std::string key_space, value_space;
+ ASSERT_OK(
+ db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+ FlushOptions flush_options;
+ flush_options.wait = false;
+ ASSERT_OK(db_->Flush(flush_options));
+ write_options.sync = true;
+ ASSERT_OK(
+ db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+ ASSERT_OK(db_->FlushWAL(false));
+
+ env_->SetFilesystemActive(false);
+ NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ ASSERT_OK(OpenDB());
+ std::string val;
+ Value(2, &value_space);
+ ASSERT_OK(ReadValue(2, &val));
+ ASSERT_EQ(value_space, val);
+
+ Value(1, &value_space);
+ ASSERT_OK(ReadValue(1, &val));
+ ASSERT_EQ(value_space, val);
+}
+
+TEST_P(FaultInjectionTest, UninstalledCompaction) {
+ options_.target_file_size_base = 32 * 1024;
+ options_.write_buffer_size = 100 << 10; // 100KB
+ options_.level0_file_num_compaction_trigger = 6;
+ options_.level0_stop_writes_trigger = 1 << 10;
+ options_.level0_slowdown_writes_trigger = 1 << 10;
+ options_.max_background_compactions = 1;
+ OpenDB();
+
+ if (!sequential_order_) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"FaultInjectionTest::FaultTest:0", "DBImpl::BGWorkCompaction"},
+ {"CompactionJob::Run():End", "FaultInjectionTest::FaultTest:1"},
+ {"FaultInjectionTest::FaultTest:2",
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+ });
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ int kNumKeys = 1000;
+ Build(WriteOptions(), 0, kNumKeys);
+ FlushOptions flush_options;
+ flush_options.wait = true;
+ ASSERT_OK(db_->Flush(flush_options));
+ ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+ TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0");
+ TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1");
+ env_->SetFilesystemActive(false);
+ TEST_SYNC_POINT("FaultInjectionTest::FaultTest:2");
+ CloseDB();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ResetDBState(kResetDropUnsyncedData);
+
+ std::atomic<bool> opened(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::Open:Opened", [&](void* /*arg*/) { opened.store(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BGWorkCompaction",
+ [&](void* /*arg*/) { ASSERT_TRUE(opened.load()); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(OpenDB());
+ ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+ WaitCompactionFinish();
+ ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(FaultInjectionTest, ManualLogSyncTest) {
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ // Block the job queue to prevent flush job from running.
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::HIGH);
+ sleeping_task_low.WaitUntilSleeping();
+
+ WriteOptions write_options;
+ write_options.sync = false;
+
+ std::string key_space, value_space;
+ ASSERT_OK(
+ db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+ FlushOptions flush_options;
+ flush_options.wait = false;
+ ASSERT_OK(db_->Flush(flush_options));
+ ASSERT_OK(
+ db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+ ASSERT_OK(db_->FlushWAL(true));
+
+ env_->SetFilesystemActive(false);
+ NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ ASSERT_OK(OpenDB());
+ std::string val;
+ Value(2, &value_space);
+ ASSERT_OK(ReadValue(2, &val));
+ ASSERT_EQ(value_space, val);
+
+ Value(1, &value_space);
+ ASSERT_OK(ReadValue(1, &val));
+ ASSERT_EQ(value_space, val);
+}
+
+TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) {
+ ReadOptions ro;
+ Options options = CurrentOptions();
+ options.env = env_;
+
+ WriteOptions wo;
+ wo.sync = true;
+ wo.disableWAL = false;
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("cats", "dogs"));
+ batch.MarkWalTerminationPoint();
+ ASSERT_OK(batch.Put("boys", "girls"));
+ ASSERT_OK(db_->Write(wo, &batch));
+
+ env_->SetFilesystemActive(false);
+ NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+ ASSERT_OK(OpenDB());
+
+ std::string val;
+ ASSERT_OK(db_->Get(ro, "cats", &val));
+ ASSERT_EQ("dogs", val);
+ ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound());
+}
+
+TEST_P(FaultInjectionTest, NoDuplicateTrailingEntries) {
+ auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+ fault_fs->EnableWriteErrorInjection();
+ fault_fs->SetFilesystemDirectWritable(false);
+ const std::string file_name = NormalizePath(dbname_ + "/test_file");
+ std::unique_ptr<log::Writer> log_writer = nullptr;
+ constexpr uint64_t log_number = 0;
+ {
+ std::unique_ptr<FSWritableFile> file;
+ const Status s =
+ fault_fs->NewWritableFile(file_name, FileOptions(), &file, nullptr);
+ ASSERT_OK(s);
+ std::unique_ptr<WritableFileWriter> fwriter(
+ new WritableFileWriter(std::move(file), file_name, FileOptions()));
+ log_writer.reset(new log::Writer(std::move(fwriter), log_number,
+ /*recycle_log_files=*/false));
+ }
+
+ fault_fs->SetRandomWriteError(
+ 0xdeadbeef, /*one_in=*/1, IOStatus::IOError("Injected IOError"),
+ /*inject_for_all_file_types=*/true, /*types=*/{});
+
+ {
+ VersionEdit edit;
+ edit.SetColumnFamily(0);
+ std::string buf;
+ assert(edit.EncodeTo(&buf));
+ const Status s = log_writer->AddRecord(buf);
+ ASSERT_NOK(s);
+ }
+
+ fault_fs->DisableWriteErrorInjection();
+
+ // Closing the log writer will cause WritableFileWriter::Close() and flush
+ // remaining data from its buffer to underlying file.
+ log_writer.reset();
+
+ {
+ std::unique_ptr<FSSequentialFile> file;
+ Status s =
+ fault_fs->NewSequentialFile(file_name, FileOptions(), &file, nullptr);
+ ASSERT_OK(s);
+ std::unique_ptr<SequentialFileReader> freader(
+ new SequentialFileReader(std::move(file), file_name));
+ Status log_read_s;
+ class LogReporter : public log::Reader::Reporter {
+ public:
+ Status* status_;
+ explicit LogReporter(Status* _s) : status_(_s) {}
+ void Corruption(size_t /*bytes*/, const Status& _s) override {
+ if (status_->ok()) {
+ *status_ = _s;
+ }
+ }
+ } reporter(&log_read_s);
+ std::unique_ptr<log::Reader> log_reader(new log::Reader(
+ nullptr, std::move(freader), &reporter, /*checksum=*/true, log_number));
+ Slice record;
+ std::string data;
+ size_t count = 0;
+ while (log_reader->ReadRecord(&record, &data) && log_read_s.ok()) {
+ VersionEdit edit;
+ ASSERT_OK(edit.DecodeFrom(data));
+ ++count;
+ }
+ // Verify that only one version edit exists in the file.
+ ASSERT_EQ(1, count);
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ FaultTest, FaultInjectionTest,
+ ::testing::Values(std::make_tuple(false, kDefault, kEnd),
+ std::make_tuple(true, kDefault, kEnd)));
+
+INSTANTIATE_TEST_CASE_P(
+ FaultTest, FaultInjectionTestSplitted,
+ ::testing::Values(std::make_tuple(false, kDefault, kSyncWal),
+ std::make_tuple(true, kDefault, kSyncWal),
+ std::make_tuple(false, kSyncWal, kEnd),
+ std::make_tuple(true, kSyncWal, kEnd)));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/file_indexer.cc b/src/rocksdb/db/file_indexer.cc
new file mode 100644
index 000000000..608f1cb28
--- /dev/null
+++ b/src/rocksdb/db/file_indexer.cc
@@ -0,0 +1,218 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FileIndexer::FileIndexer(const Comparator* ucmp)
+ : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {}
+
+size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); }
+
+size_t FileIndexer::LevelIndexSize(size_t level) const {
+ if (level >= next_level_index_.size()) {
+ return 0;
+ }
+ return next_level_index_[level].num_index;
+}
+
+void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index,
+ const int cmp_smallest,
+ const int cmp_largest, int32_t* left_bound,
+ int32_t* right_bound) const {
+ assert(level > 0);
+
+ // Last level, no hint
+ if (level == num_levels_ - 1) {
+ *left_bound = 0;
+ *right_bound = -1;
+ return;
+ }
+
+ assert(level < num_levels_ - 1);
+ assert(static_cast<int32_t>(file_index) <= level_rb_[level]);
+
+ const IndexUnit* index_units = next_level_index_[level].index_units;
+ const auto& index = index_units[file_index];
+
+ if (cmp_smallest < 0) {
+ *left_bound = (level > 0 && file_index > 0)
+ ? index_units[file_index - 1].largest_lb
+ : 0;
+ *right_bound = index.smallest_rb;
+ } else if (cmp_smallest == 0) {
+ *left_bound = index.smallest_lb;
+ *right_bound = index.smallest_rb;
+ } else if (cmp_smallest > 0 && cmp_largest < 0) {
+ *left_bound = index.smallest_lb;
+ *right_bound = index.largest_rb;
+ } else if (cmp_largest == 0) {
+ *left_bound = index.largest_lb;
+ *right_bound = index.largest_rb;
+ } else if (cmp_largest > 0) {
+ *left_bound = index.largest_lb;
+ *right_bound = level_rb_[level + 1];
+ } else {
+ assert(false);
+ }
+
+ assert(*left_bound >= 0);
+ assert(*left_bound <= *right_bound + 1);
+ assert(*right_bound <= level_rb_[level + 1]);
+}
+
+void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels,
+ std::vector<FileMetaData*>* const files) {
+ if (files == nullptr) {
+ return;
+ }
+ if (num_levels == 0) { // uint_32 0-1 would cause bad behavior
+ num_levels_ = num_levels;
+ return;
+ }
+ assert(level_rb_ == nullptr); // level_rb_ should be init here
+
+ num_levels_ = num_levels;
+ next_level_index_.resize(num_levels);
+
+ char* mem = arena->AllocateAligned(num_levels_ * sizeof(int32_t));
+ level_rb_ = new (mem) int32_t[num_levels_];
+ for (size_t i = 0; i < num_levels_; i++) {
+ level_rb_[i] = -1;
+ }
+
+ // L1 - Ln-1
+ for (size_t level = 1; level < num_levels_ - 1; ++level) {
+ const auto& upper_files = files[level];
+ const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+ const auto& lower_files = files[level + 1];
+ level_rb_[level] = static_cast<int32_t>(upper_files.size()) - 1;
+ if (upper_size == 0) {
+ continue;
+ }
+ IndexLevel& index_level = next_level_index_[level];
+ index_level.num_index = upper_size;
+ mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
+ index_level.index_units = new (mem) IndexUnit[upper_size];
+
+ CalculateLB(
+ upper_files, lower_files, &index_level,
+ [this](const FileMetaData* a, const FileMetaData* b) -> int {
+ return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(),
+ b->largest.user_key());
+ },
+ [](IndexUnit* index, int32_t f_idx) { index->smallest_lb = f_idx; });
+ CalculateLB(
+ upper_files, lower_files, &index_level,
+ [this](const FileMetaData* a, const FileMetaData* b) -> int {
+ return ucmp_->CompareWithoutTimestamp(a->largest.user_key(),
+ b->largest.user_key());
+ },
+ [](IndexUnit* index, int32_t f_idx) { index->largest_lb = f_idx; });
+ CalculateRB(
+ upper_files, lower_files, &index_level,
+ [this](const FileMetaData* a, const FileMetaData* b) -> int {
+ return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(),
+ b->smallest.user_key());
+ },
+ [](IndexUnit* index, int32_t f_idx) { index->smallest_rb = f_idx; });
+ CalculateRB(
+ upper_files, lower_files, &index_level,
+ [this](const FileMetaData* a, const FileMetaData* b) -> int {
+ return ucmp_->CompareWithoutTimestamp(a->largest.user_key(),
+ b->smallest.user_key());
+ },
+ [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; });
+ }
+
+ level_rb_[num_levels_ - 1] =
+ static_cast<int32_t>(files[num_levels_ - 1].size()) - 1;
+}
+
+void FileIndexer::CalculateLB(
+ const std::vector<FileMetaData*>& upper_files,
+ const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+ std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+ std::function<void(IndexUnit*, int32_t)> set_index) {
+ const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+ const int32_t lower_size = static_cast<int32_t>(lower_files.size());
+ int32_t upper_idx = 0;
+ int32_t lower_idx = 0;
+
+ IndexUnit* index = index_level->index_units;
+ while (upper_idx < upper_size && lower_idx < lower_size) {
+ int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+ if (cmp == 0) {
+ set_index(&index[upper_idx], lower_idx);
+ ++upper_idx;
+ } else if (cmp > 0) {
+ // Lower level's file (largest) is smaller, a key won't hit in that
+ // file. Move to next lower file
+ ++lower_idx;
+ } else {
+ // Lower level's file becomes larger, update the index, and
+ // move to the next upper file
+ set_index(&index[upper_idx], lower_idx);
+ ++upper_idx;
+ }
+ }
+
+ while (upper_idx < upper_size) {
+ // Lower files are exhausted, that means the remaining upper files are
+ // greater than any lower files. Set the index to be the lower level size.
+ set_index(&index[upper_idx], lower_size);
+ ++upper_idx;
+ }
+}
+
+void FileIndexer::CalculateRB(
+ const std::vector<FileMetaData*>& upper_files,
+ const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+ std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+ std::function<void(IndexUnit*, int32_t)> set_index) {
+ const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+ const int32_t lower_size = static_cast<int32_t>(lower_files.size());
+ int32_t upper_idx = upper_size - 1;
+ int32_t lower_idx = lower_size - 1;
+
+ IndexUnit* index = index_level->index_units;
+ while (upper_idx >= 0 && lower_idx >= 0) {
+ int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+ if (cmp == 0) {
+ set_index(&index[upper_idx], lower_idx);
+ --upper_idx;
+ } else if (cmp < 0) {
+ // Lower level's file (smallest) is larger, a key won't hit in that
+ // file. Move to next lower file.
+ --lower_idx;
+ } else {
+ // Lower level's file becomes smaller, update the index, and move to
+ // the next the upper file
+ set_index(&index[upper_idx], lower_idx);
+ --upper_idx;
+ }
+ }
+ while (upper_idx >= 0) {
+ // Lower files are exhausted, that means the remaining upper files are
+ // smaller than any lower files. Set it to -1.
+ set_index(&index[upper_idx], -1);
+ --upper_idx;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/file_indexer.h b/src/rocksdb/db/file_indexer.h
new file mode 100644
index 000000000..45cb13615
--- /dev/null
+++ b/src/rocksdb/db/file_indexer.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <vector>
+
+#include "memory/arena.h"
+#include "port/port.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+struct FileMetaData;
+struct FdWithKeyRange;
+struct FileLevel;
+
+// The file tree structure in Version is prebuilt and the range of each file
+// is known. On Version::Get(), it uses binary search to find a potential file
+// and then check if a target key can be found in the file by comparing the key
+// to each file's smallest and largest key. The results of these comparisons
+// can be reused beyond checking if a key falls into a file's range.
+// With some pre-calculated knowledge, each key comparison that has been done
+// can serve as a hint to narrow down further searches: if a key compared to
+// be smaller than a file's smallest or largest, that comparison can be used
+// to find out the right bound of next binary search. Similarly, if a key
+// compared to be larger than a file's smallest or largest, it can be utilized
+// to find out the left bound of next binary search.
+// With these hints: it can greatly reduce the range of binary search,
+// especially for bottom levels, given that one file most likely overlaps with
+// only N files from level below (where N is max_bytes_for_level_multiplier).
+// So on level L, we will only look at ~N files instead of N^L files on the
+// naive approach.
+class FileIndexer {
+ public:
+ explicit FileIndexer(const Comparator* ucmp);
+
+ size_t NumLevelIndex() const;
+
+ size_t LevelIndexSize(size_t level) const;
+
+ // Return a file index range in the next level to search for a key based on
+ // smallest and largest key comparison for the current file specified by
+ // level and file_index. When *left_index < *right_index, both index should
+ // be valid and fit in the vector size.
+ void GetNextLevelIndex(const size_t level, const size_t file_index,
+ const int cmp_smallest, const int cmp_largest,
+ int32_t* left_bound, int32_t* right_bound) const;
+
+ void UpdateIndex(Arena* arena, const size_t num_levels,
+ std::vector<FileMetaData*>* const files);
+
+ enum { kLevelMaxIndex = std::numeric_limits<int32_t>::max() };
+
+ private:
+ size_t num_levels_;
+ const Comparator* ucmp_;
+
+ struct IndexUnit {
+ IndexUnit()
+ : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {}
+ // During file search, a key is compared against smallest and largest
+ // from a FileMetaData. It can have 3 possible outcomes:
+ // (1) key is smaller than smallest, implying it is also smaller than
+ // larger. Precalculated index based on "smallest < smallest" can
+ // be used to provide right bound.
+ // (2) key is in between smallest and largest.
+ // Precalculated index based on "smallest > greatest" can be used to
+ // provide left bound.
+ // Precalculated index based on "largest < smallest" can be used to
+ // provide right bound.
+ // (3) key is larger than largest, implying it is also larger than smallest.
+ // Precalculated index based on "largest > largest" can be used to
+ // provide left bound.
+ //
+ // As a result, we will need to do:
+ // Compare smallest (<=) and largest keys from upper level file with
+ // smallest key from lower level to get a right bound.
+ // Compare smallest (>=) and largest keys from upper level file with
+ // largest key from lower level to get a left bound.
+ //
+ // Example:
+ // level 1: [50 - 60]
+ // level 2: [1 - 40], [45 - 55], [58 - 80]
+ // A key 35, compared to be less than 50, 3rd file on level 2 can be
+ // skipped according to rule (1). LB = 0, RB = 1.
+ // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be
+ // skipped according to rule (2)-a, but the 3rd file cannot be skipped
+ // because 60 is greater than 58. LB = 1, RB = 2.
+ // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped
+ // according to rule (3). LB = 2, RB = 2.
+ //
+ // Point to a left most file in a lower level that may contain a key,
+ // which compares greater than smallest of a FileMetaData (upper level)
+ int32_t smallest_lb;
+ // Point to a left most file in a lower level that may contain a key,
+ // which compares greater than largest of a FileMetaData (upper level)
+ int32_t largest_lb;
+ // Point to a right most file in a lower level that may contain a key,
+ // which compares smaller than smallest of a FileMetaData (upper level)
+ int32_t smallest_rb;
+ // Point to a right most file in a lower level that may contain a key,
+ // which compares smaller than largest of a FileMetaData (upper level)
+ int32_t largest_rb;
+ };
+
+ // Data structure to store IndexUnits in a whole level
+ struct IndexLevel {
+ size_t num_index;
+ IndexUnit* index_units;
+
+ IndexLevel() : num_index(0), index_units(nullptr) {}
+ };
+
+ void CalculateLB(
+ const std::vector<FileMetaData*>& upper_files,
+ const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+ std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+ std::function<void(IndexUnit*, int32_t)> set_index);
+
+ void CalculateRB(
+ const std::vector<FileMetaData*>& upper_files,
+ const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+ std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+ std::function<void(IndexUnit*, int32_t)> set_index);
+
+ autovector<IndexLevel> next_level_index_;
+ int32_t* level_rb_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/file_indexer_test.cc b/src/rocksdb/db/file_indexer_test.cc
new file mode 100644
index 000000000..5c82189ef
--- /dev/null
+++ b/src/rocksdb/db/file_indexer_test.cc
@@ -0,0 +1,352 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IntComparator : public Comparator {
+ public:
+ int Compare(const Slice& a, const Slice& b) const override {
+ assert(a.size() == 8);
+ assert(b.size() == 8);
+ int64_t diff = *reinterpret_cast<const int64_t*>(a.data()) -
+ *reinterpret_cast<const int64_t*>(b.data());
+ if (diff < 0) {
+ return -1;
+ } else if (diff == 0) {
+ return 0;
+ } else {
+ return 1;
+ }
+ }
+
+ const char* Name() const override { return "IntComparator"; }
+
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class FileIndexerTest : public testing::Test {
+ public:
+ FileIndexerTest()
+ : kNumLevels(4), files(new std::vector<FileMetaData*>[kNumLevels]) {}
+
+ ~FileIndexerTest() override {
+ ClearFiles();
+ delete[] files;
+ }
+
+ void AddFile(int level, int64_t smallest, int64_t largest) {
+ auto* f = new FileMetaData();
+ f->smallest = IntKey(smallest);
+ f->largest = IntKey(largest);
+ files[level].push_back(f);
+ }
+
+ InternalKey IntKey(int64_t v) {
+ return InternalKey(Slice(reinterpret_cast<char*>(&v), 8), 0, kTypeValue);
+ }
+
+ void ClearFiles() {
+ for (uint32_t i = 0; i < kNumLevels; ++i) {
+ for (auto* f : files[i]) {
+ delete f;
+ }
+ files[i].clear();
+ }
+ }
+
+ void GetNextLevelIndex(const uint32_t level, const uint32_t file_index,
+ const int cmp_smallest, const int cmp_largest,
+ int32_t* left_index, int32_t* right_index) {
+ *left_index = 100;
+ *right_index = 100;
+ indexer->GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest,
+ left_index, right_index);
+ }
+
+ int32_t left = 100;
+ int32_t right = 100;
+ const uint32_t kNumLevels;
+ IntComparator ucmp;
+ FileIndexer* indexer;
+
+ std::vector<FileMetaData*>* files;
+};
+
+// Case 0: Empty
+TEST_F(FileIndexerTest, Empty) {
+ Arena arena;
+ indexer = new FileIndexer(&ucmp);
+ indexer->UpdateIndex(&arena, 0, files);
+ delete indexer;
+}
+
+// Case 1: no overlap, files are on the left of next level files
+TEST_F(FileIndexerTest, no_overlap_left) {
+ Arena arena;
+ indexer = new FileIndexer(&ucmp);
+ // level 1
+ AddFile(1, 100, 200);
+ AddFile(1, 300, 400);
+ AddFile(1, 500, 600);
+ // level 2
+ AddFile(2, 1500, 1600);
+ AddFile(2, 1601, 1699);
+ AddFile(2, 1700, 1800);
+ // level 3
+ AddFile(3, 2500, 2600);
+ AddFile(3, 2601, 2699);
+ AddFile(3, 2700, 2800);
+ indexer->UpdateIndex(&arena, kNumLevels, files);
+ for (uint32_t level = 1; level < 3; ++level) {
+ for (uint32_t f = 0; f < 3; ++f) {
+ GetNextLevelIndex(level, f, -1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(level, f, 0, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(level, f, 1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(level, f, 1, 0, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(level, f, 1, 1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(2, right);
+ }
+ }
+ delete indexer;
+ ClearFiles();
+}
+
+// Case 2: no overlap, files are on the right of next level files
+TEST_F(FileIndexerTest, no_overlap_right) {
+ Arena arena;
+ indexer = new FileIndexer(&ucmp);
+ // level 1
+ AddFile(1, 2100, 2200);
+ AddFile(1, 2300, 2400);
+ AddFile(1, 2500, 2600);
+ // level 2
+ AddFile(2, 1500, 1600);
+ AddFile(2, 1501, 1699);
+ AddFile(2, 1700, 1800);
+ // level 3
+ AddFile(3, 500, 600);
+ AddFile(3, 501, 699);
+ AddFile(3, 700, 800);
+ indexer->UpdateIndex(&arena, kNumLevels, files);
+ for (uint32_t level = 1; level < 3; ++level) {
+ for (uint32_t f = 0; f < 3; ++f) {
+ GetNextLevelIndex(level, f, -1, -1, &left, &right);
+ ASSERT_EQ(f == 0 ? 0 : 3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(level, f, 0, -1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(level, f, 1, -1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(level, f, 1, -1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(level, f, 1, 0, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(level, f, 1, 1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ }
+ }
+ delete indexer;
+}
+
+// Case 3: empty L2
+TEST_F(FileIndexerTest, empty_L2) {
+ Arena arena;
+ indexer = new FileIndexer(&ucmp);
+ for (uint32_t i = 1; i < kNumLevels; ++i) {
+ ASSERT_EQ(0U, indexer->LevelIndexSize(i));
+ }
+ // level 1
+ AddFile(1, 2100, 2200);
+ AddFile(1, 2300, 2400);
+ AddFile(1, 2500, 2600);
+ // level 3
+ AddFile(3, 500, 600);
+ AddFile(3, 501, 699);
+ AddFile(3, 700, 800);
+ indexer->UpdateIndex(&arena, kNumLevels, files);
+ for (uint32_t f = 0; f < 3; ++f) {
+ GetNextLevelIndex(1, f, -1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(1, f, 0, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(1, f, 1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(1, f, 1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(1, f, 1, 0, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ GetNextLevelIndex(1, f, 1, 1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(-1, right);
+ }
+ delete indexer;
+ ClearFiles();
+}
+
+// Case 4: mixed
+TEST_F(FileIndexerTest, mixed) {
+ Arena arena;
+ indexer = new FileIndexer(&ucmp);
+ // level 1
+ AddFile(1, 100, 200);
+ AddFile(1, 250, 400);
+ AddFile(1, 450, 500);
+ // level 2
+ AddFile(2, 100, 150); // 0
+ AddFile(2, 200, 250); // 1
+ AddFile(2, 251, 300); // 2
+ AddFile(2, 301, 350); // 3
+ AddFile(2, 500, 600); // 4
+ // level 3
+ AddFile(3, 0, 50);
+ AddFile(3, 100, 200);
+ AddFile(3, 201, 250);
+ indexer->UpdateIndex(&arena, kNumLevels, files);
+ // level 1, 0
+ GetNextLevelIndex(1, 0, -1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(0, right);
+ GetNextLevelIndex(1, 0, 0, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(0, right);
+ GetNextLevelIndex(1, 0, 1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(1, 0, 1, 0, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(1, 0, 1, 1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(4, right);
+ // level 1, 1
+ GetNextLevelIndex(1, 1, -1, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(1, 1, 0, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(1, 1, 1, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(3, right);
+ GetNextLevelIndex(1, 1, 1, 0, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(3, right);
+ GetNextLevelIndex(1, 1, 1, 1, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(4, right);
+ // level 1, 2
+ GetNextLevelIndex(1, 2, -1, -1, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(3, right);
+ GetNextLevelIndex(1, 2, 0, -1, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(3, right);
+ GetNextLevelIndex(1, 2, 1, -1, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(4, right);
+ GetNextLevelIndex(1, 2, 1, 0, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(4, right);
+ GetNextLevelIndex(1, 2, 1, 1, &left, &right);
+ ASSERT_EQ(4, left);
+ ASSERT_EQ(4, right);
+ // level 2, 0
+ GetNextLevelIndex(2, 0, -1, -1, &left, &right);
+ ASSERT_EQ(0, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 0, 0, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 0, 1, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 0, 1, 0, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 0, 1, 1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(2, right);
+ // level 2, 1
+ GetNextLevelIndex(2, 1, -1, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 1, 0, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(1, right);
+ GetNextLevelIndex(2, 1, 1, -1, &left, &right);
+ ASSERT_EQ(1, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, 1, 1, 0, &left, &right);
+ ASSERT_EQ(2, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, 1, 1, 1, &left, &right);
+ ASSERT_EQ(2, left);
+ ASSERT_EQ(2, right);
+ // level 2, [2 - 4], no overlap
+ for (uint32_t f = 2; f <= 4; ++f) {
+ GetNextLevelIndex(2, f, -1, -1, &left, &right);
+ ASSERT_EQ(f == 2 ? 2 : 3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, f, 0, -1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, f, 1, -1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, f, 1, 0, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ GetNextLevelIndex(2, f, 1, 1, &left, &right);
+ ASSERT_EQ(3, left);
+ ASSERT_EQ(2, right);
+ }
+ delete indexer;
+ ClearFiles();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/filename_test.cc b/src/rocksdb/db/filename_test.cc
new file mode 100644
index 000000000..04c81b333
--- /dev/null
+++ b/src/rocksdb/db/filename_test.cc
@@ -0,0 +1,241 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileNameTest : public testing::Test {};
+
+TEST_F(FileNameTest, Parse) {
+ Slice db;
+ FileType type;
+ uint64_t number;
+
+ char kDefautInfoLogDir = 1;
+ char kDifferentInfoLogDir = 2;
+ char kNoCheckLogDir = 4;
+ char kAllMode = kDefautInfoLogDir | kDifferentInfoLogDir | kNoCheckLogDir;
+
+ // Successful parses
+ static struct {
+ const char* fname;
+ uint64_t number;
+ FileType type;
+ char mode;
+ } cases[] = {
+ {"100.log", 100, kWalFile, kAllMode},
+ {"0.log", 0, kWalFile, kAllMode},
+ {"0.sst", 0, kTableFile, kAllMode},
+ {"CURRENT", 0, kCurrentFile, kAllMode},
+ {"LOCK", 0, kDBLockFile, kAllMode},
+ {"MANIFEST-2", 2, kDescriptorFile, kAllMode},
+ {"MANIFEST-7", 7, kDescriptorFile, kAllMode},
+ {"METADB-2", 2, kMetaDatabase, kAllMode},
+ {"METADB-7", 7, kMetaDatabase, kAllMode},
+ {"LOG", 0, kInfoLogFile, kDefautInfoLogDir},
+ {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir},
+ {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir},
+ {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir},
+ {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir},
+ {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir},
+ {"18446744073709551615.log", 18446744073709551615ull, kWalFile, kAllMode},
+ };
+ for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) {
+ for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+ InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir");
+ if (cases[i].mode & mode) {
+ std::string f = cases[i].fname;
+ if (mode == kNoCheckLogDir) {
+ ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+ } else {
+ ASSERT_TRUE(ParseFileName(f, &number, info_log_prefix.prefix, &type))
+ << f;
+ }
+ ASSERT_EQ(cases[i].type, type) << f;
+ ASSERT_EQ(cases[i].number, number) << f;
+ }
+ }
+ }
+
+ // Errors
+ static const char* errors[] = {"",
+ "foo",
+ "foo-dx-100.log",
+ ".log",
+ "",
+ "manifest",
+ "CURREN",
+ "CURRENTX",
+ "MANIFES",
+ "MANIFEST",
+ "MANIFEST-",
+ "XMANIFEST-3",
+ "MANIFEST-3x",
+ "META",
+ "METADB",
+ "METADB-",
+ "XMETADB-3",
+ "METADB-3x",
+ "LOC",
+ "LOCKx",
+ "LO",
+ "LOGx",
+ "18446744073709551616.log",
+ "184467440737095516150.log",
+ "100",
+ "100.",
+ "100.lop"};
+ for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+ std::string f = errors[i];
+ ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+ };
+}
+
+TEST_F(FileNameTest, InfoLogFileName) {
+ std::string dbname = ("/data/rocksdb");
+ std::string db_absolute_path;
+ ASSERT_OK(Env::Default()->GetAbsolutePath(dbname, &db_absolute_path));
+
+ ASSERT_EQ("/data/rocksdb/LOG", InfoLogFileName(dbname, db_absolute_path, ""));
+ ASSERT_EQ("/data/rocksdb/LOG.old.666",
+ OldInfoLogFileName(dbname, 666u, db_absolute_path, ""));
+
+ ASSERT_EQ("/data/rocksdb_log/data_rocksdb_LOG",
+ InfoLogFileName(dbname, db_absolute_path, "/data/rocksdb_log"));
+ ASSERT_EQ(
+ "/data/rocksdb_log/data_rocksdb_LOG.old.666",
+ OldInfoLogFileName(dbname, 666u, db_absolute_path, "/data/rocksdb_log"));
+}
+
+TEST_F(FileNameTest, Construction) {
+ uint64_t number;
+ FileType type;
+ std::string fname;
+
+ fname = CurrentFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(0U, number);
+ ASSERT_EQ(kCurrentFile, type);
+
+ fname = LockFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(0U, number);
+ ASSERT_EQ(kDBLockFile, type);
+
+ fname = LogFileName("foo", 192);
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(192U, number);
+ ASSERT_EQ(kWalFile, type);
+
+ fname = TableFileName({DbPath("bar", 0)}, 200, 0);
+ std::string fname1 =
+ TableFileName({DbPath("foo", 0), DbPath("bar", 0)}, 200, 1);
+ ASSERT_EQ(fname, fname1);
+ ASSERT_EQ("bar/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(200U, number);
+ ASSERT_EQ(kTableFile, type);
+
+ fname = DescriptorFileName("bar", 100);
+ ASSERT_EQ("bar/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(100U, number);
+ ASSERT_EQ(kDescriptorFile, type);
+
+ fname = TempFileName("tmp", 999);
+ ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(999U, number);
+ ASSERT_EQ(kTempFile, type);
+
+ fname = MetaDatabaseName("met", 100);
+ ASSERT_EQ("met/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(100U, number);
+ ASSERT_EQ(kMetaDatabase, type);
+}
+
+TEST_F(FileNameTest, NormalizePath) {
+ // No leading slash
+ const std::string sep = std::string(1, kFilePathSeparator);
+
+ std::string expected = "FOLDER" + sep + "filename.ext";
+ std::string given = "FOLDER" + sep + "filename.ext";
+
+ ASSERT_EQ(expected, NormalizePath(given));
+
+ // Two chars /a
+
+ expected = sep + "a";
+ given = expected;
+ ASSERT_EQ(expected, NormalizePath(given));
+
+ // Two chars a/
+ expected = "a" + sep;
+ given = expected;
+ ASSERT_EQ(expected, NormalizePath(given));
+
+ // Server only
+ expected = sep + sep + "a";
+ given = expected;
+ ASSERT_EQ(expected, NormalizePath(given));
+
+ // Two slashes after character
+ expected = "a" + sep;
+ given = "a" + sep + sep;
+
+ ASSERT_EQ(expected, NormalizePath(given));
+
+ // slash only /
+ expected = sep;
+ given = expected;
+ ASSERT_EQ(expected, NormalizePath(given));
+
+ // UNC only //
+ expected = sep;
+ given = sep + sep;
+
+ ASSERT_EQ(expected, NormalizePath(given));
+
+ // 3 slashesy //
+ expected = sep + sep;
+ given = sep + sep + sep;
+ ASSERT_EQ(expected, NormalizePath(given));
+
+ // 3 slashes //
+ expected = sep + sep + "a" + sep;
+ given = sep + sep + sep + "a" + sep;
+ ASSERT_EQ(expected, NormalizePath(given));
+
+ // 2 separators in the middle
+ expected = "a" + sep + "b";
+ given = "a" + sep + sep + "b";
+ ASSERT_EQ(expected, NormalizePath(given));
+
+ // UNC with duplicate slashes
+ expected = sep + sep + "SERVER" + sep + "a" + sep + "b" + sep + "c";
+ given = sep + sep + "SERVER" + sep + "a" + sep + sep + "b" + sep + "c";
+ ASSERT_EQ(expected, NormalizePath(given));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/flush_job.cc b/src/rocksdb/db/flush_job.cc
new file mode 100644
index 000000000..645e42f44
--- /dev/null
+++ b/src/rocksdb/db/flush_job.cc
@@ -0,0 +1,1094 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/flush_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/event_helpers.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "logging/event_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetFlushReasonString(FlushReason flush_reason) {
+ switch (flush_reason) {
+ case FlushReason::kOthers:
+ return "Other Reasons";
+ case FlushReason::kGetLiveFiles:
+ return "Get Live Files";
+ case FlushReason::kShutDown:
+ return "Shut down";
+ case FlushReason::kExternalFileIngestion:
+ return "External File Ingestion";
+ case FlushReason::kManualCompaction:
+ return "Manual Compaction";
+ case FlushReason::kWriteBufferManager:
+ return "Write Buffer Manager";
+ case FlushReason::kWriteBufferFull:
+ return "Write Buffer Full";
+ case FlushReason::kTest:
+ return "Test";
+ case FlushReason::kDeleteFiles:
+ return "Delete Files";
+ case FlushReason::kAutoCompaction:
+ return "Auto Compaction";
+ case FlushReason::kManualFlush:
+ return "Manual Flush";
+ case FlushReason::kErrorRecovery:
+ return "Error Recovery";
+ case FlushReason::kWalFull:
+ return "WAL Full";
+ default:
+ return "Invalid";
+ }
+}
+
+FlushJob::FlushJob(
+ const std::string& dbname, ColumnFamilyData* cfd,
+ const ImmutableDBOptions& db_options,
+ const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
+ const FileOptions& file_options, VersionSet* versions,
+ InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, JobContext* job_context,
+ LogBuffer* log_buffer, FSDirectory* db_directory,
+ FSDirectory* output_file_directory, CompressionType output_compression,
+ Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
+ const bool sync_output_directory, const bool write_manifest,
+ Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+ const SeqnoToTimeMapping& seqno_time_mapping, const std::string& db_id,
+ const std::string& db_session_id, std::string full_history_ts_low,
+ BlobFileCompletionCallback* blob_callback)
+ : dbname_(dbname),
+ db_id_(db_id),
+ db_session_id_(db_session_id),
+ cfd_(cfd),
+ db_options_(db_options),
+ mutable_cf_options_(mutable_cf_options),
+ max_memtable_id_(max_memtable_id),
+ file_options_(file_options),
+ versions_(versions),
+ db_mutex_(db_mutex),
+ shutting_down_(shutting_down),
+ existing_snapshots_(std::move(existing_snapshots)),
+ earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+ snapshot_checker_(snapshot_checker),
+ job_context_(job_context),
+ log_buffer_(log_buffer),
+ db_directory_(db_directory),
+ output_file_directory_(output_file_directory),
+ output_compression_(output_compression),
+ stats_(stats),
+ event_logger_(event_logger),
+ measure_io_stats_(measure_io_stats),
+ sync_output_directory_(sync_output_directory),
+ write_manifest_(write_manifest),
+ edit_(nullptr),
+ base_(nullptr),
+ pick_memtable_called(false),
+ thread_pri_(thread_pri),
+ io_tracer_(io_tracer),
+ clock_(db_options_.clock),
+ full_history_ts_low_(std::move(full_history_ts_low)),
+ blob_callback_(blob_callback),
+ db_impl_seqno_time_mapping_(seqno_time_mapping) {
+ // Update the thread status to indicate flush.
+ ReportStartedFlush();
+ TEST_SYNC_POINT("FlushJob::FlushJob()");
+}
+
+FlushJob::~FlushJob() { ThreadStatusUtil::ResetThreadStatus(); }
+
+void FlushJob::ReportStartedFlush() {
+ ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env,
+ db_options_.enable_thread_tracking);
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH);
+ ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
+ job_context_->job_id);
+ IOSTATS_RESET(bytes_written);
+}
+
+void FlushJob::ReportFlushInputSize(const autovector<MemTable*>& mems) {
+ uint64_t input_size = 0;
+ for (auto* mem : mems) {
+ input_size += mem->ApproximateMemoryUsage();
+ }
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::FLUSH_BYTES_MEMTABLES, input_size);
+}
+
+void FlushJob::RecordFlushIOStats() {
+ RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
+ IOSTATS_RESET(bytes_written);
+}
+void FlushJob::PickMemTable() {
+ db_mutex_->AssertHeld();
+ assert(!pick_memtable_called);
+ pick_memtable_called = true;
+
+ // Maximum "NextLogNumber" of the memtables to flush.
+ // When mempurge feature is turned off, this variable is useless
+ // because the memtables are implicitly sorted by increasing order of creation
+ // time. Therefore mems_->back()->GetNextLogNumber() is already equal to
+ // max_next_log_number. However when Mempurge is on, the memtables are no
+ // longer sorted by increasing order of creation time. Therefore this variable
+ // becomes necessary because mems_->back()->GetNextLogNumber() is no longer
+ // necessarily equal to max_next_log_number.
+ uint64_t max_next_log_number = 0;
+
+ // Save the contents of the earliest memtable as a new Table
+ cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_,
+ &max_next_log_number);
+ if (mems_.empty()) {
+ return;
+ }
+
+ ReportFlushInputSize(mems_);
+
+ // entries mems are (implicitly) sorted in ascending order by their created
+ // time. We will use the first memtable's `edit` to keep the meta info for
+ // this flush.
+ MemTable* m = mems_[0];
+ edit_ = m->GetEdits();
+ edit_->SetPrevLogNumber(0);
+ // SetLogNumber(log_num) indicates logs with number smaller than log_num
+ // will no longer be picked up for recovery.
+ edit_->SetLogNumber(max_next_log_number);
+ edit_->SetColumnFamily(cfd_->GetID());
+
+ // path 0 for level 0 file.
+ meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+
+ base_ = cfd_->current();
+ base_->Ref(); // it is likely that we do not need this reference
+}
+
+Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
+ bool* switched_to_mempurge) {
+ TEST_SYNC_POINT("FlushJob::Start");
+ db_mutex_->AssertHeld();
+ assert(pick_memtable_called);
+ // Mempurge threshold can be dynamically changed.
+ // For sake of consistency, mempurge_threshold is
+ // saved locally to maintain consistency in each
+ // FlushJob::Run call.
+ double mempurge_threshold =
+ mutable_cf_options_.experimental_mempurge_threshold;
+
+ AutoThreadOperationStageUpdater stage_run(ThreadStatus::STAGE_FLUSH_RUN);
+ if (mems_.empty()) {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
+ cfd_->GetName().c_str());
+ return Status::OK();
+ }
+
+ // I/O measurement variables
+ PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+ uint64_t prev_write_nanos = 0;
+ uint64_t prev_fsync_nanos = 0;
+ uint64_t prev_range_sync_nanos = 0;
+ uint64_t prev_prepare_write_nanos = 0;
+ uint64_t prev_cpu_write_nanos = 0;
+ uint64_t prev_cpu_read_nanos = 0;
+ if (measure_io_stats_) {
+ prev_perf_level = GetPerfLevel();
+ SetPerfLevel(PerfLevel::kEnableTime);
+ prev_write_nanos = IOSTATS(write_nanos);
+ prev_fsync_nanos = IOSTATS(fsync_nanos);
+ prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+ prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+ prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+ prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+ }
+ Status mempurge_s = Status::NotFound("No MemPurge.");
+ if ((mempurge_threshold > 0.0) &&
+ (cfd_->GetFlushReason() == FlushReason::kWriteBufferFull) &&
+ (!mems_.empty()) && MemPurgeDecider(mempurge_threshold) &&
+ !(db_options_.atomic_flush)) {
+ cfd_->SetMempurgeUsed();
+ mempurge_s = MemPurge();
+ if (!mempurge_s.ok()) {
+ // Mempurge is typically aborted when the output
+ // bytes cannot be contained onto a single output memtable.
+ if (mempurge_s.IsAborted()) {
+ ROCKS_LOG_INFO(db_options_.info_log, "Mempurge process aborted: %s\n",
+ mempurge_s.ToString().c_str());
+ } else {
+ // However the mempurge process can also fail for
+ // other reasons (eg: new_mem->Add() fails).
+ ROCKS_LOG_WARN(db_options_.info_log, "Mempurge process failed: %s\n",
+ mempurge_s.ToString().c_str());
+ }
+ } else {
+ if (switched_to_mempurge) {
+ *switched_to_mempurge = true;
+ } else {
+ // The mempurge process was successful, but no switch_to_mempurge
+ // pointer provided so no way to propagate the state of flush job.
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Mempurge process succeeded"
+ "but no 'switched_to_mempurge' ptr provided.\n");
+ }
+ }
+ }
+ Status s;
+ if (mempurge_s.ok()) {
+ base_->Unref();
+ s = Status::OK();
+ } else {
+ // This will release and re-acquire the mutex.
+ s = WriteLevel0Table();
+ }
+
+ if (s.ok() && cfd_->IsDropped()) {
+ s = Status::ColumnFamilyDropped("Column family dropped during compaction");
+ }
+ if ((s.ok() || s.IsColumnFamilyDropped()) &&
+ shutting_down_->load(std::memory_order_acquire)) {
+ s = Status::ShutdownInProgress("Database shutdown");
+ }
+
+ if (!s.ok()) {
+ cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber());
+ } else if (write_manifest_) {
+ TEST_SYNC_POINT("FlushJob::InstallResults");
+ // Replace immutable memtable with the generated Table
+ s = cfd_->imm()->TryInstallMemtableFlushResults(
+ cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
+ meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
+ log_buffer_, &committed_flush_jobs_info_,
+ !(mempurge_s.ok()) /* write_edit : true if no mempurge happened (or if aborted),
+ but 'false' if mempurge successful: no new min log number
+ or new level 0 file path to write to manifest. */);
+ }
+
+ if (s.ok() && file_meta != nullptr) {
+ *file_meta = meta_;
+ }
+ RecordFlushIOStats();
+
+ // When measure_io_stats_ is true, the default 512 bytes is not enough.
+ auto stream = event_logger_->LogToBuffer(log_buffer_, 1024);
+ stream << "job" << job_context_->job_id << "event"
+ << "flush_finished";
+ stream << "output_compression"
+ << CompressionTypeToString(output_compression_);
+ stream << "lsm_state";
+ stream.StartArray();
+ auto vstorage = cfd_->current()->storage_info();
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ stream << vstorage->NumLevelFiles(level);
+ }
+ stream.EndArray();
+
+ const auto& blob_files = vstorage->GetBlobFiles();
+ if (!blob_files.empty()) {
+ assert(blob_files.front());
+ stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber();
+
+ assert(blob_files.back());
+ stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber();
+ }
+
+ stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed();
+
+ if (measure_io_stats_) {
+ if (prev_perf_level != PerfLevel::kEnableTime) {
+ SetPerfLevel(prev_perf_level);
+ }
+ stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos);
+ stream << "file_range_sync_nanos"
+ << (IOSTATS(range_sync_nanos) - prev_range_sync_nanos);
+ stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos);
+ stream << "file_prepare_write_nanos"
+ << (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos);
+ stream << "file_cpu_write_nanos"
+ << (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos);
+ stream << "file_cpu_read_nanos"
+ << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos);
+ }
+
+ return s;
+}
+
+void FlushJob::Cancel() {
+ db_mutex_->AssertHeld();
+ assert(base_ != nullptr);
+ base_->Unref();
+}
+
+Status FlushJob::MemPurge() {
+ Status s;
+ db_mutex_->AssertHeld();
+ db_mutex_->Unlock();
+ assert(!mems_.empty());
+
+ // Measure purging time.
+ const uint64_t start_micros = clock_->NowMicros();
+ const uint64_t start_cpu_micros = clock_->CPUMicros();
+
+ MemTable* new_mem = nullptr;
+ // For performance/log investigation purposes:
+ // look at how much useful payload we harvest in the new_mem.
+ // This value is then printed to the DB log.
+ double new_mem_capacity = 0.0;
+
+ // Create two iterators, one for the memtable data (contains
+ // info from puts + deletes), and one for the memtable
+ // Range Tombstones (from DeleteRanges).
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ Arena arena;
+ std::vector<InternalIterator*> memtables;
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters;
+ for (MemTable* m : mems_) {
+ memtables.push_back(m->NewIterator(ro, &arena));
+ auto* range_del_iter = m->NewRangeTombstoneIterator(
+ ro, kMaxSequenceNumber, true /* immutable_memtable */);
+ if (range_del_iter != nullptr) {
+ range_del_iters.emplace_back(range_del_iter);
+ }
+ }
+
+ assert(!memtables.empty());
+ SequenceNumber first_seqno = kMaxSequenceNumber;
+ SequenceNumber earliest_seqno = kMaxSequenceNumber;
+ // Pick first and earliest seqno as min of all first_seqno
+ // and earliest_seqno of the mempurged memtables.
+ for (const auto& mem : mems_) {
+ first_seqno = mem->GetFirstSequenceNumber() < first_seqno
+ ? mem->GetFirstSequenceNumber()
+ : first_seqno;
+ earliest_seqno = mem->GetEarliestSequenceNumber() < earliest_seqno
+ ? mem->GetEarliestSequenceNumber()
+ : earliest_seqno;
+ }
+
+ ScopedArenaIterator iter(
+ NewMergingIterator(&(cfd_->internal_comparator()), memtables.data(),
+ static_cast<int>(memtables.size()), &arena));
+
+ auto* ioptions = cfd_->ioptions();
+
+ // Place iterator at the First (meaning most recent) key node.
+ iter->SeekToFirst();
+
+ const std::string* const full_history_ts_low = &(cfd_->GetFullHistoryTsLow());
+ std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+ new CompactionRangeDelAggregator(&(cfd_->internal_comparator()),
+ existing_snapshots_,
+ full_history_ts_low));
+ for (auto& rd_iter : range_del_iters) {
+ range_del_agg->AddTombstones(std::move(rd_iter));
+ }
+
+ // If there is valid data in the memtable,
+ // or at least range tombstones, copy over the info
+ // to the new memtable.
+ if (iter->Valid() || !range_del_agg->IsEmpty()) {
+ // MaxSize is the size of a memtable.
+ size_t maxSize = mutable_cf_options_.write_buffer_size;
+ std::unique_ptr<CompactionFilter> compaction_filter;
+ if (ioptions->compaction_filter_factory != nullptr &&
+ ioptions->compaction_filter_factory->ShouldFilterTableFileCreation(
+ TableFileCreationReason::kFlush)) {
+ CompactionFilter::Context ctx;
+ ctx.is_full_compaction = false;
+ ctx.is_manual_compaction = false;
+ ctx.column_family_id = cfd_->GetID();
+ ctx.reason = TableFileCreationReason::kFlush;
+ compaction_filter =
+ ioptions->compaction_filter_factory->CreateCompactionFilter(ctx);
+ if (compaction_filter != nullptr &&
+ !compaction_filter->IgnoreSnapshots()) {
+ s = Status::NotSupported(
+ "CompactionFilter::IgnoreSnapshots() = false is not supported "
+ "anymore.");
+ return s;
+ }
+ }
+
+ new_mem = new MemTable((cfd_->internal_comparator()), *(cfd_->ioptions()),
+ mutable_cf_options_, cfd_->write_buffer_mgr(),
+ earliest_seqno, cfd_->GetID());
+ assert(new_mem != nullptr);
+
+ Env* env = db_options_.env;
+ assert(env);
+ MergeHelper merge(
+ env, (cfd_->internal_comparator()).user_comparator(),
+ (ioptions->merge_operator).get(), compaction_filter.get(),
+ ioptions->logger, true /* internal key corruption is not ok */,
+ existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+ snapshot_checker_);
+ assert(job_context_);
+ SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence();
+ const std::atomic<bool> kManualCompactionCanceledFalse{false};
+ CompactionIterator c_iter(
+ iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge,
+ kMaxSequenceNumber, &existing_snapshots_,
+ earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_,
+ env, ShouldReportDetailedTime(env, ioptions->stats),
+ true /* internal key corruption is not ok */, range_del_agg.get(),
+ nullptr, ioptions->allow_data_in_errors,
+ ioptions->enforce_single_del_contracts,
+ /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
+ /*compaction=*/nullptr, compaction_filter.get(),
+ /*shutting_down=*/nullptr, ioptions->info_log, full_history_ts_low);
+
+ // Set earliest sequence number in the new memtable
+ // to be equal to the earliest sequence number of the
+ // memtable being flushed (See later if there is a need
+ // to update this number!).
+ new_mem->SetEarliestSequenceNumber(earliest_seqno);
+ // Likewise for first seq number.
+ new_mem->SetFirstSequenceNumber(first_seqno);
+ SequenceNumber new_first_seqno = kMaxSequenceNumber;
+
+ c_iter.SeekToFirst();
+
+ // Key transfer
+ for (; c_iter.Valid(); c_iter.Next()) {
+ const ParsedInternalKey ikey = c_iter.ikey();
+ const Slice value = c_iter.value();
+ new_first_seqno =
+ ikey.sequence < new_first_seqno ? ikey.sequence : new_first_seqno;
+
+ // Should we update "OldestKeyTime" ???? -> timestamp appear
+ // to still be an "experimental" feature.
+ s = new_mem->Add(
+ ikey.sequence, ikey.type, ikey.user_key, value,
+ nullptr, // KV protection info set as nullptr since it
+ // should only be useful for the first add to
+ // the original memtable.
+ false, // : allow concurrent_memtable_writes_
+ // Not seen as necessary for now.
+ nullptr, // get_post_process_info(m) must be nullptr
+ // when concurrent_memtable_writes is switched off.
+ nullptr); // hint, only used when concurrent_memtable_writes_
+ // is switched on.
+ if (!s.ok()) {
+ break;
+ }
+
+ // If new_mem has size greater than maxSize,
+ // then rollback to regular flush operation,
+ // and destroy new_mem.
+ if (new_mem->ApproximateMemoryUsage() > maxSize) {
+ s = Status::Aborted("Mempurge filled more than one memtable.");
+ new_mem_capacity = 1.0;
+ break;
+ }
+ }
+
+ // Check status and propagate
+ // potential error status from c_iter
+ if (!s.ok()) {
+ c_iter.status().PermitUncheckedError();
+ } else if (!c_iter.status().ok()) {
+ s = c_iter.status();
+ }
+
+ // Range tombstone transfer.
+ if (s.ok()) {
+ auto range_del_it = range_del_agg->NewIterator();
+ for (range_del_it->SeekToFirst(); range_del_it->Valid();
+ range_del_it->Next()) {
+ auto tombstone = range_del_it->Tombstone();
+ new_first_seqno =
+ tombstone.seq_ < new_first_seqno ? tombstone.seq_ : new_first_seqno;
+ s = new_mem->Add(
+ tombstone.seq_, // Sequence number
+ kTypeRangeDeletion, // KV type
+ tombstone.start_key_, // Key is start key.
+ tombstone.end_key_, // Value is end key.
+ nullptr, // KV protection info set as nullptr since it
+ // should only be useful for the first add to
+ // the original memtable.
+ false, // : allow concurrent_memtable_writes_
+ // Not seen as necessary for now.
+ nullptr, // get_post_process_info(m) must be nullptr
+ // when concurrent_memtable_writes is switched off.
+ nullptr); // hint, only used when concurrent_memtable_writes_
+ // is switched on.
+
+ if (!s.ok()) {
+ break;
+ }
+
+ // If new_mem has size greater than maxSize,
+ // then rollback to regular flush operation,
+ // and destroy new_mem.
+ if (new_mem->ApproximateMemoryUsage() > maxSize) {
+ s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
+ new_mem_capacity = 1.0;
+ break;
+ }
+ }
+ }
+
+ // If everything happened smoothly and new_mem contains valid data,
+ // decide if it is flushed to storage or kept in the imm()
+ // memtable list (memory).
+ if (s.ok() && (new_first_seqno != kMaxSequenceNumber)) {
+ // Rectify the first sequence number, which (unlike the earliest seq
+ // number) needs to be present in the new memtable.
+ new_mem->SetFirstSequenceNumber(new_first_seqno);
+
+ // The new_mem is added to the list of immutable memtables
+ // only if it filled at less than 100% capacity and isn't flagged
+ // as in need of being flushed.
+ if (new_mem->ApproximateMemoryUsage() < maxSize &&
+ !(new_mem->ShouldFlushNow())) {
+ // Construct fragmented memtable range tombstones without mutex
+ new_mem->ConstructFragmentedRangeTombstones();
+ db_mutex_->Lock();
+ uint64_t new_mem_id = mems_[0]->GetID();
+
+ new_mem->SetID(new_mem_id);
+ new_mem->SetNextLogNumber(mems_[0]->GetNextLogNumber());
+
+ // This addition will not trigger another flush, because
+ // we do not call SchedulePendingFlush().
+ cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free);
+ new_mem->Ref();
+#ifndef ROCKSDB_LITE
+ // Piggyback FlushJobInfo on the first flushed memtable.
+ db_mutex_->AssertHeld();
+ meta_.fd.file_size = 0;
+ mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
+#endif // !ROCKSDB_LITE
+ db_mutex_->Unlock();
+ } else {
+ s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
+ new_mem_capacity = 1.0;
+ if (new_mem) {
+ job_context_->memtables_to_free.push_back(new_mem);
+ }
+ }
+ } else {
+ // In this case, the newly allocated new_mem is empty.
+ assert(new_mem != nullptr);
+ job_context_->memtables_to_free.push_back(new_mem);
+ }
+ }
+
+ // Reacquire the mutex for WriteLevel0 function.
+ db_mutex_->Lock();
+
+ // If mempurge successful, don't write input tables to level0,
+ // but write any full output table to level0.
+ if (s.ok()) {
+ TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeSuccessful");
+ } else {
+ TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful");
+ }
+ const uint64_t micros = clock_->NowMicros() - start_micros;
+ const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Mempurge lasted %" PRIu64
+ " microseconds, and %" PRIu64
+ " cpu "
+ "microseconds. Status is %s ok. Perc capacity: %f\n",
+ cfd_->GetName().c_str(), job_context_->job_id, micros,
+ cpu_micros, s.ok() ? "" : "not", new_mem_capacity);
+
+ return s;
+}
+
+bool FlushJob::MemPurgeDecider(double threshold) {
+ // Never trigger mempurge if threshold is not a strictly positive value.
+ if (!(threshold > 0.0)) {
+ return false;
+ }
+ if (threshold > (1.0 * mems_.size())) {
+ return true;
+ }
+ // Payload and useful_payload (in bytes).
+ // The useful payload ratio of a given MemTable
+ // is estimated to be useful_payload/payload.
+ uint64_t payload = 0, useful_payload = 0, entry_size = 0;
+
+ // Local variables used repetitively inside the for-loop
+ // when iterating over the sampled entries.
+ Slice key_slice, value_slice;
+ ParsedInternalKey res;
+ SnapshotImpl min_snapshot;
+ std::string vget;
+ Status mget_s, parse_s;
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0, sqno = 0,
+ min_seqno_snapshot = 0;
+ bool get_res, can_be_useful_payload, not_in_next_mems;
+
+ // If estimated_useful_payload is > threshold,
+ // then flush to storage, else MemPurge.
+ double estimated_useful_payload = 0.0;
+ // Cochran formula for determining sample size.
+ // 95% confidence interval, 7% precision.
+ // n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0
+ double n0 = 196.0;
+ ReadOptions ro;
+ ro.total_order_seek = true;
+
+ // Iterate over each memtable of the set.
+ for (auto mem_iter = std::begin(mems_); mem_iter != std::end(mems_);
+ mem_iter++) {
+ MemTable* mt = *mem_iter;
+
+ // Else sample from the table.
+ uint64_t nentries = mt->num_entries();
+ // Corrected Cochran formula for small populations
+ // (converges to n0 for large populations).
+ uint64_t target_sample_size =
+ static_cast<uint64_t>(ceil(n0 / (1.0 + (n0 / nentries))));
+ std::unordered_set<const char*> sentries = {};
+ // Populate sample entries set.
+ mt->UniqueRandomSample(target_sample_size, &sentries);
+
+ // Estimate the garbage ratio by comparing if
+ // each sample corresponds to a valid entry.
+ for (const char* ss : sentries) {
+ key_slice = GetLengthPrefixedSlice(ss);
+ parse_s = ParseInternalKey(key_slice, &res, true /*log_err_key*/);
+ if (!parse_s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Memtable Decider: ParseInternalKey did not parse "
+ "key_slice %s successfully.",
+ key_slice.data());
+ }
+
+ // Size of the entry is "key size (+ value size if KV entry)"
+ entry_size = key_slice.size();
+ if (res.type == kTypeValue) {
+ value_slice =
+ GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+ entry_size += value_slice.size();
+ }
+
+ // Count entry bytes as payload.
+ payload += entry_size;
+
+ LookupKey lkey(res.user_key, kMaxSequenceNumber);
+
+ // Paranoia: zero out these values just in case.
+ max_covering_tombstone_seq = 0;
+ sqno = 0;
+
+ // Pick the oldest existing snapshot that is more recent
+ // than the sequence number of the sampled entry.
+ min_seqno_snapshot = kMaxSequenceNumber;
+ for (SequenceNumber seq_num : existing_snapshots_) {
+ if (seq_num > res.sequence && seq_num < min_seqno_snapshot) {
+ min_seqno_snapshot = seq_num;
+ }
+ }
+ min_snapshot.number_ = min_seqno_snapshot;
+ ro.snapshot =
+ min_seqno_snapshot < kMaxSequenceNumber ? &min_snapshot : nullptr;
+
+ // Estimate if the sample entry is valid or not.
+ get_res = mt->Get(lkey, &vget, /*columns=*/nullptr, /*timestamp=*/nullptr,
+ &mget_s, &merge_context, &max_covering_tombstone_seq,
+ &sqno, ro, true /* immutable_memtable */);
+ if (!get_res) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "Memtable Get returned false when Get(sampled entry). "
+ "Yet each sample entry should exist somewhere in the memtable, "
+ "unrelated to whether it has been deleted or not.");
+ }
+
+ // TODO(bjlemaire): evaluate typeMerge.
+ // This is where the sampled entry is estimated to be
+ // garbage or not. Note that this is a garbage *estimation*
+ // because we do not include certain items such as
+ // CompactionFitlers triggered at flush, or if the same delete
+ // has been inserted twice or more in the memtable.
+
+ // Evaluate if the entry can be useful payload
+ // Situation #1: entry is a KV entry, was found in the memtable mt
+ // and the sequence numbers match.
+ can_be_useful_payload = (res.type == kTypeValue) && get_res &&
+ mget_s.ok() && (sqno == res.sequence);
+
+ // Situation #2: entry is a delete entry, was found in the memtable mt
+ // (because gres==true) and no valid KV entry is found.
+ // (note: duplicate delete entries are also taken into
+ // account here, because the sequence number 'sqno'
+ // in memtable->Get(&sqno) operation is set to be equal
+ // to the most recent delete entry as well).
+ can_be_useful_payload |=
+ ((res.type == kTypeDeletion) || (res.type == kTypeSingleDeletion)) &&
+ mget_s.IsNotFound() && get_res && (sqno == res.sequence);
+
+ // If there is a chance that the entry is useful payload
+ // Verify that the entry does not appear in the following memtables
+ // (memtables with greater memtable ID/larger sequence numbers).
+ if (can_be_useful_payload) {
+ not_in_next_mems = true;
+ for (auto next_mem_iter = mem_iter + 1;
+ next_mem_iter != std::end(mems_); next_mem_iter++) {
+ if ((*next_mem_iter)
+ ->Get(lkey, &vget, /*columns=*/nullptr, /*timestamp=*/nullptr,
+ &mget_s, &merge_context, &max_covering_tombstone_seq,
+ &sqno, ro, true /* immutable_memtable */)) {
+ not_in_next_mems = false;
+ break;
+ }
+ }
+ if (not_in_next_mems) {
+ useful_payload += entry_size;
+ }
+ }
+ }
+ if (payload > 0) {
+ // We use the estimated useful payload ratio to
+ // evaluate how many of the memtable bytes are useful bytes.
+ estimated_useful_payload +=
+ (mt->ApproximateMemoryUsage()) * (useful_payload * 1.0 / payload);
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Mempurge sampling [CF %s] - found garbage ratio from "
+ "sampling: %f. Threshold is %f\n",
+ cfd_->GetName().c_str(),
+ (payload - useful_payload) * 1.0 / payload, threshold);
+ } else {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Mempurge sampling: null payload measured, and collected "
+ "sample size is %zu\n.",
+ sentries.size());
+ }
+ }
+ // We convert the total number of useful payload bytes
+ // into the proportion of memtable necessary to store all these bytes.
+ // We compare this proportion with the threshold value.
+ return ((estimated_useful_payload / mutable_cf_options_.write_buffer_size) <
+ threshold);
+}
+
+Status FlushJob::WriteLevel0Table() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_FLUSH_WRITE_L0);
+ db_mutex_->AssertHeld();
+ const uint64_t start_micros = clock_->NowMicros();
+ const uint64_t start_cpu_micros = clock_->CPUMicros();
+ Status s;
+
+ SequenceNumber smallest_seqno = mems_.front()->GetEarliestSequenceNumber();
+ if (!db_impl_seqno_time_mapping_.Empty()) {
+ // make a local copy, as the seqno_time_mapping from db_impl is not thread
+ // safe, which will be used while not holding the db_mutex.
+ seqno_to_time_mapping_ = db_impl_seqno_time_mapping_.Copy(smallest_seqno);
+ }
+
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ {
+ auto write_hint = cfd_->CalculateSSTWriteHint(0);
+ Env::IOPriority io_priority = GetRateLimiterPriorityForWrite();
+ db_mutex_->Unlock();
+ if (log_buffer_) {
+ log_buffer_->FlushBufferToLog();
+ }
+ // memtables and range_del_iters store internal iterators over each data
+ // memtable and its associated range deletion memtable, respectively, at
+ // corresponding indexes.
+ std::vector<InternalIterator*> memtables;
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters;
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ Arena arena;
+ uint64_t total_num_entries = 0, total_num_deletes = 0;
+ uint64_t total_data_size = 0;
+ size_t total_memory_usage = 0;
+ // Used for testing:
+ uint64_t mems_size = mems_.size();
+ (void)mems_size; // avoids unused variable error when
+ // TEST_SYNC_POINT_CALLBACK not used.
+ TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:num_memtables",
+ &mems_size);
+ assert(job_context_);
+ for (MemTable* m : mems_) {
+ ROCKS_LOG_INFO(
+ db_options_.info_log,
+ "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
+ cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
+ memtables.push_back(m->NewIterator(ro, &arena));
+ auto* range_del_iter = m->NewRangeTombstoneIterator(
+ ro, kMaxSequenceNumber, true /* immutable_memtable */);
+ if (range_del_iter != nullptr) {
+ range_del_iters.emplace_back(range_del_iter);
+ }
+ total_num_entries += m->num_entries();
+ total_num_deletes += m->num_deletes();
+ total_data_size += m->get_data_size();
+ total_memory_usage += m->ApproximateMemoryUsage();
+ }
+
+ event_logger_->Log() << "job" << job_context_->job_id << "event"
+ << "flush_started"
+ << "num_memtables" << mems_.size() << "num_entries"
+ << total_num_entries << "num_deletes"
+ << total_num_deletes << "total_data_size"
+ << total_data_size << "memory_usage"
+ << total_memory_usage << "flush_reason"
+ << GetFlushReasonString(cfd_->GetFlushReason());
+
+ {
+ ScopedArenaIterator iter(
+ NewMergingIterator(&cfd_->internal_comparator(), memtables.data(),
+ static_cast<int>(memtables.size()), &arena));
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
+ cfd_->GetName().c_str(), job_context_->job_id,
+ meta_.fd.GetNumber());
+
+ TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
+ &output_compression_);
+ int64_t _current_time = 0;
+ auto status = clock_->GetCurrentTime(&_current_time);
+ // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "Failed to get current time to populate creation_time property. "
+ "Status: %s",
+ status.ToString().c_str());
+ }
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+ uint64_t oldest_key_time = mems_.front()->ApproximateOldestKeyTime();
+
+ // It's not clear whether oldest_key_time is always available. In case
+ // it is not available, use current_time.
+ uint64_t oldest_ancester_time = std::min(current_time, oldest_key_time);
+
+ TEST_SYNC_POINT_CALLBACK(
+ "FlushJob::WriteLevel0Table:oldest_ancester_time",
+ &oldest_ancester_time);
+ meta_.oldest_ancester_time = oldest_ancester_time;
+ meta_.file_creation_time = current_time;
+
+ uint64_t num_input_entries = 0;
+ uint64_t memtable_payload_bytes = 0;
+ uint64_t memtable_garbage_bytes = 0;
+ IOStatus io_s;
+
+ const std::string* const full_history_ts_low =
+ (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_;
+ TableBuilderOptions tboptions(
+ *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(),
+ cfd_->int_tbl_prop_collector_factories(), output_compression_,
+ mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(),
+ 0 /* level */, false /* is_bottommost */,
+ TableFileCreationReason::kFlush, oldest_key_time, current_time,
+ db_id_, db_session_id_, 0 /* target_file_size */,
+ meta_.fd.GetNumber());
+ const SequenceNumber job_snapshot_seq =
+ job_context_->GetJobSnapshotSequence();
+ s = BuildTable(
+ dbname_, versions_, db_options_, tboptions, file_options_,
+ cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_,
+ &blob_file_additions, existing_snapshots_,
+ earliest_write_conflict_snapshot_, job_snapshot_seq,
+ snapshot_checker_, mutable_cf_options_.paranoid_file_checks,
+ cfd_->internal_stats(), &io_s, io_tracer_,
+ BlobFileCreationReason::kFlush, seqno_to_time_mapping_, event_logger_,
+ job_context_->job_id, io_priority, &table_properties_, write_hint,
+ full_history_ts_low, blob_callback_, &num_input_entries,
+ &memtable_payload_bytes, &memtable_garbage_bytes);
+ // TODO: Cleanup io_status in BuildTable and table builders
+ assert(!s.ok() || io_s.ok());
+ io_s.PermitUncheckedError();
+ if (num_input_entries != total_num_entries && s.ok()) {
+ std::string msg = "Expected " + std::to_string(total_num_entries) +
+ " entries in memtables, but read " +
+ std::to_string(num_input_entries);
+ ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
+ cfd_->GetName().c_str(), job_context_->job_id,
+ msg.c_str());
+ if (db_options_.flush_verify_memtable_count) {
+ s = Status::Corruption(msg);
+ }
+ }
+ if (tboptions.reason == TableFileCreationReason::kFlush) {
+ TEST_SYNC_POINT("DBImpl::FlushJob:Flush");
+ RecordTick(stats_, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+ memtable_payload_bytes);
+ RecordTick(stats_, MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+ memtable_garbage_bytes);
+ }
+ LogFlush(db_options_.info_log);
+ }
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
+ " bytes %s"
+ "%s",
+ cfd_->GetName().c_str(), job_context_->job_id,
+ meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
+ s.ToString().c_str(),
+ meta_.marked_for_compaction ? " (needs compaction)" : "");
+
+ if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
+ s = output_file_directory_->FsyncWithDirOptions(
+ IOOptions(), nullptr,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+ }
+ TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
+ db_mutex_->Lock();
+ }
+ base_->Unref();
+
+ // Note that if file_size is zero, the file has been deleted and
+ // should not be added to the manifest.
+ const bool has_output = meta_.fd.GetFileSize() > 0;
+
+ if (s.ok() && has_output) {
+ TEST_SYNC_POINT("DBImpl::FlushJob:SSTFileCreated");
+ // if we have more than 1 background thread, then we cannot
+ // insert files directly into higher levels because some other
+ // threads could be concurrently producing compacted files for
+ // that key range.
+ // Add file to L0
+ edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
+ meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
+ meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
+ meta_.marked_for_compaction, meta_.temperature,
+ meta_.oldest_blob_file_number, meta_.oldest_ancester_time,
+ meta_.file_creation_time, meta_.file_checksum,
+ meta_.file_checksum_func_name, meta_.unique_id);
+
+ edit_->SetBlobFileAdditions(std::move(blob_file_additions));
+ }
+#ifndef ROCKSDB_LITE
+ // Piggyback FlushJobInfo on the first first flushed memtable.
+ mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
+#endif // !ROCKSDB_LITE
+
+ // Note that here we treat flush as level 0 compaction in internal stats
+ InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
+ const uint64_t micros = clock_->NowMicros() - start_micros;
+ const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
+ stats.micros = micros;
+ stats.cpu_micros = cpu_micros;
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Flush lasted %" PRIu64
+ " microseconds, and %" PRIu64 " cpu microseconds.\n",
+ cfd_->GetName().c_str(), job_context_->job_id, micros,
+ cpu_micros);
+
+ if (has_output) {
+ stats.bytes_written = meta_.fd.GetFileSize();
+ stats.num_output_files = 1;
+ }
+
+ const auto& blobs = edit_->GetBlobFileAdditions();
+ for (const auto& blob : blobs) {
+ stats.bytes_written_blob += blob.GetTotalBlobBytes();
+ }
+
+ stats.num_output_files_blob = static_cast<int>(blobs.size());
+
+ RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
+ cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
+ cfd_->internal_stats()->AddCFStats(
+ InternalStats::BYTES_FLUSHED,
+ stats.bytes_written + stats.bytes_written_blob);
+ RecordFlushIOStats();
+
+ return s;
+}
+
+Env::IOPriority FlushJob::GetRateLimiterPriorityForWrite() {
+ if (versions_ && versions_->GetColumnFamilySet() &&
+ versions_->GetColumnFamilySet()->write_controller()) {
+ WriteController* write_controller =
+ versions_->GetColumnFamilySet()->write_controller();
+ if (write_controller->IsStopped() || write_controller->NeedsDelay()) {
+ return Env::IO_USER;
+ }
+ }
+
+ return Env::IO_HIGH;
+}
+
+#ifndef ROCKSDB_LITE
+std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
+ db_mutex_->AssertHeld();
+ std::unique_ptr<FlushJobInfo> info(new FlushJobInfo{});
+ info->cf_id = cfd_->GetID();
+ info->cf_name = cfd_->GetName();
+
+ const uint64_t file_number = meta_.fd.GetNumber();
+ info->file_path =
+ MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number);
+ info->file_number = file_number;
+ info->oldest_blob_file_number = meta_.oldest_blob_file_number;
+ info->thread_id = db_options_.env->GetThreadID();
+ info->job_id = job_context_->job_id;
+ info->smallest_seqno = meta_.fd.smallest_seqno;
+ info->largest_seqno = meta_.fd.largest_seqno;
+ info->table_properties = table_properties_;
+ info->flush_reason = cfd_->GetFlushReason();
+ info->blob_compression_type = mutable_cf_options_.blob_compression_type;
+
+ // Update BlobFilesInfo.
+ for (const auto& blob_file : edit_->GetBlobFileAdditions()) {
+ BlobFileAdditionInfo blob_file_addition_info(
+ BlobFileName(cfd_->ioptions()->cf_paths.front().path,
+ blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+ blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(),
+ blob_file.GetTotalBlobBytes());
+ info->blob_file_addition_infos.emplace_back(
+ std::move(blob_file_addition_info));
+ }
+ return info;
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_job.h b/src/rocksdb/db/flush_job.h
new file mode 100644
index 000000000..60c272aec
--- /dev/null
+++ b/src/rocksdb/db/flush_job.h
@@ -0,0 +1,203 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <list>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/column_family.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/snapshot_impl.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class MemTable;
+class SnapshotChecker;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class Arena;
+
+class FlushJob {
+ public:
+ // TODO(icanadi) make effort to reduce number of parameters here
+ // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
+ FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+ const ImmutableDBOptions& db_options,
+ const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
+ const FileOptions& file_options, VersionSet* versions,
+ InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, JobContext* job_context,
+ LogBuffer* log_buffer, FSDirectory* db_directory,
+ FSDirectory* output_file_directory,
+ CompressionType output_compression, Statistics* stats,
+ EventLogger* event_logger, bool measure_io_stats,
+ const bool sync_output_directory, const bool write_manifest,
+ Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+ const SeqnoToTimeMapping& seq_time_mapping,
+ const std::string& db_id = "", const std::string& db_session_id = "",
+ std::string full_history_ts_low = "",
+ BlobFileCompletionCallback* blob_callback = nullptr);
+
+ ~FlushJob();
+
+ // Require db_mutex held.
+ // Once PickMemTable() is called, either Run() or Cancel() has to be called.
+ void PickMemTable();
+ Status Run(LogsWithPrepTracker* prep_tracker = nullptr,
+ FileMetaData* file_meta = nullptr,
+ bool* switched_to_mempurge = nullptr);
+ void Cancel();
+ const autovector<MemTable*>& GetMemTables() const { return mems_; }
+
+#ifndef ROCKSDB_LITE
+ std::list<std::unique_ptr<FlushJobInfo>>* GetCommittedFlushJobsInfo() {
+ return &committed_flush_jobs_info_;
+ }
+#endif // !ROCKSDB_LITE
+
+ private:
+ friend class FlushJobTest_GetRateLimiterPriorityForWrite_Test;
+
+ void ReportStartedFlush();
+ void ReportFlushInputSize(const autovector<MemTable*>& mems);
+ void RecordFlushIOStats();
+ Status WriteLevel0Table();
+
+ // Memtable Garbage Collection algorithm: a MemPurge takes the list
+ // of immutable memtables and filters out (or "purge") the outdated bytes
+ // out of it. The output (the filtered bytes, or "useful payload") is
+ // then transfered into a new memtable. If this memtable is filled, then
+ // the mempurge is aborted and rerouted to a regular flush process. Else,
+ // depending on the heuristics, placed onto the immutable memtable list.
+ // The addition to the imm list will not trigger a flush operation. The
+ // flush of the imm list will instead be triggered once the mutable memtable
+ // is added to the imm list.
+ // This process is typically intended for workloads with heavy overwrites
+ // when we want to avoid SSD writes (and reads) as much as possible.
+ // "MemPurge" is an experimental feature still at a very early stage
+ // of development. At the moment it is only compatible with the Get, Put,
+ // Delete operations as well as Iterators and CompactionFilters.
+ // For this early version, "MemPurge" is called by setting the
+ // options.experimental_mempurge_threshold value as >0.0. When this is
+ // the case, ALL automatic flush operations (kWRiteBufferManagerFull) will
+ // first go through the MemPurge process. Therefore, we strongly
+ // recommend all users not to set this flag as true given that the MemPurge
+ // process has not matured yet.
+ Status MemPurge();
+ bool MemPurgeDecider(double threshold);
+ // The rate limiter priority (io_priority) is determined dynamically here.
+ Env::IOPriority GetRateLimiterPriorityForWrite();
+#ifndef ROCKSDB_LITE
+ std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
+#endif // !ROCKSDB_LITE
+
+ const std::string& dbname_;
+ const std::string db_id_;
+ const std::string db_session_id_;
+ ColumnFamilyData* cfd_;
+ const ImmutableDBOptions& db_options_;
+ const MutableCFOptions& mutable_cf_options_;
+ // A variable storing the largest memtable id to flush in this
+ // flush job. RocksDB uses this variable to select the memtables to flush in
+ // this job. All memtables in this column family with an ID smaller than or
+ // equal to max_memtable_id_ will be selected for flush.
+ uint64_t max_memtable_id_;
+ const FileOptions file_options_;
+ VersionSet* versions_;
+ InstrumentedMutex* db_mutex_;
+ std::atomic<bool>* shutting_down_;
+ std::vector<SequenceNumber> existing_snapshots_;
+ SequenceNumber earliest_write_conflict_snapshot_;
+ SnapshotChecker* snapshot_checker_;
+ JobContext* job_context_;
+ LogBuffer* log_buffer_;
+ FSDirectory* db_directory_;
+ FSDirectory* output_file_directory_;
+ CompressionType output_compression_;
+ Statistics* stats_;
+ EventLogger* event_logger_;
+ TableProperties table_properties_;
+ bool measure_io_stats_;
+ // True if this flush job should call fsync on the output directory. False
+ // otherwise.
+ // Usually sync_output_directory_ is true. A flush job needs to call sync on
+ // the output directory before committing to the MANIFEST.
+ // However, an individual flush job does not have to call sync on the output
+ // directory if it is part of an atomic flush. After all flush jobs in the
+ // atomic flush succeed, call sync once on each distinct output directory.
+ const bool sync_output_directory_;
+ // True if this flush job should write to MANIFEST after successfully
+ // flushing memtables. False otherwise.
+ // Usually write_manifest_ is true. A flush job commits to the MANIFEST after
+ // flushing the memtables.
+ // However, an individual flush job cannot rashly write to the MANIFEST
+ // immediately after it finishes the flush if it is part of an atomic flush.
+ // In this case, only after all flush jobs succeed in flush can RocksDB
+ // commit to the MANIFEST.
+ const bool write_manifest_;
+ // The current flush job can commit flush result of a concurrent flush job.
+ // We collect FlushJobInfo of all jobs committed by current job and fire
+ // OnFlushCompleted for them.
+ std::list<std::unique_ptr<FlushJobInfo>> committed_flush_jobs_info_;
+
+ // Variables below are set by PickMemTable():
+ FileMetaData meta_;
+ autovector<MemTable*> mems_;
+ VersionEdit* edit_;
+ Version* base_;
+ bool pick_memtable_called;
+ Env::Priority thread_pri_;
+
+ const std::shared_ptr<IOTracer> io_tracer_;
+ SystemClock* clock_;
+
+ const std::string full_history_ts_low_;
+ BlobFileCompletionCallback* blob_callback_;
+
+ // reference to the seqno_time_mapping_ in db_impl.h, not safe to read without
+ // db mutex
+ const SeqnoToTimeMapping& db_impl_seqno_time_mapping_;
+ SeqnoToTimeMapping seqno_to_time_mapping_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_job_test.cc b/src/rocksdb/db/flush_job_test.cc
new file mode 100644
index 000000000..f994b4e9b
--- /dev/null
+++ b/src/rocksdb/db/flush_job_test.cc
@@ -0,0 +1,745 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/flush_job.h"
+
+#include <algorithm>
+#include <array>
+#include <map>
+#include <string>
+
+#include "db/blob/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(icanadi) Mock out everything else:
+// 1. VersionSet
+// 2. Memtable
+class FlushJobTestBase : public testing::Test {
+ protected:
+ FlushJobTestBase(std::string dbname, const Comparator* ucmp)
+ : env_(Env::Default()),
+ fs_(env_->GetFileSystem()),
+ dbname_(std::move(dbname)),
+ ucmp_(ucmp),
+ options_(),
+ db_options_(options_),
+ column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}),
+ table_cache_(NewLRUCache(50000, 16)),
+ write_buffer_manager_(db_options_.db_write_buffer_size),
+ shutting_down_(false),
+ mock_table_factory_(new mock::MockTableFactory()) {}
+
+ virtual ~FlushJobTestBase() {
+ if (getenv("KEEP_DB")) {
+ fprintf(stdout, "db is still in %s\n", dbname_.c_str());
+ } else {
+ // destroy versions_ to release all file handles
+ versions_.reset();
+ EXPECT_OK(DestroyDir(env_, dbname_));
+ }
+ }
+
+ void NewDB() {
+ ASSERT_OK(SetIdentityFile(env_, dbname_));
+ VersionEdit new_db;
+
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ autovector<VersionEdit> new_cfs;
+ SequenceNumber last_seq = 1;
+ uint32_t cf_id = 1;
+ for (size_t i = 1; i != column_family_names_.size(); ++i) {
+ VersionEdit new_cf;
+ new_cf.AddColumnFamily(column_family_names_[i]);
+ new_cf.SetColumnFamily(cf_id++);
+ new_cf.SetComparatorName(ucmp_->Name());
+ new_cf.SetLogNumber(0);
+ new_cf.SetNextFile(2);
+ new_cf.SetLastSequence(last_seq++);
+ new_cfs.emplace_back(new_cf);
+ }
+
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ const auto& fs = env_->GetFileSystem();
+ std::unique_ptr<WritableFileWriter> file_writer;
+ Status s = WritableFileWriter::Create(
+ fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+ nullptr);
+ ASSERT_OK(s);
+
+ {
+ log::Writer log(std::move(file_writer), 0, false);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+ ASSERT_OK(s);
+
+ for (const auto& e : new_cfs) {
+ record.clear();
+ e.EncodeTo(&record);
+ s = log.AddRecord(record);
+ ASSERT_OK(s);
+ }
+ }
+ ASSERT_OK(s);
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+ ASSERT_OK(s);
+ }
+
+ void SetUp() override {
+ EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+
+ // TODO(icanadi) Remove this once we mock out VersionSet
+ NewDB();
+
+ db_options_.env = env_;
+ db_options_.fs = fs_;
+ db_options_.db_paths.emplace_back(dbname_,
+ std::numeric_limits<uint64_t>::max());
+ db_options_.statistics = CreateDBStatistics();
+
+ cf_options_.comparator = ucmp_;
+
+ std::vector<ColumnFamilyDescriptor> column_families;
+ cf_options_.table_factory = mock_table_factory_;
+ for (const auto& cf_name : column_family_names_) {
+ column_families.emplace_back(cf_name, cf_options_);
+ }
+
+ versions_.reset(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ EXPECT_OK(versions_->Recover(column_families, false));
+ }
+
+ Env* env_;
+ std::shared_ptr<FileSystem> fs_;
+ std::string dbname_;
+ const Comparator* const ucmp_;
+ EnvOptions env_options_;
+ Options options_;
+ ImmutableDBOptions db_options_;
+ const std::vector<std::string> column_family_names_;
+ std::shared_ptr<Cache> table_cache_;
+ WriteController write_controller_;
+ WriteBufferManager write_buffer_manager_;
+ ColumnFamilyOptions cf_options_;
+ std::unique_ptr<VersionSet> versions_;
+ InstrumentedMutex mutex_;
+ std::atomic<bool> shutting_down_;
+ std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+
+ SeqnoToTimeMapping empty_seqno_to_time_mapping_;
+};
+
+class FlushJobTest : public FlushJobTestBase {
+ public:
+ FlushJobTest()
+ : FlushJobTestBase(test::PerThreadDBPath("flush_job_test"),
+ BytewiseComparator()) {}
+};
+
+TEST_F(FlushJobTest, Empty) {
+ JobContext job_context(0);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ EventLogger event_logger(db_options_.info_log.get());
+ SnapshotChecker* snapshot_checker = nullptr; // not relavant
+ FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+ db_options_, *cfd->GetLatestMutableCFOptions(),
+ std::numeric_limits<uint64_t>::max() /* memtable_id */,
+ env_options_, versions_.get(), &mutex_, &shutting_down_,
+ {}, kMaxSequenceNumber, snapshot_checker, &job_context,
+ nullptr, nullptr, nullptr, kNoCompression, nullptr,
+ &event_logger, false, true /* sync_output_directory */,
+ true /* write_manifest */, Env::Priority::USER,
+ nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ flush_job.PickMemTable();
+ ASSERT_OK(flush_job.Run());
+ }
+ job_context.Clean();
+}
+
+TEST_F(FlushJobTest, NonEmpty) {
+ JobContext job_context(0);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+ new_mem->Ref();
+ auto inserted_keys = mock::MakeMockFile();
+ // Test data:
+ // seqno [ 1, 2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ]
+ // key [ 1001, 1002 ... 9998, 9999, 0, 1, 2 ... 999 ]
+ // range-delete "9995" -> "9999" at seqno 10000
+ // blob references with seqnos 10001..10006
+ for (int i = 1; i < 10000; ++i) {
+ std::string key(std::to_string((i + 1000) % 10000));
+ std::string value("value" + key);
+ ASSERT_OK(new_mem->Add(SequenceNumber(i), kTypeValue, key, value,
+ nullptr /* kv_prot_info */));
+ if ((i + 1000) % 10000 < 9995) {
+ InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
+ inserted_keys.push_back({internal_key.Encode().ToString(), value});
+ }
+ }
+
+ {
+ ASSERT_OK(new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995",
+ "9999a", nullptr /* kv_prot_info */));
+ InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
+ inserted_keys.push_back({internal_key.Encode().ToString(), "9999a"});
+ }
+
+ // Note: the first two blob references will not be considered when resolving
+ // the oldest blob file referenced (the first one is inlined TTL, while the
+ // second one is TTL and thus points to a TTL blob file).
+ constexpr std::array<uint64_t, 6> blob_file_numbers{
+ {kInvalidBlobFileNumber, 5, 103, 17, 102, 101}};
+ for (size_t i = 0; i < blob_file_numbers.size(); ++i) {
+ std::string key(std::to_string(i + 10001));
+ std::string blob_index;
+ if (i == 0) {
+ BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 1234567890ULL,
+ "foo");
+ } else if (i == 1) {
+ BlobIndex::EncodeBlobTTL(&blob_index, /* expiration */ 1234567890ULL,
+ blob_file_numbers[i], /* offset */ i << 10,
+ /* size */ i << 20, kNoCompression);
+ } else {
+ BlobIndex::EncodeBlob(&blob_index, blob_file_numbers[i],
+ /* offset */ i << 10, /* size */ i << 20,
+ kNoCompression);
+ }
+
+ const SequenceNumber seq(i + 10001);
+ ASSERT_OK(new_mem->Add(seq, kTypeBlobIndex, key, blob_index,
+ nullptr /* kv_prot_info */));
+
+ InternalKey internal_key(key, seq, kTypeBlobIndex);
+ inserted_keys.push_back({internal_key.Encode().ToString(), blob_index});
+ }
+ mock::SortKVVector(&inserted_keys);
+
+ autovector<MemTable*> to_delete;
+ new_mem->ConstructFragmentedRangeTombstones();
+ cfd->imm()->Add(new_mem, &to_delete);
+ for (auto& m : to_delete) {
+ delete m;
+ }
+
+ EventLogger event_logger(db_options_.info_log.get());
+ SnapshotChecker* snapshot_checker = nullptr; // not relavant
+ FlushJob flush_job(
+ dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+ *cfd->GetLatestMutableCFOptions(),
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+ versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+ snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+ db_options_.statistics.get(), &event_logger, true,
+ true /* sync_output_directory */, true /* write_manifest */,
+ Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+
+ HistogramData hist;
+ FileMetaData file_meta;
+ mutex_.Lock();
+ flush_job.PickMemTable();
+ ASSERT_OK(flush_job.Run(nullptr, &file_meta));
+ mutex_.Unlock();
+ db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+ ASSERT_GT(hist.average, 0.0);
+
+ ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString());
+ ASSERT_EQ("9999a", file_meta.largest.user_key().ToString());
+ ASSERT_EQ(1, file_meta.fd.smallest_seqno);
+ ASSERT_EQ(10006, file_meta.fd.largest_seqno);
+ ASSERT_EQ(17, file_meta.oldest_blob_file_number);
+ mock_table_factory_->AssertSingleFile(inserted_keys);
+ job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
+ const size_t num_mems = 2;
+ const size_t num_mems_to_flush = 1;
+ const size_t num_keys_per_table = 100;
+ JobContext job_context(0);
+ ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+ std::vector<uint64_t> memtable_ids;
+ std::vector<MemTable*> new_mems;
+ for (size_t i = 0; i != num_mems; ++i) {
+ MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+ mem->SetID(i);
+ mem->Ref();
+ new_mems.emplace_back(mem);
+ memtable_ids.push_back(mem->GetID());
+
+ for (size_t j = 0; j < num_keys_per_table; ++j) {
+ std::string key(std::to_string(j + i * num_keys_per_table));
+ std::string value("value" + key);
+ ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue,
+ key, value, nullptr /* kv_prot_info */));
+ }
+ }
+
+ autovector<MemTable*> to_delete;
+ for (auto mem : new_mems) {
+ mem->ConstructFragmentedRangeTombstones();
+ cfd->imm()->Add(mem, &to_delete);
+ }
+
+ EventLogger event_logger(db_options_.info_log.get());
+ SnapshotChecker* snapshot_checker = nullptr; // not relavant
+
+ assert(memtable_ids.size() == num_mems);
+ uint64_t smallest_memtable_id = memtable_ids.front();
+ uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+ FlushJob flush_job(
+ dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+ *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
+ versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+ snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+ db_options_.statistics.get(), &event_logger, true,
+ true /* sync_output_directory */, true /* write_manifest */,
+ Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+ HistogramData hist;
+ FileMetaData file_meta;
+ mutex_.Lock();
+ flush_job.PickMemTable();
+ ASSERT_OK(flush_job.Run(nullptr /* prep_tracker */, &file_meta));
+ mutex_.Unlock();
+ db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+ ASSERT_GT(hist.average, 0.0);
+
+ ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString());
+ ASSERT_EQ("99", file_meta.largest.user_key().ToString());
+ ASSERT_EQ(0, file_meta.fd.smallest_seqno);
+ ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1),
+ file_meta.fd.largest_seqno);
+ ASSERT_EQ(kInvalidBlobFileNumber, file_meta.oldest_blob_file_number);
+
+ for (auto m : to_delete) {
+ delete m;
+ }
+ to_delete.clear();
+ job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
+ autovector<ColumnFamilyData*> all_cfds;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ all_cfds.push_back(cfd);
+ }
+ const std::vector<size_t> num_memtables = {2, 1, 3};
+ assert(num_memtables.size() == column_family_names_.size());
+ const size_t num_keys_per_memtable = 1000;
+ JobContext job_context(0);
+ std::vector<uint64_t> memtable_ids;
+ std::vector<SequenceNumber> smallest_seqs;
+ std::vector<SequenceNumber> largest_seqs;
+ autovector<MemTable*> to_delete;
+ SequenceNumber curr_seqno = 0;
+ size_t k = 0;
+ for (auto cfd : all_cfds) {
+ smallest_seqs.push_back(curr_seqno);
+ for (size_t i = 0; i != num_memtables[k]; ++i) {
+ MemTable* mem = cfd->ConstructNewMemtable(
+ *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+ mem->SetID(i);
+ mem->Ref();
+
+ for (size_t j = 0; j != num_keys_per_memtable; ++j) {
+ std::string key(std::to_string(j + i * num_keys_per_memtable));
+ std::string value("value" + key);
+ ASSERT_OK(mem->Add(curr_seqno++, kTypeValue, key, value,
+ nullptr /* kv_prot_info */));
+ }
+ mem->ConstructFragmentedRangeTombstones();
+ cfd->imm()->Add(mem, &to_delete);
+ }
+ largest_seqs.push_back(curr_seqno - 1);
+ memtable_ids.push_back(num_memtables[k++] - 1);
+ }
+
+ EventLogger event_logger(db_options_.info_log.get());
+ SnapshotChecker* snapshot_checker = nullptr; // not relevant
+ std::vector<std::unique_ptr<FlushJob>> flush_jobs;
+ k = 0;
+ for (auto cfd : all_cfds) {
+ std::vector<SequenceNumber> snapshot_seqs;
+ flush_jobs.emplace_back(new FlushJob(
+ dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+ memtable_ids[k], env_options_, versions_.get(), &mutex_,
+ &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
+ &job_context, nullptr, nullptr, nullptr, kNoCompression,
+ db_options_.statistics.get(), &event_logger, true,
+ false /* sync_output_directory */, false /* write_manifest */,
+ Env::Priority::USER, nullptr /*IOTracer*/,
+ empty_seqno_to_time_mapping_));
+ k++;
+ }
+ HistogramData hist;
+ std::vector<FileMetaData> file_metas;
+ // Call reserve to avoid auto-resizing
+ file_metas.reserve(flush_jobs.size());
+ mutex_.Lock();
+ for (auto& job : flush_jobs) {
+ job->PickMemTable();
+ }
+ for (auto& job : flush_jobs) {
+ FileMetaData meta;
+ // Run will release and re-acquire mutex
+ ASSERT_OK(job->Run(nullptr /**/, &meta));
+ file_metas.emplace_back(meta);
+ }
+ autovector<FileMetaData*> file_meta_ptrs;
+ for (auto& meta : file_metas) {
+ file_meta_ptrs.push_back(&meta);
+ }
+ autovector<const autovector<MemTable*>*> mems_list;
+ for (size_t i = 0; i != all_cfds.size(); ++i) {
+ const auto& mems = flush_jobs[i]->GetMemTables();
+ mems_list.push_back(&mems);
+ }
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ for (auto cfd : all_cfds) {
+ mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+ }
+ autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+ committed_flush_jobs_info;
+#ifndef ROCKSDB_LITE
+ for (auto& job : flush_jobs) {
+ committed_flush_jobs_info.push_back(job->GetCommittedFlushJobsInfo());
+ }
+#endif //! ROCKSDB_LITE
+
+ Status s = InstallMemtableAtomicFlushResults(
+ nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list,
+ versions_.get(), nullptr /* prep_tracker */, &mutex_, file_meta_ptrs,
+ committed_flush_jobs_info, &job_context.memtables_to_free,
+ nullptr /* db_directory */, nullptr /* log_buffer */);
+ ASSERT_OK(s);
+
+ mutex_.Unlock();
+ db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+ ASSERT_GT(hist.average, 0.0);
+ k = 0;
+ for (const auto& file_meta : file_metas) {
+ ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString());
+ ASSERT_EQ("999", file_meta.largest.user_key()
+ .ToString()); // max key by bytewise comparator
+ ASSERT_EQ(smallest_seqs[k], file_meta.fd.smallest_seqno);
+ ASSERT_EQ(largest_seqs[k], file_meta.fd.largest_seqno);
+ // Verify that imm is empty
+ ASSERT_EQ(std::numeric_limits<uint64_t>::max(),
+ all_cfds[k]->imm()->GetEarliestMemTableID());
+ ASSERT_EQ(0, all_cfds[k]->imm()->GetLatestMemTableID());
+ ++k;
+ }
+
+ for (auto m : to_delete) {
+ delete m;
+ }
+ to_delete.clear();
+ job_context.Clean();
+}
+
+TEST_F(FlushJobTest, Snapshots) {
+ JobContext job_context(0);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+
+ std::set<SequenceNumber> snapshots_set;
+ int keys = 10000;
+ int max_inserts_per_keys = 8;
+
+ Random rnd(301);
+ for (int i = 0; i < keys / 2; ++i) {
+ snapshots_set.insert(rnd.Uniform(keys * (max_inserts_per_keys / 2)) + 1);
+ }
+ // set has already removed the duplicate snapshots
+ std::vector<SequenceNumber> snapshots(snapshots_set.begin(),
+ snapshots_set.end());
+
+ new_mem->Ref();
+ SequenceNumber current_seqno = 0;
+ auto inserted_keys = mock::MakeMockFile();
+ for (int i = 1; i < keys; ++i) {
+ std::string key(std::to_string(i));
+ int insertions = rnd.Uniform(max_inserts_per_keys);
+ for (int j = 0; j < insertions; ++j) {
+ std::string value(rnd.HumanReadableString(10));
+ auto seqno = ++current_seqno;
+ ASSERT_OK(new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value,
+ nullptr /* kv_prot_info */));
+ // a key is visible only if:
+ // 1. it's the last one written (j == insertions - 1)
+ // 2. there's a snapshot pointing at it
+ bool visible = (j == insertions - 1) ||
+ (snapshots_set.find(seqno) != snapshots_set.end());
+ if (visible) {
+ InternalKey internal_key(key, seqno, kTypeValue);
+ inserted_keys.push_back({internal_key.Encode().ToString(), value});
+ }
+ }
+ }
+ mock::SortKVVector(&inserted_keys);
+
+ autovector<MemTable*> to_delete;
+ new_mem->ConstructFragmentedRangeTombstones();
+ cfd->imm()->Add(new_mem, &to_delete);
+ for (auto& m : to_delete) {
+ delete m;
+ }
+
+ EventLogger event_logger(db_options_.info_log.get());
+ SnapshotChecker* snapshot_checker = nullptr; // not relavant
+ FlushJob flush_job(
+ dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+ *cfd->GetLatestMutableCFOptions(),
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+ versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+ snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+ db_options_.statistics.get(), &event_logger, true,
+ true /* sync_output_directory */, true /* write_manifest */,
+ Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+ mutex_.Lock();
+ flush_job.PickMemTable();
+ ASSERT_OK(flush_job.Run());
+ mutex_.Unlock();
+ mock_table_factory_->AssertSingleFile(inserted_keys);
+ HistogramData hist;
+ db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+ ASSERT_GT(hist.average, 0.0);
+ job_context.Clean();
+}
+
+TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) {
+ // Prepare a FlushJob that flush MemTables of Single Column Family.
+ const size_t num_mems = 2;
+ const size_t num_mems_to_flush = 1;
+ const size_t num_keys_per_table = 100;
+ JobContext job_context(0);
+ ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+ std::vector<uint64_t> memtable_ids;
+ std::vector<MemTable*> new_mems;
+ for (size_t i = 0; i != num_mems; ++i) {
+ MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+ mem->SetID(i);
+ mem->Ref();
+ new_mems.emplace_back(mem);
+ memtable_ids.push_back(mem->GetID());
+
+ for (size_t j = 0; j < num_keys_per_table; ++j) {
+ std::string key(std::to_string(j + i * num_keys_per_table));
+ std::string value("value" + key);
+ ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue,
+ key, value, nullptr /* kv_prot_info */));
+ }
+ }
+
+ autovector<MemTable*> to_delete;
+ for (auto mem : new_mems) {
+ mem->ConstructFragmentedRangeTombstones();
+ cfd->imm()->Add(mem, &to_delete);
+ }
+
+ EventLogger event_logger(db_options_.info_log.get());
+ SnapshotChecker* snapshot_checker = nullptr; // not relavant
+
+ assert(memtable_ids.size() == num_mems);
+ uint64_t smallest_memtable_id = memtable_ids.front();
+ uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+ FlushJob flush_job(
+ dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+ *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
+ versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+ snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+ db_options_.statistics.get(), &event_logger, true,
+ true /* sync_output_directory */, true /* write_manifest */,
+ Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+
+ // When the state from WriteController is normal.
+ ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_HIGH);
+
+ WriteController* write_controller =
+ flush_job.versions_->GetColumnFamilySet()->write_controller();
+
+ {
+ // When the state from WriteController is Delayed.
+ std::unique_ptr<WriteControllerToken> delay_token =
+ write_controller->GetDelayToken(1000000);
+ ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER);
+ }
+
+ {
+ // When the state from WriteController is Stopped.
+ std::unique_ptr<WriteControllerToken> stop_token =
+ write_controller->GetStopToken();
+ ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER);
+ }
+}
+
+class FlushJobTimestampTest : public FlushJobTestBase {
+ public:
+ FlushJobTimestampTest()
+ : FlushJobTestBase(test::PerThreadDBPath("flush_job_ts_gc_test"),
+ test::BytewiseComparatorWithU64TsWrapper()) {}
+
+ void AddKeyValueToMemtable(MemTable* memtable, std::string key, uint64_t ts,
+ SequenceNumber seq, ValueType value_type,
+ Slice value) {
+ std::string key_str(std::move(key));
+ PutFixed64(&key_str, ts);
+ ASSERT_OK(memtable->Add(seq, value_type, key_str, value,
+ nullptr /* kv_prot_info */));
+ }
+
+ protected:
+ static constexpr uint64_t kStartTs = 10;
+ static constexpr SequenceNumber kStartSeq = 0;
+ SequenceNumber curr_seq_{kStartSeq};
+ std::atomic<uint64_t> curr_ts_{kStartTs};
+};
+
+TEST_F(FlushJobTimestampTest, AllKeysExpired) {
+ ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+ autovector<MemTable*> to_delete;
+
+ {
+ MemTable* new_mem = cfd->ConstructNewMemtable(
+ *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+ new_mem->Ref();
+ for (int i = 0; i < 100; ++i) {
+ uint64_t ts = curr_ts_.fetch_add(1);
+ SequenceNumber seq = (curr_seq_++);
+ AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+ ValueType::kTypeValue, "0_value");
+ }
+ uint64_t ts = curr_ts_.fetch_add(1);
+ SequenceNumber seq = (curr_seq_++);
+ AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+ ValueType::kTypeDeletionWithTimestamp, "");
+ new_mem->ConstructFragmentedRangeTombstones();
+ cfd->imm()->Add(new_mem, &to_delete);
+ }
+
+ std::vector<SequenceNumber> snapshots;
+ constexpr SnapshotChecker* const snapshot_checker = nullptr;
+ JobContext job_context(0);
+ EventLogger event_logger(db_options_.info_log.get());
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+ FlushJob flush_job(
+ dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+ versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+ snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+ db_options_.statistics.get(), &event_logger, true,
+ true /* sync_output_directory */, true /* write_manifest */,
+ Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_,
+ /*db_id=*/"",
+ /*db_session_id=*/"", full_history_ts_low);
+
+ FileMetaData fmeta;
+ mutex_.Lock();
+ flush_job.PickMemTable();
+ ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta));
+ mutex_.Unlock();
+
+ {
+ std::string key = test::EncodeInt(0);
+ key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1));
+ InternalKey ikey(key, curr_seq_ - 1, ValueType::kTypeDeletionWithTimestamp);
+ ASSERT_EQ(ikey.Encode(), fmeta.smallest.Encode());
+ ASSERT_EQ(ikey.Encode(), fmeta.largest.Encode());
+ }
+
+ job_context.Clean();
+ ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(FlushJobTimestampTest, NoKeyExpired) {
+ ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+ autovector<MemTable*> to_delete;
+
+ {
+ MemTable* new_mem = cfd->ConstructNewMemtable(
+ *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+ new_mem->Ref();
+ for (int i = 0; i < 100; ++i) {
+ uint64_t ts = curr_ts_.fetch_add(1);
+ SequenceNumber seq = (curr_seq_++);
+ AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+ ValueType::kTypeValue, "0_value");
+ }
+ new_mem->ConstructFragmentedRangeTombstones();
+ cfd->imm()->Add(new_mem, &to_delete);
+ }
+
+ std::vector<SequenceNumber> snapshots;
+ SnapshotChecker* const snapshot_checker = nullptr;
+ JobContext job_context(0);
+ EventLogger event_logger(db_options_.info_log.get());
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, 0);
+ FlushJob flush_job(
+ dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+ versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+ snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+ db_options_.statistics.get(), &event_logger, true,
+ true /* sync_output_directory */, true /* write_manifest */,
+ Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_,
+ /*db_id=*/"",
+ /*db_session_id=*/"", full_history_ts_low);
+
+ FileMetaData fmeta;
+ mutex_.Lock();
+ flush_job.PickMemTable();
+ ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta));
+ mutex_.Unlock();
+
+ {
+ std::string ukey = test::EncodeInt(0);
+ std::string smallest_key =
+ ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1);
+ std::string largest_key = ukey + test::EncodeInt(kStartTs);
+ InternalKey smallest(smallest_key, curr_seq_ - 1, ValueType::kTypeValue);
+ InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue);
+ ASSERT_EQ(smallest.Encode(), fmeta.smallest.Encode());
+ ASSERT_EQ(largest.Encode(), fmeta.largest.Encode());
+ }
+ job_context.Clean();
+ ASSERT_TRUE(to_delete.empty());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/flush_scheduler.cc b/src/rocksdb/db/flush_scheduler.cc
new file mode 100644
index 000000000..6f4d3e1a5
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/flush_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+#ifndef NDEBUG
+ {
+ std::lock_guard<std::mutex> lock(checking_mutex_);
+ assert(checking_set_.count(cfd) == 0);
+ checking_set_.insert(cfd);
+ }
+#endif // NDEBUG
+ cfd->Ref();
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+ Node* node = new Node{cfd, head_.load(std::memory_order_relaxed)};
+ while (!head_.compare_exchange_strong(
+ node->next, node, std::memory_order_relaxed, std::memory_order_relaxed)) {
+ // failing CAS updates the first param, so we are already set for
+ // retry. TakeNextColumnFamily won't happen until after another
+ // inter-thread synchronization, so we don't even need release
+ // semantics for this CAS
+ }
+#endif // __clang_analyzer__
+}
+
+ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
+ while (true) {
+ if (head_.load(std::memory_order_relaxed) == nullptr) {
+ return nullptr;
+ }
+
+ // dequeue the head
+ Node* node = head_.load(std::memory_order_relaxed);
+ head_.store(node->next, std::memory_order_relaxed);
+ ColumnFamilyData* cfd = node->column_family;
+ delete node;
+
+#ifndef NDEBUG
+ {
+ std::lock_guard<std::mutex> lock(checking_mutex_);
+ auto iter = checking_set_.find(cfd);
+ assert(iter != checking_set_.end());
+ checking_set_.erase(iter);
+ }
+#endif // NDEBUG
+
+ if (!cfd->IsDropped()) {
+ // success
+ return cfd;
+ }
+
+ // no longer relevant, retry
+ cfd->UnrefAndTryDelete();
+ }
+}
+
+bool FlushScheduler::Empty() {
+ auto rv = head_.load(std::memory_order_relaxed) == nullptr;
+#ifndef NDEBUG
+ std::lock_guard<std::mutex> lock(checking_mutex_);
+ // Empty is allowed to be called concurrnetly with ScheduleFlush. It would
+ // only miss the recent schedules.
+ assert((rv == checking_set_.empty()) || rv);
+#endif // NDEBUG
+ return rv;
+}
+
+void FlushScheduler::Clear() {
+ ColumnFamilyData* cfd;
+ while ((cfd = TakeNextColumnFamily()) != nullptr) {
+ cfd->UnrefAndTryDelete();
+ }
+ assert(head_.load(std::memory_order_relaxed) == nullptr);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_scheduler.h b/src/rocksdb/db/flush_scheduler.h
new file mode 100644
index 000000000..eb03f3e11
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <mutex>
+#include <set>
+
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+
+// FlushScheduler keeps track of all column families whose memtable may
+// be full and require flushing. Unless otherwise noted, all methods on
+// FlushScheduler should be called only with the DB mutex held or from
+// a single-threaded recovery context.
+class FlushScheduler {
+ public:
+ FlushScheduler() : head_(nullptr) {}
+
+ // May be called from multiple threads at once, but not concurrent with
+ // any other method calls on this instance
+ void ScheduleWork(ColumnFamilyData* cfd);
+
+ // Removes and returns Ref()-ed column family. Client needs to Unref().
+ // Filters column families that have been dropped.
+ ColumnFamilyData* TakeNextColumnFamily();
+
+ // This can be called concurrently with ScheduleWork but it would miss all
+ // the scheduled flushes after the last synchronization. This would result
+ // into less precise enforcement of memtable sizes but should not matter much.
+ bool Empty();
+
+ void Clear();
+
+ private:
+ struct Node {
+ ColumnFamilyData* column_family;
+ Node* next;
+ };
+
+ std::atomic<Node*> head_;
+#ifndef NDEBUG
+ std::mutex checking_mutex_;
+ std::set<ColumnFamilyData*> checking_set_;
+#endif // NDEBUG
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/forward_iterator.cc b/src/rocksdb/db/forward_iterator.cc
new file mode 100644
index 000000000..3fbc2cf47
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.cc
@@ -0,0 +1,1062 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "db/forward_iterator.h"
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Usage:
+// ForwardLevelIterator iter;
+// iter.SetFileIndex(file_index);
+// iter.Seek(target); // or iter.SeekToFirst();
+// iter.Next()
+class ForwardLevelIterator : public InternalIterator {
+ public:
+ ForwardLevelIterator(
+ const ColumnFamilyData* const cfd, const ReadOptions& read_options,
+ const std::vector<FileMetaData*>& files,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ bool allow_unprepared_value)
+ : cfd_(cfd),
+ read_options_(read_options),
+ files_(files),
+ valid_(false),
+ file_index_(std::numeric_limits<uint32_t>::max()),
+ file_iter_(nullptr),
+ pinned_iters_mgr_(nullptr),
+ prefix_extractor_(prefix_extractor),
+ allow_unprepared_value_(allow_unprepared_value) {
+ status_.PermitUncheckedError(); // Allow uninitialized status through
+ }
+
+ ~ForwardLevelIterator() override {
+ // Reset current pointer
+ if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+ pinned_iters_mgr_->PinIterator(file_iter_);
+ } else {
+ delete file_iter_;
+ }
+ }
+
+ void SetFileIndex(uint32_t file_index) {
+ assert(file_index < files_.size());
+ status_ = Status::OK();
+ if (file_index != file_index_) {
+ file_index_ = file_index;
+ Reset();
+ }
+ }
+ void Reset() {
+ assert(file_index_ < files_.size());
+
+ // Reset current pointer
+ if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+ pinned_iters_mgr_->PinIterator(file_iter_);
+ } else {
+ delete file_iter_;
+ }
+
+ ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+ kMaxSequenceNumber /* upper_bound */);
+ file_iter_ = cfd_->table_cache()->NewIterator(
+ read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
+ *files_[file_index_],
+ read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+ prefix_extractor_, /*table_reader_ptr=*/nullptr,
+ /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator,
+ /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
+ /*max_file_size_for_l0_meta_pin=*/0,
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr, allow_unprepared_value_);
+ file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+ valid_ = false;
+ if (!range_del_agg.IsEmpty()) {
+ status_ = Status::NotSupported(
+ "Range tombstones unsupported with ForwardIterator");
+ }
+ }
+ void SeekToLast() override {
+ status_ = Status::NotSupported("ForwardLevelIterator::SeekToLast()");
+ valid_ = false;
+ }
+ void Prev() override {
+ status_ = Status::NotSupported("ForwardLevelIterator::Prev()");
+ valid_ = false;
+ }
+ bool Valid() const override { return valid_; }
+ void SeekToFirst() override {
+ assert(file_iter_ != nullptr);
+ if (!status_.ok()) {
+ assert(!valid_);
+ return;
+ }
+ file_iter_->SeekToFirst();
+ valid_ = file_iter_->Valid();
+ }
+ void Seek(const Slice& internal_key) override {
+ assert(file_iter_ != nullptr);
+
+ // This deviates from the usual convention for InternalIterator::Seek() in
+ // that it doesn't discard pre-existing error status. That's because this
+ // Seek() is only supposed to be called immediately after SetFileIndex()
+ // (which discards pre-existing error status), and SetFileIndex() may set
+ // an error status, which we shouldn't discard.
+ if (!status_.ok()) {
+ assert(!valid_);
+ return;
+ }
+
+ file_iter_->Seek(internal_key);
+ valid_ = file_iter_->Valid();
+ }
+ void SeekForPrev(const Slice& /*internal_key*/) override {
+ status_ = Status::NotSupported("ForwardLevelIterator::SeekForPrev()");
+ valid_ = false;
+ }
+ void Next() override {
+ assert(valid_);
+ file_iter_->Next();
+ for (;;) {
+ valid_ = file_iter_->Valid();
+ if (!file_iter_->status().ok()) {
+ assert(!valid_);
+ return;
+ }
+ if (valid_) {
+ return;
+ }
+ if (file_index_ + 1 >= files_.size()) {
+ valid_ = false;
+ return;
+ }
+ SetFileIndex(file_index_ + 1);
+ if (!status_.ok()) {
+ assert(!valid_);
+ return;
+ }
+ file_iter_->SeekToFirst();
+ }
+ }
+ Slice key() const override {
+ assert(valid_);
+ return file_iter_->key();
+ }
+ Slice value() const override {
+ assert(valid_);
+ return file_iter_->value();
+ }
+ Status status() const override {
+ if (!status_.ok()) {
+ return status_;
+ } else if (file_iter_) {
+ return file_iter_->status();
+ }
+ return Status::OK();
+ }
+ bool PrepareValue() override {
+ assert(valid_);
+ if (file_iter_->PrepareValue()) {
+ return true;
+ }
+
+ assert(!file_iter_->Valid());
+ valid_ = false;
+ return false;
+ }
+ bool IsKeyPinned() const override {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ file_iter_->IsKeyPinned();
+ }
+ bool IsValuePinned() const override {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ file_iter_->IsValuePinned();
+ }
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ if (file_iter_) {
+ file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+ }
+
+ private:
+ const ColumnFamilyData* const cfd_;
+ const ReadOptions& read_options_;
+ const std::vector<FileMetaData*>& files_;
+
+ bool valid_;
+ uint32_t file_index_;
+ Status status_;
+ InternalIterator* file_iter_;
+ PinnedIteratorsManager* pinned_iters_mgr_;
+ // Kept alive by ForwardIterator::sv_->mutable_cf_options
+ const std::shared_ptr<const SliceTransform>& prefix_extractor_;
+ const bool allow_unprepared_value_;
+};
+
+ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+ ColumnFamilyData* cfd,
+ SuperVersion* current_sv,
+ bool allow_unprepared_value)
+ : db_(db),
+ read_options_(read_options),
+ cfd_(cfd),
+ prefix_extractor_(current_sv->mutable_cf_options.prefix_extractor.get()),
+ user_comparator_(cfd->user_comparator()),
+ allow_unprepared_value_(allow_unprepared_value),
+ immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
+ sv_(current_sv),
+ mutable_iter_(nullptr),
+ current_(nullptr),
+ valid_(false),
+ status_(Status::OK()),
+ immutable_status_(Status::OK()),
+ has_iter_trimmed_for_upper_bound_(false),
+ current_over_upper_bound_(false),
+ is_prev_set_(false),
+ is_prev_inclusive_(false),
+ pinned_iters_mgr_(nullptr) {
+ if (sv_) {
+ RebuildIterators(false);
+ }
+
+ // immutable_status_ is a local aggregation of the
+ // status of the immutable Iterators.
+ // We have to PermitUncheckedError in case it is never
+ // used, otherwise it will fail ASSERT_STATUS_CHECKED.
+ immutable_status_.PermitUncheckedError();
+}
+
+ForwardIterator::~ForwardIterator() { Cleanup(true); }
+
+void ForwardIterator::SVCleanup(DBImpl* db, SuperVersion* sv,
+ bool background_purge_on_iterator_cleanup) {
+ if (sv->Unref()) {
+ // Job id == 0 means that this is not our background process, but rather
+ // user thread
+ JobContext job_context(0);
+ db->mutex_.Lock();
+ sv->Cleanup();
+ db->FindObsoleteFiles(&job_context, false, true);
+ if (background_purge_on_iterator_cleanup) {
+ db->ScheduleBgLogWriterClose(&job_context);
+ db->AddSuperVersionsToFreeQueue(sv);
+ db->SchedulePurge();
+ }
+ db->mutex_.Unlock();
+ if (!background_purge_on_iterator_cleanup) {
+ delete sv;
+ }
+ if (job_context.HaveSomethingToDelete()) {
+ db->PurgeObsoleteFiles(job_context, background_purge_on_iterator_cleanup);
+ }
+ job_context.Clean();
+ }
+}
+
+namespace {
+struct SVCleanupParams {
+ DBImpl* db;
+ SuperVersion* sv;
+ bool background_purge_on_iterator_cleanup;
+};
+} // anonymous namespace
+
+// Used in PinnedIteratorsManager to release pinned SuperVersion
+void ForwardIterator::DeferredSVCleanup(void* arg) {
+ auto d = reinterpret_cast<SVCleanupParams*>(arg);
+ ForwardIterator::SVCleanup(d->db, d->sv,
+ d->background_purge_on_iterator_cleanup);
+ delete d;
+}
+
+void ForwardIterator::SVCleanup() {
+ if (sv_ == nullptr) {
+ return;
+ }
+ bool background_purge =
+ read_options_.background_purge_on_iterator_cleanup ||
+ db_->immutable_db_options().avoid_unnecessary_blocking_io;
+ if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+ // pinned_iters_mgr_ tells us to make sure that all visited key-value slices
+ // are alive until pinned_iters_mgr_->ReleasePinnedData() is called.
+ // The slices may point into some memtables owned by sv_, so we need to keep
+ // sv_ referenced until pinned_iters_mgr_ unpins everything.
+ auto p = new SVCleanupParams{db_, sv_, background_purge};
+ pinned_iters_mgr_->PinPtr(p, &ForwardIterator::DeferredSVCleanup);
+ } else {
+ SVCleanup(db_, sv_, background_purge);
+ }
+}
+
+void ForwardIterator::Cleanup(bool release_sv) {
+ if (mutable_iter_ != nullptr) {
+ DeleteIterator(mutable_iter_, true /* is_arena */);
+ }
+
+ for (auto* m : imm_iters_) {
+ DeleteIterator(m, true /* is_arena */);
+ }
+ imm_iters_.clear();
+
+ for (auto* f : l0_iters_) {
+ DeleteIterator(f);
+ }
+ l0_iters_.clear();
+
+ for (auto* l : level_iters_) {
+ DeleteIterator(l);
+ }
+ level_iters_.clear();
+
+ if (release_sv) {
+ SVCleanup();
+ }
+}
+
+bool ForwardIterator::Valid() const {
+ // See UpdateCurrent().
+ return valid_ ? !current_over_upper_bound_ : false;
+}
+
+void ForwardIterator::SeekToFirst() {
+ if (sv_ == nullptr) {
+ RebuildIterators(true);
+ } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) {
+ RenewIterators();
+ } else if (immutable_status_.IsIncomplete()) {
+ ResetIncompleteIterators();
+ }
+ SeekInternal(Slice(), true, false);
+}
+
+bool ForwardIterator::IsOverUpperBound(const Slice& internal_key) const {
+ return !(read_options_.iterate_upper_bound == nullptr ||
+ cfd_->internal_comparator().user_comparator()->Compare(
+ ExtractUserKey(internal_key),
+ *read_options_.iterate_upper_bound) < 0);
+}
+
+void ForwardIterator::Seek(const Slice& internal_key) {
+ if (sv_ == nullptr) {
+ RebuildIterators(true);
+ } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) {
+ RenewIterators();
+ } else if (immutable_status_.IsIncomplete()) {
+ ResetIncompleteIterators();
+ }
+
+ SeekInternal(internal_key, false, false);
+ if (read_options_.async_io) {
+ SeekInternal(internal_key, false, true);
+ }
+}
+
+// In case of async_io, SeekInternal is called twice with seek_after_async_io
+// enabled in second call which only does seeking part to retrieve the blocks.
+void ForwardIterator::SeekInternal(const Slice& internal_key,
+ bool seek_to_first,
+ bool seek_after_async_io) {
+ assert(mutable_iter_);
+ // mutable
+ if (!seek_after_async_io) {
+ seek_to_first ? mutable_iter_->SeekToFirst()
+ : mutable_iter_->Seek(internal_key);
+ }
+
+ // immutable
+ // TODO(ljin): NeedToSeekImmutable has negative impact on performance
+ // if it turns to need to seek immutable often. We probably want to have
+ // an option to turn it off.
+ if (seek_to_first || seek_after_async_io ||
+ NeedToSeekImmutable(internal_key)) {
+ if (!seek_after_async_io) {
+ immutable_status_ = Status::OK();
+ if (has_iter_trimmed_for_upper_bound_ &&
+ (
+ // prev_ is not set yet
+ is_prev_set_ == false ||
+ // We are doing SeekToFirst() and internal_key.size() = 0
+ seek_to_first ||
+ // prev_key_ > internal_key
+ cfd_->internal_comparator().InternalKeyComparator::Compare(
+ prev_key_.GetInternalKey(), internal_key) > 0)) {
+ // Some iterators are trimmed. Need to rebuild.
+ RebuildIterators(true);
+ // Already seeked mutable iter, so seek again
+ seek_to_first ? mutable_iter_->SeekToFirst()
+ : mutable_iter_->Seek(internal_key);
+ }
+ {
+ auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
+ immutable_min_heap_.swap(tmp);
+ }
+ for (size_t i = 0; i < imm_iters_.size(); i++) {
+ auto* m = imm_iters_[i];
+ seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
+ if (!m->status().ok()) {
+ immutable_status_ = m->status();
+ } else if (m->Valid()) {
+ immutable_min_heap_.push(m);
+ }
+ }
+ }
+
+ Slice target_user_key;
+ if (!seek_to_first) {
+ target_user_key = ExtractUserKey(internal_key);
+ }
+ const VersionStorageInfo* vstorage = sv_->current->storage_info();
+ const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+ for (size_t i = 0; i < l0.size(); ++i) {
+ if (!l0_iters_[i]) {
+ continue;
+ }
+ if (seek_after_async_io) {
+ if (!l0_iters_[i]->status().IsTryAgain()) {
+ continue;
+ }
+ }
+
+ if (seek_to_first) {
+ l0_iters_[i]->SeekToFirst();
+ } else {
+ // If the target key passes over the largest key, we are sure Next()
+ // won't go over this file.
+ if (seek_after_async_io == false &&
+ user_comparator_->Compare(target_user_key,
+ l0[i]->largest.user_key()) > 0) {
+ if (read_options_.iterate_upper_bound != nullptr) {
+ has_iter_trimmed_for_upper_bound_ = true;
+ DeleteIterator(l0_iters_[i]);
+ l0_iters_[i] = nullptr;
+ }
+ continue;
+ }
+ l0_iters_[i]->Seek(internal_key);
+ }
+
+ if (l0_iters_[i]->status().IsTryAgain()) {
+ assert(!seek_after_async_io);
+ continue;
+ } else if (!l0_iters_[i]->status().ok()) {
+ immutable_status_ = l0_iters_[i]->status();
+ } else if (l0_iters_[i]->Valid() &&
+ !IsOverUpperBound(l0_iters_[i]->key())) {
+ immutable_min_heap_.push(l0_iters_[i]);
+ } else {
+ has_iter_trimmed_for_upper_bound_ = true;
+ DeleteIterator(l0_iters_[i]);
+ l0_iters_[i] = nullptr;
+ }
+ }
+
+ for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+ const std::vector<FileMetaData*>& level_files =
+ vstorage->LevelFiles(level);
+ if (level_files.empty()) {
+ continue;
+ }
+ if (level_iters_[level - 1] == nullptr) {
+ continue;
+ }
+
+ if (seek_after_async_io) {
+ if (!level_iters_[level - 1]->status().IsTryAgain()) {
+ continue;
+ }
+ }
+ uint32_t f_idx = 0;
+ if (!seek_to_first && !seek_after_async_io) {
+ f_idx = FindFileInRange(level_files, internal_key, 0,
+ static_cast<uint32_t>(level_files.size()));
+ }
+
+ // Seek
+ if (seek_after_async_io || f_idx < level_files.size()) {
+ if (!seek_after_async_io) {
+ level_iters_[level - 1]->SetFileIndex(f_idx);
+ }
+ seek_to_first ? level_iters_[level - 1]->SeekToFirst()
+ : level_iters_[level - 1]->Seek(internal_key);
+
+ if (level_iters_[level - 1]->status().IsTryAgain()) {
+ assert(!seek_after_async_io);
+ continue;
+ } else if (!level_iters_[level - 1]->status().ok()) {
+ immutable_status_ = level_iters_[level - 1]->status();
+ } else if (level_iters_[level - 1]->Valid() &&
+ !IsOverUpperBound(level_iters_[level - 1]->key())) {
+ immutable_min_heap_.push(level_iters_[level - 1]);
+ } else {
+ // Nothing in this level is interesting. Remove.
+ has_iter_trimmed_for_upper_bound_ = true;
+ DeleteIterator(level_iters_[level - 1]);
+ level_iters_[level - 1] = nullptr;
+ }
+ }
+ }
+
+ if (seek_to_first) {
+ is_prev_set_ = false;
+ } else {
+ prev_key_.SetInternalKey(internal_key);
+ is_prev_set_ = true;
+ is_prev_inclusive_ = true;
+ }
+
+ TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Immutable", this);
+ } else if (current_ && current_ != mutable_iter_) {
+ // current_ is one of immutable iterators, push it back to the heap
+ immutable_min_heap_.push(current_);
+ }
+
+ // For async_io, it should be updated when seek_after_async_io is true (in
+ // second call).
+ if (seek_to_first || !read_options_.async_io || seek_after_async_io) {
+ UpdateCurrent();
+ }
+ TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Return", this);
+}
+
+void ForwardIterator::Next() {
+ assert(valid_);
+ bool update_prev_key = false;
+
+ if (sv_ == nullptr || sv_->version_number != cfd_->GetSuperVersionNumber()) {
+ std::string current_key = key().ToString();
+ Slice old_key(current_key.data(), current_key.size());
+
+ if (sv_ == nullptr) {
+ RebuildIterators(true);
+ } else {
+ RenewIterators();
+ }
+
+ SeekInternal(old_key, false, false);
+ if (read_options_.async_io) {
+ SeekInternal(old_key, false, true);
+ }
+
+ if (!valid_ || key().compare(old_key) != 0) {
+ return;
+ }
+ } else if (current_ != mutable_iter_) {
+ // It is going to advance immutable iterator
+
+ if (is_prev_set_ && prefix_extractor_) {
+ // advance prev_key_ to current_ only if they share the same prefix
+ update_prev_key =
+ prefix_extractor_->Transform(prev_key_.GetUserKey())
+ .compare(prefix_extractor_->Transform(current_->key())) == 0;
+ } else {
+ update_prev_key = true;
+ }
+
+ if (update_prev_key) {
+ prev_key_.SetInternalKey(current_->key());
+ is_prev_set_ = true;
+ is_prev_inclusive_ = false;
+ }
+ }
+
+ current_->Next();
+ if (current_ != mutable_iter_) {
+ if (!current_->status().ok()) {
+ immutable_status_ = current_->status();
+ } else if ((current_->Valid()) && (!IsOverUpperBound(current_->key()))) {
+ immutable_min_heap_.push(current_);
+ } else {
+ if ((current_->Valid()) && (IsOverUpperBound(current_->key()))) {
+ // remove the current iterator
+ DeleteCurrentIter();
+ current_ = nullptr;
+ }
+ if (update_prev_key) {
+ mutable_iter_->Seek(prev_key_.GetInternalKey());
+ }
+ }
+ }
+ UpdateCurrent();
+ TEST_SYNC_POINT_CALLBACK("ForwardIterator::Next:Return", this);
+}
+
+Slice ForwardIterator::key() const {
+ assert(valid_);
+ return current_->key();
+}
+
+Slice ForwardIterator::value() const {
+ assert(valid_);
+ return current_->value();
+}
+
+Status ForwardIterator::status() const {
+ if (!status_.ok()) {
+ return status_;
+ } else if (!mutable_iter_->status().ok()) {
+ return mutable_iter_->status();
+ }
+
+ return immutable_status_;
+}
+
+bool ForwardIterator::PrepareValue() {
+ assert(valid_);
+ if (current_->PrepareValue()) {
+ return true;
+ }
+
+ assert(!current_->Valid());
+ assert(!current_->status().ok());
+ assert(current_ != mutable_iter_); // memtable iterator can't fail
+ assert(immutable_status_.ok());
+
+ valid_ = false;
+ immutable_status_ = current_->status();
+ return false;
+}
+
+Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) {
+ assert(prop != nullptr);
+ if (prop_name == "rocksdb.iterator.super-version-number") {
+ *prop = std::to_string(sv_->version_number);
+ return Status::OK();
+ }
+ return Status::InvalidArgument();
+}
+
+void ForwardIterator::SetPinnedItersMgr(
+ PinnedIteratorsManager* pinned_iters_mgr) {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ UpdateChildrenPinnedItersMgr();
+}
+
+void ForwardIterator::UpdateChildrenPinnedItersMgr() {
+ // Set PinnedIteratorsManager for mutable memtable iterator.
+ if (mutable_iter_) {
+ mutable_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+
+ // Set PinnedIteratorsManager for immutable memtable iterators.
+ for (InternalIterator* child_iter : imm_iters_) {
+ if (child_iter) {
+ child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+ }
+
+ // Set PinnedIteratorsManager for L0 files iterators.
+ for (InternalIterator* child_iter : l0_iters_) {
+ if (child_iter) {
+ child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+ }
+
+ // Set PinnedIteratorsManager for L1+ levels iterators.
+ for (ForwardLevelIterator* child_iter : level_iters_) {
+ if (child_iter) {
+ child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+ }
+}
+
+bool ForwardIterator::IsKeyPinned() const {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ current_->IsKeyPinned();
+}
+
+bool ForwardIterator::IsValuePinned() const {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ current_->IsValuePinned();
+}
+
+void ForwardIterator::RebuildIterators(bool refresh_sv) {
+ // Clean up
+ Cleanup(refresh_sv);
+ if (refresh_sv) {
+ // New
+ sv_ = cfd_->GetReferencedSuperVersion(db_);
+ }
+ ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+ kMaxSequenceNumber /* upper_bound */);
+ mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
+ sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+ if (!read_options_.ignore_range_deletions) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ sv_->mem->NewRangeTombstoneIterator(
+ read_options_, sv_->current->version_set()->LastSequence(),
+ false /* immutable_memtable */));
+ range_del_agg.AddTombstones(std::move(range_del_iter));
+ // Always return Status::OK().
+ Status temp_s = sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_,
+ &range_del_agg);
+ assert(temp_s.ok());
+ }
+ has_iter_trimmed_for_upper_bound_ = false;
+
+ const auto* vstorage = sv_->current->storage_info();
+ const auto& l0_files = vstorage->LevelFiles(0);
+ l0_iters_.reserve(l0_files.size());
+ for (const auto* l0 : l0_files) {
+ if ((read_options_.iterate_upper_bound != nullptr) &&
+ cfd_->internal_comparator().user_comparator()->Compare(
+ l0->smallest.user_key(), *read_options_.iterate_upper_bound) > 0) {
+ // No need to set has_iter_trimmed_for_upper_bound_: this ForwardIterator
+ // will never be interested in files with smallest key above
+ // iterate_upper_bound, since iterate_upper_bound can't be changed.
+ l0_iters_.push_back(nullptr);
+ continue;
+ }
+ l0_iters_.push_back(cfd_->table_cache()->NewIterator(
+ read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0,
+ read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+ sv_->mutable_cf_options.prefix_extractor,
+ /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+ TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+ /*skip_filters=*/false, /*level=*/-1,
+ MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr, allow_unprepared_value_));
+ }
+ BuildLevelIterators(vstorage, sv_);
+ current_ = nullptr;
+ is_prev_set_ = false;
+
+ UpdateChildrenPinnedItersMgr();
+ if (!range_del_agg.IsEmpty()) {
+ status_ = Status::NotSupported(
+ "Range tombstones unsupported with ForwardIterator");
+ valid_ = false;
+ }
+}
+
+void ForwardIterator::RenewIterators() {
+ SuperVersion* svnew;
+ assert(sv_);
+ svnew = cfd_->GetReferencedSuperVersion(db_);
+
+ if (mutable_iter_ != nullptr) {
+ DeleteIterator(mutable_iter_, true /* is_arena */);
+ }
+ for (auto* m : imm_iters_) {
+ DeleteIterator(m, true /* is_arena */);
+ }
+ imm_iters_.clear();
+
+ mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_);
+ svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+ ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+ kMaxSequenceNumber /* upper_bound */);
+ if (!read_options_.ignore_range_deletions) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ svnew->mem->NewRangeTombstoneIterator(
+ read_options_, sv_->current->version_set()->LastSequence(),
+ false /* immutable_memtable */));
+ range_del_agg.AddTombstones(std::move(range_del_iter));
+ // Always return Status::OK().
+ Status temp_s = svnew->imm->AddRangeTombstoneIterators(
+ read_options_, &arena_, &range_del_agg);
+ assert(temp_s.ok());
+ }
+
+ const auto* vstorage = sv_->current->storage_info();
+ const auto& l0_files = vstorage->LevelFiles(0);
+ const auto* vstorage_new = svnew->current->storage_info();
+ const auto& l0_files_new = vstorage_new->LevelFiles(0);
+ size_t iold, inew;
+ bool found;
+ std::vector<InternalIterator*> l0_iters_new;
+ l0_iters_new.reserve(l0_files_new.size());
+
+ for (inew = 0; inew < l0_files_new.size(); inew++) {
+ found = false;
+ for (iold = 0; iold < l0_files.size(); iold++) {
+ if (l0_files[iold] == l0_files_new[inew]) {
+ found = true;
+ break;
+ }
+ }
+ if (found) {
+ if (l0_iters_[iold] == nullptr) {
+ l0_iters_new.push_back(nullptr);
+ TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Null", this);
+ } else {
+ l0_iters_new.push_back(l0_iters_[iold]);
+ l0_iters_[iold] = nullptr;
+ TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Copy", this);
+ }
+ continue;
+ }
+ l0_iters_new.push_back(cfd_->table_cache()->NewIterator(
+ read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+ *l0_files_new[inew],
+ read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+ svnew->mutable_cf_options.prefix_extractor,
+ /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+ TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+ /*skip_filters=*/false, /*level=*/-1,
+ MaxFileSizeForL0MetaPin(svnew->mutable_cf_options),
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr, allow_unprepared_value_));
+ }
+
+ for (auto* f : l0_iters_) {
+ DeleteIterator(f);
+ }
+ l0_iters_.clear();
+ l0_iters_ = l0_iters_new;
+
+ for (auto* l : level_iters_) {
+ DeleteIterator(l);
+ }
+ level_iters_.clear();
+ BuildLevelIterators(vstorage_new, svnew);
+ current_ = nullptr;
+ is_prev_set_ = false;
+ SVCleanup();
+ sv_ = svnew;
+
+ UpdateChildrenPinnedItersMgr();
+ if (!range_del_agg.IsEmpty()) {
+ status_ = Status::NotSupported(
+ "Range tombstones unsupported with ForwardIterator");
+ valid_ = false;
+ }
+}
+
+void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage,
+ SuperVersion* sv) {
+ level_iters_.reserve(vstorage->num_levels() - 1);
+ for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+ const auto& level_files = vstorage->LevelFiles(level);
+ if ((level_files.empty()) ||
+ ((read_options_.iterate_upper_bound != nullptr) &&
+ (user_comparator_->Compare(*read_options_.iterate_upper_bound,
+ level_files[0]->smallest.user_key()) <
+ 0))) {
+ level_iters_.push_back(nullptr);
+ if (!level_files.empty()) {
+ has_iter_trimmed_for_upper_bound_ = true;
+ }
+ } else {
+ level_iters_.push_back(new ForwardLevelIterator(
+ cfd_, read_options_, level_files,
+ sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_));
+ }
+ }
+}
+
+void ForwardIterator::ResetIncompleteIterators() {
+ const auto& l0_files = sv_->current->storage_info()->LevelFiles(0);
+ for (size_t i = 0; i < l0_iters_.size(); ++i) {
+ assert(i < l0_files.size());
+ if (!l0_iters_[i] || !l0_iters_[i]->status().IsIncomplete()) {
+ continue;
+ }
+ DeleteIterator(l0_iters_[i]);
+ l0_iters_[i] = cfd_->table_cache()->NewIterator(
+ read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+ *l0_files[i], /*range_del_agg=*/nullptr,
+ sv_->mutable_cf_options.prefix_extractor,
+ /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+ TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+ /*skip_filters=*/false, /*level=*/-1,
+ MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr, allow_unprepared_value_);
+ l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+
+ for (auto* level_iter : level_iters_) {
+ if (level_iter && level_iter->status().IsIncomplete()) {
+ level_iter->Reset();
+ }
+ }
+
+ current_ = nullptr;
+ is_prev_set_ = false;
+}
+
+void ForwardIterator::UpdateCurrent() {
+ if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
+ current_ = nullptr;
+ } else if (immutable_min_heap_.empty()) {
+ current_ = mutable_iter_;
+ } else if (!mutable_iter_->Valid()) {
+ current_ = immutable_min_heap_.top();
+ immutable_min_heap_.pop();
+ } else {
+ current_ = immutable_min_heap_.top();
+ assert(current_ != nullptr);
+ assert(current_->Valid());
+ int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare(
+ mutable_iter_->key(), current_->key());
+ assert(cmp != 0);
+ if (cmp > 0) {
+ immutable_min_heap_.pop();
+ } else {
+ current_ = mutable_iter_;
+ }
+ }
+ valid_ = current_ != nullptr && immutable_status_.ok();
+ if (!status_.ok()) {
+ status_ = Status::OK();
+ }
+
+ // Upper bound doesn't apply to the memtable iterator. We want Valid() to
+ // return false when all iterators are over iterate_upper_bound, but can't
+ // just set valid_ to false, as that would effectively disable the tailing
+ // optimization (Seek() would be called on all immutable iterators regardless
+ // of whether the target key is greater than prev_key_).
+ current_over_upper_bound_ = valid_ && IsOverUpperBound(current_->key());
+}
+
+bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
+ // We maintain the interval (prev_key_, immutable_min_heap_.top()->key())
+ // such that there are no records with keys within that range in
+ // immutable_min_heap_. Since immutable structures (SST files and immutable
+ // memtables) can't change in this version, we don't need to do a seek if
+ // 'target' belongs to that interval (immutable_min_heap_.top() is already
+ // at the correct position).
+
+ if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) {
+ return true;
+ }
+ Slice prev_key = prev_key_.GetInternalKey();
+ if (prefix_extractor_ && prefix_extractor_->Transform(target).compare(
+ prefix_extractor_->Transform(prev_key)) != 0) {
+ return true;
+ }
+ if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+ prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) {
+ return true;
+ }
+
+ if (immutable_min_heap_.empty() && current_ == mutable_iter_) {
+ // Nothing to seek on.
+ return false;
+ }
+ if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+ target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
+ : current_->key()) > 0) {
+ return true;
+ }
+ return false;
+}
+
+void ForwardIterator::DeleteCurrentIter() {
+ const VersionStorageInfo* vstorage = sv_->current->storage_info();
+ const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+ for (size_t i = 0; i < l0.size(); ++i) {
+ if (!l0_iters_[i]) {
+ continue;
+ }
+ if (l0_iters_[i] == current_) {
+ has_iter_trimmed_for_upper_bound_ = true;
+ DeleteIterator(l0_iters_[i]);
+ l0_iters_[i] = nullptr;
+ return;
+ }
+ }
+
+ for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+ if (level_iters_[level - 1] == nullptr) {
+ continue;
+ }
+ if (level_iters_[level - 1] == current_) {
+ has_iter_trimmed_for_upper_bound_ = true;
+ DeleteIterator(level_iters_[level - 1]);
+ level_iters_[level - 1] = nullptr;
+ }
+ }
+}
+
+bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters,
+ int* pnum_iters) {
+ bool retval = false;
+ int deleted_iters = 0;
+ int num_iters = 0;
+
+ const VersionStorageInfo* vstorage = sv_->current->storage_info();
+ const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+ for (size_t i = 0; i < l0.size(); ++i) {
+ if (!l0_iters_[i]) {
+ retval = true;
+ deleted_iters++;
+ } else {
+ num_iters++;
+ }
+ }
+
+ for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+ if ((level_iters_[level - 1] == nullptr) &&
+ (!vstorage->LevelFiles(level).empty())) {
+ retval = true;
+ deleted_iters++;
+ } else if (!vstorage->LevelFiles(level).empty()) {
+ num_iters++;
+ }
+ }
+ if ((!retval) && num_iters <= 1) {
+ retval = true;
+ }
+ if (pdeleted_iters) {
+ *pdeleted_iters = deleted_iters;
+ }
+ if (pnum_iters) {
+ *pnum_iters = num_iters;
+ }
+ return retval;
+}
+
+uint32_t ForwardIterator::FindFileInRange(
+ const std::vector<FileMetaData*>& files, const Slice& internal_key,
+ uint32_t left, uint32_t right) {
+ auto cmp = [&](const FileMetaData* f, const Slice& k) -> bool {
+ return cfd_->internal_comparator().InternalKeyComparator::Compare(
+ f->largest.Encode(), k) < 0;
+ };
+ const auto& b = files.begin();
+ return static_cast<uint32_t>(
+ std::lower_bound(b + left, b + right, internal_key, cmp) - b);
+}
+
+void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) {
+ if (iter == nullptr) {
+ return;
+ }
+
+ if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+ pinned_iters_mgr_->PinIterator(iter, is_arena);
+ } else {
+ if (is_arena) {
+ iter->~InternalIterator();
+ } else {
+ delete iter;
+ }
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/forward_iterator.h b/src/rocksdb/db/forward_iterator.h
new file mode 100644
index 000000000..5a5c6f0f3
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/comparator.h"
+#ifndef ROCKSDB_LITE
+
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "memory/arena.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class Env;
+struct SuperVersion;
+class ColumnFamilyData;
+class ForwardLevelIterator;
+class VersionStorageInfo;
+struct FileMetaData;
+
+class MinIterComparator {
+ public:
+ explicit MinIterComparator(const CompareInterface* comparator)
+ : comparator_(comparator) {}
+
+ bool operator()(InternalIterator* a, InternalIterator* b) {
+ return comparator_->Compare(a->key(), b->key()) > 0;
+ }
+
+ private:
+ const CompareInterface* comparator_;
+};
+
+using MinIterHeap =
+ std::priority_queue<InternalIterator*, std::vector<InternalIterator*>,
+ MinIterComparator>;
+
+/**
+ * ForwardIterator is a special type of iterator that only supports Seek()
+ * and Next(). It is expected to perform better than TailingIterator by
+ * removing the encapsulation and making all information accessible within
+ * the iterator. At the current implementation, snapshot is taken at the
+ * time Seek() is called. The Next() followed do not see new values after.
+ */
+class ForwardIterator : public InternalIterator {
+ public:
+ ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+ ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr,
+ bool allow_unprepared_value = false);
+ virtual ~ForwardIterator();
+
+ void SeekForPrev(const Slice& /*target*/) override {
+ status_ = Status::NotSupported("ForwardIterator::SeekForPrev()");
+ valid_ = false;
+ }
+ void SeekToLast() override {
+ status_ = Status::NotSupported("ForwardIterator::SeekToLast()");
+ valid_ = false;
+ }
+ void Prev() override {
+ status_ = Status::NotSupported("ForwardIterator::Prev");
+ valid_ = false;
+ }
+
+ virtual bool Valid() const override;
+ void SeekToFirst() override;
+ virtual void Seek(const Slice& target) override;
+ virtual void Next() override;
+ virtual Slice key() const override;
+ virtual Slice value() const override;
+ virtual Status status() const override;
+ virtual bool PrepareValue() override;
+ virtual Status GetProperty(std::string prop_name, std::string* prop) override;
+ virtual void SetPinnedItersMgr(
+ PinnedIteratorsManager* pinned_iters_mgr) override;
+ virtual bool IsKeyPinned() const override;
+ virtual bool IsValuePinned() const override;
+
+ bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters);
+
+ private:
+ void Cleanup(bool release_sv);
+ // Unreference and, if needed, clean up the current SuperVersion. This is
+ // either done immediately or deferred until this iterator is unpinned by
+ // PinnedIteratorsManager.
+ void SVCleanup();
+ static void SVCleanup(DBImpl* db, SuperVersion* sv,
+ bool background_purge_on_iterator_cleanup);
+ static void DeferredSVCleanup(void* arg);
+
+ void RebuildIterators(bool refresh_sv);
+ void RenewIterators();
+ void BuildLevelIterators(const VersionStorageInfo* vstorage,
+ SuperVersion* sv);
+ void ResetIncompleteIterators();
+ void SeekInternal(const Slice& internal_key, bool seek_to_first,
+ bool seek_after_async_io);
+
+ void UpdateCurrent();
+ bool NeedToSeekImmutable(const Slice& internal_key);
+ void DeleteCurrentIter();
+ uint32_t FindFileInRange(const std::vector<FileMetaData*>& files,
+ const Slice& internal_key, uint32_t left,
+ uint32_t right);
+
+ bool IsOverUpperBound(const Slice& internal_key) const;
+
+ // Set PinnedIteratorsManager for all children Iterators, this function should
+ // be called whenever we update children Iterators or pinned_iters_mgr_.
+ void UpdateChildrenPinnedItersMgr();
+
+ // A helper function that will release iter in the proper manner, or pass it
+ // to pinned_iters_mgr_ to release it later if pinning is enabled.
+ void DeleteIterator(InternalIterator* iter, bool is_arena = false);
+
+ DBImpl* const db_;
+ const ReadOptions read_options_;
+ ColumnFamilyData* const cfd_;
+ const SliceTransform* const prefix_extractor_;
+ const Comparator* user_comparator_;
+ const bool allow_unprepared_value_;
+ MinIterHeap immutable_min_heap_;
+
+ SuperVersion* sv_;
+ InternalIterator* mutable_iter_;
+ std::vector<InternalIterator*> imm_iters_;
+ std::vector<InternalIterator*> l0_iters_;
+ std::vector<ForwardLevelIterator*> level_iters_;
+ InternalIterator* current_;
+ bool valid_;
+
+ // Internal iterator status; set only by one of the unsupported methods.
+ Status status_;
+ // Status of immutable iterators, maintained here to avoid iterating over
+ // all of them in status().
+ Status immutable_status_;
+ // Indicates that at least one of the immutable iterators pointed to a key
+ // larger than iterate_upper_bound and was therefore destroyed. Seek() may
+ // need to rebuild such iterators.
+ bool has_iter_trimmed_for_upper_bound_;
+ // Is current key larger than iterate_upper_bound? If so, makes Valid()
+ // return false.
+ bool current_over_upper_bound_;
+
+ // Left endpoint of the range of keys that immutable iterators currently
+ // cover. When Seek() is called with a key that's within that range, immutable
+ // iterators don't need to be moved; see NeedToSeekImmutable(). This key is
+ // included in the range after a Seek(), but excluded when advancing the
+ // iterator using Next().
+ IterKey prev_key_;
+ bool is_prev_set_;
+ bool is_prev_inclusive_;
+
+ PinnedIteratorsManager* pinned_iters_mgr_;
+ Arena arena_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/forward_iterator_bench.cc b/src/rocksdb/db/forward_iterator_bench.cc
new file mode 100644
index 000000000..325661cef
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator_bench.cc
@@ -0,0 +1,378 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#elif defined(OS_MACOSX) || defined(OS_WIN)
+// Block forward_iterator_bench under MAC and Windows
+int main() { return 0; }
+#else
+#include <semaphore.h>
+
+#include <atomic>
+#include <bitset>
+#include <chrono>
+#include <climits>
+#include <condition_variable>
+#include <limits>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <thread>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "util/gflags_compat.h"
+
+const int MAX_SHARDS = 100000;
+
+DEFINE_int32(writers, 8, "");
+DEFINE_int32(readers, 8, "");
+DEFINE_int64(rate, 100000, "");
+DEFINE_int64(value_size, 300, "");
+DEFINE_int64(shards, 1000, "");
+DEFINE_int64(memtable_size, 500000000, "");
+DEFINE_int64(block_cache_size, 300000000, "");
+DEFINE_int64(block_size, 65536, "");
+DEFINE_double(runtime, 300.0, "");
+DEFINE_bool(cache_only_first, true, "");
+DEFINE_bool(iterate_upper_bound, true, "");
+
+struct Stats {
+ char pad1[128] __attribute__((__unused__));
+ std::atomic<uint64_t> written{0};
+ char pad2[128] __attribute__((__unused__));
+ std::atomic<uint64_t> read{0};
+ std::atomic<uint64_t> cache_misses{0};
+ char pad3[128] __attribute__((__unused__));
+} stats;
+
+struct Key {
+ Key() {}
+ Key(uint64_t shard_in, uint64_t seqno_in)
+ : shard_be(htobe64(shard_in)), seqno_be(htobe64(seqno_in)) {}
+
+ uint64_t shard() const { return be64toh(shard_be); }
+ uint64_t seqno() const { return be64toh(seqno_be); }
+
+ private:
+ uint64_t shard_be;
+ uint64_t seqno_be;
+} __attribute__((__packed__));
+
+struct Reader;
+struct Writer;
+
+struct ShardState {
+ char pad1[128] __attribute__((__unused__));
+ std::atomic<uint64_t> last_written{0};
+ Writer* writer;
+ Reader* reader;
+ char pad2[128] __attribute__((__unused__));
+ std::atomic<uint64_t> last_read{0};
+ std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it;
+ std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it_cacheonly;
+ Key upper_bound;
+ ROCKSDB_NAMESPACE::Slice upper_bound_slice;
+ char pad3[128] __attribute__((__unused__));
+};
+
+struct Reader {
+ public:
+ explicit Reader(std::vector<ShardState>* shard_states,
+ ROCKSDB_NAMESPACE::DB* db)
+ : shard_states_(shard_states), db_(db) {
+ sem_init(&sem_, 0, 0);
+ thread_ = port::Thread(&Reader::run, this);
+ }
+
+ void run() {
+ while (1) {
+ sem_wait(&sem_);
+ if (done_.load()) {
+ break;
+ }
+
+ uint64_t shard;
+ {
+ std::lock_guard<std::mutex> guard(queue_mutex_);
+ assert(!shards_pending_queue_.empty());
+ shard = shards_pending_queue_.front();
+ shards_pending_queue_.pop();
+ shards_pending_set_.reset(shard);
+ }
+ readOnceFromShard(shard);
+ }
+ }
+
+ void readOnceFromShard(uint64_t shard) {
+ ShardState& state = (*shard_states_)[shard];
+ if (!state.it) {
+ // Initialize iterators
+ ROCKSDB_NAMESPACE::ReadOptions options;
+ options.tailing = true;
+ if (FLAGS_iterate_upper_bound) {
+ state.upper_bound = Key(shard, std::numeric_limits<uint64_t>::max());
+ state.upper_bound_slice = ROCKSDB_NAMESPACE::Slice(
+ (const char*)&state.upper_bound, sizeof(state.upper_bound));
+ options.iterate_upper_bound = &state.upper_bound_slice;
+ }
+
+ state.it.reset(db_->NewIterator(options));
+
+ if (FLAGS_cache_only_first) {
+ options.read_tier = ROCKSDB_NAMESPACE::ReadTier::kBlockCacheTier;
+ state.it_cacheonly.reset(db_->NewIterator(options));
+ }
+ }
+
+ const uint64_t upto = state.last_written.load();
+ for (ROCKSDB_NAMESPACE::Iterator* it :
+ {state.it_cacheonly.get(), state.it.get()}) {
+ if (it == nullptr) {
+ continue;
+ }
+ if (state.last_read.load() >= upto) {
+ break;
+ }
+ bool need_seek = true;
+ for (uint64_t seq = state.last_read.load() + 1; seq <= upto; ++seq) {
+ if (need_seek) {
+ Key from(shard, state.last_read.load() + 1);
+ it->Seek(ROCKSDB_NAMESPACE::Slice((const char*)&from, sizeof(from)));
+ need_seek = false;
+ } else {
+ it->Next();
+ }
+ if (it->status().IsIncomplete()) {
+ ++::stats.cache_misses;
+ break;
+ }
+ assert(it->Valid());
+ assert(it->key().size() == sizeof(Key));
+ Key key;
+ memcpy(&key, it->key().data(), it->key().size());
+ // fprintf(stderr, "Expecting (%ld, %ld) read (%ld, %ld)\n",
+ // shard, seq, key.shard(), key.seqno());
+ assert(key.shard() == shard);
+ assert(key.seqno() == seq);
+ state.last_read.store(seq);
+ ++::stats.read;
+ }
+ }
+ }
+
+ void onWrite(uint64_t shard) {
+ {
+ std::lock_guard<std::mutex> guard(queue_mutex_);
+ if (!shards_pending_set_.test(shard)) {
+ shards_pending_queue_.push(shard);
+ shards_pending_set_.set(shard);
+ sem_post(&sem_);
+ }
+ }
+ }
+
+ ~Reader() {
+ done_.store(true);
+ sem_post(&sem_);
+ thread_.join();
+ }
+
+ private:
+ char pad1[128] __attribute__((__unused__));
+ std::vector<ShardState>* shard_states_;
+ ROCKSDB_NAMESPACE::DB* db_;
+ ROCKSDB_NAMESPACE::port::Thread thread_;
+ sem_t sem_;
+ std::mutex queue_mutex_;
+ std::bitset<MAX_SHARDS + 1> shards_pending_set_;
+ std::queue<uint64_t> shards_pending_queue_;
+ std::atomic<bool> done_{false};
+ char pad2[128] __attribute__((__unused__));
+};
+
+struct Writer {
+ explicit Writer(std::vector<ShardState>* shard_states,
+ ROCKSDB_NAMESPACE::DB* db)
+ : shard_states_(shard_states), db_(db) {}
+
+ void start() { thread_ = port::Thread(&Writer::run, this); }
+
+ void run() {
+ std::queue<std::chrono::steady_clock::time_point> workq;
+ std::chrono::steady_clock::time_point deadline(
+ std::chrono::steady_clock::now() +
+ std::chrono::nanoseconds((uint64_t)(1000000000 * FLAGS_runtime)));
+ std::vector<uint64_t> my_shards;
+ for (int i = 1; i <= FLAGS_shards; ++i) {
+ if ((*shard_states_)[i].writer == this) {
+ my_shards.push_back(i);
+ }
+ }
+
+ std::mt19937 rng{std::random_device()()};
+ std::uniform_int_distribution<int> shard_dist(
+ 0, static_cast<int>(my_shards.size()) - 1);
+ std::string value(FLAGS_value_size, '*');
+
+ while (1) {
+ auto now = std::chrono::steady_clock::now();
+ if (FLAGS_runtime >= 0 && now >= deadline) {
+ break;
+ }
+ if (workq.empty()) {
+ for (int i = 0; i < FLAGS_rate; i += FLAGS_writers) {
+ std::chrono::nanoseconds offset(1000000000LL * i / FLAGS_rate);
+ workq.push(now + offset);
+ }
+ }
+ while (!workq.empty() && workq.front() < now) {
+ workq.pop();
+ uint64_t shard = my_shards[shard_dist(rng)];
+ ShardState& state = (*shard_states_)[shard];
+ uint64_t seqno = state.last_written.load() + 1;
+ Key key(shard, seqno);
+ // fprintf(stderr, "Writing (%ld, %ld)\n", shard, seqno);
+ ROCKSDB_NAMESPACE::Status status =
+ db_->Put(ROCKSDB_NAMESPACE::WriteOptions(),
+ ROCKSDB_NAMESPACE::Slice((const char*)&key, sizeof(key)),
+ ROCKSDB_NAMESPACE::Slice(value));
+ assert(status.ok());
+ state.last_written.store(seqno);
+ state.reader->onWrite(shard);
+ ++::stats.written;
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
+ }
+ // fprintf(stderr, "Writer done\n");
+ }
+
+ ~Writer() { thread_.join(); }
+
+ private:
+ char pad1[128] __attribute__((__unused__));
+ std::vector<ShardState>* shard_states_;
+ ROCKSDB_NAMESPACE::DB* db_;
+ ROCKSDB_NAMESPACE::port::Thread thread_;
+ char pad2[128] __attribute__((__unused__));
+};
+
+struct StatsThread {
+ explicit StatsThread(ROCKSDB_NAMESPACE::DB* db)
+ : db_(db), thread_(&StatsThread::run, this) {}
+
+ void run() {
+ auto tstart = std::chrono::steady_clock::now(), tlast = tstart;
+ uint64_t wlast = 0, rlast = 0;
+ while (!done_.load()) {
+ {
+ std::unique_lock<std::mutex> lock(cvm_);
+ cv_.wait_for(lock, std::chrono::seconds(1));
+ }
+ auto now = std::chrono::steady_clock::now();
+ double elapsed =
+ std::chrono::duration_cast<std::chrono::duration<double> >(now -
+ tlast)
+ .count();
+ uint64_t w = ::stats.written.load();
+ uint64_t r = ::stats.read.load();
+ fprintf(stderr,
+ "%s elapsed %4lds | written %10ld | w/s %10.0f | read %10ld | "
+ "r/s %10.0f | cache misses %10ld\n",
+ db_->GetEnv()->TimeToString(time(nullptr)).c_str(),
+ std::chrono::duration_cast<std::chrono::seconds>(now - tstart)
+ .count(),
+ w, (w - wlast) / elapsed, r, (r - rlast) / elapsed,
+ ::stats.cache_misses.load());
+ wlast = w;
+ rlast = r;
+ tlast = now;
+ }
+ }
+
+ ~StatsThread() {
+ {
+ std::lock_guard<std::mutex> guard(cvm_);
+ done_.store(true);
+ }
+ cv_.notify_all();
+ thread_.join();
+ }
+
+ private:
+ ROCKSDB_NAMESPACE::DB* db_;
+ std::mutex cvm_;
+ std::condition_variable cv_;
+ ROCKSDB_NAMESPACE::port::Thread thread_;
+ std::atomic<bool> done_{false};
+};
+
+int main(int argc, char** argv) {
+ GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+
+ std::mt19937 rng{std::random_device()()};
+ ROCKSDB_NAMESPACE::Status status;
+ std::string path =
+ ROCKSDB_NAMESPACE::test::PerThreadDBPath("forward_iterator_test");
+ fprintf(stderr, "db path is %s\n", path.c_str());
+ ROCKSDB_NAMESPACE::Options options;
+ options.create_if_missing = true;
+ options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression;
+ options.compaction_style =
+ ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleNone;
+ options.level0_slowdown_writes_trigger = 99999;
+ options.level0_stop_writes_trigger = 99999;
+ options.use_direct_io_for_flush_and_compaction = true;
+ options.write_buffer_size = FLAGS_memtable_size;
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions table_options;
+ table_options.block_cache =
+ ROCKSDB_NAMESPACE::NewLRUCache(FLAGS_block_cache_size);
+ table_options.block_size = FLAGS_block_size;
+ options.table_factory.reset(
+ ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options));
+
+ status = ROCKSDB_NAMESPACE::DestroyDB(path, options);
+ assert(status.ok());
+ ROCKSDB_NAMESPACE::DB* db_raw;
+ status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db_raw);
+ assert(status.ok());
+ std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(db_raw);
+
+ std::vector<ShardState> shard_states(FLAGS_shards + 1);
+ std::deque<Reader> readers;
+ while (static_cast<int>(readers.size()) < FLAGS_readers) {
+ readers.emplace_back(&shard_states, db_raw);
+ }
+ std::deque<Writer> writers;
+ while (static_cast<int>(writers.size()) < FLAGS_writers) {
+ writers.emplace_back(&shard_states, db_raw);
+ }
+
+ // Each shard gets a random reader and random writer assigned to it
+ for (int i = 1; i <= FLAGS_shards; ++i) {
+ std::uniform_int_distribution<int> reader_dist(0, FLAGS_readers - 1);
+ std::uniform_int_distribution<int> writer_dist(0, FLAGS_writers - 1);
+ shard_states[i].reader = &readers[reader_dist(rng)];
+ shard_states[i].writer = &writers[writer_dist(rng)];
+ }
+
+ StatsThread stats_thread(db_raw);
+ for (Writer& w : writers) {
+ w.start();
+ }
+
+ writers.clear();
+ readers.clear();
+}
+#endif // !defined(GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/db/history_trimming_iterator.h b/src/rocksdb/db/history_trimming_iterator.h
new file mode 100644
index 000000000..b445ced33
--- /dev/null
+++ b/src/rocksdb/db/history_trimming_iterator.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class HistoryTrimmingIterator : public InternalIterator {
+ public:
+ explicit HistoryTrimmingIterator(InternalIterator* input,
+ const Comparator* cmp, const std::string& ts)
+ : input_(input), filter_ts_(ts), cmp_(cmp) {
+ assert(cmp_->timestamp_size() > 0 && !ts.empty());
+ }
+
+ bool filter() const {
+ if (!input_->Valid()) {
+ return true;
+ }
+ Slice current_ts = ExtractTimestampFromKey(key(), cmp_->timestamp_size());
+ return cmp_->CompareTimestamp(current_ts, Slice(filter_ts_)) <= 0;
+ }
+
+ bool Valid() const override { return input_->Valid(); }
+
+ void SeekToFirst() override {
+ input_->SeekToFirst();
+ while (!filter()) {
+ input_->Next();
+ }
+ }
+
+ void SeekToLast() override {
+ input_->SeekToLast();
+ while (!filter()) {
+ input_->Prev();
+ }
+ }
+
+ void Seek(const Slice& target) override {
+ input_->Seek(target);
+ while (!filter()) {
+ input_->Next();
+ }
+ }
+
+ void SeekForPrev(const Slice& target) override {
+ input_->SeekForPrev(target);
+ while (!filter()) {
+ input_->Prev();
+ }
+ }
+
+ void Next() override {
+ do {
+ input_->Next();
+ } while (!filter());
+ }
+
+ void Prev() override {
+ do {
+ input_->Prev();
+ } while (!filter());
+ }
+
+ Slice key() const override { return input_->key(); }
+
+ Slice value() const override { return input_->value(); }
+
+ Status status() const override { return input_->status(); }
+
+ bool IsKeyPinned() const override { return input_->IsKeyPinned(); }
+
+ bool IsValuePinned() const override { return input_->IsValuePinned(); }
+
+ private:
+ InternalIterator* input_;
+ const std::string filter_ts_;
+ const Comparator* const cmp_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/import_column_family_job.cc b/src/rocksdb/db/import_column_family_job.cc
new file mode 100644
index 000000000..34985666a
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_job.cc
@@ -0,0 +1,312 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/import_column_family_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "table/unique_id_impl.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
+ SuperVersion* sv) {
+ Status status;
+
+ // Read the information of files we are importing
+ for (const auto& file_metadata : metadata_) {
+ const auto file_path = file_metadata.db_path + "/" + file_metadata.name;
+ IngestedFileInfo file_to_import;
+ status =
+ GetIngestedFileInfo(file_path, next_file_number++, &file_to_import, sv);
+ if (!status.ok()) {
+ return status;
+ }
+ files_to_import_.push_back(file_to_import);
+ }
+
+ auto num_files = files_to_import_.size();
+ if (num_files == 0) {
+ return Status::InvalidArgument("The list of files is empty");
+ } else if (num_files > 1) {
+ // Verify that passed files don't have overlapping ranges in any particular
+ // level.
+ int min_level = 1; // Check for overlaps in Level 1 and above.
+ int max_level = -1;
+ for (const auto& file_metadata : metadata_) {
+ if (file_metadata.level > max_level) {
+ max_level = file_metadata.level;
+ }
+ }
+ for (int level = min_level; level <= max_level; ++level) {
+ autovector<const IngestedFileInfo*> sorted_files;
+ for (size_t i = 0; i < num_files; i++) {
+ if (metadata_[i].level == level) {
+ sorted_files.push_back(&files_to_import_[i]);
+ }
+ }
+
+ std::sort(
+ sorted_files.begin(), sorted_files.end(),
+ [this](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
+ return cfd_->internal_comparator().Compare(
+ info1->smallest_internal_key,
+ info2->smallest_internal_key) < 0;
+ });
+
+ for (size_t i = 0; i + 1 < sorted_files.size(); i++) {
+ if (cfd_->internal_comparator().Compare(
+ sorted_files[i]->largest_internal_key,
+ sorted_files[i + 1]->smallest_internal_key) >= 0) {
+ return Status::InvalidArgument("Files have overlapping ranges");
+ }
+ }
+ }
+ }
+
+ for (const auto& f : files_to_import_) {
+ if (f.num_entries == 0) {
+ return Status::InvalidArgument("File contain no entries");
+ }
+
+ if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
+ return Status::Corruption("File has corrupted keys");
+ }
+ }
+
+ // Copy/Move external files into DB
+ auto hardlink_files = import_options_.move_files;
+ for (auto& f : files_to_import_) {
+ const auto path_outside_db = f.external_file_path;
+ const auto path_inside_db = TableFileName(
+ cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
+
+ if (hardlink_files) {
+ status =
+ fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
+ if (status.IsNotSupported()) {
+ // Original file is on a different FS, use copy instead of hard linking
+ hardlink_files = false;
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Try to link file %s but it's not supported : %s",
+ f.internal_file_path.c_str(), status.ToString().c_str());
+ }
+ }
+ if (!hardlink_files) {
+ status =
+ CopyFile(fs_.get(), path_outside_db, path_inside_db, 0,
+ db_options_.use_fsync, io_tracer_, Temperature::kUnknown);
+ }
+ if (!status.ok()) {
+ break;
+ }
+ f.copy_file = !hardlink_files;
+ f.internal_file_path = path_inside_db;
+ }
+
+ if (!status.ok()) {
+ // We failed, remove all files that we copied into the db
+ for (const auto& f : files_to_import_) {
+ if (f.internal_file_path.empty()) {
+ break;
+ }
+ const auto s =
+ fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "AddFile() clean up for file %s failed : %s",
+ f.internal_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ }
+
+ return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ImportColumnFamilyJob::Run() {
+ Status status;
+ edit_.SetColumnFamily(cfd_->GetID());
+
+ // We use the import time as the ancester time. This is the time the data
+ // is written to the database.
+ int64_t temp_current_time = 0;
+ uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+ uint64_t current_time = kUnknownOldestAncesterTime;
+ if (clock_->GetCurrentTime(&temp_current_time).ok()) {
+ current_time = oldest_ancester_time =
+ static_cast<uint64_t>(temp_current_time);
+ }
+
+ for (size_t i = 0; i < files_to_import_.size(); ++i) {
+ const auto& f = files_to_import_[i];
+ const auto& file_metadata = metadata_[i];
+
+ edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
+ f.fd.GetFileSize(), f.smallest_internal_key,
+ f.largest_internal_key, file_metadata.smallest_seqno,
+ file_metadata.largest_seqno, false, file_metadata.temperature,
+ kInvalidBlobFileNumber, oldest_ancester_time, current_time,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ f.unique_id);
+
+ // If incoming sequence number is higher, update local sequence number.
+ if (file_metadata.largest_seqno > versions_->LastSequence()) {
+ versions_->SetLastAllocatedSequence(file_metadata.largest_seqno);
+ versions_->SetLastPublishedSequence(file_metadata.largest_seqno);
+ versions_->SetLastSequence(file_metadata.largest_seqno);
+ }
+ }
+
+ return status;
+}
+
+void ImportColumnFamilyJob::Cleanup(const Status& status) {
+ if (!status.ok()) {
+ // We failed to add files to the database remove all the files we copied.
+ for (const auto& f : files_to_import_) {
+ const auto s =
+ fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "AddFile() clean up for file %s failed : %s",
+ f.internal_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ } else if (status.ok() && import_options_.move_files) {
+ // The files were moved and added successfully, remove original file links
+ for (IngestedFileInfo& f : files_to_import_) {
+ const auto s =
+ fs_->DeleteFile(f.external_file_path, IOOptions(), nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "%s was added to DB successfully but failed to remove original "
+ "file link : %s",
+ f.external_file_path.c_str(), s.ToString().c_str());
+ }
+ }
+ }
+}
+
+Status ImportColumnFamilyJob::GetIngestedFileInfo(
+ const std::string& external_file, uint64_t new_file_number,
+ IngestedFileInfo* file_to_import, SuperVersion* sv) {
+ file_to_import->external_file_path = external_file;
+
+ // Get external file size
+ Status status = fs_->GetFileSize(external_file, IOOptions(),
+ &file_to_import->file_size, nullptr);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Assign FD with number
+ file_to_import->fd =
+ FileDescriptor(new_file_number, 0, file_to_import->file_size);
+
+ // Create TableReader for external file
+ std::unique_ptr<TableReader> table_reader;
+ std::unique_ptr<FSRandomAccessFile> sst_file;
+ std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+ status =
+ fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr);
+ if (!status.ok()) {
+ return status;
+ }
+ sst_file_reader.reset(new RandomAccessFileReader(
+ std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_));
+
+ status = cfd_->ioptions()->table_factory->NewTableReader(
+ TableReaderOptions(
+ *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
+ env_options_, cfd_->internal_comparator(),
+ /*skip_filters*/ false, /*immortal*/ false,
+ /*force_direct_prefetch*/ false, /*level*/ -1,
+ /*block_cache_tracer*/ nullptr,
+ /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(),
+ /*cur_file_num*/ new_file_number),
+ std::move(sst_file_reader), file_to_import->file_size, &table_reader);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Get the external file properties
+ auto props = table_reader->GetTableProperties();
+
+ // Set original_seqno to 0.
+ file_to_import->original_seqno = 0;
+
+ // Get number of entries in table
+ file_to_import->num_entries = props->num_entries;
+
+ ParsedInternalKey key;
+ ReadOptions ro;
+ // During reading the external file we can cache blocks that we read into
+ // the block cache, if we later change the global seqno of this file, we will
+ // have block in cache that will include keys with wrong seqno.
+ // We need to disable fill_cache so that we read from the file without
+ // updating the block cache.
+ ro.fill_cache = false;
+ std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+ ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+
+ // Get first (smallest) key from file
+ iter->SeekToFirst();
+ Status pik_status =
+ ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors);
+ if (!pik_status.ok()) {
+ return Status::Corruption("Corrupted Key in external file. ",
+ pik_status.getState());
+ }
+ file_to_import->smallest_internal_key.SetFrom(key);
+
+ // Get last (largest) key from file
+ iter->SeekToLast();
+ pik_status =
+ ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors);
+ if (!pik_status.ok()) {
+ return Status::Corruption("Corrupted Key in external file. ",
+ pik_status.getState());
+ }
+ file_to_import->largest_internal_key.SetFrom(key);
+
+ file_to_import->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+ file_to_import->table_properties = *props;
+
+ auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
+ props->orig_file_number,
+ &(file_to_import->unique_id));
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to get SST unique id for file %s",
+ file_to_import->internal_file_path.c_str());
+ }
+
+ return status;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/import_column_family_job.h b/src/rocksdb/db/import_column_family_job.h
new file mode 100644
index 000000000..57c49c67f
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_job.h
@@ -0,0 +1,82 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/snapshot_impl.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct EnvOptions;
+class SystemClock;
+
+// Imports a set of sst files as is into a new column family. Logic is similar
+// to ExternalSstFileIngestionJob.
+class ImportColumnFamilyJob {
+ public:
+ ImportColumnFamilyJob(VersionSet* versions, ColumnFamilyData* cfd,
+ const ImmutableDBOptions& db_options,
+ const EnvOptions& env_options,
+ const ImportColumnFamilyOptions& import_options,
+ const std::vector<LiveFileMetaData>& metadata,
+ const std::shared_ptr<IOTracer>& io_tracer)
+ : clock_(db_options.clock),
+ versions_(versions),
+ cfd_(cfd),
+ db_options_(db_options),
+ fs_(db_options_.fs, io_tracer),
+ env_options_(env_options),
+ import_options_(import_options),
+ metadata_(metadata),
+ io_tracer_(io_tracer) {}
+
+ // Prepare the job by copying external files into the DB.
+ Status Prepare(uint64_t next_file_number, SuperVersion* sv);
+
+ // Will execute the import job and prepare edit() to be applied.
+ // REQUIRES: Mutex held
+ Status Run();
+
+ // Cleanup after successful/failed job
+ void Cleanup(const Status& status);
+
+ VersionEdit* edit() { return &edit_; }
+
+ const autovector<IngestedFileInfo>& files_to_import() const {
+ return files_to_import_;
+ }
+
+ private:
+ // Open the external file and populate `file_to_import` with all the
+ // external information we need to import this file.
+ Status GetIngestedFileInfo(const std::string& external_file,
+ uint64_t new_file_number,
+ IngestedFileInfo* file_to_import,
+ SuperVersion* sv);
+
+ SystemClock* clock_;
+ VersionSet* versions_;
+ ColumnFamilyData* cfd_;
+ const ImmutableDBOptions& db_options_;
+ const FileSystemPtr fs_;
+ const EnvOptions& env_options_;
+ autovector<IngestedFileInfo> files_to_import_;
+ VersionEdit edit_;
+ const ImportColumnFamilyOptions& import_options_;
+ std::vector<LiveFileMetaData> metadata_;
+ const std::shared_ptr<IOTracer> io_tracer_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/import_column_family_test.cc b/src/rocksdb/db/import_column_family_test.cc
new file mode 100644
index 000000000..2847ea8da
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_test.cc
@@ -0,0 +1,644 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ImportColumnFamilyTest : public DBTestBase {
+ public:
+ ImportColumnFamilyTest()
+ : DBTestBase("import_column_family_test", /*env_do_fsync=*/true) {
+ sst_files_dir_ = dbname_ + "/sst_files/";
+ export_files_dir_ = test::PerThreadDBPath(env_, "export");
+ DestroyAndRecreateExternalSSTFilesDir();
+ import_cfh_ = nullptr;
+ import_cfh2_ = nullptr;
+ metadata_ptr_ = nullptr;
+ }
+
+ ~ImportColumnFamilyTest() {
+ if (import_cfh_) {
+ EXPECT_OK(db_->DropColumnFamily(import_cfh_));
+ EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+ import_cfh_ = nullptr;
+ }
+ if (import_cfh2_) {
+ EXPECT_OK(db_->DropColumnFamily(import_cfh2_));
+ EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh2_));
+ import_cfh2_ = nullptr;
+ }
+ if (metadata_ptr_) {
+ delete metadata_ptr_;
+ metadata_ptr_ = nullptr;
+ }
+ EXPECT_OK(DestroyDir(env_, sst_files_dir_));
+ EXPECT_OK(DestroyDir(env_, export_files_dir_));
+ }
+
+ void DestroyAndRecreateExternalSSTFilesDir() {
+ EXPECT_OK(DestroyDir(env_, sst_files_dir_));
+ EXPECT_OK(env_->CreateDir(sst_files_dir_));
+ EXPECT_OK(DestroyDir(env_, export_files_dir_));
+ }
+
+ LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path,
+ int level,
+ SequenceNumber smallest_seqno,
+ SequenceNumber largest_seqno) {
+ LiveFileMetaData metadata;
+ metadata.name = name;
+ metadata.db_path = path;
+ metadata.smallest_seqno = smallest_seqno;
+ metadata.largest_seqno = largest_seqno;
+ metadata.level = level;
+ return metadata;
+ }
+
+ protected:
+ std::string sst_files_dir_;
+ std::string export_files_dir_;
+ ColumnFamilyHandle* import_cfh_;
+ ColumnFamilyHandle* import_cfh2_;
+ ExportImportFilesMetaData* metadata_ptr_;
+};
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko"}, options);
+
+ SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+ SstFileWriter sfw_unknown(EnvOptions(), options);
+
+ // cf1.sst
+ const std::string cf1_sst_name = "cf1.sst";
+ const std::string cf1_sst = sst_files_dir_ + cf1_sst_name;
+ ASSERT_OK(sfw_cf1.Open(cf1_sst));
+ ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+ ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // cf_unknown.sst
+ const std::string unknown_sst_name = "cf_unknown.sst";
+ const std::string unknown_sst = sst_files_dir_ + unknown_sst_name;
+ ASSERT_OK(sfw_unknown.Open(unknown_sst));
+ ASSERT_OK(sfw_unknown.Put("K3", "V1"));
+ ASSERT_OK(sfw_unknown.Put("K4", "V2"));
+ ASSERT_OK(sfw_unknown.Finish());
+
+ {
+ // Import sst file corresponding to cf1 onto a new cf and verify
+ ExportImportFilesMetaData metadata;
+ metadata.files.push_back(
+ LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 10, 19));
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(
+ options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+ ASSERT_NE(import_cfh_, nullptr);
+
+ std::string value;
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K1", &value));
+ ASSERT_EQ(value, "V1");
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K2", &value));
+ ASSERT_EQ(value, "V2");
+ ASSERT_OK(db_->DropColumnFamily(import_cfh_));
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+ import_cfh_ = nullptr;
+ }
+
+ {
+ // Import sst file corresponding to unknown cf onto a new cf and verify
+ ExportImportFilesMetaData metadata;
+ metadata.files.push_back(
+ LiveFileMetaDataInit(unknown_sst_name, sst_files_dir_, 0, 20, 29));
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(
+ options, "yoyo", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+ ASSERT_NE(import_cfh_, nullptr);
+
+ std::string value;
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K3", &value));
+ ASSERT_EQ(value, "V1");
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K4", &value));
+ ASSERT_EQ(value, "V2");
+ }
+ EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+ import_cfh_ = nullptr;
+
+ // verify sst unique id during reopen
+ options.verify_sst_unique_id_in_manifest = true;
+ ReopenWithColumnFamilies({"default", "koko", "yoyo"}, options);
+}
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko"}, options);
+
+ SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+
+ // file3.sst
+ const std::string file3_sst_name = "file3.sst";
+ const std::string file3_sst = sst_files_dir_ + file3_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file3_sst));
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_val"));
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // file2.sst
+ const std::string file2_sst_name = "file2.sst";
+ const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file2_sst));
+ for (int i = 0; i < 100; i += 2) {
+ ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite1"));
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // file1a.sst
+ const std::string file1a_sst_name = "file1a.sst";
+ const std::string file1a_sst = sst_files_dir_ + file1a_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file1a_sst));
+ for (int i = 0; i < 52; i += 4) {
+ ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"));
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // file1b.sst
+ const std::string file1b_sst_name = "file1b.sst";
+ const std::string file1b_sst = sst_files_dir_ + file1b_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file1b_sst));
+ for (int i = 52; i < 100; i += 4) {
+ ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"));
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // file0a.sst
+ const std::string file0a_sst_name = "file0a.sst";
+ const std::string file0a_sst = sst_files_dir_ + file0a_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file0a_sst));
+ for (int i = 0; i < 100; i += 16) {
+ ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite3"));
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // file0b.sst
+ const std::string file0b_sst_name = "file0b.sst";
+ const std::string file0b_sst = sst_files_dir_ + file0b_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file0b_sst));
+ for (int i = 0; i < 100; i += 16) {
+ ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite4"));
+ }
+ ASSERT_OK(sfw_cf1.Finish());
+
+ // Import sst files and verify
+ ExportImportFilesMetaData metadata;
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 3, 10, 19));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 2, 20, 29));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file1a_sst_name, sst_files_dir_, 1, 30, 34));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file1b_sst_name, sst_files_dir_, 1, 35, 39));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file0a_sst_name, sst_files_dir_, 0, 40, 49));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file0b_sst_name, sst_files_dir_, 0, 50, 59));
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(
+ options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+ ASSERT_NE(import_cfh_, nullptr);
+
+ for (int i = 0; i < 100; i++) {
+ std::string value;
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
+ if (i % 16 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite4");
+ } else if (i % 4 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite2");
+ } else if (i % 2 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite1");
+ } else {
+ ASSERT_EQ(value, Key(i) + "_val");
+ }
+ }
+
+ for (int i = 0; i < 100; i += 5) {
+ ASSERT_OK(
+ db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite5"));
+ }
+
+ // Flush and check again
+ ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+ for (int i = 0; i < 100; i++) {
+ std::string value;
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
+ if (i % 5 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite5");
+ } else if (i % 16 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite4");
+ } else if (i % 4 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite2");
+ } else if (i % 2 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite1");
+ } else {
+ ASSERT_EQ(value, Key(i) + "_val");
+ }
+ }
+
+ // Compact and check again.
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+ for (int i = 0; i < 100; i++) {
+ std::string value;
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
+ if (i % 5 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite5");
+ } else if (i % 16 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite4");
+ } else if (i % 4 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite2");
+ } else if (i % 2 == 0) {
+ ASSERT_EQ(value, Key(i) + "_overwrite1");
+ } else {
+ ASSERT_EQ(value, Key(i) + "_val");
+ }
+ }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko"}, options);
+
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(Put(1, Key(i), Key(i) + "_val"));
+ }
+ ASSERT_OK(Flush(1));
+
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+ // Overwrite the value in the same set of keys.
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite"));
+ }
+
+ // Flush to create L0 file.
+ ASSERT_OK(Flush(1));
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2"));
+ }
+
+ // Flush again to create another L0 file. It should have higher sequencer.
+ ASSERT_OK(Flush(1));
+
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+ &metadata_ptr_));
+ ASSERT_NE(metadata_ptr_, nullptr);
+ delete checkpoint;
+
+ ImportColumnFamilyOptions import_options;
+ import_options.move_files = false;
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options,
+ *metadata_ptr_, &import_cfh_));
+ ASSERT_NE(import_cfh_, nullptr);
+
+ import_options.move_files = true;
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options,
+ *metadata_ptr_, &import_cfh2_));
+ ASSERT_NE(import_cfh2_, nullptr);
+ delete metadata_ptr_;
+ metadata_ptr_ = NULL;
+
+ std::string value1, value2;
+
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+ ASSERT_EQ(Get(1, Key(i)), value1);
+ }
+
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
+ ASSERT_EQ(Get(1, Key(i)), value2);
+ }
+
+ // Modify keys in cf1 and verify.
+ for (int i = 0; i < 25; i++) {
+ ASSERT_OK(db_->Delete(WriteOptions(), import_cfh_, Key(i)));
+ }
+ for (int i = 25; i < 50; i++) {
+ ASSERT_OK(
+ db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite3"));
+ }
+ for (int i = 0; i < 25; ++i) {
+ ASSERT_TRUE(
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+ }
+ for (int i = 25; i < 50; ++i) {
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+ ASSERT_EQ(Key(i) + "_overwrite3", value1);
+ }
+ for (int i = 50; i < 100; ++i) {
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+ ASSERT_EQ(Key(i) + "_overwrite2", value1);
+ }
+
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
+ ASSERT_EQ(Get(1, Key(i)), value2);
+ }
+
+ // Compact and check again.
+ ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+
+ for (int i = 0; i < 25; ++i) {
+ ASSERT_TRUE(
+ db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+ }
+ for (int i = 25; i < 50; ++i) {
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+ ASSERT_EQ(Key(i) + "_overwrite3", value1);
+ }
+ for (int i = 50; i < 100; ++i) {
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+ ASSERT_EQ(Key(i) + "_overwrite2", value1);
+ }
+
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
+ ASSERT_EQ(Get(1, Key(i)), value2);
+ }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko"}, options);
+
+ for (int i = 0; i < 100; ++i) {
+ ASSERT_OK(Put(1, Key(i), Key(i) + "_val"));
+ }
+ ASSERT_OK(Flush(1));
+
+ // Compact to create a L1 file.
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+ // Overwrite the value in the same set of keys.
+ for (int i = 0; i < 50; ++i) {
+ ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite"));
+ }
+
+ // Flush to create L0 file.
+ ASSERT_OK(Flush(1));
+
+ for (int i = 0; i < 25; ++i) {
+ ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2"));
+ }
+
+ // Flush again to create another L0 file. It should have higher sequencer.
+ ASSERT_OK(Flush(1));
+
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+ &metadata_ptr_));
+ ASSERT_NE(metadata_ptr_, nullptr);
+ delete checkpoint;
+
+ // Create a new db and import the files.
+ DB* db_copy;
+ ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+ ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
+ ColumnFamilyHandle* cfh = nullptr;
+ ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ *metadata_ptr_, &cfh));
+ ASSERT_NE(cfh, nullptr);
+
+ for (int i = 0; i < 100; ++i) {
+ std::string value;
+ ASSERT_OK(db_copy->Get(ReadOptions(), cfh, Key(i), &value));
+ ASSERT_EQ(Get(1, Key(i)), value);
+ }
+ ASSERT_OK(db_copy->DropColumnFamily(cfh));
+ ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
+ delete db_copy;
+ ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+}
+
+TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) {
+ // Imports a column family containing a level where two files overlap at their
+ // endpoints. "Overlap" means the largest user key in one file is the same as
+ // the smallest user key in the second file.
+ const int kFileBytes = 128 << 10; // 128KB
+ const int kValueBytes = 1 << 10; // 1KB
+ const int kNumFiles = 4;
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.num_levels = 2;
+ CreateAndReopenWithCF({"koko"}, options);
+
+ Random rnd(301);
+ // Every key is snapshot protected to ensure older versions will not be
+ // dropped during compaction.
+ std::vector<const Snapshot*> snapshots;
+ snapshots.reserve(kFileBytes / kValueBytes * kNumFiles);
+ for (int i = 0; i < kNumFiles; ++i) {
+ for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+ auto value = rnd.RandomString(kValueBytes);
+ ASSERT_OK(Put(1, "key", value));
+ snapshots.push_back(db_->GetSnapshot());
+ }
+ ASSERT_OK(Flush(1));
+ }
+
+ // Compact to create overlapping L1 files.
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+ &metadata_ptr_));
+ ASSERT_NE(metadata_ptr_, nullptr);
+ delete checkpoint;
+
+ // Create a new db and import the files.
+ DB* db_copy;
+ ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+ ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
+ ColumnFamilyHandle* cfh = nullptr;
+ ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ *metadata_ptr_, &cfh));
+ ASSERT_NE(cfh, nullptr);
+
+ {
+ std::string value;
+ ASSERT_OK(db_copy->Get(ReadOptions(), cfh, "key", &value));
+ }
+ ASSERT_OK(db_copy->DropColumnFamily(cfh));
+ ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
+ delete db_copy;
+ ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+ for (const Snapshot* snapshot : snapshots) {
+ db_->ReleaseSnapshot(snapshot);
+ }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"koko"}, options);
+
+ {
+ // Create column family with existing cf name.
+ ExportImportFilesMetaData metadata;
+
+ ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_),
+ Status::InvalidArgument("Column family already exists"));
+ ASSERT_EQ(import_cfh_, nullptr);
+ }
+
+ {
+ // Import with no files specified.
+ ExportImportFilesMetaData metadata;
+
+ ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_),
+ Status::InvalidArgument("The list of files is empty"));
+ ASSERT_EQ(import_cfh_, nullptr);
+ }
+
+ {
+ // Import with overlapping keys in sst files.
+ ExportImportFilesMetaData metadata;
+ SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+ const std::string file1_sst_name = "file1.sst";
+ const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file1_sst));
+ ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+ ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+ ASSERT_OK(sfw_cf1.Finish());
+ const std::string file2_sst_name = "file2.sst";
+ const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file2_sst));
+ ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+ ASSERT_OK(sfw_cf1.Put("K3", "V3"));
+ ASSERT_OK(sfw_cf1.Finish());
+
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19));
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_),
+ Status::InvalidArgument("Files have overlapping ranges"));
+ ASSERT_EQ(import_cfh_, nullptr);
+ }
+
+ {
+ // Import with a mismatching comparator, should fail with appropriate error.
+ ExportImportFilesMetaData metadata;
+ Options mismatch_options = CurrentOptions();
+ mismatch_options.comparator = ReverseBytewiseComparator();
+ SstFileWriter sfw_cf1(EnvOptions(), mismatch_options, handles_[1]);
+ const std::string file1_sst_name = "file1.sst";
+ const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file1_sst));
+ ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+ ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+ ASSERT_OK(sfw_cf1.Finish());
+
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+ metadata.db_comparator_name = mismatch_options.comparator->Name();
+
+ ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_),
+ Status::InvalidArgument("Comparator name mismatch"));
+ ASSERT_EQ(import_cfh_, nullptr);
+ }
+
+ {
+ // Import with non existent sst file should fail with appropriate error
+ ExportImportFilesMetaData metadata;
+ SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+ const std::string file1_sst_name = "file1.sst";
+ const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+ ASSERT_OK(sfw_cf1.Open(file1_sst));
+ ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+ ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+ ASSERT_OK(sfw_cf1.Finish());
+ const std::string file3_sst_name = "file3.sst";
+
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+ metadata.files.push_back(
+ LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19));
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_),
+ Status::IOError("No such file or directory"));
+ ASSERT_EQ(import_cfh_, nullptr);
+
+ // Test successful import after a failure with the same CF name. Ensures
+ // there is no side effect with CF when there is a failed import
+ metadata.files.pop_back();
+ metadata.db_comparator_name = options.comparator->Name();
+
+ ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+ ImportColumnFamilyOptions(),
+ metadata, &import_cfh_));
+ ASSERT_NE(import_cfh_, nullptr);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as External SST File Writer and Import are not supported "
+ "in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/internal_stats.cc b/src/rocksdb/db/internal_stats.cc
new file mode 100644
index 000000000..ac5b81f3e
--- /dev/null
+++ b/src/rocksdb/db/internal_stats.cc
@@ -0,0 +1,2002 @@
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/internal_stats.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstddef>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_entry_stats.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "table/block_based/cachable_entry.h"
+#include "util/hash_containers.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+const std::map<LevelStatType, LevelStat> InternalStats::compaction_level_stats =
+ {
+ {LevelStatType::NUM_FILES, LevelStat{"NumFiles", "Files"}},
+ {LevelStatType::COMPACTED_FILES,
+ LevelStat{"CompactedFiles", "CompactedFiles"}},
+ {LevelStatType::SIZE_BYTES, LevelStat{"SizeBytes", "Size"}},
+ {LevelStatType::SCORE, LevelStat{"Score", "Score"}},
+ {LevelStatType::READ_GB, LevelStat{"ReadGB", "Read(GB)"}},
+ {LevelStatType::RN_GB, LevelStat{"RnGB", "Rn(GB)"}},
+ {LevelStatType::RNP1_GB, LevelStat{"Rnp1GB", "Rnp1(GB)"}},
+ {LevelStatType::WRITE_GB, LevelStat{"WriteGB", "Write(GB)"}},
+ {LevelStatType::W_NEW_GB, LevelStat{"WnewGB", "Wnew(GB)"}},
+ {LevelStatType::MOVED_GB, LevelStat{"MovedGB", "Moved(GB)"}},
+ {LevelStatType::WRITE_AMP, LevelStat{"WriteAmp", "W-Amp"}},
+ {LevelStatType::READ_MBPS, LevelStat{"ReadMBps", "Rd(MB/s)"}},
+ {LevelStatType::WRITE_MBPS, LevelStat{"WriteMBps", "Wr(MB/s)"}},
+ {LevelStatType::COMP_SEC, LevelStat{"CompSec", "Comp(sec)"}},
+ {LevelStatType::COMP_CPU_SEC,
+ LevelStat{"CompMergeCPU", "CompMergeCPU(sec)"}},
+ {LevelStatType::COMP_COUNT, LevelStat{"CompCount", "Comp(cnt)"}},
+ {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}},
+ {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}},
+ {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}},
+ {LevelStatType::R_BLOB_GB, LevelStat{"RblobGB", "Rblob(GB)"}},
+ {LevelStatType::W_BLOB_GB, LevelStat{"WblobGB", "Wblob(GB)"}},
+};
+
+const std::map<InternalStats::InternalDBStatsType, DBStatInfo>
+ InternalStats::db_stats_type_to_info = {
+ {InternalStats::kIntStatsWalFileBytes,
+ DBStatInfo{"db.wal_bytes_written"}},
+ {InternalStats::kIntStatsWalFileSynced, DBStatInfo{"db.wal_syncs"}},
+ {InternalStats::kIntStatsBytesWritten,
+ DBStatInfo{"db.user_bytes_written"}},
+ {InternalStats::kIntStatsNumKeysWritten,
+ DBStatInfo{"db.user_keys_written"}},
+ {InternalStats::kIntStatsWriteDoneByOther,
+ DBStatInfo{"db.user_writes_by_other"}},
+ {InternalStats::kIntStatsWriteDoneBySelf,
+ DBStatInfo{"db.user_writes_by_self"}},
+ {InternalStats::kIntStatsWriteWithWal,
+ DBStatInfo{"db.user_writes_with_wal"}},
+ {InternalStats::kIntStatsWriteStallMicros,
+ DBStatInfo{"db.user_write_stall_micros"}},
+};
+
+namespace {
+const double kMB = 1048576.0;
+const double kGB = kMB * 1024;
+const double kMicrosInSec = 1000000.0;
+
+void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name,
+ const std::string& group_by) {
+ int written_size =
+ snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str());
+ written_size = std::min(written_size, static_cast<int>(len));
+ auto hdr = [](LevelStatType t) {
+ return InternalStats::compaction_level_stats.at(t).header_name.c_str();
+ };
+ int line_size = snprintf(
+ buf + written_size, len - written_size,
+ "%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s "
+ "%s\n",
+ // Note that we skip COMPACTED_FILES and merge it with Files column
+ group_by.c_str(), hdr(LevelStatType::NUM_FILES),
+ hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE),
+ hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB),
+ hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB),
+ hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB),
+ hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS),
+ hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC),
+ hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT),
+ hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN),
+ hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB),
+ hdr(LevelStatType::W_BLOB_GB));
+
+ written_size += line_size;
+ written_size = std::min(written_size, static_cast<int>(len));
+ snprintf(buf + written_size, len - written_size, "%s\n",
+ std::string(line_size, '-').c_str());
+}
+
+void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
+ int num_files, int being_compacted,
+ double total_file_size, double score, double w_amp,
+ const InternalStats::CompactionStats& stats) {
+ const uint64_t bytes_read = stats.bytes_read_non_output_levels +
+ stats.bytes_read_output_level +
+ stats.bytes_read_blob;
+ const uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
+ const int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level;
+ const double elapsed = (stats.micros + 1) / kMicrosInSec;
+
+ (*level_stats)[LevelStatType::NUM_FILES] = num_files;
+ (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted;
+ (*level_stats)[LevelStatType::SIZE_BYTES] = total_file_size;
+ (*level_stats)[LevelStatType::SCORE] = score;
+ (*level_stats)[LevelStatType::READ_GB] = bytes_read / kGB;
+ (*level_stats)[LevelStatType::RN_GB] =
+ stats.bytes_read_non_output_levels / kGB;
+ (*level_stats)[LevelStatType::RNP1_GB] = stats.bytes_read_output_level / kGB;
+ (*level_stats)[LevelStatType::WRITE_GB] = stats.bytes_written / kGB;
+ (*level_stats)[LevelStatType::W_NEW_GB] = bytes_new / kGB;
+ (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB;
+ (*level_stats)[LevelStatType::WRITE_AMP] = w_amp;
+ (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed;
+ (*level_stats)[LevelStatType::WRITE_MBPS] = bytes_written / kMB / elapsed;
+ (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec;
+ (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec;
+ (*level_stats)[LevelStatType::COMP_COUNT] = stats.count;
+ (*level_stats)[LevelStatType::AVG_SEC] =
+ stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count;
+ (*level_stats)[LevelStatType::KEY_IN] =
+ static_cast<double>(stats.num_input_records);
+ (*level_stats)[LevelStatType::KEY_DROP] =
+ static_cast<double>(stats.num_dropped_records);
+ (*level_stats)[LevelStatType::R_BLOB_GB] = stats.bytes_read_blob / kGB;
+ (*level_stats)[LevelStatType::W_BLOB_GB] = stats.bytes_written_blob / kGB;
+}
+
+void PrintLevelStats(char* buf, size_t len, const std::string& name,
+ const std::map<LevelStatType, double>& stat_value) {
+ snprintf(
+ buf, len,
+ "%4s " /* Level */
+ "%6d/%-3d " /* Files */
+ "%8s " /* Size */
+ "%5.1f " /* Score */
+ "%8.1f " /* Read(GB) */
+ "%7.1f " /* Rn(GB) */
+ "%8.1f " /* Rnp1(GB) */
+ "%9.1f " /* Write(GB) */
+ "%8.1f " /* Wnew(GB) */
+ "%9.1f " /* Moved(GB) */
+ "%5.1f " /* W-Amp */
+ "%8.1f " /* Rd(MB/s) */
+ "%8.1f " /* Wr(MB/s) */
+ "%9.2f " /* Comp(sec) */
+ "%17.2f " /* CompMergeCPU(sec) */
+ "%9d " /* Comp(cnt) */
+ "%8.3f " /* Avg(sec) */
+ "%7s " /* KeyIn */
+ "%6s " /* KeyDrop */
+ "%9.1f " /* Rblob(GB) */
+ "%9.1f\n", /* Wblob(GB) */
+ name.c_str(), static_cast<int>(stat_value.at(LevelStatType::NUM_FILES)),
+ static_cast<int>(stat_value.at(LevelStatType::COMPACTED_FILES)),
+ BytesToHumanString(
+ static_cast<uint64_t>(stat_value.at(LevelStatType::SIZE_BYTES)))
+ .c_str(),
+ stat_value.at(LevelStatType::SCORE),
+ stat_value.at(LevelStatType::READ_GB),
+ stat_value.at(LevelStatType::RN_GB),
+ stat_value.at(LevelStatType::RNP1_GB),
+ stat_value.at(LevelStatType::WRITE_GB),
+ stat_value.at(LevelStatType::W_NEW_GB),
+ stat_value.at(LevelStatType::MOVED_GB),
+ stat_value.at(LevelStatType::WRITE_AMP),
+ stat_value.at(LevelStatType::READ_MBPS),
+ stat_value.at(LevelStatType::WRITE_MBPS),
+ stat_value.at(LevelStatType::COMP_SEC),
+ stat_value.at(LevelStatType::COMP_CPU_SEC),
+ static_cast<int>(stat_value.at(LevelStatType::COMP_COUNT)),
+ stat_value.at(LevelStatType::AVG_SEC),
+ NumberToHumanString(
+ static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_IN)))
+ .c_str(),
+ NumberToHumanString(
+ static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_DROP)))
+ .c_str(),
+ stat_value.at(LevelStatType::R_BLOB_GB),
+ stat_value.at(LevelStatType::W_BLOB_GB));
+}
+
+void PrintLevelStats(char* buf, size_t len, const std::string& name,
+ int num_files, int being_compacted, double total_file_size,
+ double score, double w_amp,
+ const InternalStats::CompactionStats& stats) {
+ std::map<LevelStatType, double> level_stats;
+ PrepareLevelStats(&level_stats, num_files, being_compacted, total_file_size,
+ score, w_amp, stats);
+ PrintLevelStats(buf, len, name, level_stats);
+}
+
+// Assumes that trailing numbers represent an optional argument. This requires
+// property names to not end with numbers.
+std::pair<Slice, Slice> GetPropertyNameAndArg(const Slice& property) {
+ Slice name = property, arg = property;
+ size_t sfx_len = 0;
+ while (sfx_len < property.size() &&
+ isdigit(property[property.size() - sfx_len - 1])) {
+ ++sfx_len;
+ }
+ name.remove_suffix(sfx_len);
+ arg.remove_prefix(property.size() - sfx_len);
+ return {name, arg};
+}
+} // anonymous namespace
+
+static const std::string rocksdb_prefix = "rocksdb.";
+
+static const std::string num_files_at_level_prefix = "num-files-at-level";
+static const std::string compression_ratio_at_level_prefix =
+ "compression-ratio-at-level";
+static const std::string allstats = "stats";
+static const std::string sstables = "sstables";
+static const std::string cfstats = "cfstats";
+static const std::string cfstats_no_file_histogram =
+ "cfstats-no-file-histogram";
+static const std::string cf_file_histogram = "cf-file-histogram";
+static const std::string dbstats = "dbstats";
+static const std::string levelstats = "levelstats";
+static const std::string block_cache_entry_stats = "block-cache-entry-stats";
+static const std::string fast_block_cache_entry_stats =
+ "fast-block-cache-entry-stats";
+static const std::string num_immutable_mem_table = "num-immutable-mem-table";
+static const std::string num_immutable_mem_table_flushed =
+ "num-immutable-mem-table-flushed";
+static const std::string mem_table_flush_pending = "mem-table-flush-pending";
+static const std::string compaction_pending = "compaction-pending";
+static const std::string background_errors = "background-errors";
+static const std::string cur_size_active_mem_table =
+ "cur-size-active-mem-table";
+static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables";
+static const std::string size_all_mem_tables = "size-all-mem-tables";
+static const std::string num_entries_active_mem_table =
+ "num-entries-active-mem-table";
+static const std::string num_entries_imm_mem_tables =
+ "num-entries-imm-mem-tables";
+static const std::string num_deletes_active_mem_table =
+ "num-deletes-active-mem-table";
+static const std::string num_deletes_imm_mem_tables =
+ "num-deletes-imm-mem-tables";
+static const std::string estimate_num_keys = "estimate-num-keys";
+static const std::string estimate_table_readers_mem =
+ "estimate-table-readers-mem";
+static const std::string is_file_deletions_enabled =
+ "is-file-deletions-enabled";
+static const std::string num_snapshots = "num-snapshots";
+static const std::string oldest_snapshot_time = "oldest-snapshot-time";
+static const std::string oldest_snapshot_sequence = "oldest-snapshot-sequence";
+static const std::string num_live_versions = "num-live-versions";
+static const std::string current_version_number =
+ "current-super-version-number";
+static const std::string estimate_live_data_size = "estimate-live-data-size";
+static const std::string min_log_number_to_keep_str = "min-log-number-to-keep";
+static const std::string min_obsolete_sst_number_to_keep_str =
+ "min-obsolete-sst-number-to-keep";
+static const std::string base_level_str = "base-level";
+static const std::string total_sst_files_size = "total-sst-files-size";
+static const std::string live_sst_files_size = "live-sst-files-size";
+static const std::string live_sst_files_size_at_temperature =
+ "live-sst-files-size-at-temperature";
+static const std::string estimate_pending_comp_bytes =
+ "estimate-pending-compaction-bytes";
+static const std::string aggregated_table_properties =
+ "aggregated-table-properties";
+static const std::string aggregated_table_properties_at_level =
+ aggregated_table_properties + "-at-level";
+static const std::string num_running_compactions = "num-running-compactions";
+static const std::string num_running_flushes = "num-running-flushes";
+static const std::string actual_delayed_write_rate =
+ "actual-delayed-write-rate";
+static const std::string is_write_stopped = "is-write-stopped";
+static const std::string estimate_oldest_key_time = "estimate-oldest-key-time";
+static const std::string block_cache_capacity = "block-cache-capacity";
+static const std::string block_cache_usage = "block-cache-usage";
+static const std::string block_cache_pinned_usage = "block-cache-pinned-usage";
+static const std::string options_statistics = "options-statistics";
+static const std::string num_blob_files = "num-blob-files";
+static const std::string blob_stats = "blob-stats";
+static const std::string total_blob_file_size = "total-blob-file-size";
+static const std::string live_blob_file_size = "live-blob-file-size";
+static const std::string live_blob_file_garbage_size =
+ "live-blob-file-garbage-size";
+static const std::string blob_cache_capacity = "blob-cache-capacity";
+static const std::string blob_cache_usage = "blob-cache-usage";
+static const std::string blob_cache_pinned_usage = "blob-cache-pinned-usage";
+
+const std::string DB::Properties::kNumFilesAtLevelPrefix =
+ rocksdb_prefix + num_files_at_level_prefix;
+const std::string DB::Properties::kCompressionRatioAtLevelPrefix =
+ rocksdb_prefix + compression_ratio_at_level_prefix;
+const std::string DB::Properties::kStats = rocksdb_prefix + allstats;
+const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables;
+const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats;
+const std::string DB::Properties::kCFStatsNoFileHistogram =
+ rocksdb_prefix + cfstats_no_file_histogram;
+const std::string DB::Properties::kCFFileHistogram =
+ rocksdb_prefix + cf_file_histogram;
+const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats;
+const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats;
+const std::string DB::Properties::kBlockCacheEntryStats =
+ rocksdb_prefix + block_cache_entry_stats;
+const std::string DB::Properties::kFastBlockCacheEntryStats =
+ rocksdb_prefix + fast_block_cache_entry_stats;
+const std::string DB::Properties::kNumImmutableMemTable =
+ rocksdb_prefix + num_immutable_mem_table;
+const std::string DB::Properties::kNumImmutableMemTableFlushed =
+ rocksdb_prefix + num_immutable_mem_table_flushed;
+const std::string DB::Properties::kMemTableFlushPending =
+ rocksdb_prefix + mem_table_flush_pending;
+const std::string DB::Properties::kCompactionPending =
+ rocksdb_prefix + compaction_pending;
+const std::string DB::Properties::kNumRunningCompactions =
+ rocksdb_prefix + num_running_compactions;
+const std::string DB::Properties::kNumRunningFlushes =
+ rocksdb_prefix + num_running_flushes;
+const std::string DB::Properties::kBackgroundErrors =
+ rocksdb_prefix + background_errors;
+const std::string DB::Properties::kCurSizeActiveMemTable =
+ rocksdb_prefix + cur_size_active_mem_table;
+const std::string DB::Properties::kCurSizeAllMemTables =
+ rocksdb_prefix + cur_size_all_mem_tables;
+const std::string DB::Properties::kSizeAllMemTables =
+ rocksdb_prefix + size_all_mem_tables;
+const std::string DB::Properties::kNumEntriesActiveMemTable =
+ rocksdb_prefix + num_entries_active_mem_table;
+const std::string DB::Properties::kNumEntriesImmMemTables =
+ rocksdb_prefix + num_entries_imm_mem_tables;
+const std::string DB::Properties::kNumDeletesActiveMemTable =
+ rocksdb_prefix + num_deletes_active_mem_table;
+const std::string DB::Properties::kNumDeletesImmMemTables =
+ rocksdb_prefix + num_deletes_imm_mem_tables;
+const std::string DB::Properties::kEstimateNumKeys =
+ rocksdb_prefix + estimate_num_keys;
+const std::string DB::Properties::kEstimateTableReadersMem =
+ rocksdb_prefix + estimate_table_readers_mem;
+const std::string DB::Properties::kIsFileDeletionsEnabled =
+ rocksdb_prefix + is_file_deletions_enabled;
+const std::string DB::Properties::kNumSnapshots =
+ rocksdb_prefix + num_snapshots;
+const std::string DB::Properties::kOldestSnapshotTime =
+ rocksdb_prefix + oldest_snapshot_time;
+const std::string DB::Properties::kOldestSnapshotSequence =
+ rocksdb_prefix + oldest_snapshot_sequence;
+const std::string DB::Properties::kNumLiveVersions =
+ rocksdb_prefix + num_live_versions;
+const std::string DB::Properties::kCurrentSuperVersionNumber =
+ rocksdb_prefix + current_version_number;
+const std::string DB::Properties::kEstimateLiveDataSize =
+ rocksdb_prefix + estimate_live_data_size;
+const std::string DB::Properties::kMinLogNumberToKeep =
+ rocksdb_prefix + min_log_number_to_keep_str;
+const std::string DB::Properties::kMinObsoleteSstNumberToKeep =
+ rocksdb_prefix + min_obsolete_sst_number_to_keep_str;
+const std::string DB::Properties::kTotalSstFilesSize =
+ rocksdb_prefix + total_sst_files_size;
+const std::string DB::Properties::kLiveSstFilesSize =
+ rocksdb_prefix + live_sst_files_size;
+const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level_str;
+const std::string DB::Properties::kEstimatePendingCompactionBytes =
+ rocksdb_prefix + estimate_pending_comp_bytes;
+const std::string DB::Properties::kAggregatedTableProperties =
+ rocksdb_prefix + aggregated_table_properties;
+const std::string DB::Properties::kAggregatedTablePropertiesAtLevel =
+ rocksdb_prefix + aggregated_table_properties_at_level;
+const std::string DB::Properties::kActualDelayedWriteRate =
+ rocksdb_prefix + actual_delayed_write_rate;
+const std::string DB::Properties::kIsWriteStopped =
+ rocksdb_prefix + is_write_stopped;
+const std::string DB::Properties::kEstimateOldestKeyTime =
+ rocksdb_prefix + estimate_oldest_key_time;
+const std::string DB::Properties::kBlockCacheCapacity =
+ rocksdb_prefix + block_cache_capacity;
+const std::string DB::Properties::kBlockCacheUsage =
+ rocksdb_prefix + block_cache_usage;
+const std::string DB::Properties::kBlockCachePinnedUsage =
+ rocksdb_prefix + block_cache_pinned_usage;
+const std::string DB::Properties::kOptionsStatistics =
+ rocksdb_prefix + options_statistics;
+const std::string DB::Properties::kLiveSstFilesSizeAtTemperature =
+ rocksdb_prefix + live_sst_files_size_at_temperature;
+const std::string DB::Properties::kNumBlobFiles =
+ rocksdb_prefix + num_blob_files;
+const std::string DB::Properties::kBlobStats = rocksdb_prefix + blob_stats;
+const std::string DB::Properties::kTotalBlobFileSize =
+ rocksdb_prefix + total_blob_file_size;
+const std::string DB::Properties::kLiveBlobFileSize =
+ rocksdb_prefix + live_blob_file_size;
+const std::string DB::Properties::kLiveBlobFileGarbageSize =
+ rocksdb_prefix + live_blob_file_garbage_size;
+const std::string DB::Properties::kBlobCacheCapacity =
+ rocksdb_prefix + blob_cache_capacity;
+const std::string DB::Properties::kBlobCacheUsage =
+ rocksdb_prefix + blob_cache_usage;
+const std::string DB::Properties::kBlobCachePinnedUsage =
+ rocksdb_prefix + blob_cache_pinned_usage;
+
+const std::string InternalStats::kPeriodicCFStats =
+ DB::Properties::kCFStats + ".periodic";
+const int InternalStats::kMaxNoChangePeriodSinceDump = 8;
+
+const UnorderedMap<std::string, DBPropertyInfo>
+ InternalStats::ppt_name_to_info = {
+ {DB::Properties::kNumFilesAtLevelPrefix,
+ {false, &InternalStats::HandleNumFilesAtLevel, nullptr, nullptr,
+ nullptr}},
+ {DB::Properties::kCompressionRatioAtLevelPrefix,
+ {false, &InternalStats::HandleCompressionRatioAtLevelPrefix, nullptr,
+ nullptr, nullptr}},
+ {DB::Properties::kLevelStats,
+ {false, &InternalStats::HandleLevelStats, nullptr, nullptr, nullptr}},
+ {DB::Properties::kStats,
+ {false, &InternalStats::HandleStats, nullptr, nullptr, nullptr}},
+ {DB::Properties::kCFStats,
+ {false, &InternalStats::HandleCFStats, nullptr,
+ &InternalStats::HandleCFMapStats, nullptr}},
+ {InternalStats::kPeriodicCFStats,
+ {false, &InternalStats::HandleCFStatsPeriodic, nullptr, nullptr,
+ nullptr}},
+ {DB::Properties::kCFStatsNoFileHistogram,
+ {false, &InternalStats::HandleCFStatsNoFileHistogram, nullptr, nullptr,
+ nullptr}},
+ {DB::Properties::kCFFileHistogram,
+ {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr,
+ nullptr}},
+ {DB::Properties::kDBStats,
+ {false, &InternalStats::HandleDBStats, nullptr,
+ &InternalStats::HandleDBMapStats, nullptr}},
+ {DB::Properties::kBlockCacheEntryStats,
+ {true, &InternalStats::HandleBlockCacheEntryStats, nullptr,
+ &InternalStats::HandleBlockCacheEntryStatsMap, nullptr}},
+ {DB::Properties::kFastBlockCacheEntryStats,
+ {true, &InternalStats::HandleFastBlockCacheEntryStats, nullptr,
+ &InternalStats::HandleFastBlockCacheEntryStatsMap, nullptr}},
+ {DB::Properties::kSSTables,
+ {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}},
+ {DB::Properties::kAggregatedTableProperties,
+ {false, &InternalStats::HandleAggregatedTableProperties, nullptr,
+ &InternalStats::HandleAggregatedTablePropertiesMap, nullptr}},
+ {DB::Properties::kAggregatedTablePropertiesAtLevel,
+ {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel,
+ nullptr, &InternalStats::HandleAggregatedTablePropertiesAtLevelMap,
+ nullptr}},
+ {DB::Properties::kNumImmutableMemTable,
+ {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr,
+ nullptr}},
+ {DB::Properties::kNumImmutableMemTableFlushed,
+ {false, nullptr, &InternalStats::HandleNumImmutableMemTableFlushed,
+ nullptr, nullptr}},
+ {DB::Properties::kMemTableFlushPending,
+ {false, nullptr, &InternalStats::HandleMemTableFlushPending, nullptr,
+ nullptr}},
+ {DB::Properties::kCompactionPending,
+ {false, nullptr, &InternalStats::HandleCompactionPending, nullptr,
+ nullptr}},
+ {DB::Properties::kBackgroundErrors,
+ {false, nullptr, &InternalStats::HandleBackgroundErrors, nullptr,
+ nullptr}},
+ {DB::Properties::kCurSizeActiveMemTable,
+ {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable, nullptr,
+ nullptr}},
+ {DB::Properties::kCurSizeAllMemTables,
+ {false, nullptr, &InternalStats::HandleCurSizeAllMemTables, nullptr,
+ nullptr}},
+ {DB::Properties::kSizeAllMemTables,
+ {false, nullptr, &InternalStats::HandleSizeAllMemTables, nullptr,
+ nullptr}},
+ {DB::Properties::kNumEntriesActiveMemTable,
+ {false, nullptr, &InternalStats::HandleNumEntriesActiveMemTable,
+ nullptr, nullptr}},
+ {DB::Properties::kNumEntriesImmMemTables,
+ {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables, nullptr,
+ nullptr}},
+ {DB::Properties::kNumDeletesActiveMemTable,
+ {false, nullptr, &InternalStats::HandleNumDeletesActiveMemTable,
+ nullptr, nullptr}},
+ {DB::Properties::kNumDeletesImmMemTables,
+ {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables, nullptr,
+ nullptr}},
+ {DB::Properties::kEstimateNumKeys,
+ {false, nullptr, &InternalStats::HandleEstimateNumKeys, nullptr,
+ nullptr}},
+ {DB::Properties::kEstimateTableReadersMem,
+ {true, nullptr, &InternalStats::HandleEstimateTableReadersMem, nullptr,
+ nullptr}},
+ {DB::Properties::kIsFileDeletionsEnabled,
+ {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled, nullptr,
+ nullptr}},
+ {DB::Properties::kNumSnapshots,
+ {false, nullptr, &InternalStats::HandleNumSnapshots, nullptr,
+ nullptr}},
+ {DB::Properties::kOldestSnapshotTime,
+ {false, nullptr, &InternalStats::HandleOldestSnapshotTime, nullptr,
+ nullptr}},
+ {DB::Properties::kOldestSnapshotSequence,
+ {false, nullptr, &InternalStats::HandleOldestSnapshotSequence, nullptr,
+ nullptr}},
+ {DB::Properties::kNumLiveVersions,
+ {false, nullptr, &InternalStats::HandleNumLiveVersions, nullptr,
+ nullptr}},
+ {DB::Properties::kCurrentSuperVersionNumber,
+ {false, nullptr, &InternalStats::HandleCurrentSuperVersionNumber,
+ nullptr, nullptr}},
+ {DB::Properties::kEstimateLiveDataSize,
+ {true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr,
+ nullptr}},
+ {DB::Properties::kMinLogNumberToKeep,
+ {false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr,
+ nullptr}},
+ {DB::Properties::kMinObsoleteSstNumberToKeep,
+ {false, nullptr, &InternalStats::HandleMinObsoleteSstNumberToKeep,
+ nullptr, nullptr}},
+ {DB::Properties::kBaseLevel,
+ {false, nullptr, &InternalStats::HandleBaseLevel, nullptr, nullptr}},
+ {DB::Properties::kTotalSstFilesSize,
+ {false, nullptr, &InternalStats::HandleTotalSstFilesSize, nullptr,
+ nullptr}},
+ {DB::Properties::kLiveSstFilesSize,
+ {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr,
+ nullptr}},
+ {DB::Properties::kLiveSstFilesSizeAtTemperature,
+ {false, &InternalStats::HandleLiveSstFilesSizeAtTemperature, nullptr,
+ nullptr, nullptr}},
+ {DB::Properties::kEstimatePendingCompactionBytes,
+ {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes,
+ nullptr, nullptr}},
+ {DB::Properties::kNumRunningFlushes,
+ {false, nullptr, &InternalStats::HandleNumRunningFlushes, nullptr,
+ nullptr}},
+ {DB::Properties::kNumRunningCompactions,
+ {false, nullptr, &InternalStats::HandleNumRunningCompactions, nullptr,
+ nullptr}},
+ {DB::Properties::kActualDelayedWriteRate,
+ {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr,
+ nullptr}},
+ {DB::Properties::kIsWriteStopped,
+ {false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr,
+ nullptr}},
+ {DB::Properties::kEstimateOldestKeyTime,
+ {false, nullptr, &InternalStats::HandleEstimateOldestKeyTime, nullptr,
+ nullptr}},
+ {DB::Properties::kBlockCacheCapacity,
+ {false, nullptr, &InternalStats::HandleBlockCacheCapacity, nullptr,
+ nullptr}},
+ {DB::Properties::kBlockCacheUsage,
+ {false, nullptr, &InternalStats::HandleBlockCacheUsage, nullptr,
+ nullptr}},
+ {DB::Properties::kBlockCachePinnedUsage,
+ {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr,
+ nullptr}},
+ {DB::Properties::kOptionsStatistics,
+ {true, nullptr, nullptr, nullptr,
+ &DBImpl::GetPropertyHandleOptionsStatistics}},
+ {DB::Properties::kNumBlobFiles,
+ {false, nullptr, &InternalStats::HandleNumBlobFiles, nullptr,
+ nullptr}},
+ {DB::Properties::kBlobStats,
+ {false, &InternalStats::HandleBlobStats, nullptr, nullptr, nullptr}},
+ {DB::Properties::kTotalBlobFileSize,
+ {false, nullptr, &InternalStats::HandleTotalBlobFileSize, nullptr,
+ nullptr}},
+ {DB::Properties::kLiveBlobFileSize,
+ {false, nullptr, &InternalStats::HandleLiveBlobFileSize, nullptr,
+ nullptr}},
+ {DB::Properties::kLiveBlobFileGarbageSize,
+ {false, nullptr, &InternalStats::HandleLiveBlobFileGarbageSize,
+ nullptr, nullptr}},
+ {DB::Properties::kBlobCacheCapacity,
+ {false, nullptr, &InternalStats::HandleBlobCacheCapacity, nullptr,
+ nullptr}},
+ {DB::Properties::kBlobCacheUsage,
+ {false, nullptr, &InternalStats::HandleBlobCacheUsage, nullptr,
+ nullptr}},
+ {DB::Properties::kBlobCachePinnedUsage,
+ {false, nullptr, &InternalStats::HandleBlobCachePinnedUsage, nullptr,
+ nullptr}},
+};
+
+InternalStats::InternalStats(int num_levels, SystemClock* clock,
+ ColumnFamilyData* cfd)
+ : db_stats_{},
+ cf_stats_value_{},
+ cf_stats_count_{},
+ comp_stats_(num_levels),
+ comp_stats_by_pri_(Env::Priority::TOTAL),
+ file_read_latency_(num_levels),
+ has_cf_change_since_dump_(true),
+ bg_error_count_(0),
+ number_levels_(num_levels),
+ clock_(clock),
+ cfd_(cfd),
+ started_at_(clock->NowMicros()) {
+ Cache* block_cache = GetBlockCacheForStats();
+ if (block_cache) {
+ // Extract or create stats collector. Could fail in rare cases.
+ Status s = CacheEntryStatsCollector<CacheEntryRoleStats>::GetShared(
+ block_cache, clock_, &cache_entry_stats_collector_);
+ if (s.ok()) {
+ assert(cache_entry_stats_collector_);
+ } else {
+ assert(!cache_entry_stats_collector_);
+ }
+ }
+}
+
+void InternalStats::TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats,
+ bool foreground) {
+ CollectCacheEntryStats(foreground);
+ if (cache_entry_stats_collector_) {
+ cache_entry_stats_collector_->GetStats(stats);
+ }
+}
+
+void InternalStats::CollectCacheEntryStats(bool foreground) {
+ // This function is safe to call from any thread because
+ // cache_entry_stats_collector_ field is const after constructor
+ // and ->GetStats does its own synchronization, which also suffices for
+ // cache_entry_stats_.
+
+ if (!cache_entry_stats_collector_) {
+ return; // nothing to do (e.g. no block cache)
+ }
+
+ // For "background" collections, strictly cap the collection time by
+ // expanding effective cache TTL. For foreground, be more aggressive about
+ // getting latest data.
+ int min_interval_seconds = foreground ? 10 : 180;
+ // 1/500 = max of 0.2% of one CPU thread
+ int min_interval_factor = foreground ? 10 : 500;
+ cache_entry_stats_collector_->CollectStats(min_interval_seconds,
+ min_interval_factor);
+}
+
+std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+InternalStats::CacheEntryRoleStats::GetEntryCallback() {
+ return [&](const Slice& /*key*/, void* /*value*/, size_t charge,
+ Cache::DeleterFn deleter) {
+ auto e = role_map_.find(deleter);
+ size_t role_idx;
+ if (e == role_map_.end()) {
+ role_idx = static_cast<size_t>(CacheEntryRole::kMisc);
+ } else {
+ role_idx = static_cast<size_t>(e->second);
+ }
+ entry_counts[role_idx]++;
+ total_charges[role_idx] += charge;
+ };
+}
+
+void InternalStats::CacheEntryRoleStats::BeginCollection(
+ Cache* cache, SystemClock*, uint64_t start_time_micros) {
+ Clear();
+ last_start_time_micros_ = start_time_micros;
+ ++collection_count;
+ role_map_ = CopyCacheDeleterRoleMap();
+ std::ostringstream str;
+ str << cache->Name() << "@" << static_cast<void*>(cache) << "#"
+ << port::GetProcessID();
+ cache_id = str.str();
+ cache_capacity = cache->GetCapacity();
+ cache_usage = cache->GetUsage();
+ table_size = cache->GetTableAddressCount();
+ occupancy = cache->GetOccupancyCount();
+}
+
+void InternalStats::CacheEntryRoleStats::EndCollection(
+ Cache*, SystemClock*, uint64_t end_time_micros) {
+ last_end_time_micros_ = end_time_micros;
+}
+
+void InternalStats::CacheEntryRoleStats::SkippedCollection() {
+ ++copies_of_last_collection;
+}
+
+uint64_t InternalStats::CacheEntryRoleStats::GetLastDurationMicros() const {
+ if (last_end_time_micros_ > last_start_time_micros_) {
+ return last_end_time_micros_ - last_start_time_micros_;
+ } else {
+ return 0U;
+ }
+}
+
+std::string InternalStats::CacheEntryRoleStats::ToString(
+ SystemClock* clock) const {
+ std::ostringstream str;
+ str << "Block cache " << cache_id
+ << " capacity: " << BytesToHumanString(cache_capacity)
+ << " usage: " << BytesToHumanString(cache_usage)
+ << " table_size: " << table_size << " occupancy: " << occupancy
+ << " collections: " << collection_count
+ << " last_copies: " << copies_of_last_collection
+ << " last_secs: " << (GetLastDurationMicros() / 1000000.0)
+ << " secs_since: "
+ << ((clock->NowMicros() - last_end_time_micros_) / 1000000U) << "\n";
+ str << "Block cache entry stats(count,size,portion):";
+ for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+ if (entry_counts[i] > 0) {
+ str << " " << kCacheEntryRoleToCamelString[i] << "(" << entry_counts[i]
+ << "," << BytesToHumanString(total_charges[i]) << ","
+ << (100.0 * total_charges[i] / cache_capacity) << "%)";
+ }
+ }
+ str << "\n";
+ return str.str();
+}
+
+void InternalStats::CacheEntryRoleStats::ToMap(
+ std::map<std::string, std::string>* values, SystemClock* clock) const {
+ values->clear();
+ auto& v = *values;
+ v[BlockCacheEntryStatsMapKeys::CacheId()] = cache_id;
+ v[BlockCacheEntryStatsMapKeys::CacheCapacityBytes()] =
+ std::to_string(cache_capacity);
+ v[BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds()] =
+ std::to_string(GetLastDurationMicros() / 1000000.0);
+ v[BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds()] =
+ std::to_string((clock->NowMicros() - last_end_time_micros_) / 1000000U);
+ for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+ auto role = static_cast<CacheEntryRole>(i);
+ v[BlockCacheEntryStatsMapKeys::EntryCount(role)] =
+ std::to_string(entry_counts[i]);
+ v[BlockCacheEntryStatsMapKeys::UsedBytes(role)] =
+ std::to_string(total_charges[i]);
+ v[BlockCacheEntryStatsMapKeys::UsedPercent(role)] =
+ std::to_string(100.0 * total_charges[i] / cache_capacity);
+ }
+}
+
+bool InternalStats::HandleBlockCacheEntryStatsInternal(std::string* value,
+ bool fast) {
+ if (!cache_entry_stats_collector_) {
+ return false;
+ }
+ CollectCacheEntryStats(!fast /* foreground */);
+ CacheEntryRoleStats stats;
+ cache_entry_stats_collector_->GetStats(&stats);
+ *value = stats.ToString(clock_);
+ return true;
+}
+
+bool InternalStats::HandleBlockCacheEntryStatsMapInternal(
+ std::map<std::string, std::string>* values, bool fast) {
+ if (!cache_entry_stats_collector_) {
+ return false;
+ }
+ CollectCacheEntryStats(!fast /* foreground */);
+ CacheEntryRoleStats stats;
+ cache_entry_stats_collector_->GetStats(&stats);
+ stats.ToMap(values, clock_);
+ return true;
+}
+
+bool InternalStats::HandleBlockCacheEntryStats(std::string* value,
+ Slice /*suffix*/) {
+ return HandleBlockCacheEntryStatsInternal(value, false /* fast */);
+}
+
+bool InternalStats::HandleBlockCacheEntryStatsMap(
+ std::map<std::string, std::string>* values, Slice /*suffix*/) {
+ return HandleBlockCacheEntryStatsMapInternal(values, false /* fast */);
+}
+
+bool InternalStats::HandleFastBlockCacheEntryStats(std::string* value,
+ Slice /*suffix*/) {
+ return HandleBlockCacheEntryStatsInternal(value, true /* fast */);
+}
+
+bool InternalStats::HandleFastBlockCacheEntryStatsMap(
+ std::map<std::string, std::string>* values, Slice /*suffix*/) {
+ return HandleBlockCacheEntryStatsMapInternal(values, true /* fast */);
+}
+
+bool InternalStats::HandleLiveSstFilesSizeAtTemperature(std::string* value,
+ Slice suffix) {
+ uint64_t temperature;
+ bool ok = ConsumeDecimalNumber(&suffix, &temperature) && suffix.empty();
+ if (!ok) {
+ return false;
+ }
+
+ uint64_t size = 0;
+ const auto* vstorage = cfd_->current()->storage_info();
+ for (int level = 0; level < vstorage->num_levels(); level++) {
+ for (const auto& file_meta : vstorage->LevelFiles(level)) {
+ if (static_cast<uint8_t>(file_meta->temperature) == temperature) {
+ size += file_meta->fd.GetFileSize();
+ }
+ }
+ }
+
+ *value = std::to_string(size);
+ return true;
+}
+
+bool InternalStats::HandleNumBlobFiles(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ assert(value);
+ assert(cfd_);
+
+ const auto* current = cfd_->current();
+ assert(current);
+
+ const auto* vstorage = current->storage_info();
+ assert(vstorage);
+
+ const auto& blob_files = vstorage->GetBlobFiles();
+
+ *value = blob_files.size();
+
+ return true;
+}
+
+bool InternalStats::HandleBlobStats(std::string* value, Slice /*suffix*/) {
+ assert(value);
+ assert(cfd_);
+
+ const auto* current = cfd_->current();
+ assert(current);
+
+ const auto* vstorage = current->storage_info();
+ assert(vstorage);
+
+ const auto blob_st = vstorage->GetBlobStats();
+
+ std::ostringstream oss;
+
+ oss << "Number of blob files: " << vstorage->GetBlobFiles().size()
+ << "\nTotal size of blob files: " << blob_st.total_file_size
+ << "\nTotal size of garbage in blob files: " << blob_st.total_garbage_size
+ << "\nBlob file space amplification: " << blob_st.space_amp << '\n';
+
+ value->append(oss.str());
+
+ return true;
+}
+
+bool InternalStats::HandleTotalBlobFileSize(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ assert(value);
+ assert(cfd_);
+
+ *value = cfd_->GetTotalBlobFileSize();
+
+ return true;
+}
+
+bool InternalStats::HandleLiveBlobFileSize(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ assert(value);
+ assert(cfd_);
+
+ const auto* current = cfd_->current();
+ assert(current);
+
+ const auto* vstorage = current->storage_info();
+ assert(vstorage);
+
+ *value = vstorage->GetBlobStats().total_file_size;
+
+ return true;
+}
+
+bool InternalStats::HandleLiveBlobFileGarbageSize(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ assert(value);
+ assert(cfd_);
+
+ const auto* current = cfd_->current();
+ assert(current);
+
+ const auto* vstorage = current->storage_info();
+ assert(vstorage);
+
+ *value = vstorage->GetBlobStats().total_garbage_size;
+
+ return true;
+}
+
+Cache* InternalStats::GetBlobCacheForStats() {
+ return cfd_->ioptions()->blob_cache.get();
+}
+
+bool InternalStats::HandleBlobCacheCapacity(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ Cache* blob_cache = GetBlobCacheForStats();
+ if (blob_cache) {
+ *value = static_cast<uint64_t>(blob_cache->GetCapacity());
+ return true;
+ }
+ return false;
+}
+
+bool InternalStats::HandleBlobCacheUsage(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ Cache* blob_cache = GetBlobCacheForStats();
+ if (blob_cache) {
+ *value = static_cast<uint64_t>(blob_cache->GetUsage());
+ return true;
+ }
+ return false;
+}
+
+bool InternalStats::HandleBlobCachePinnedUsage(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ Cache* blob_cache = GetBlobCacheForStats();
+ if (blob_cache) {
+ *value = static_cast<uint64_t>(blob_cache->GetPinnedUsage());
+ return true;
+ }
+ return false;
+}
+
+const DBPropertyInfo* GetPropertyInfo(const Slice& property) {
+ std::string ppt_name = GetPropertyNameAndArg(property).first.ToString();
+ auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name);
+ if (ppt_info_iter == InternalStats::ppt_name_to_info.end()) {
+ return nullptr;
+ }
+ return &ppt_info_iter->second;
+}
+
+bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info,
+ const Slice& property,
+ std::string* value) {
+ assert(value != nullptr);
+ assert(property_info.handle_string != nullptr);
+ Slice arg = GetPropertyNameAndArg(property).second;
+ return (this->*(property_info.handle_string))(value, arg);
+}
+
+bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info,
+ const Slice& property,
+ std::map<std::string, std::string>* value) {
+ assert(value != nullptr);
+ assert(property_info.handle_map != nullptr);
+ Slice arg = GetPropertyNameAndArg(property).second;
+ return (this->*(property_info.handle_map))(value, arg);
+}
+
+bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info,
+ uint64_t* value, DBImpl* db) {
+ assert(value != nullptr);
+ assert(property_info.handle_int != nullptr &&
+ !property_info.need_out_of_mutex);
+ db->mutex_.AssertHeld();
+ return (this->*(property_info.handle_int))(value, db, nullptr /* version */);
+}
+
+bool InternalStats::GetIntPropertyOutOfMutex(
+ const DBPropertyInfo& property_info, Version* version, uint64_t* value) {
+ assert(value != nullptr);
+ assert(property_info.handle_int != nullptr &&
+ property_info.need_out_of_mutex);
+ return (this->*(property_info.handle_int))(value, nullptr /* db */, version);
+}
+
+bool InternalStats::HandleNumFilesAtLevel(std::string* value, Slice suffix) {
+ uint64_t level;
+ const auto* vstorage = cfd_->current()->storage_info();
+ bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+ if (!ok || static_cast<int>(level) >= number_levels_) {
+ return false;
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d",
+ vstorage->NumLevelFiles(static_cast<int>(level)));
+ *value = buf;
+ return true;
+ }
+}
+
+bool InternalStats::HandleCompressionRatioAtLevelPrefix(std::string* value,
+ Slice suffix) {
+ uint64_t level;
+ const auto* vstorage = cfd_->current()->storage_info();
+ bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+ if (!ok || level >= static_cast<uint64_t>(number_levels_)) {
+ return false;
+ }
+ *value = std::to_string(
+ vstorage->GetEstimatedCompressionRatioAtLevel(static_cast<int>(level)));
+ return true;
+}
+
+bool InternalStats::HandleLevelStats(std::string* value, Slice /*suffix*/) {
+ char buf[1000];
+ const auto* vstorage = cfd_->current()->storage_info();
+ snprintf(buf, sizeof(buf),
+ "Level Files Size(MB)\n"
+ "--------------------\n");
+ value->append(buf);
+
+ for (int level = 0; level < number_levels_; level++) {
+ snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
+ vstorage->NumLevelFiles(level),
+ vstorage->NumLevelBytes(level) / kMB);
+ value->append(buf);
+ }
+ return true;
+}
+
+bool InternalStats::HandleStats(std::string* value, Slice suffix) {
+ if (!HandleCFStats(value, suffix)) {
+ return false;
+ }
+ if (!HandleDBStats(value, suffix)) {
+ return false;
+ }
+ return true;
+}
+
+bool InternalStats::HandleCFMapStats(
+ std::map<std::string, std::string>* cf_stats, Slice /*suffix*/) {
+ DumpCFMapStats(cf_stats);
+ return true;
+}
+
+bool InternalStats::HandleCFStats(std::string* value, Slice /*suffix*/) {
+ DumpCFStats(value);
+ return true;
+}
+
+bool InternalStats::HandleCFStatsPeriodic(std::string* value,
+ Slice /*suffix*/) {
+ bool has_change = has_cf_change_since_dump_;
+ if (!has_change) {
+ // If file histogram changes, there is activity in this period too.
+ uint64_t new_histogram_num = 0;
+ for (int level = 0; level < number_levels_; level++) {
+ new_histogram_num += file_read_latency_[level].num();
+ }
+ new_histogram_num += blob_file_read_latency_.num();
+ if (new_histogram_num != last_histogram_num) {
+ has_change = true;
+ last_histogram_num = new_histogram_num;
+ }
+ }
+ if (has_change) {
+ no_cf_change_period_since_dump_ = 0;
+ has_cf_change_since_dump_ = false;
+ } else if (no_cf_change_period_since_dump_++ > 0) {
+ // Not ready to sync
+ if (no_cf_change_period_since_dump_ == kMaxNoChangePeriodSinceDump) {
+ // Next periodic, we need to dump stats even if there is no change.
+ no_cf_change_period_since_dump_ = 0;
+ }
+ return true;
+ }
+
+ DumpCFStatsNoFileHistogram(/*is_periodic=*/true, value);
+ DumpCFFileHistogram(value);
+ return true;
+}
+
+bool InternalStats::HandleCFStatsNoFileHistogram(std::string* value,
+ Slice /*suffix*/) {
+ DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value);
+ return true;
+}
+
+bool InternalStats::HandleCFFileHistogram(std::string* value,
+ Slice /*suffix*/) {
+ DumpCFFileHistogram(value);
+ return true;
+}
+
+bool InternalStats::HandleDBMapStats(
+ std::map<std::string, std::string>* db_stats, Slice /*suffix*/) {
+ DumpDBMapStats(db_stats);
+ return true;
+}
+
+bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) {
+ DumpDBStats(value);
+ return true;
+}
+
+bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) {
+ auto* current = cfd_->current();
+ *value = current->DebugString(true, true);
+ return true;
+}
+
+bool InternalStats::HandleAggregatedTableProperties(std::string* value,
+ Slice /*suffix*/) {
+ std::shared_ptr<const TableProperties> tp;
+ auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
+ if (!s.ok()) {
+ return false;
+ }
+ *value = tp->ToString();
+ return true;
+}
+
+static std::map<std::string, std::string> MapUint64ValuesToString(
+ const std::map<std::string, uint64_t>& from) {
+ std::map<std::string, std::string> to;
+ for (const auto& e : from) {
+ to[e.first] = std::to_string(e.second);
+ }
+ return to;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesMap(
+ std::map<std::string, std::string>* values, Slice /*suffix*/) {
+ std::shared_ptr<const TableProperties> tp;
+ auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
+ if (!s.ok()) {
+ return false;
+ }
+ *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap());
+ return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values,
+ Slice suffix) {
+ uint64_t level;
+ bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+ if (!ok || static_cast<int>(level) >= number_levels_) {
+ return false;
+ }
+ std::shared_ptr<const TableProperties> tp;
+ auto s = cfd_->current()->GetAggregatedTableProperties(
+ &tp, static_cast<int>(level));
+ if (!s.ok()) {
+ return false;
+ }
+ *values = tp->ToString();
+ return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap(
+ std::map<std::string, std::string>* values, Slice suffix) {
+ uint64_t level;
+ bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+ if (!ok || static_cast<int>(level) >= number_levels_) {
+ return false;
+ }
+ std::shared_ptr<const TableProperties> tp;
+ auto s = cfd_->current()->GetAggregatedTableProperties(
+ &tp, static_cast<int>(level));
+ if (!s.ok()) {
+ return false;
+ }
+ *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap());
+ return true;
+}
+
+bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->imm()->NumNotFlushed();
+ return true;
+}
+
+bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->imm()->NumFlushed();
+ return true;
+}
+
+bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = (cfd_->imm()->IsFlushPending() ? 1 : 0);
+ return true;
+}
+
+bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->num_running_flushes();
+ return true;
+}
+
+bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // 1 if the system already determines at least one compaction is needed.
+ // 0 otherwise,
+ const auto* vstorage = cfd_->current()->storage_info();
+ *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0);
+ return true;
+}
+
+bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->num_running_compactions_;
+ return true;
+}
+
+bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Accumulated number of errors in background flushes or compactions.
+ *value = GetBackgroundErrorCount();
+ return true;
+}
+
+bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current size of the active memtable
+ // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+ *value = cfd_->mem()->ApproximateMemoryUsageFast();
+ return true;
+}
+
+bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current size of the active memtable + immutable memtables
+ // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+ *value = cfd_->mem()->ApproximateMemoryUsageFast() +
+ cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage();
+ return true;
+}
+
+bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+ *value = cfd_->mem()->ApproximateMemoryUsageFast() +
+ cfd_->imm()->ApproximateMemoryUsage();
+ return true;
+}
+
+bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current number of entires in the active memtable
+ *value = cfd_->mem()->num_entries();
+ return true;
+}
+
+bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current number of entries in the immutable memtables
+ *value = cfd_->imm()->current()->GetTotalNumEntries();
+ return true;
+}
+
+bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current number of entires in the active memtable
+ *value = cfd_->mem()->num_deletes();
+ return true;
+}
+
+bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Current number of entries in the immutable memtables
+ *value = cfd_->imm()->current()->GetTotalNumDeletes();
+ return true;
+}
+
+bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // Estimate number of entries in the column family:
+ // Use estimated entries in tables + total entries in memtables.
+ const auto* vstorage = cfd_->current()->storage_info();
+ uint64_t estimate_keys = cfd_->mem()->num_entries() +
+ cfd_->imm()->current()->GetTotalNumEntries() +
+ vstorage->GetEstimatedActiveKeys();
+ uint64_t estimate_deletes =
+ cfd_->mem()->num_deletes() + cfd_->imm()->current()->GetTotalNumDeletes();
+ *value = estimate_keys > estimate_deletes * 2
+ ? estimate_keys - (estimate_deletes * 2)
+ : 0;
+ return true;
+}
+
+bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->snapshots().count();
+ return true;
+}
+
+bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotTime());
+ return true;
+}
+
+bool InternalStats::HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotSequence());
+ return true;
+}
+
+bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->GetNumLiveVersions();
+ return true;
+}
+
+bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->GetSuperVersionNumber();
+ return true;
+}
+
+bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->IsFileDeletionsEnabled() ? 1 : 0;
+ return true;
+}
+
+bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ const auto* vstorage = cfd_->current()->storage_info();
+ *value = vstorage->base_level();
+ return true;
+}
+
+bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->GetTotalSstFilesSize();
+ return true;
+}
+
+bool InternalStats::HandleLiveSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ *value = cfd_->GetLiveSstFilesSize();
+ return true;
+}
+
+bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* /*version*/) {
+ const auto* vstorage = cfd_->current()->storage_info();
+ *value = vstorage->estimated_compaction_needed_bytes();
+ return true;
+}
+
+bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value,
+ DBImpl* /*db*/,
+ Version* version) {
+ *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders();
+ return true;
+}
+
+bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* /*db*/,
+ Version* version) {
+ const auto* vstorage = version->storage_info();
+ *value = vstorage->EstimateLiveDataSize();
+ return true;
+}
+
+bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->MinLogNumberToKeep();
+ return true;
+}
+
+bool InternalStats::HandleMinObsoleteSstNumberToKeep(uint64_t* value,
+ DBImpl* db,
+ Version* /*version*/) {
+ *value = db->MinObsoleteSstNumberToKeep();
+ return true;
+}
+
+bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ const WriteController& wc = db->write_controller();
+ if (!wc.NeedsDelay()) {
+ *value = 0;
+ } else {
+ *value = wc.delayed_write_rate();
+ }
+ return true;
+}
+
+bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db,
+ Version* /*version*/) {
+ *value = db->write_controller().IsStopped() ? 1 : 0;
+ return true;
+}
+
+bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ // TODO(yiwu): The property is currently available for fifo compaction
+ // with allow_compaction = false. This is because we don't propagate
+ // oldest_key_time on compaction.
+ if (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO ||
+ cfd_->GetCurrentMutableCFOptions()
+ ->compaction_options_fifo.allow_compaction) {
+ return false;
+ }
+
+ TablePropertiesCollection collection;
+ auto s = cfd_->current()->GetPropertiesOfAllTables(&collection);
+ if (!s.ok()) {
+ return false;
+ }
+ *value = std::numeric_limits<uint64_t>::max();
+ for (auto& p : collection) {
+ *value = std::min(*value, p.second->oldest_key_time);
+ if (*value == 0) {
+ break;
+ }
+ }
+ if (*value > 0) {
+ *value = std::min({cfd_->mem()->ApproximateOldestKeyTime(),
+ cfd_->imm()->ApproximateOldestKeyTime(), *value});
+ }
+ return *value > 0 && *value < std::numeric_limits<uint64_t>::max();
+}
+
+Cache* InternalStats::GetBlockCacheForStats() {
+ auto* table_factory = cfd_->ioptions()->table_factory.get();
+ assert(table_factory != nullptr);
+ return table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
+}
+
+bool InternalStats::HandleBlockCacheCapacity(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ Cache* block_cache = GetBlockCacheForStats();
+ if (block_cache) {
+ *value = static_cast<uint64_t>(block_cache->GetCapacity());
+ return true;
+ }
+ return false;
+}
+
+bool InternalStats::HandleBlockCacheUsage(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ Cache* block_cache = GetBlockCacheForStats();
+ if (block_cache) {
+ *value = static_cast<uint64_t>(block_cache->GetUsage());
+ return true;
+ }
+ return false;
+}
+
+bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/,
+ Version* /*version*/) {
+ Cache* block_cache = GetBlockCacheForStats();
+ if (block_cache) {
+ *value = static_cast<uint64_t>(block_cache->GetPinnedUsage());
+ return true;
+ }
+ return false;
+}
+
+void InternalStats::DumpDBMapStats(
+ std::map<std::string, std::string>* db_stats) {
+ for (int i = 0; i < static_cast<int>(kIntStatsNumMax); ++i) {
+ InternalDBStatsType type = static_cast<InternalDBStatsType>(i);
+ (*db_stats)[db_stats_type_to_info.at(type).property_name] =
+ std::to_string(GetDBStats(type));
+ }
+ double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec;
+ (*db_stats)["db.uptime"] = std::to_string(seconds_up);
+}
+
+void InternalStats::DumpDBStats(std::string* value) {
+ char buf[1000];
+ // DB-level stats, only available from default column family
+ double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec;
+ double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up;
+ snprintf(buf, sizeof(buf),
+ "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n",
+ seconds_up, interval_seconds_up);
+ value->append(buf);
+ // Cumulative
+ uint64_t user_bytes_written =
+ GetDBStats(InternalStats::kIntStatsBytesWritten);
+ uint64_t num_keys_written =
+ GetDBStats(InternalStats::kIntStatsNumKeysWritten);
+ uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther);
+ uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf);
+ uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes);
+ uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced);
+ uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal);
+ uint64_t write_stall_micros =
+ GetDBStats(InternalStats::kIntStatsWriteStallMicros);
+
+ const int kHumanMicrosLen = 32;
+ char human_micros[kHumanMicrosLen];
+
+ // Data
+ // writes: total number of write requests.
+ // keys: total number of key updates issued by all the write requests
+ // commit groups: number of group commits issued to the DB. Each group can
+ // contain one or more writes.
+ // so writes/keys is the average number of put in multi-put or put
+ // writes/groups is the average group commit size.
+ //
+ // The format is the same for interval stats.
+ snprintf(buf, sizeof(buf),
+ "Cumulative writes: %s writes, %s keys, %s commit groups, "
+ "%.1f writes per commit group, ingest: %.2f GB, %.2f MB/s\n",
+ NumberToHumanString(write_other + write_self).c_str(),
+ NumberToHumanString(num_keys_written).c_str(),
+ NumberToHumanString(write_self).c_str(),
+ (write_other + write_self) /
+ std::max(1.0, static_cast<double>(write_self)),
+ user_bytes_written / kGB,
+ user_bytes_written / kMB / std::max(seconds_up, 0.001));
+ value->append(buf);
+ // WAL
+ snprintf(buf, sizeof(buf),
+ "Cumulative WAL: %s writes, %s syncs, "
+ "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+ NumberToHumanString(write_with_wal).c_str(),
+ NumberToHumanString(wal_synced).c_str(),
+ write_with_wal / std::max(1.0, static_cast<double>(wal_synced)),
+ wal_bytes / kGB, wal_bytes / kMB / std::max(seconds_up, 0.001));
+ value->append(buf);
+ // Stall
+ AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true);
+ snprintf(buf, sizeof(buf), "Cumulative stall: %s, %.1f percent\n",
+ human_micros,
+ // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+ write_stall_micros / 10000.0 / std::max(seconds_up, 0.001));
+ value->append(buf);
+
+ // Interval
+ uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other;
+ uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
+ uint64_t interval_num_keys_written =
+ num_keys_written - db_stats_snapshot_.num_keys_written;
+ snprintf(
+ buf, sizeof(buf),
+ "Interval writes: %s writes, %s keys, %s commit groups, "
+ "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n",
+ NumberToHumanString(interval_write_other + interval_write_self).c_str(),
+ NumberToHumanString(interval_num_keys_written).c_str(),
+ NumberToHumanString(interval_write_self).c_str(),
+ static_cast<double>(interval_write_other + interval_write_self) /
+ std::max(1.0, static_cast<double>(interval_write_self)),
+ (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
+ (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB /
+ std::max(interval_seconds_up, 0.001)),
+ value->append(buf);
+
+ uint64_t interval_write_with_wal =
+ write_with_wal - db_stats_snapshot_.write_with_wal;
+ uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced;
+ uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes;
+
+ snprintf(buf, sizeof(buf),
+ "Interval WAL: %s writes, %s syncs, "
+ "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+ NumberToHumanString(interval_write_with_wal).c_str(),
+ NumberToHumanString(interval_wal_synced).c_str(),
+ interval_write_with_wal /
+ std::max(1.0, static_cast<double>(interval_wal_synced)),
+ interval_wal_bytes / kGB,
+ interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
+ value->append(buf);
+
+ // Stall
+ AppendHumanMicros(write_stall_micros - db_stats_snapshot_.write_stall_micros,
+ human_micros, kHumanMicrosLen, true);
+ snprintf(buf, sizeof(buf), "Interval stall: %s, %.1f percent\n", human_micros,
+ // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+ (write_stall_micros - db_stats_snapshot_.write_stall_micros) /
+ 10000.0 / std::max(interval_seconds_up, 0.001));
+ value->append(buf);
+
+ db_stats_snapshot_.seconds_up = seconds_up;
+ db_stats_snapshot_.ingest_bytes = user_bytes_written;
+ db_stats_snapshot_.write_other = write_other;
+ db_stats_snapshot_.write_self = write_self;
+ db_stats_snapshot_.num_keys_written = num_keys_written;
+ db_stats_snapshot_.wal_bytes = wal_bytes;
+ db_stats_snapshot_.wal_synced = wal_synced;
+ db_stats_snapshot_.write_with_wal = write_with_wal;
+ db_stats_snapshot_.write_stall_micros = write_stall_micros;
+}
+
+/**
+ * Dump Compaction Level stats to a map of stat name with "compaction." prefix
+ * to value in double as string. The level in stat name is represented with
+ * a prefix "Lx" where "x" is the level number. A special level "Sum"
+ * represents the sum of a stat for all levels.
+ * The result also contains IO stall counters which keys start with "io_stalls."
+ * and values represent uint64 encoded as strings.
+ */
+void InternalStats::DumpCFMapStats(
+ std::map<std::string, std::string>* cf_stats) {
+ const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
+ CompactionStats compaction_stats_sum;
+ std::map<int, std::map<LevelStatType, double>> levels_stats;
+ DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum);
+ for (auto const& level_ent : levels_stats) {
+ auto level_str =
+ level_ent.first == -1 ? "Sum" : "L" + std::to_string(level_ent.first);
+ for (auto const& stat_ent : level_ent.second) {
+ auto stat_type = stat_ent.first;
+ auto key_str =
+ "compaction." + level_str + "." +
+ InternalStats::compaction_level_stats.at(stat_type).property_name;
+ (*cf_stats)[key_str] = std::to_string(stat_ent.second);
+ }
+ }
+
+ DumpCFMapStatsIOStalls(cf_stats);
+}
+
+void InternalStats::DumpCFMapStats(
+ const VersionStorageInfo* vstorage,
+ std::map<int, std::map<LevelStatType, double>>* levels_stats,
+ CompactionStats* compaction_stats_sum) {
+ assert(vstorage);
+
+ int num_levels_to_check =
+ (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
+ ? vstorage->num_levels() - 1
+ : 1;
+
+ // Compaction scores are sorted based on its value. Restore them to the
+ // level order
+ std::vector<double> compaction_score(number_levels_, 0);
+ for (int i = 0; i < num_levels_to_check; ++i) {
+ compaction_score[vstorage->CompactionScoreLevel(i)] =
+ vstorage->CompactionScore(i);
+ }
+ // Count # of files being compacted for each level
+ std::vector<int> files_being_compacted(number_levels_, 0);
+ for (int level = 0; level < number_levels_; ++level) {
+ for (auto* f : vstorage->LevelFiles(level)) {
+ if (f->being_compacted) {
+ ++files_being_compacted[level];
+ }
+ }
+ }
+
+ int total_files = 0;
+ int total_files_being_compacted = 0;
+ double total_file_size = 0;
+ uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED];
+ uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE];
+ uint64_t curr_ingest = flush_ingest + add_file_ingest;
+ for (int level = 0; level < number_levels_; level++) {
+ int files = vstorage->NumLevelFiles(level);
+ total_files += files;
+ total_files_being_compacted += files_being_compacted[level];
+ if (comp_stats_[level].micros > 0 || comp_stats_[level].cpu_micros > 0 ||
+ files > 0) {
+ compaction_stats_sum->Add(comp_stats_[level]);
+ total_file_size += vstorage->NumLevelBytes(level);
+ uint64_t input_bytes;
+ if (level == 0) {
+ input_bytes = curr_ingest;
+ } else {
+ input_bytes = comp_stats_[level].bytes_read_non_output_levels +
+ comp_stats_[level].bytes_read_blob;
+ }
+ double w_amp =
+ (input_bytes == 0)
+ ? 0.0
+ : static_cast<double>(comp_stats_[level].bytes_written +
+ comp_stats_[level].bytes_written_blob) /
+ input_bytes;
+ std::map<LevelStatType, double> level_stats;
+ PrepareLevelStats(&level_stats, files, files_being_compacted[level],
+ static_cast<double>(vstorage->NumLevelBytes(level)),
+ compaction_score[level], w_amp, comp_stats_[level]);
+ (*levels_stats)[level] = level_stats;
+ }
+ }
+ // Cumulative summary
+ double w_amp = (0 == curr_ingest)
+ ? 0.0
+ : (compaction_stats_sum->bytes_written +
+ compaction_stats_sum->bytes_written_blob) /
+ static_cast<double>(curr_ingest);
+ // Stats summary across levels
+ std::map<LevelStatType, double> sum_stats;
+ PrepareLevelStats(&sum_stats, total_files, total_files_being_compacted,
+ total_file_size, 0, w_amp, *compaction_stats_sum);
+ (*levels_stats)[-1] = sum_stats; // -1 is for the Sum level
+}
+
+void InternalStats::DumpCFMapStatsByPriority(
+ std::map<int, std::map<LevelStatType, double>>* priorities_stats) {
+ for (size_t priority = 0; priority < comp_stats_by_pri_.size(); priority++) {
+ if (comp_stats_by_pri_[priority].micros > 0) {
+ std::map<LevelStatType, double> priority_stats;
+ PrepareLevelStats(&priority_stats, 0 /* num_files */,
+ 0 /* being_compacted */, 0 /* total_file_size */,
+ 0 /* compaction_score */, 0 /* w_amp */,
+ comp_stats_by_pri_[priority]);
+ (*priorities_stats)[static_cast<int>(priority)] = priority_stats;
+ }
+ }
+}
+
+void InternalStats::DumpCFMapStatsIOStalls(
+ std::map<std::string, std::string>* cf_stats) {
+ (*cf_stats)["io_stalls.level0_slowdown"] =
+ std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+ (*cf_stats)["io_stalls.level0_slowdown_with_compaction"] =
+ std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+ (*cf_stats)["io_stalls.level0_numfiles"] =
+ std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS]);
+ (*cf_stats)["io_stalls.level0_numfiles_with_compaction"] =
+ std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS]);
+ (*cf_stats)["io_stalls.stop_for_pending_compaction_bytes"] =
+ std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS]);
+ (*cf_stats)["io_stalls.slowdown_for_pending_compaction_bytes"] =
+ std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS]);
+ (*cf_stats)["io_stalls.memtable_compaction"] =
+ std::to_string(cf_stats_count_[MEMTABLE_LIMIT_STOPS]);
+ (*cf_stats)["io_stalls.memtable_slowdown"] =
+ std::to_string(cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]);
+
+ uint64_t total_stop = cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+ cf_stats_count_[MEMTABLE_LIMIT_STOPS];
+
+ uint64_t total_slowdown =
+ cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+ cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
+
+ (*cf_stats)["io_stalls.total_stop"] = std::to_string(total_stop);
+ (*cf_stats)["io_stalls.total_slowdown"] = std::to_string(total_slowdown);
+}
+
+void InternalStats::DumpCFStats(std::string* value) {
+ DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value);
+ DumpCFFileHistogram(value);
+}
+
+void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic,
+ std::string* value) {
+ char buf[2000];
+ // Per-ColumnFamily stats
+ PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Level");
+ value->append(buf);
+
+ // Print stats for each level
+ const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
+ std::map<int, std::map<LevelStatType, double>> levels_stats;
+ CompactionStats compaction_stats_sum;
+ DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum);
+ for (int l = 0; l < number_levels_; ++l) {
+ if (levels_stats.find(l) != levels_stats.end()) {
+ PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(l),
+ levels_stats[l]);
+ value->append(buf);
+ }
+ }
+
+ // Print sum of level stats
+ PrintLevelStats(buf, sizeof(buf), "Sum", levels_stats[-1]);
+ value->append(buf);
+
+ uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED];
+ uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE];
+ uint64_t ingest_files_addfile = cf_stats_value_[INGESTED_NUM_FILES_TOTAL];
+ uint64_t ingest_l0_files_addfile =
+ cf_stats_value_[INGESTED_LEVEL0_NUM_FILES_TOTAL];
+ uint64_t ingest_keys_addfile = cf_stats_value_[INGESTED_NUM_KEYS_TOTAL];
+ // Cumulative summary
+ uint64_t total_stall_count =
+ cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+ cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+ cf_stats_count_[MEMTABLE_LIMIT_STOPS] +
+ cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
+ // Interval summary
+ uint64_t interval_flush_ingest =
+ flush_ingest - cf_stats_snapshot_.ingest_bytes_flush;
+ uint64_t interval_add_file_inget =
+ add_file_ingest - cf_stats_snapshot_.ingest_bytes_addfile;
+ uint64_t interval_ingest =
+ interval_flush_ingest + interval_add_file_inget + 1;
+ CompactionStats interval_stats(compaction_stats_sum);
+ interval_stats.Subtract(cf_stats_snapshot_.comp_stats);
+ double w_amp =
+ (interval_stats.bytes_written + interval_stats.bytes_written_blob) /
+ static_cast<double>(interval_ingest);
+ PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats);
+ value->append(buf);
+
+ PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Priority");
+ value->append(buf);
+ std::map<int, std::map<LevelStatType, double>> priorities_stats;
+ DumpCFMapStatsByPriority(&priorities_stats);
+ for (size_t priority = 0; priority < comp_stats_by_pri_.size(); ++priority) {
+ if (priorities_stats.find(static_cast<int>(priority)) !=
+ priorities_stats.end()) {
+ PrintLevelStats(
+ buf, sizeof(buf),
+ Env::PriorityToString(static_cast<Env::Priority>(priority)),
+ priorities_stats[static_cast<int>(priority)]);
+ value->append(buf);
+ }
+ }
+
+ const auto blob_st = vstorage->GetBlobStats();
+
+ snprintf(buf, sizeof(buf),
+ "\nBlob file count: %" ROCKSDB_PRIszt
+ ", total size: %.1f GB, garbage size: %.1f GB, space amp: %.1f\n\n",
+ vstorage->GetBlobFiles().size(), blob_st.total_file_size / kGB,
+ blob_st.total_garbage_size / kGB, blob_st.space_amp);
+ value->append(buf);
+
+ uint64_t now_micros = clock_->NowMicros();
+ double seconds_up = (now_micros - started_at_) / kMicrosInSec;
+ double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up;
+ snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
+ seconds_up, interval_seconds_up);
+ value->append(buf);
+ snprintf(buf, sizeof(buf), "Flush(GB): cumulative %.3f, interval %.3f\n",
+ flush_ingest / kGB, interval_flush_ingest / kGB);
+ value->append(buf);
+ snprintf(buf, sizeof(buf), "AddFile(GB): cumulative %.3f, interval %.3f\n",
+ add_file_ingest / kGB, interval_add_file_inget / kGB);
+ value->append(buf);
+
+ uint64_t interval_ingest_files_addfile =
+ ingest_files_addfile - cf_stats_snapshot_.ingest_files_addfile;
+ snprintf(buf, sizeof(buf),
+ "AddFile(Total Files): cumulative %" PRIu64 ", interval %" PRIu64
+ "\n",
+ ingest_files_addfile, interval_ingest_files_addfile);
+ value->append(buf);
+
+ uint64_t interval_ingest_l0_files_addfile =
+ ingest_l0_files_addfile - cf_stats_snapshot_.ingest_l0_files_addfile;
+ snprintf(buf, sizeof(buf),
+ "AddFile(L0 Files): cumulative %" PRIu64 ", interval %" PRIu64 "\n",
+ ingest_l0_files_addfile, interval_ingest_l0_files_addfile);
+ value->append(buf);
+
+ uint64_t interval_ingest_keys_addfile =
+ ingest_keys_addfile - cf_stats_snapshot_.ingest_keys_addfile;
+ snprintf(buf, sizeof(buf),
+ "AddFile(Keys): cumulative %" PRIu64 ", interval %" PRIu64 "\n",
+ ingest_keys_addfile, interval_ingest_keys_addfile);
+ value->append(buf);
+
+ // Compact
+ uint64_t compact_bytes_read = 0;
+ uint64_t compact_bytes_write = 0;
+ uint64_t compact_micros = 0;
+ for (int level = 0; level < number_levels_; level++) {
+ compact_bytes_read += comp_stats_[level].bytes_read_output_level +
+ comp_stats_[level].bytes_read_non_output_levels +
+ comp_stats_[level].bytes_read_blob;
+ compact_bytes_write += comp_stats_[level].bytes_written +
+ comp_stats_[level].bytes_written_blob;
+ compact_micros += comp_stats_[level].micros;
+ }
+
+ snprintf(buf, sizeof(buf),
+ "Cumulative compaction: %.2f GB write, %.2f MB/s write, "
+ "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+ compact_bytes_write / kGB,
+ compact_bytes_write / kMB / std::max(seconds_up, 0.001),
+ compact_bytes_read / kGB,
+ compact_bytes_read / kMB / std::max(seconds_up, 0.001),
+ compact_micros / kMicrosInSec);
+ value->append(buf);
+
+ // Compaction interval
+ uint64_t interval_compact_bytes_write =
+ compact_bytes_write - cf_stats_snapshot_.compact_bytes_write;
+ uint64_t interval_compact_bytes_read =
+ compact_bytes_read - cf_stats_snapshot_.compact_bytes_read;
+ uint64_t interval_compact_micros =
+ compact_micros - cf_stats_snapshot_.compact_micros;
+
+ snprintf(
+ buf, sizeof(buf),
+ "Interval compaction: %.2f GB write, %.2f MB/s write, "
+ "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+ interval_compact_bytes_write / kGB,
+ interval_compact_bytes_write / kMB / std::max(interval_seconds_up, 0.001),
+ interval_compact_bytes_read / kGB,
+ interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001),
+ interval_compact_micros / kMicrosInSec);
+ value->append(buf);
+ if (is_periodic) {
+ cf_stats_snapshot_.compact_bytes_write = compact_bytes_write;
+ cf_stats_snapshot_.compact_bytes_read = compact_bytes_read;
+ cf_stats_snapshot_.compact_micros = compact_micros;
+ }
+
+ snprintf(buf, sizeof(buf),
+ "Stalls(count): %" PRIu64
+ " level0_slowdown, "
+ "%" PRIu64
+ " level0_slowdown_with_compaction, "
+ "%" PRIu64
+ " level0_numfiles, "
+ "%" PRIu64
+ " level0_numfiles_with_compaction, "
+ "%" PRIu64
+ " stop for pending_compaction_bytes, "
+ "%" PRIu64
+ " slowdown for pending_compaction_bytes, "
+ "%" PRIu64
+ " memtable_compaction, "
+ "%" PRIu64
+ " memtable_slowdown, "
+ "interval %" PRIu64 " total count\n",
+ cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+ cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+ cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS],
+ cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS],
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS],
+ cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS],
+ cf_stats_count_[MEMTABLE_LIMIT_STOPS],
+ cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS],
+ total_stall_count - cf_stats_snapshot_.stall_count);
+ value->append(buf);
+
+ if (is_periodic) {
+ cf_stats_snapshot_.seconds_up = seconds_up;
+ cf_stats_snapshot_.ingest_bytes_flush = flush_ingest;
+ cf_stats_snapshot_.ingest_bytes_addfile = add_file_ingest;
+ cf_stats_snapshot_.ingest_files_addfile = ingest_files_addfile;
+ cf_stats_snapshot_.ingest_l0_files_addfile = ingest_l0_files_addfile;
+ cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile;
+ cf_stats_snapshot_.comp_stats = compaction_stats_sum;
+ cf_stats_snapshot_.stall_count = total_stall_count;
+ }
+
+ // Do not gather cache entry stats during CFStats because DB
+ // mutex is held. Only dump last cached collection (rely on DB
+ // periodic stats dump to update)
+ if (cache_entry_stats_collector_) {
+ CacheEntryRoleStats stats;
+ // thread safe
+ cache_entry_stats_collector_->GetStats(&stats);
+
+ constexpr uint64_t kDayInMicros = uint64_t{86400} * 1000000U;
+
+ // Skip if stats are extremely old (> 1 day, incl not yet populated)
+ if (now_micros - stats.last_end_time_micros_ < kDayInMicros) {
+ value->append(stats.ToString(clock_));
+ }
+ }
+}
+
+void InternalStats::DumpCFFileHistogram(std::string* value) {
+ assert(value);
+ assert(cfd_);
+
+ std::ostringstream oss;
+ oss << "\n** File Read Latency Histogram By Level [" << cfd_->GetName()
+ << "] **\n";
+
+ for (int level = 0; level < number_levels_; level++) {
+ if (!file_read_latency_[level].Empty()) {
+ oss << "** Level " << level << " read latency histogram (micros):\n"
+ << file_read_latency_[level].ToString() << '\n';
+ }
+ }
+
+ if (!blob_file_read_latency_.Empty()) {
+ oss << "** Blob file read latency histogram (micros):\n"
+ << blob_file_read_latency_.ToString() << '\n';
+ }
+
+ value->append(oss.str());
+}
+
+#else
+
+const DBPropertyInfo* GetPropertyInfo(const Slice& /*property*/) {
+ return nullptr;
+}
+
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/internal_stats.h b/src/rocksdb/db/internal_stats.h
new file mode 100644
index 000000000..b0cd5899b
--- /dev/null
+++ b/src/rocksdb/db/internal_stats.h
@@ -0,0 +1,996 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "db/version_set.h"
+#include "rocksdb/system_clock.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <class Stats>
+class CacheEntryStatsCollector;
+class DBImpl;
+class MemTableList;
+
+// Config for retrieving a property's value.
+struct DBPropertyInfo {
+ bool need_out_of_mutex;
+
+ // gcc had an internal error for initializing union of pointer-to-member-
+ // functions. Workaround is to populate exactly one of the following function
+ // pointers with a non-nullptr value.
+
+ // @param value Value-result argument for storing the property's string value
+ // @param suffix Argument portion of the property. For example, suffix would
+ // be "5" for the property "rocksdb.num-files-at-level5". So far, only
+ // certain string properties take an argument.
+ bool (InternalStats::*handle_string)(std::string* value, Slice suffix);
+
+ // @param value Value-result argument for storing the property's uint64 value
+ // @param db Many of the int properties rely on DBImpl methods.
+ // @param version Version is needed in case the property is retrieved without
+ // holding db mutex, which is only supported for int properties.
+ bool (InternalStats::*handle_int)(uint64_t* value, DBImpl* db,
+ Version* version);
+
+ // @param props Map of general properties to populate
+ // @param suffix Argument portion of the property. (see handle_string)
+ bool (InternalStats::*handle_map)(std::map<std::string, std::string>* props,
+ Slice suffix);
+
+ // handle the string type properties rely on DBImpl methods
+ // @param value Value-result argument for storing the property's string value
+ bool (DBImpl::*handle_string_dbimpl)(std::string* value);
+};
+
+extern const DBPropertyInfo* GetPropertyInfo(const Slice& property);
+
+#ifndef ROCKSDB_LITE
+#undef SCORE
+enum class LevelStatType {
+ INVALID = 0,
+ NUM_FILES,
+ COMPACTED_FILES,
+ SIZE_BYTES,
+ SCORE,
+ READ_GB,
+ RN_GB,
+ RNP1_GB,
+ WRITE_GB,
+ W_NEW_GB,
+ MOVED_GB,
+ WRITE_AMP,
+ READ_MBPS,
+ WRITE_MBPS,
+ COMP_SEC,
+ COMP_CPU_SEC,
+ COMP_COUNT,
+ AVG_SEC,
+ KEY_IN,
+ KEY_DROP,
+ R_BLOB_GB,
+ W_BLOB_GB,
+ TOTAL // total number of types
+};
+
+struct LevelStat {
+ // This what will be L?.property_name in the flat map returned to the user
+ std::string property_name;
+ // This will be what we will print in the header in the cli
+ std::string header_name;
+};
+
+struct DBStatInfo {
+ // This what will be property_name in the flat map returned to the user
+ std::string property_name;
+};
+
+class InternalStats {
+ public:
+ static const std::map<LevelStatType, LevelStat> compaction_level_stats;
+
+ enum InternalCFStatsType {
+ L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+ LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+ MEMTABLE_LIMIT_STOPS,
+ MEMTABLE_LIMIT_SLOWDOWNS,
+ L0_FILE_COUNT_LIMIT_STOPS,
+ LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+ PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+ PENDING_COMPACTION_BYTES_LIMIT_STOPS,
+ WRITE_STALLS_ENUM_MAX,
+ BYTES_FLUSHED,
+ BYTES_INGESTED_ADD_FILE,
+ INGESTED_NUM_FILES_TOTAL,
+ INGESTED_LEVEL0_NUM_FILES_TOTAL,
+ INGESTED_NUM_KEYS_TOTAL,
+ INTERNAL_CF_STATS_ENUM_MAX,
+ };
+
+ enum InternalDBStatsType {
+ kIntStatsWalFileBytes,
+ kIntStatsWalFileSynced,
+ kIntStatsBytesWritten,
+ kIntStatsNumKeysWritten,
+ kIntStatsWriteDoneByOther,
+ kIntStatsWriteDoneBySelf,
+ kIntStatsWriteWithWal,
+ kIntStatsWriteStallMicros,
+ kIntStatsNumMax,
+ };
+
+ static const std::map<InternalDBStatsType, DBStatInfo> db_stats_type_to_info;
+
+ InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd);
+
+ // Per level compaction stats
+ struct CompactionOutputsStats {
+ uint64_t num_output_records = 0;
+ uint64_t bytes_written = 0;
+ uint64_t bytes_written_blob = 0;
+ uint64_t num_output_files = 0;
+ uint64_t num_output_files_blob = 0;
+
+ void Add(const CompactionOutputsStats& stats) {
+ this->num_output_records += stats.num_output_records;
+ this->bytes_written += stats.bytes_written;
+ this->bytes_written_blob += stats.bytes_written_blob;
+ this->num_output_files += stats.num_output_files;
+ this->num_output_files_blob += stats.num_output_files_blob;
+ }
+ };
+
+ // Per level compaction stats. comp_stats_[level] stores the stats for
+ // compactions that produced data for the specified "level".
+ struct CompactionStats {
+ uint64_t micros;
+ uint64_t cpu_micros;
+
+ // The number of bytes read from all non-output levels (table files)
+ uint64_t bytes_read_non_output_levels;
+
+ // The number of bytes read from the compaction output level (table files)
+ uint64_t bytes_read_output_level;
+
+ // The number of bytes read from blob files
+ uint64_t bytes_read_blob;
+
+ // Total number of bytes written to table files during compaction
+ uint64_t bytes_written;
+
+ // Total number of bytes written to blob files during compaction
+ uint64_t bytes_written_blob;
+
+ // Total number of bytes moved to the output level (table files)
+ uint64_t bytes_moved;
+
+ // The number of compaction input files in all non-output levels (table
+ // files)
+ int num_input_files_in_non_output_levels;
+
+ // The number of compaction input files in the output level (table files)
+ int num_input_files_in_output_level;
+
+ // The number of compaction output files (table files)
+ int num_output_files;
+
+ // The number of compaction output files (blob files)
+ int num_output_files_blob;
+
+ // Total incoming entries during compaction between levels N and N+1
+ uint64_t num_input_records;
+
+ // Accumulated diff number of entries
+ // (num input entries - num output entries) for compaction levels N and N+1
+ uint64_t num_dropped_records;
+
+ // Total output entries from compaction
+ uint64_t num_output_records;
+
+ // Number of compactions done
+ int count;
+
+ // Number of compactions done per CompactionReason
+ int counts[static_cast<int>(CompactionReason::kNumOfReasons)]{};
+
+ explicit CompactionStats()
+ : micros(0),
+ cpu_micros(0),
+ bytes_read_non_output_levels(0),
+ bytes_read_output_level(0),
+ bytes_read_blob(0),
+ bytes_written(0),
+ bytes_written_blob(0),
+ bytes_moved(0),
+ num_input_files_in_non_output_levels(0),
+ num_input_files_in_output_level(0),
+ num_output_files(0),
+ num_output_files_blob(0),
+ num_input_records(0),
+ num_dropped_records(0),
+ num_output_records(0),
+ count(0) {
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] = 0;
+ }
+ }
+
+ explicit CompactionStats(CompactionReason reason, int c)
+ : micros(0),
+ cpu_micros(0),
+ bytes_read_non_output_levels(0),
+ bytes_read_output_level(0),
+ bytes_read_blob(0),
+ bytes_written(0),
+ bytes_written_blob(0),
+ bytes_moved(0),
+ num_input_files_in_non_output_levels(0),
+ num_input_files_in_output_level(0),
+ num_output_files(0),
+ num_output_files_blob(0),
+ num_input_records(0),
+ num_dropped_records(0),
+ num_output_records(0),
+ count(c) {
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] = 0;
+ }
+ int r = static_cast<int>(reason);
+ if (r >= 0 && r < num_of_reasons) {
+ counts[r] = c;
+ } else {
+ count = 0;
+ }
+ }
+
+ CompactionStats(const CompactionStats& c)
+ : micros(c.micros),
+ cpu_micros(c.cpu_micros),
+ bytes_read_non_output_levels(c.bytes_read_non_output_levels),
+ bytes_read_output_level(c.bytes_read_output_level),
+ bytes_read_blob(c.bytes_read_blob),
+ bytes_written(c.bytes_written),
+ bytes_written_blob(c.bytes_written_blob),
+ bytes_moved(c.bytes_moved),
+ num_input_files_in_non_output_levels(
+ c.num_input_files_in_non_output_levels),
+ num_input_files_in_output_level(c.num_input_files_in_output_level),
+ num_output_files(c.num_output_files),
+ num_output_files_blob(c.num_output_files_blob),
+ num_input_records(c.num_input_records),
+ num_dropped_records(c.num_dropped_records),
+ num_output_records(c.num_output_records),
+ count(c.count) {
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] = c.counts[i];
+ }
+ }
+
+ CompactionStats& operator=(const CompactionStats& c) {
+ micros = c.micros;
+ cpu_micros = c.cpu_micros;
+ bytes_read_non_output_levels = c.bytes_read_non_output_levels;
+ bytes_read_output_level = c.bytes_read_output_level;
+ bytes_read_blob = c.bytes_read_blob;
+ bytes_written = c.bytes_written;
+ bytes_written_blob = c.bytes_written_blob;
+ bytes_moved = c.bytes_moved;
+ num_input_files_in_non_output_levels =
+ c.num_input_files_in_non_output_levels;
+ num_input_files_in_output_level = c.num_input_files_in_output_level;
+ num_output_files = c.num_output_files;
+ num_output_files_blob = c.num_output_files_blob;
+ num_input_records = c.num_input_records;
+ num_dropped_records = c.num_dropped_records;
+ num_output_records = c.num_output_records;
+ count = c.count;
+
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] = c.counts[i];
+ }
+ return *this;
+ }
+
+ void Clear() {
+ this->micros = 0;
+ this->cpu_micros = 0;
+ this->bytes_read_non_output_levels = 0;
+ this->bytes_read_output_level = 0;
+ this->bytes_read_blob = 0;
+ this->bytes_written = 0;
+ this->bytes_written_blob = 0;
+ this->bytes_moved = 0;
+ this->num_input_files_in_non_output_levels = 0;
+ this->num_input_files_in_output_level = 0;
+ this->num_output_files = 0;
+ this->num_output_files_blob = 0;
+ this->num_input_records = 0;
+ this->num_dropped_records = 0;
+ this->num_output_records = 0;
+ this->count = 0;
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] = 0;
+ }
+ }
+
+ void Add(const CompactionStats& c) {
+ this->micros += c.micros;
+ this->cpu_micros += c.cpu_micros;
+ this->bytes_read_non_output_levels += c.bytes_read_non_output_levels;
+ this->bytes_read_output_level += c.bytes_read_output_level;
+ this->bytes_read_blob += c.bytes_read_blob;
+ this->bytes_written += c.bytes_written;
+ this->bytes_written_blob += c.bytes_written_blob;
+ this->bytes_moved += c.bytes_moved;
+ this->num_input_files_in_non_output_levels +=
+ c.num_input_files_in_non_output_levels;
+ this->num_input_files_in_output_level +=
+ c.num_input_files_in_output_level;
+ this->num_output_files += c.num_output_files;
+ this->num_output_files_blob += c.num_output_files_blob;
+ this->num_input_records += c.num_input_records;
+ this->num_dropped_records += c.num_dropped_records;
+ this->num_output_records += c.num_output_records;
+ this->count += c.count;
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] += c.counts[i];
+ }
+ }
+
+ void Add(const CompactionOutputsStats& stats) {
+ this->num_output_files += static_cast<int>(stats.num_output_files);
+ this->num_output_records += stats.num_output_records;
+ this->bytes_written += stats.bytes_written;
+ this->bytes_written_blob += stats.bytes_written_blob;
+ this->num_output_files_blob +=
+ static_cast<int>(stats.num_output_files_blob);
+ }
+
+ void Subtract(const CompactionStats& c) {
+ this->micros -= c.micros;
+ this->cpu_micros -= c.cpu_micros;
+ this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels;
+ this->bytes_read_output_level -= c.bytes_read_output_level;
+ this->bytes_read_blob -= c.bytes_read_blob;
+ this->bytes_written -= c.bytes_written;
+ this->bytes_written_blob -= c.bytes_written_blob;
+ this->bytes_moved -= c.bytes_moved;
+ this->num_input_files_in_non_output_levels -=
+ c.num_input_files_in_non_output_levels;
+ this->num_input_files_in_output_level -=
+ c.num_input_files_in_output_level;
+ this->num_output_files -= c.num_output_files;
+ this->num_output_files_blob -= c.num_output_files_blob;
+ this->num_input_records -= c.num_input_records;
+ this->num_dropped_records -= c.num_dropped_records;
+ this->num_output_records -= c.num_output_records;
+ this->count -= c.count;
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] -= c.counts[i];
+ }
+ }
+
+ void ResetCompactionReason(CompactionReason reason) {
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ assert(count == 1); // only support update one compaction reason
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] = 0;
+ }
+ int r = static_cast<int>(reason);
+ assert(r >= 0 && r < num_of_reasons);
+ counts[r] = 1;
+ }
+ };
+
+ // Compaction stats, for per_key_placement compaction, it includes 2 levels
+ // stats: the last level and the penultimate level.
+ struct CompactionStatsFull {
+ // the stats for the target primary output level
+ CompactionStats stats;
+
+ // stats for penultimate level output if exist
+ bool has_penultimate_level_output = false;
+ CompactionStats penultimate_level_stats;
+
+ explicit CompactionStatsFull() : stats(), penultimate_level_stats() {}
+
+ explicit CompactionStatsFull(CompactionReason reason, int c)
+ : stats(reason, c), penultimate_level_stats(reason, c){};
+
+ uint64_t TotalBytesWritten() const {
+ uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
+ if (has_penultimate_level_output) {
+ bytes_written += penultimate_level_stats.bytes_written +
+ penultimate_level_stats.bytes_written_blob;
+ }
+ return bytes_written;
+ }
+
+ uint64_t DroppedRecords() {
+ uint64_t output_records = stats.num_output_records;
+ if (has_penultimate_level_output) {
+ output_records += penultimate_level_stats.num_output_records;
+ }
+ if (stats.num_input_records > output_records) {
+ return stats.num_input_records - output_records;
+ }
+ return 0;
+ }
+
+ void SetMicros(uint64_t val) {
+ stats.micros = val;
+ penultimate_level_stats.micros = val;
+ }
+
+ void AddCpuMicros(uint64_t val) {
+ stats.cpu_micros += val;
+ penultimate_level_stats.cpu_micros += val;
+ }
+ };
+
+ // For use with CacheEntryStatsCollector
+ struct CacheEntryRoleStats {
+ uint64_t cache_capacity = 0;
+ uint64_t cache_usage = 0;
+ size_t table_size = 0;
+ size_t occupancy = 0;
+ std::string cache_id;
+ std::array<uint64_t, kNumCacheEntryRoles> total_charges;
+ std::array<size_t, kNumCacheEntryRoles> entry_counts;
+ uint32_t collection_count = 0;
+ uint32_t copies_of_last_collection = 0;
+ uint64_t last_start_time_micros_ = 0;
+ uint64_t last_end_time_micros_ = 0;
+
+ void Clear() {
+ // Wipe everything except collection_count
+ uint32_t saved_collection_count = collection_count;
+ *this = CacheEntryRoleStats();
+ collection_count = saved_collection_count;
+ }
+
+ void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros);
+ std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+ GetEntryCallback();
+ void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros);
+ void SkippedCollection();
+
+ std::string ToString(SystemClock* clock) const;
+ void ToMap(std::map<std::string, std::string>* values,
+ SystemClock* clock) const;
+
+ private:
+ UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map_;
+ uint64_t GetLastDurationMicros() const;
+ };
+
+ void Clear() {
+ for (int i = 0; i < kIntStatsNumMax; i++) {
+ db_stats_[i].store(0);
+ }
+ for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) {
+ cf_stats_count_[i] = 0;
+ cf_stats_value_[i] = 0;
+ }
+ for (auto& comp_stat : comp_stats_) {
+ comp_stat.Clear();
+ }
+ per_key_placement_comp_stats_.Clear();
+ for (auto& h : file_read_latency_) {
+ h.Clear();
+ }
+ blob_file_read_latency_.Clear();
+ cf_stats_snapshot_.Clear();
+ db_stats_snapshot_.Clear();
+ bg_error_count_ = 0;
+ started_at_ = clock_->NowMicros();
+ has_cf_change_since_dump_ = true;
+ }
+
+ void AddCompactionStats(int level, Env::Priority thread_pri,
+ const CompactionStats& stats) {
+ comp_stats_[level].Add(stats);
+ comp_stats_by_pri_[thread_pri].Add(stats);
+ }
+
+ void AddCompactionStats(int level, Env::Priority thread_pri,
+ const CompactionStatsFull& comp_stats_full) {
+ AddCompactionStats(level, thread_pri, comp_stats_full.stats);
+ if (comp_stats_full.has_penultimate_level_output) {
+ per_key_placement_comp_stats_.Add(
+ comp_stats_full.penultimate_level_stats);
+ }
+ }
+
+ void IncBytesMoved(int level, uint64_t amount) {
+ comp_stats_[level].bytes_moved += amount;
+ }
+
+ void AddCFStats(InternalCFStatsType type, uint64_t value) {
+ has_cf_change_since_dump_ = true;
+ cf_stats_value_[type] += value;
+ ++cf_stats_count_[type];
+ }
+
+ void AddDBStats(InternalDBStatsType type, uint64_t value,
+ bool concurrent = false) {
+ auto& v = db_stats_[type];
+ if (concurrent) {
+ v.fetch_add(value, std::memory_order_relaxed);
+ } else {
+ v.store(v.load(std::memory_order_relaxed) + value,
+ std::memory_order_relaxed);
+ }
+ }
+
+ uint64_t GetDBStats(InternalDBStatsType type) {
+ return db_stats_[type].load(std::memory_order_relaxed);
+ }
+
+ HistogramImpl* GetFileReadHist(int level) {
+ return &file_read_latency_[level];
+ }
+
+ HistogramImpl* GetBlobFileReadHist() { return &blob_file_read_latency_; }
+
+ uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
+
+ uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
+
+ bool GetStringProperty(const DBPropertyInfo& property_info,
+ const Slice& property, std::string* value);
+
+ bool GetMapProperty(const DBPropertyInfo& property_info,
+ const Slice& property,
+ std::map<std::string, std::string>* value);
+
+ bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value,
+ DBImpl* db);
+
+ bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info,
+ Version* version, uint64_t* value);
+
+ // Unless there is a recent enough collection of the stats, collect and
+ // saved new cache entry stats. If `foreground`, require data to be more
+ // recent to skip re-collection.
+ //
+ // This should only be called while NOT holding the DB mutex.
+ void CollectCacheEntryStats(bool foreground);
+
+ const uint64_t* TEST_GetCFStatsValue() const { return cf_stats_value_; }
+
+ const std::vector<CompactionStats>& TEST_GetCompactionStats() const {
+ return comp_stats_;
+ }
+
+ const CompactionStats& TEST_GetPerKeyPlacementCompactionStats() const {
+ return per_key_placement_comp_stats_;
+ }
+
+ void TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, bool foreground);
+
+ // Store a mapping from the user-facing DB::Properties string to our
+ // DBPropertyInfo struct used internally for retrieving properties.
+ static const UnorderedMap<std::string, DBPropertyInfo> ppt_name_to_info;
+
+ static const std::string kPeriodicCFStats;
+
+ private:
+ void DumpDBMapStats(std::map<std::string, std::string>* db_stats);
+ void DumpDBStats(std::string* value);
+ void DumpCFMapStats(std::map<std::string, std::string>* cf_stats);
+ void DumpCFMapStats(
+ const VersionStorageInfo* vstorage,
+ std::map<int, std::map<LevelStatType, double>>* level_stats,
+ CompactionStats* compaction_stats_sum);
+ void DumpCFMapStatsByPriority(
+ std::map<int, std::map<LevelStatType, double>>* priorities_stats);
+ void DumpCFMapStatsIOStalls(std::map<std::string, std::string>* cf_stats);
+ void DumpCFStats(std::string* value);
+ // if is_periodic = true, it is an internal call by RocksDB periodically to
+ // dump the status.
+ void DumpCFStatsNoFileHistogram(bool is_periodic, std::string* value);
+ // if is_periodic = true, it is an internal call by RocksDB periodically to
+ // dump the status.
+ void DumpCFFileHistogram(std::string* value);
+
+ Cache* GetBlockCacheForStats();
+ Cache* GetBlobCacheForStats();
+
+ // Per-DB stats
+ std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
+ // Per-ColumnFamily stats
+ uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
+ uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
+ // Initialize/reference the collector in constructor so that we don't need
+ // additional synchronization in InternalStats, relying on synchronization
+ // in CacheEntryStatsCollector::GetStats. This collector is pinned in cache
+ // (through a shared_ptr) so that it does not get immediately ejected from
+ // a full cache, which would force a re-scan on the next GetStats.
+ std::shared_ptr<CacheEntryStatsCollector<CacheEntryRoleStats>>
+ cache_entry_stats_collector_;
+ // Per-ColumnFamily/level compaction stats
+ std::vector<CompactionStats> comp_stats_;
+ std::vector<CompactionStats> comp_stats_by_pri_;
+ CompactionStats per_key_placement_comp_stats_;
+ std::vector<HistogramImpl> file_read_latency_;
+ HistogramImpl blob_file_read_latency_;
+ bool has_cf_change_since_dump_;
+ // How many periods of no change since the last time stats are dumped for
+ // a periodic dump.
+ int no_cf_change_period_since_dump_ = 0;
+ uint64_t last_histogram_num = std::numeric_limits<uint64_t>::max();
+ static const int kMaxNoChangePeriodSinceDump;
+
+ // Used to compute per-interval statistics
+ struct CFStatsSnapshot {
+ // ColumnFamily-level stats
+ CompactionStats comp_stats;
+ uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush)
+ uint64_t stall_count; // Stall count
+ // Stats from compaction jobs - bytes written, bytes read, duration.
+ uint64_t compact_bytes_write;
+ uint64_t compact_bytes_read;
+ uint64_t compact_micros;
+ double seconds_up;
+
+ // AddFile specific stats
+ uint64_t ingest_bytes_addfile; // Total Bytes ingested
+ uint64_t ingest_files_addfile; // Total number of files ingested
+ uint64_t ingest_l0_files_addfile; // Total number of files ingested to L0
+ uint64_t ingest_keys_addfile; // Total number of keys ingested
+
+ CFStatsSnapshot()
+ : ingest_bytes_flush(0),
+ stall_count(0),
+ compact_bytes_write(0),
+ compact_bytes_read(0),
+ compact_micros(0),
+ seconds_up(0),
+ ingest_bytes_addfile(0),
+ ingest_files_addfile(0),
+ ingest_l0_files_addfile(0),
+ ingest_keys_addfile(0) {}
+
+ void Clear() {
+ comp_stats.Clear();
+ ingest_bytes_flush = 0;
+ stall_count = 0;
+ compact_bytes_write = 0;
+ compact_bytes_read = 0;
+ compact_micros = 0;
+ seconds_up = 0;
+ ingest_bytes_addfile = 0;
+ ingest_files_addfile = 0;
+ ingest_l0_files_addfile = 0;
+ ingest_keys_addfile = 0;
+ }
+ } cf_stats_snapshot_;
+
+ struct DBStatsSnapshot {
+ // DB-level stats
+ uint64_t ingest_bytes; // Bytes written by user
+ uint64_t wal_bytes; // Bytes written to WAL
+ uint64_t wal_synced; // Number of times WAL is synced
+ uint64_t write_with_wal; // Number of writes that request WAL
+ // These count the number of writes processed by the calling thread or
+ // another thread.
+ uint64_t write_other;
+ uint64_t write_self;
+ // Total number of keys written. write_self and write_other measure number
+ // of write requests written, Each of the write request can contain updates
+ // to multiple keys. num_keys_written is total number of keys updated by all
+ // those writes.
+ uint64_t num_keys_written;
+ // Total time writes delayed by stalls.
+ uint64_t write_stall_micros;
+ double seconds_up;
+
+ DBStatsSnapshot()
+ : ingest_bytes(0),
+ wal_bytes(0),
+ wal_synced(0),
+ write_with_wal(0),
+ write_other(0),
+ write_self(0),
+ num_keys_written(0),
+ write_stall_micros(0),
+ seconds_up(0) {}
+
+ void Clear() {
+ ingest_bytes = 0;
+ wal_bytes = 0;
+ wal_synced = 0;
+ write_with_wal = 0;
+ write_other = 0;
+ write_self = 0;
+ num_keys_written = 0;
+ write_stall_micros = 0;
+ seconds_up = 0;
+ }
+ } db_stats_snapshot_;
+
+ // Handler functions for getting property values. They use "value" as a value-
+ // result argument, and return true upon successfully setting "value".
+ bool HandleNumFilesAtLevel(std::string* value, Slice suffix);
+ bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix);
+ bool HandleLevelStats(std::string* value, Slice suffix);
+ bool HandleStats(std::string* value, Slice suffix);
+ bool HandleCFMapStats(std::map<std::string, std::string>* compaction_stats,
+ Slice suffix);
+ bool HandleCFStats(std::string* value, Slice suffix);
+ bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix);
+ bool HandleCFFileHistogram(std::string* value, Slice suffix);
+ bool HandleCFStatsPeriodic(std::string* value, Slice suffix);
+ bool HandleDBMapStats(std::map<std::string, std::string>* compaction_stats,
+ Slice suffix);
+ bool HandleDBStats(std::string* value, Slice suffix);
+ bool HandleSsTables(std::string* value, Slice suffix);
+ bool HandleAggregatedTableProperties(std::string* value, Slice suffix);
+ bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix);
+ bool HandleAggregatedTablePropertiesMap(
+ std::map<std::string, std::string>* values, Slice suffix);
+ bool HandleAggregatedTablePropertiesAtLevelMap(
+ std::map<std::string, std::string>* values, Slice suffix);
+ bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleMemTableFlushPending(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumRunningFlushes(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleCompactionPending(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleNumRunningCompactions(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleEstimateNumKeys(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleNumSnapshots(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleNumLiveVersions(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleLiveSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleMinObsoleteSstNumberToKeep(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleBlockCacheCapacity(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleBlockCacheEntryStatsInternal(std::string* value, bool fast);
+ bool HandleBlockCacheEntryStatsMapInternal(
+ std::map<std::string, std::string>* values, bool fast);
+ bool HandleBlockCacheEntryStats(std::string* value, Slice suffix);
+ bool HandleBlockCacheEntryStatsMap(std::map<std::string, std::string>* values,
+ Slice suffix);
+ bool HandleFastBlockCacheEntryStats(std::string* value, Slice suffix);
+ bool HandleFastBlockCacheEntryStatsMap(
+ std::map<std::string, std::string>* values, Slice suffix);
+ bool HandleLiveSstFilesSizeAtTemperature(std::string* value, Slice suffix);
+ bool HandleNumBlobFiles(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleBlobStats(std::string* value, Slice suffix);
+ bool HandleTotalBlobFileSize(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleLiveBlobFileSize(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleLiveBlobFileGarbageSize(uint64_t* value, DBImpl* db,
+ Version* version);
+ bool HandleBlobCacheCapacity(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleBlobCacheUsage(uint64_t* value, DBImpl* db, Version* version);
+ bool HandleBlobCachePinnedUsage(uint64_t* value, DBImpl* db,
+ Version* version);
+
+ // Total number of background errors encountered. Every time a flush task
+ // or compaction task fails, this counter is incremented. The failure can
+ // be caused by any possible reason, including file system errors, out of
+ // resources, or input file corruption. Failing when retrying the same flush
+ // or compaction will cause the counter to increase too.
+ uint64_t bg_error_count_;
+
+ const int number_levels_;
+ SystemClock* clock_;
+ ColumnFamilyData* cfd_;
+ uint64_t started_at_;
+};
+
+#else
+
+class InternalStats {
+ public:
+ enum InternalCFStatsType {
+ L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+ LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+ MEMTABLE_LIMIT_STOPS,
+ MEMTABLE_LIMIT_SLOWDOWNS,
+ L0_FILE_COUNT_LIMIT_STOPS,
+ LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+ PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+ PENDING_COMPACTION_BYTES_LIMIT_STOPS,
+ WRITE_STALLS_ENUM_MAX,
+ BYTES_FLUSHED,
+ BYTES_INGESTED_ADD_FILE,
+ INGESTED_NUM_FILES_TOTAL,
+ INGESTED_LEVEL0_NUM_FILES_TOTAL,
+ INGESTED_NUM_KEYS_TOTAL,
+ INTERNAL_CF_STATS_ENUM_MAX,
+ };
+
+ enum InternalDBStatsType {
+ kIntStatsWalFileBytes,
+ kIntStatsWalFileSynced,
+ kIntStatsBytesWritten,
+ kIntStatsNumKeysWritten,
+ kIntStatsWriteDoneByOther,
+ kIntStatsWriteDoneBySelf,
+ kIntStatsWriteWithWal,
+ kIntStatsWriteStallMicros,
+ kIntStatsNumMax,
+ };
+
+ InternalStats(int /*num_levels*/, SystemClock* /*clock*/,
+ ColumnFamilyData* /*cfd*/) {}
+
+ // Per level compaction stats
+ struct CompactionOutputsStats {
+ uint64_t num_output_records = 0;
+ uint64_t bytes_written = 0;
+ uint64_t bytes_written_blob = 0;
+ uint64_t num_output_files = 0;
+ uint64_t num_output_files_blob = 0;
+
+ void Add(const CompactionOutputsStats& stats) {
+ this->num_output_records += stats.num_output_records;
+ this->bytes_written += stats.bytes_written;
+ this->bytes_written_blob += stats.bytes_written_blob;
+ this->num_output_files += stats.num_output_files;
+ this->num_output_files_blob += stats.num_output_files_blob;
+ }
+ };
+
+ struct CompactionStats {
+ uint64_t micros;
+ uint64_t cpu_micros;
+ uint64_t bytes_read_non_output_levels;
+ uint64_t bytes_read_output_level;
+ uint64_t bytes_read_blob;
+ uint64_t bytes_written;
+ uint64_t bytes_written_blob;
+ uint64_t bytes_moved;
+ int num_input_files_in_non_output_levels;
+ int num_input_files_in_output_level;
+ int num_output_files;
+ int num_output_files_blob;
+ uint64_t num_input_records;
+ uint64_t num_dropped_records;
+ uint64_t num_output_records;
+ int count;
+
+ explicit CompactionStats() {}
+
+ explicit CompactionStats(CompactionReason /*reason*/, int /*c*/) {}
+
+ explicit CompactionStats(const CompactionStats& /*c*/) {}
+
+ void Add(const CompactionStats& /*c*/) {}
+
+ void Add(const CompactionOutputsStats& /*c*/) {}
+
+ void Subtract(const CompactionStats& /*c*/) {}
+ };
+
+ struct CompactionStatsFull {
+ // the stats for the target primary output level (per level stats)
+ CompactionStats stats;
+
+ // stats for output_to_penultimate_level level (per level stats)
+ bool has_penultimate_level_output = false;
+ CompactionStats penultimate_level_stats;
+
+ explicit CompactionStatsFull(){};
+
+ explicit CompactionStatsFull(CompactionReason /*reason*/, int /*c*/){};
+
+ uint64_t TotalBytesWritten() const { return 0; }
+
+ uint64_t DroppedRecords() { return 0; }
+
+ void SetMicros(uint64_t /*val*/){};
+
+ void AddCpuMicros(uint64_t /*val*/){};
+ };
+
+ void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/,
+ const CompactionStats& /*stats*/) {}
+
+ void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/,
+ const CompactionStatsFull& /*unmerged_stats*/) {}
+
+ void IncBytesMoved(int /*level*/, uint64_t /*amount*/) {}
+
+ void AddCFStats(InternalCFStatsType /*type*/, uint64_t /*value*/) {}
+
+ void AddDBStats(InternalDBStatsType /*type*/, uint64_t /*value*/,
+ bool /*concurrent */ = false) {}
+
+ HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; }
+
+ HistogramImpl* GetBlobFileReadHist() { return nullptr; }
+
+ uint64_t GetBackgroundErrorCount() const { return 0; }
+
+ uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
+
+ bool GetStringProperty(const DBPropertyInfo& /*property_info*/,
+ const Slice& /*property*/, std::string* /*value*/) {
+ return false;
+ }
+
+ bool GetMapProperty(const DBPropertyInfo& /*property_info*/,
+ const Slice& /*property*/,
+ std::map<std::string, std::string>* /*value*/) {
+ return false;
+ }
+
+ bool GetIntProperty(const DBPropertyInfo& /*property_info*/,
+ uint64_t* /*value*/, DBImpl* /*db*/) const {
+ return false;
+ }
+
+ bool GetIntPropertyOutOfMutex(const DBPropertyInfo& /*property_info*/,
+ Version* /*version*/,
+ uint64_t* /*value*/) const {
+ return false;
+ }
+};
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/job_context.h b/src/rocksdb/db/job_context.h
new file mode 100644
index 000000000..352c58e82
--- /dev/null
+++ b/src/rocksdb/db/job_context.h
@@ -0,0 +1,238 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/log_writer.h"
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTable;
+struct SuperVersion;
+
+struct SuperVersionContext {
+ struct WriteStallNotification {
+ WriteStallInfo write_stall_info;
+ const ImmutableOptions* immutable_options;
+ };
+
+ autovector<SuperVersion*> superversions_to_free;
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+ autovector<WriteStallNotification> write_stall_notifications;
+#endif
+ std::unique_ptr<SuperVersion>
+ new_superversion; // if nullptr no new superversion
+
+ explicit SuperVersionContext(bool create_superversion = false)
+ : new_superversion(create_superversion ? new SuperVersion() : nullptr) {}
+
+ explicit SuperVersionContext(SuperVersionContext&& other) noexcept
+ : superversions_to_free(std::move(other.superversions_to_free)),
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+ write_stall_notifications(std::move(other.write_stall_notifications)),
+#endif
+ new_superversion(std::move(other.new_superversion)) {
+ }
+ // No copies
+ SuperVersionContext(const SuperVersionContext& other) = delete;
+ void operator=(const SuperVersionContext& other) = delete;
+
+ void NewSuperVersion() {
+ new_superversion = std::unique_ptr<SuperVersion>(new SuperVersion());
+ }
+
+ inline bool HaveSomethingToDelete() const {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+ return !superversions_to_free.empty() || !write_stall_notifications.empty();
+#else
+ return !superversions_to_free.empty();
+#endif
+ }
+
+ void PushWriteStallNotification(WriteStallCondition old_cond,
+ WriteStallCondition new_cond,
+ const std::string& name,
+ const ImmutableOptions* ioptions) {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+ WriteStallNotification notif;
+ notif.write_stall_info.cf_name = name;
+ notif.write_stall_info.condition.prev = old_cond;
+ notif.write_stall_info.condition.cur = new_cond;
+ notif.immutable_options = ioptions;
+ write_stall_notifications.push_back(notif);
+#else
+ (void)old_cond;
+ (void)new_cond;
+ (void)name;
+ (void)ioptions;
+#endif // !defined(ROCKSDB_LITE) &&
+ // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+ }
+
+ void Clean() {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+ // notify listeners on changed write stall conditions
+ for (auto& notif : write_stall_notifications) {
+ for (auto& listener : notif.immutable_options->listeners) {
+ listener->OnStallConditionsChanged(notif.write_stall_info);
+ }
+ }
+ write_stall_notifications.clear();
+#endif // !ROCKSDB_LITE
+ // free superversions
+ for (auto s : superversions_to_free) {
+ delete s;
+ }
+ superversions_to_free.clear();
+ }
+
+ ~SuperVersionContext() {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+ assert(write_stall_notifications.empty());
+#endif
+ assert(superversions_to_free.empty());
+ }
+};
+
+struct JobContext {
+ inline bool HaveSomethingToDelete() const {
+ return !(full_scan_candidate_files.empty() && sst_delete_files.empty() &&
+ blob_delete_files.empty() && log_delete_files.empty() &&
+ manifest_delete_files.empty());
+ }
+
+ inline bool HaveSomethingToClean() const {
+ bool sv_have_sth = false;
+ for (const auto& sv_ctx : superversion_contexts) {
+ if (sv_ctx.HaveSomethingToDelete()) {
+ sv_have_sth = true;
+ break;
+ }
+ }
+ return memtables_to_free.size() > 0 || logs_to_free.size() > 0 ||
+ job_snapshot != nullptr || sv_have_sth;
+ }
+
+ SequenceNumber GetJobSnapshotSequence() const {
+ if (job_snapshot) {
+ assert(job_snapshot->snapshot());
+ return job_snapshot->snapshot()->GetSequenceNumber();
+ }
+ return kMaxSequenceNumber;
+ }
+
+ // Structure to store information for candidate files to delete.
+ struct CandidateFileInfo {
+ std::string file_name;
+ std::string file_path;
+ CandidateFileInfo(std::string name, std::string path)
+ : file_name(std::move(name)), file_path(std::move(path)) {}
+ bool operator==(const CandidateFileInfo& other) const {
+ return file_name == other.file_name && file_path == other.file_path;
+ }
+ };
+
+ // Unique job id
+ int job_id;
+
+ // a list of all files that we'll consider deleting
+ // (every once in a while this is filled up with all files
+ // in the DB directory)
+ // (filled only if we're doing full scan)
+ std::vector<CandidateFileInfo> full_scan_candidate_files;
+
+ // the list of all live sst files that cannot be deleted
+ std::vector<uint64_t> sst_live;
+
+ // the list of sst files that we need to delete
+ std::vector<ObsoleteFileInfo> sst_delete_files;
+
+ // the list of all live blob files that cannot be deleted
+ std::vector<uint64_t> blob_live;
+
+ // the list of blob files that we need to delete
+ std::vector<ObsoleteBlobFileInfo> blob_delete_files;
+
+ // a list of log files that we need to delete
+ std::vector<uint64_t> log_delete_files;
+
+ // a list of log files that we need to preserve during full purge since they
+ // will be reused later
+ std::vector<uint64_t> log_recycle_files;
+
+ // a list of manifest files that we need to delete
+ std::vector<std::string> manifest_delete_files;
+
+ // a list of memtables to be free
+ autovector<MemTable*> memtables_to_free;
+
+ // contexts for installing superversions for multiple column families
+ std::vector<SuperVersionContext> superversion_contexts;
+
+ autovector<log::Writer*> logs_to_free;
+
+ // the current manifest_file_number, log_number and prev_log_number
+ // that corresponds to the set of files in 'live'.
+ uint64_t manifest_file_number;
+ uint64_t pending_manifest_file_number;
+ uint64_t log_number;
+ uint64_t prev_log_number;
+
+ uint64_t min_pending_output = 0;
+ uint64_t prev_total_log_size = 0;
+ size_t num_alive_log_files = 0;
+ uint64_t size_log_to_delete = 0;
+
+ // Snapshot taken before flush/compaction job.
+ std::unique_ptr<ManagedSnapshot> job_snapshot;
+
+ explicit JobContext(int _job_id, bool create_superversion = false) {
+ job_id = _job_id;
+ manifest_file_number = 0;
+ pending_manifest_file_number = 0;
+ log_number = 0;
+ prev_log_number = 0;
+ superversion_contexts.emplace_back(
+ SuperVersionContext(create_superversion));
+ }
+
+ // For non-empty JobContext Clean() has to be called at least once before
+ // before destruction (see asserts in ~JobContext()). Should be called with
+ // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally
+ // doing potentially slow Clean() with locked DB mutex.
+ void Clean() {
+ // free superversions
+ for (auto& sv_context : superversion_contexts) {
+ sv_context.Clean();
+ }
+ // free pending memtables
+ for (auto m : memtables_to_free) {
+ delete m;
+ }
+ for (auto l : logs_to_free) {
+ delete l;
+ }
+
+ memtables_to_free.clear();
+ logs_to_free.clear();
+ job_snapshot.reset();
+ }
+
+ ~JobContext() {
+ assert(memtables_to_free.size() == 0);
+ assert(logs_to_free.size() == 0);
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/kv_checksum.h b/src/rocksdb/db/kv_checksum.h
new file mode 100644
index 000000000..bce507fcf
--- /dev/null
+++ b/src/rocksdb/db/kv_checksum.h
@@ -0,0 +1,398 @@
+// Copyright (c) 2020-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// This file contains classes containing fields to protect individual entries.
+// The classes are named "ProtectionInfo<suffix>", where <suffix> indicates the
+// combination of fields that are covered. Each field has a single letter
+// abbreviation as follows.
+//
+// K = key
+// V = value
+// O = optype aka value type
+// S = seqno
+// C = CF ID
+//
+// Then, for example, a class that protects an entry consisting of key, value,
+// optype, and CF ID (i.e., a `WriteBatch` entry) would be named
+// `ProtectionInfoKVOC`.
+//
+// The `ProtectionInfo.*` classes are templated on the integer type used to hold
+// the XOR of hashes for each field. Only unsigned integer types are supported,
+// and the maximum supported integer width is 64 bits. When the integer type is
+// narrower than the hash values, we lop off the most significant bits to make
+// them fit.
+//
+// The `ProtectionInfo.*` classes are all intended to be non-persistent. We do
+// not currently make the byte order consistent for integer fields before
+// hashing them, so the resulting values are endianness-dependent.
+
+#pragma once
+
+#include <type_traits>
+
+#include "db/dbformat.h"
+#include "rocksdb/types.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename T>
+class ProtectionInfo;
+template <typename T>
+class ProtectionInfoKVO;
+template <typename T>
+class ProtectionInfoKVOC;
+template <typename T>
+class ProtectionInfoKVOS;
+
+// Aliases for 64-bit protection infos.
+using ProtectionInfo64 = ProtectionInfo<uint64_t>;
+using ProtectionInfoKVO64 = ProtectionInfoKVO<uint64_t>;
+using ProtectionInfoKVOC64 = ProtectionInfoKVOC<uint64_t>;
+using ProtectionInfoKVOS64 = ProtectionInfoKVOS<uint64_t>;
+
+template <typename T>
+class ProtectionInfo {
+ public:
+ ProtectionInfo() = default;
+
+ Status GetStatus() const;
+ ProtectionInfoKVO<T> ProtectKVO(const Slice& key, const Slice& value,
+ ValueType op_type) const;
+ ProtectionInfoKVO<T> ProtectKVO(const SliceParts& key,
+ const SliceParts& value,
+ ValueType op_type) const;
+
+ T GetVal() const { return val_; }
+
+ private:
+ friend class ProtectionInfoKVO<T>;
+ friend class ProtectionInfoKVOS<T>;
+ friend class ProtectionInfoKVOC<T>;
+
+ // Each field is hashed with an independent value so we can catch fields being
+ // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall,
+ // and we should instead vary our seeds by a large odd number. This value by
+ // which we increment (0xD28AAD72F49BD50B) was taken from
+ // `head -c8 /dev/urandom | hexdump`, run repeatedly until it yielded an odd
+ // number. The values are computed manually since the Windows C++ compiler
+ // complains about the overflow when adding constants.
+ static const uint64_t kSeedK = 0;
+ static const uint64_t kSeedV = 0xD28AAD72F49BD50B;
+ static const uint64_t kSeedO = 0xA5155AE5E937AA16;
+ static const uint64_t kSeedS = 0x77A00858DDD37F21;
+ static const uint64_t kSeedC = 0x4A2AB5CBD26F542C;
+
+ ProtectionInfo(T val) : val_(val) {
+ static_assert(sizeof(ProtectionInfo<T>) == sizeof(T), "");
+ }
+
+ void SetVal(T val) { val_ = val; }
+
+ T val_ = 0;
+};
+
+template <typename T>
+class ProtectionInfoKVO {
+ public:
+ ProtectionInfoKVO() = default;
+
+ ProtectionInfo<T> StripKVO(const Slice& key, const Slice& value,
+ ValueType op_type) const;
+ ProtectionInfo<T> StripKVO(const SliceParts& key, const SliceParts& value,
+ ValueType op_type) const;
+
+ ProtectionInfoKVOC<T> ProtectC(ColumnFamilyId column_family_id) const;
+ ProtectionInfoKVOS<T> ProtectS(SequenceNumber sequence_number) const;
+
+ void UpdateK(const Slice& old_key, const Slice& new_key);
+ void UpdateK(const SliceParts& old_key, const SliceParts& new_key);
+ void UpdateV(const Slice& old_value, const Slice& new_value);
+ void UpdateV(const SliceParts& old_value, const SliceParts& new_value);
+ void UpdateO(ValueType old_op_type, ValueType new_op_type);
+
+ T GetVal() const { return info_.GetVal(); }
+
+ private:
+ friend class ProtectionInfo<T>;
+ friend class ProtectionInfoKVOS<T>;
+ friend class ProtectionInfoKVOC<T>;
+
+ explicit ProtectionInfoKVO(T val) : info_(val) {
+ static_assert(sizeof(ProtectionInfoKVO<T>) == sizeof(T), "");
+ }
+
+ void SetVal(T val) { info_.SetVal(val); }
+
+ ProtectionInfo<T> info_;
+};
+
+template <typename T>
+class ProtectionInfoKVOC {
+ public:
+ ProtectionInfoKVOC() = default;
+
+ ProtectionInfoKVO<T> StripC(ColumnFamilyId column_family_id) const;
+
+ void UpdateK(const Slice& old_key, const Slice& new_key) {
+ kvo_.UpdateK(old_key, new_key);
+ }
+ void UpdateK(const SliceParts& old_key, const SliceParts& new_key) {
+ kvo_.UpdateK(old_key, new_key);
+ }
+ void UpdateV(const Slice& old_value, const Slice& new_value) {
+ kvo_.UpdateV(old_value, new_value);
+ }
+ void UpdateV(const SliceParts& old_value, const SliceParts& new_value) {
+ kvo_.UpdateV(old_value, new_value);
+ }
+ void UpdateO(ValueType old_op_type, ValueType new_op_type) {
+ kvo_.UpdateO(old_op_type, new_op_type);
+ }
+ void UpdateC(ColumnFamilyId old_column_family_id,
+ ColumnFamilyId new_column_family_id);
+
+ T GetVal() const { return kvo_.GetVal(); }
+
+ private:
+ friend class ProtectionInfoKVO<T>;
+
+ explicit ProtectionInfoKVOC(T val) : kvo_(val) {
+ static_assert(sizeof(ProtectionInfoKVOC<T>) == sizeof(T), "");
+ }
+
+ void SetVal(T val) { kvo_.SetVal(val); }
+
+ ProtectionInfoKVO<T> kvo_;
+};
+
+template <typename T>
+class ProtectionInfoKVOS {
+ public:
+ ProtectionInfoKVOS() = default;
+
+ ProtectionInfoKVO<T> StripS(SequenceNumber sequence_number) const;
+
+ void UpdateK(const Slice& old_key, const Slice& new_key) {
+ kvo_.UpdateK(old_key, new_key);
+ }
+ void UpdateK(const SliceParts& old_key, const SliceParts& new_key) {
+ kvo_.UpdateK(old_key, new_key);
+ }
+ void UpdateV(const Slice& old_value, const Slice& new_value) {
+ kvo_.UpdateV(old_value, new_value);
+ }
+ void UpdateV(const SliceParts& old_value, const SliceParts& new_value) {
+ kvo_.UpdateV(old_value, new_value);
+ }
+ void UpdateO(ValueType old_op_type, ValueType new_op_type) {
+ kvo_.UpdateO(old_op_type, new_op_type);
+ }
+ void UpdateS(SequenceNumber old_sequence_number,
+ SequenceNumber new_sequence_number);
+
+ T GetVal() const { return kvo_.GetVal(); }
+
+ private:
+ friend class ProtectionInfoKVO<T>;
+
+ explicit ProtectionInfoKVOS(T val) : kvo_(val) {
+ static_assert(sizeof(ProtectionInfoKVOS<T>) == sizeof(T), "");
+ }
+
+ void SetVal(T val) { kvo_.SetVal(val); }
+
+ ProtectionInfoKVO<T> kvo_;
+};
+
+template <typename T>
+Status ProtectionInfo<T>::GetStatus() const {
+ if (val_ != 0) {
+ return Status::Corruption("ProtectionInfo mismatch");
+ }
+ return Status::OK();
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfo<T>::ProtectKVO(const Slice& key,
+ const Slice& value,
+ ValueType op_type) const {
+ T val = GetVal();
+ val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
+ val =
+ val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
+ val = val ^
+ static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+ sizeof(op_type), ProtectionInfo<T>::kSeedO));
+ return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfo<T>::ProtectKVO(const SliceParts& key,
+ const SliceParts& value,
+ ValueType op_type) const {
+ T val = GetVal();
+ val = val ^
+ static_cast<T>(GetSlicePartsNPHash64(key, ProtectionInfo<T>::kSeedK));
+ val = val ^
+ static_cast<T>(GetSlicePartsNPHash64(value, ProtectionInfo<T>::kSeedV));
+ val = val ^
+ static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+ sizeof(op_type), ProtectionInfo<T>::kSeedO));
+ return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateK(const Slice& old_key, const Slice& new_key) {
+ T val = GetVal();
+ val = val ^
+ static_cast<T>(GetSliceNPHash64(old_key, ProtectionInfo<T>::kSeedK));
+ val = val ^
+ static_cast<T>(GetSliceNPHash64(new_key, ProtectionInfo<T>::kSeedK));
+ SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateK(const SliceParts& old_key,
+ const SliceParts& new_key) {
+ T val = GetVal();
+ val = val ^ static_cast<T>(
+ GetSlicePartsNPHash64(old_key, ProtectionInfo<T>::kSeedK));
+ val = val ^ static_cast<T>(
+ GetSlicePartsNPHash64(new_key, ProtectionInfo<T>::kSeedK));
+ SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateV(const Slice& old_value,
+ const Slice& new_value) {
+ T val = GetVal();
+ val = val ^
+ static_cast<T>(GetSliceNPHash64(old_value, ProtectionInfo<T>::kSeedV));
+ val = val ^
+ static_cast<T>(GetSliceNPHash64(new_value, ProtectionInfo<T>::kSeedV));
+ SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateV(const SliceParts& old_value,
+ const SliceParts& new_value) {
+ T val = GetVal();
+ val = val ^ static_cast<T>(
+ GetSlicePartsNPHash64(old_value, ProtectionInfo<T>::kSeedV));
+ val = val ^ static_cast<T>(
+ GetSlicePartsNPHash64(new_value, ProtectionInfo<T>::kSeedV));
+ SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateO(ValueType old_op_type,
+ ValueType new_op_type) {
+ T val = GetVal();
+ val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&old_op_type),
+ sizeof(old_op_type),
+ ProtectionInfo<T>::kSeedO));
+ val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&new_op_type),
+ sizeof(new_op_type),
+ ProtectionInfo<T>::kSeedO));
+ SetVal(val);
+}
+
+template <typename T>
+ProtectionInfo<T> ProtectionInfoKVO<T>::StripKVO(const Slice& key,
+ const Slice& value,
+ ValueType op_type) const {
+ T val = GetVal();
+ val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
+ val =
+ val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
+ val = val ^
+ static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+ sizeof(op_type), ProtectionInfo<T>::kSeedO));
+ return ProtectionInfo<T>(val);
+}
+
+template <typename T>
+ProtectionInfo<T> ProtectionInfoKVO<T>::StripKVO(const SliceParts& key,
+ const SliceParts& value,
+ ValueType op_type) const {
+ T val = GetVal();
+ val = val ^
+ static_cast<T>(GetSlicePartsNPHash64(key, ProtectionInfo<T>::kSeedK));
+ val = val ^
+ static_cast<T>(GetSlicePartsNPHash64(value, ProtectionInfo<T>::kSeedV));
+ val = val ^
+ static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+ sizeof(op_type), ProtectionInfo<T>::kSeedO));
+ return ProtectionInfo<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVOC<T> ProtectionInfoKVO<T>::ProtectC(
+ ColumnFamilyId column_family_id) const {
+ T val = GetVal();
+ val = val ^ static_cast<T>(NPHash64(
+ reinterpret_cast<char*>(&column_family_id),
+ sizeof(column_family_id), ProtectionInfo<T>::kSeedC));
+ return ProtectionInfoKVOC<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfoKVOC<T>::StripC(
+ ColumnFamilyId column_family_id) const {
+ T val = GetVal();
+ val = val ^ static_cast<T>(NPHash64(
+ reinterpret_cast<char*>(&column_family_id),
+ sizeof(column_family_id), ProtectionInfo<T>::kSeedC));
+ return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOC<T>::UpdateC(ColumnFamilyId old_column_family_id,
+ ColumnFamilyId new_column_family_id) {
+ T val = GetVal();
+ val = val ^ static_cast<T>(NPHash64(
+ reinterpret_cast<char*>(&old_column_family_id),
+ sizeof(old_column_family_id), ProtectionInfo<T>::kSeedC));
+ val = val ^ static_cast<T>(NPHash64(
+ reinterpret_cast<char*>(&new_column_family_id),
+ sizeof(new_column_family_id), ProtectionInfo<T>::kSeedC));
+ SetVal(val);
+}
+
+template <typename T>
+ProtectionInfoKVOS<T> ProtectionInfoKVO<T>::ProtectS(
+ SequenceNumber sequence_number) const {
+ T val = GetVal();
+ val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&sequence_number),
+ sizeof(sequence_number),
+ ProtectionInfo<T>::kSeedS));
+ return ProtectionInfoKVOS<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfoKVOS<T>::StripS(
+ SequenceNumber sequence_number) const {
+ T val = GetVal();
+ val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&sequence_number),
+ sizeof(sequence_number),
+ ProtectionInfo<T>::kSeedS));
+ return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOS<T>::UpdateS(SequenceNumber old_sequence_number,
+ SequenceNumber new_sequence_number) {
+ T val = GetVal();
+ val = val ^ static_cast<T>(NPHash64(
+ reinterpret_cast<char*>(&old_sequence_number),
+ sizeof(old_sequence_number), ProtectionInfo<T>::kSeedS));
+ val = val ^ static_cast<T>(NPHash64(
+ reinterpret_cast<char*>(&new_sequence_number),
+ sizeof(new_sequence_number), ProtectionInfo<T>::kSeedS));
+ SetVal(val);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/listener_test.cc b/src/rocksdb/db/listener_test.cc
new file mode 100644
index 000000000..160866bb7
--- /dev/null
+++ b/src/rocksdb/db/listener_test.cc
@@ -0,0 +1,1595 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class EventListenerTest : public DBTestBase {
+ public:
+ EventListenerTest() : DBTestBase("listener_test", /*env_do_fsync=*/true) {}
+
+ static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+ uint64_t size) {
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+ kNoCompression);
+ return blob_index;
+ }
+
+ const size_t k110KB = 110 << 10;
+};
+
+struct TestPropertiesCollector
+ : public ROCKSDB_NAMESPACE::TablePropertiesCollector {
+ ROCKSDB_NAMESPACE::Status AddUserKey(
+ const ROCKSDB_NAMESPACE::Slice& /*key*/,
+ const ROCKSDB_NAMESPACE::Slice& /*value*/,
+ ROCKSDB_NAMESPACE::EntryType /*type*/,
+ ROCKSDB_NAMESPACE::SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ return Status::OK();
+ }
+ ROCKSDB_NAMESPACE::Status Finish(
+ ROCKSDB_NAMESPACE::UserCollectedProperties* properties) override {
+ properties->insert({"0", "1"});
+ return Status::OK();
+ }
+
+ const char* Name() const override { return "TestTablePropertiesCollector"; }
+
+ ROCKSDB_NAMESPACE::UserCollectedProperties GetReadableProperties()
+ const override {
+ ROCKSDB_NAMESPACE::UserCollectedProperties ret;
+ ret["2"] = "3";
+ return ret;
+ }
+};
+
+class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ return new TestPropertiesCollector;
+ }
+ const char* Name() const override { return "TestTablePropertiesCollector"; }
+};
+
+class TestCompactionListener : public EventListener {
+ public:
+ explicit TestCompactionListener(EventListenerTest* test) : test_(test) {}
+
+ void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ compacted_dbs_.push_back(db);
+ ASSERT_GT(ci.input_files.size(), 0U);
+ ASSERT_EQ(ci.input_files.size(), ci.input_file_infos.size());
+
+ for (size_t i = 0; i < ci.input_file_infos.size(); ++i) {
+ ASSERT_EQ(ci.input_file_infos[i].level, ci.base_input_level);
+ ASSERT_EQ(ci.input_file_infos[i].file_number,
+ TableFileNameToNumber(ci.input_files[i]));
+ }
+
+ ASSERT_GT(ci.output_files.size(), 0U);
+ ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size());
+
+ ASSERT_TRUE(test_);
+ ASSERT_EQ(test_->db_, db);
+
+ std::vector<std::vector<FileMetaData>> files_by_level;
+ test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id],
+ &files_by_level);
+ ASSERT_GT(files_by_level.size(), ci.output_level);
+
+ for (size_t i = 0; i < ci.output_file_infos.size(); ++i) {
+ ASSERT_EQ(ci.output_file_infos[i].level, ci.output_level);
+ ASSERT_EQ(ci.output_file_infos[i].file_number,
+ TableFileNameToNumber(ci.output_files[i]));
+
+ auto it = std::find_if(
+ files_by_level[ci.output_level].begin(),
+ files_by_level[ci.output_level].end(), [&](const FileMetaData& meta) {
+ return meta.fd.GetNumber() == ci.output_file_infos[i].file_number;
+ });
+ ASSERT_NE(it, files_by_level[ci.output_level].end());
+
+ ASSERT_EQ(ci.output_file_infos[i].oldest_blob_file_number,
+ it->oldest_blob_file_number);
+ }
+
+ ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id);
+ ASSERT_GT(ci.thread_id, 0U);
+
+ for (auto fl : {ci.input_files, ci.output_files}) {
+ for (auto fn : fl) {
+ auto it = ci.table_properties.find(fn);
+ ASSERT_NE(it, ci.table_properties.end());
+ auto tp = it->second;
+ ASSERT_TRUE(tp != nullptr);
+ ASSERT_EQ(tp->user_collected_properties.find("0")->second, "1");
+ }
+ }
+ }
+
+ EventListenerTest* test_;
+ std::vector<DB*> compacted_dbs_;
+ std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 100;
+ const int kNumL0Files = 4;
+
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ options.compaction_style = kCompactionStyleLevel;
+ options.target_file_size_base = options.write_buffer_size;
+ options.max_bytes_for_level_base = options.target_file_size_base * 2;
+ options.max_bytes_for_level_multiplier = 2;
+ options.compression = kNoCompression;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ options.enable_thread_tracking = true;
+#endif // ROCKSDB_USING_THREAD_STATUS
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.table_properties_collector_factories.push_back(
+ std::make_shared<TestPropertiesCollectorFactory>());
+
+ TestCompactionListener* listener = new TestCompactionListener(this);
+ options.listeners.emplace_back(listener);
+ std::vector<std::string> cf_names = {"pikachu", "ilya", "muromec",
+ "dobrynia", "nikitich", "alyosha",
+ "popovich"};
+ CreateAndReopenWithCF(cf_names, options);
+ ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+ BlobStr(123, 0, 1 << 10)));
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+ ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+ ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+ ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+ ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+ ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+ ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+ for (int i = 1; i < 8; ++i) {
+ ASSERT_OK(Flush(i));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
+ nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+
+ ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
+ for (size_t i = 0; i < cf_names.size(); ++i) {
+ ASSERT_EQ(listener->compacted_dbs_[i], db_);
+ }
+}
+
+// This simple Listener can only handle one flush at a time.
+class TestFlushListener : public EventListener {
+ public:
+ TestFlushListener(Env* env, EventListenerTest* test)
+ : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
+ db_closed = false;
+ }
+
+ virtual ~TestFlushListener() {
+ prev_fc_info_.status.PermitUncheckedError(); // Ignore the status
+ }
+ void OnTableFileCreated(const TableFileCreationInfo& info) override {
+ // remember the info for later checking the FlushJobInfo.
+ prev_fc_info_ = info;
+ ASSERT_GT(info.db_name.size(), 0U);
+ ASSERT_GT(info.cf_name.size(), 0U);
+ ASSERT_GT(info.file_path.size(), 0U);
+ ASSERT_GT(info.job_id, 0);
+ ASSERT_GT(info.table_properties.data_size, 0U);
+ ASSERT_GT(info.table_properties.raw_key_size, 0U);
+ ASSERT_GT(info.table_properties.raw_value_size, 0U);
+ ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+ ASSERT_GT(info.table_properties.num_entries, 0U);
+ ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+ ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ // Verify the id of the current thread that created this table
+ // file matches the id of any active flush or compaction thread.
+ uint64_t thread_id = env_->GetThreadID();
+ std::vector<ThreadStatus> thread_list;
+ ASSERT_OK(env_->GetThreadList(&thread_list));
+ bool found_match = false;
+ for (auto thread_status : thread_list) {
+ if (thread_status.operation_type == ThreadStatus::OP_FLUSH ||
+ thread_status.operation_type == ThreadStatus::OP_COMPACTION) {
+ if (thread_id == thread_status.thread_id) {
+ found_match = true;
+ break;
+ }
+ }
+ }
+ ASSERT_TRUE(found_match);
+#endif // ROCKSDB_USING_THREAD_STATUS
+ }
+
+ void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+ flushed_dbs_.push_back(db);
+ flushed_column_family_names_.push_back(info.cf_name);
+ if (info.triggered_writes_slowdown) {
+ slowdown_count++;
+ }
+ if (info.triggered_writes_stop) {
+ stop_count++;
+ }
+ // verify whether the previously created file matches the flushed file.
+ ASSERT_EQ(prev_fc_info_.db_name, db->GetName());
+ ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
+ ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
+ ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+ ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number);
+
+ // Note: the following chunk relies on the notification pertaining to the
+ // database pointed to by DBTestBase::db_, and is thus bypassed when
+ // that assumption does not hold (see the test case MultiDBMultiListeners
+ // below).
+ ASSERT_TRUE(test_);
+ if (db == test_->db_) {
+ std::vector<std::vector<FileMetaData>> files_by_level;
+ ASSERT_LT(info.cf_id, test_->handles_.size());
+ ASSERT_GE(info.cf_id, 0u);
+ ASSERT_NE(test_->handles_[info.cf_id], nullptr);
+ test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id],
+ &files_by_level);
+
+ ASSERT_FALSE(files_by_level.empty());
+ auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(),
+ [&](const FileMetaData& meta) {
+ return meta.fd.GetNumber() == info.file_number;
+ });
+ ASSERT_NE(it, files_by_level[0].end());
+ ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number);
+ }
+
+ ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
+ ASSERT_GT(info.thread_id, 0U);
+ ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second,
+ "1");
+ }
+
+ std::vector<std::string> flushed_column_family_names_;
+ std::vector<DB*> flushed_dbs_;
+ int slowdown_count;
+ int stop_count;
+ bool db_closing;
+ std::atomic_bool db_closed;
+ TableFileCreationInfo prev_fc_info_;
+
+ protected:
+ Env* env_;
+ EventListenerTest* test_;
+};
+
+TEST_F(EventListenerTest, OnSingleDBFlushTest) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.write_buffer_size = k110KB;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ options.enable_thread_tracking = true;
+#endif // ROCKSDB_USING_THREAD_STATUS
+ TestFlushListener* listener = new TestFlushListener(options.env, this);
+ options.listeners.emplace_back(listener);
+ std::vector<std::string> cf_names = {"pikachu", "ilya", "muromec",
+ "dobrynia", "nikitich", "alyosha",
+ "popovich"};
+ options.table_properties_collector_factories.push_back(
+ std::make_shared<TestPropertiesCollectorFactory>());
+ CreateAndReopenWithCF(cf_names, options);
+
+ ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+ BlobStr(456, 0, 1 << 10)));
+ ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+ ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+ ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+ ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+ ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+ ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+ ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+ for (int i = 1; i < 8; ++i) {
+ ASSERT_OK(Flush(i));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ // Ensure background work is fully finished including listener callbacks
+ // before accessing listener state.
+ ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+ ASSERT_EQ(listener->flushed_dbs_.size(), i);
+ ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+ }
+
+ // make sure callback functions are called in the right order
+ for (size_t i = 0; i < cf_names.size(); ++i) {
+ ASSERT_EQ(listener->flushed_dbs_[i], db_);
+ ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+ }
+}
+
+TEST_F(EventListenerTest, MultiCF) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.write_buffer_size = k110KB;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ options.enable_thread_tracking = true;
+#endif // ROCKSDB_USING_THREAD_STATUS
+ for (auto atomic_flush : {false, true}) {
+ options.atomic_flush = atomic_flush;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ TestFlushListener* listener = new TestFlushListener(options.env, this);
+ options.listeners.emplace_back(listener);
+ options.table_properties_collector_factories.push_back(
+ std::make_shared<TestPropertiesCollectorFactory>());
+ std::vector<std::string> cf_names = {"pikachu", "ilya", "muromec",
+ "dobrynia", "nikitich", "alyosha",
+ "popovich"};
+ CreateAndReopenWithCF(cf_names, options);
+
+ ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+ ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+ ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+ ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+ ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+ ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+ ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 1; i < 8; ++i) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted",
+ "EventListenerTest.MultiCF:PreVerifyListener"}});
+ ASSERT_OK(Flush(i));
+ TEST_SYNC_POINT("EventListenerTest.MultiCF:PreVerifyListener");
+ ASSERT_EQ(listener->flushed_dbs_.size(), i);
+ ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+ // make sure callback functions are called in the right order
+ if (i == 7) {
+ for (size_t j = 0; j < cf_names.size(); j++) {
+ ASSERT_EQ(listener->flushed_dbs_[j], db_);
+ ASSERT_EQ(listener->flushed_column_family_names_[j], cf_names[j]);
+ }
+ }
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ Close();
+ }
+}
+
+TEST_F(EventListenerTest, MultiDBMultiListeners) {
+ Options options;
+ options.env = CurrentOptions().env;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ options.enable_thread_tracking = true;
+#endif // ROCKSDB_USING_THREAD_STATUS
+ options.table_properties_collector_factories.push_back(
+ std::make_shared<TestPropertiesCollectorFactory>());
+ std::vector<TestFlushListener*> listeners;
+ const int kNumDBs = 5;
+ const int kNumListeners = 10;
+ for (int i = 0; i < kNumListeners; ++i) {
+ listeners.emplace_back(new TestFlushListener(options.env, this));
+ }
+
+ std::vector<std::string> cf_names = {"pikachu", "ilya", "muromec",
+ "dobrynia", "nikitich", "alyosha",
+ "popovich"};
+
+ options.create_if_missing = true;
+ for (int i = 0; i < kNumListeners; ++i) {
+ options.listeners.emplace_back(listeners[i]);
+ }
+ DBOptions db_opts(options);
+ ColumnFamilyOptions cf_opts(options);
+
+ std::vector<DB*> dbs;
+ std::vector<std::vector<ColumnFamilyHandle*>> vec_handles;
+
+ for (int d = 0; d < kNumDBs; ++d) {
+ ASSERT_OK(DestroyDB(dbname_ + std::to_string(d), options));
+ DB* db;
+ std::vector<ColumnFamilyHandle*> handles;
+ ASSERT_OK(DB::Open(options, dbname_ + std::to_string(d), &db));
+ for (size_t c = 0; c < cf_names.size(); ++c) {
+ ColumnFamilyHandle* handle;
+ ASSERT_OK(db->CreateColumnFamily(cf_opts, cf_names[c], &handle));
+ handles.push_back(handle);
+ }
+
+ vec_handles.push_back(std::move(handles));
+ dbs.push_back(db);
+ }
+
+ for (int d = 0; d < kNumDBs; ++d) {
+ for (size_t c = 0; c < cf_names.size(); ++c) {
+ ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c], cf_names[c],
+ cf_names[c]));
+ }
+ }
+
+ for (size_t c = 0; c < cf_names.size(); ++c) {
+ for (int d = 0; d < kNumDBs; ++d) {
+ ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
+ ASSERT_OK(
+ static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForFlushMemTable());
+ }
+ }
+
+ for (int d = 0; d < kNumDBs; ++d) {
+ // Ensure background work is fully finished including listener callbacks
+ // before accessing listener state.
+ ASSERT_OK(
+ static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForBackgroundWork());
+ }
+
+ for (auto* listener : listeners) {
+ int pos = 0;
+ for (size_t c = 0; c < cf_names.size(); ++c) {
+ for (int d = 0; d < kNumDBs; ++d) {
+ ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]);
+ ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]);
+ pos++;
+ }
+ }
+ }
+
+ for (auto handles : vec_handles) {
+ for (auto h : handles) {
+ delete h;
+ }
+ handles.clear();
+ }
+ vec_handles.clear();
+
+ for (auto db : dbs) {
+ delete db;
+ }
+}
+
+TEST_F(EventListenerTest, DisableBGCompaction) {
+ Options options;
+ options.env = CurrentOptions().env;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ options.enable_thread_tracking = true;
+#endif // ROCKSDB_USING_THREAD_STATUS
+ TestFlushListener* listener = new TestFlushListener(options.env, this);
+ const int kCompactionTrigger = 1;
+ const int kSlowdownTrigger = 5;
+ const int kStopTrigger = 100;
+ options.level0_file_num_compaction_trigger = kCompactionTrigger;
+ options.level0_slowdown_writes_trigger = kSlowdownTrigger;
+ options.level0_stop_writes_trigger = kStopTrigger;
+ options.max_write_buffer_number = 10;
+ options.listeners.emplace_back(listener);
+ // BG compaction is disabled. Number of L0 files will simply keeps
+ // increasing in this test.
+ options.compaction_style = kCompactionStyleNone;
+ options.compression = kNoCompression;
+ options.write_buffer_size = 100000; // Small write buffer
+ options.table_properties_collector_factories.push_back(
+ std::make_shared<TestPropertiesCollectorFactory>());
+
+ CreateAndReopenWithCF({"pikachu"}, options);
+ ColumnFamilyMetaData cf_meta;
+ db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+
+ // keep writing until writes are forced to stop.
+ for (int i = 0; static_cast<int>(cf_meta.file_count) < kSlowdownTrigger * 10;
+ ++i) {
+ ASSERT_OK(
+ Put(1, std::to_string(i), std::string(10000, 'x'), WriteOptions()));
+ FlushOptions fo;
+ fo.allow_write_stall = true;
+ ASSERT_OK(db_->Flush(fo, handles_[1]));
+ db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ }
+ // Ensure background work is fully finished including listener callbacks
+ // before accessing listener state.
+ ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+ ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
+}
+
+class TestCompactionReasonListener : public EventListener {
+ public:
+ void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ compaction_reasons_.push_back(ci.compaction_reason);
+ }
+
+ std::vector<CompactionReason> compaction_reasons_;
+ std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, CompactionReasonLevel) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+ DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+ TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+ options.listeners.emplace_back(listener);
+
+ options.level0_file_num_compaction_trigger = 4;
+ options.compaction_style = kCompactionStyleLevel;
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // Write 4 files in L0
+ for (int i = 0; i < 4; i++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(listener->compaction_reasons_.size(), 1);
+ ASSERT_EQ(listener->compaction_reasons_[0],
+ CompactionReason::kLevelL0FilesNum);
+
+ DestroyAndReopen(options);
+
+ // Write 3 non-overlapping files in L0
+ for (int k = 1; k <= 30; k++) {
+ ASSERT_OK(Put(Key(k), Key(k)));
+ if (k % 10 == 0) {
+ Flush();
+ }
+ }
+
+ // Do a trivial move from L0 -> L1
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ options.max_bytes_for_level_base = 1;
+ Close();
+ listener->compaction_reasons_.clear();
+ Reopen(options);
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_GT(listener->compaction_reasons_.size(), 1);
+
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kLevelMaxLevelSize);
+ }
+
+ options.disable_auto_compactions = true;
+ Close();
+ listener->compaction_reasons_.clear();
+ Reopen(options);
+
+ ASSERT_OK(Put("key", "value"));
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_GT(listener->compaction_reasons_.size(), 0);
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction);
+ }
+}
+
+TEST_F(EventListenerTest, CompactionReasonUniversal) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+ DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+ TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+ options.listeners.emplace_back(listener);
+
+ options.compaction_style = kCompactionStyleUniversal;
+
+ Random rnd(301);
+
+ options.level0_file_num_compaction_trigger = 8;
+ options.compaction_options_universal.max_size_amplification_percent = 100000;
+ options.compaction_options_universal.size_ratio = 100000;
+ DestroyAndReopen(options);
+ listener->compaction_reasons_.clear();
+
+ // Write 8 files in L0
+ for (int i = 0; i < 8; i++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_GT(listener->compaction_reasons_.size(), 0);
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeRatio);
+ }
+
+ options.level0_file_num_compaction_trigger = 8;
+ options.compaction_options_universal.max_size_amplification_percent = 1;
+ options.compaction_options_universal.size_ratio = 100000;
+
+ DestroyAndReopen(options);
+ listener->compaction_reasons_.clear();
+
+ // Write 8 files in L0
+ for (int i = 0; i < 8; i++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_GT(listener->compaction_reasons_.size(), 0);
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeAmplification);
+ }
+
+ options.disable_auto_compactions = true;
+ Close();
+ listener->compaction_reasons_.clear();
+ Reopen(options);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ ASSERT_GT(listener->compaction_reasons_.size(), 0);
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction);
+ }
+}
+
+TEST_F(EventListenerTest, CompactionReasonFIFO) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+ DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+ TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+ options.listeners.emplace_back(listener);
+
+ options.level0_file_num_compaction_trigger = 4;
+ options.compaction_style = kCompactionStyleFIFO;
+ options.compaction_options_fifo.max_table_files_size = 1;
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // Write 4 files in L0
+ for (int i = 0; i < 4; i++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_GT(listener->compaction_reasons_.size(), 0);
+ for (auto compaction_reason : listener->compaction_reasons_) {
+ ASSERT_EQ(compaction_reason, CompactionReason::kFIFOMaxSize);
+ }
+}
+
+class TableFileCreationListener : public EventListener {
+ public:
+ class TestEnv : public EnvWrapper {
+ public:
+ explicit TestEnv(Env* t) : EnvWrapper(t) {}
+ static const char* kClassName() { return "TestEnv"; }
+ const char* Name() const override { return kClassName(); }
+
+ void SetStatus(Status s) { status_ = s; }
+
+ Status NewWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options) override {
+ if (fname.size() > 4 && fname.substr(fname.size() - 4) == ".sst") {
+ if (!status_.ok()) {
+ return status_;
+ }
+ }
+ return target()->NewWritableFile(fname, result, options);
+ }
+
+ private:
+ Status status_;
+ };
+
+ TableFileCreationListener() {
+ for (int i = 0; i < 2; i++) {
+ started_[i] = finished_[i] = failure_[i] = 0;
+ }
+ }
+
+ int Index(TableFileCreationReason reason) {
+ int idx;
+ switch (reason) {
+ case TableFileCreationReason::kFlush:
+ idx = 0;
+ break;
+ case TableFileCreationReason::kCompaction:
+ idx = 1;
+ break;
+ default:
+ idx = -1;
+ }
+ return idx;
+ }
+
+ void CheckAndResetCounters(int flush_started, int flush_finished,
+ int flush_failure, int compaction_started,
+ int compaction_finished, int compaction_failure) {
+ ASSERT_EQ(started_[0], flush_started);
+ ASSERT_EQ(finished_[0], flush_finished);
+ ASSERT_EQ(failure_[0], flush_failure);
+ ASSERT_EQ(started_[1], compaction_started);
+ ASSERT_EQ(finished_[1], compaction_finished);
+ ASSERT_EQ(failure_[1], compaction_failure);
+ for (int i = 0; i < 2; i++) {
+ started_[i] = finished_[i] = failure_[i] = 0;
+ }
+ }
+
+ void OnTableFileCreationStarted(
+ const TableFileCreationBriefInfo& info) override {
+ int idx = Index(info.reason);
+ if (idx >= 0) {
+ started_[idx]++;
+ }
+ ASSERT_GT(info.db_name.size(), 0U);
+ ASSERT_GT(info.cf_name.size(), 0U);
+ ASSERT_GT(info.file_path.size(), 0U);
+ ASSERT_GT(info.job_id, 0);
+ }
+
+ void OnTableFileCreated(const TableFileCreationInfo& info) override {
+ int idx = Index(info.reason);
+ if (idx >= 0) {
+ finished_[idx]++;
+ }
+ ASSERT_GT(info.db_name.size(), 0U);
+ ASSERT_GT(info.cf_name.size(), 0U);
+ ASSERT_GT(info.file_path.size(), 0U);
+ ASSERT_GT(info.job_id, 0);
+ ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+ ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+ if (info.status.ok()) {
+ if (info.table_properties.num_range_deletions == 0U) {
+ ASSERT_GT(info.table_properties.data_size, 0U);
+ ASSERT_GT(info.table_properties.raw_key_size, 0U);
+ ASSERT_GT(info.table_properties.raw_value_size, 0U);
+ ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+ ASSERT_GT(info.table_properties.num_entries, 0U);
+ }
+ } else {
+ if (idx >= 0) {
+ failure_[idx]++;
+ last_failure_ = info.status;
+ }
+ }
+ }
+
+ int started_[2];
+ int finished_[2];
+ int failure_[2];
+ Status last_failure_;
+};
+
+TEST_F(EventListenerTest, TableFileCreationListenersTest) {
+ auto listener = std::make_shared<TableFileCreationListener>();
+ Options options;
+ std::unique_ptr<TableFileCreationListener::TestEnv> test_env(
+ new TableFileCreationListener::TestEnv(CurrentOptions().env));
+ options.create_if_missing = true;
+ options.listeners.push_back(listener);
+ options.env = test_env.get();
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("foo", "aaa"));
+ ASSERT_OK(Put("bar", "bbb"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
+ ASSERT_OK(Put("foo", "aaa1"));
+ ASSERT_OK(Put("bar", "bbb1"));
+ test_env->SetStatus(Status::NotSupported("not supported"));
+ ASSERT_NOK(Flush());
+ listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0);
+ ASSERT_TRUE(listener->last_failure_.IsNotSupported());
+ test_env->SetStatus(Status::OK());
+
+ Reopen(options);
+ ASSERT_OK(Put("foo", "aaa2"));
+ ASSERT_OK(Put("bar", "bbb2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
+
+ const Slice kRangeStart = "a";
+ const Slice kRangeEnd = "z";
+ ASSERT_OK(
+ dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0);
+
+ ASSERT_OK(Put("foo", "aaa3"));
+ ASSERT_OK(Put("bar", "bbb3"));
+ ASSERT_OK(Flush());
+ test_env->SetStatus(Status::NotSupported("not supported"));
+ ASSERT_NOK(
+ dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd));
+ ASSERT_NOK(dbfull()->TEST_WaitForCompact());
+ listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1);
+ ASSERT_TRUE(listener->last_failure_.IsNotSupported());
+
+ // Reset
+ test_env->SetStatus(Status::OK());
+ DestroyAndReopen(options);
+
+ // Verify that an empty table file that is immediately deleted gives Aborted
+ // status to listener.
+ ASSERT_OK(Put("baz", "z"));
+ ASSERT_OK(SingleDelete("baz"));
+ ASSERT_OK(Flush());
+ listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0);
+ ASSERT_TRUE(listener->last_failure_.IsAborted());
+
+ // Also in compaction
+ ASSERT_OK(Put("baz", "z"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ kRangeStart, kRangeEnd));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ listener->CheckAndResetCounters(2, 2, 0, 1, 1, 1);
+ ASSERT_TRUE(listener->last_failure_.IsAborted());
+
+ Close(); // Avoid UAF on listener
+}
+
+class MemTableSealedListener : public EventListener {
+ private:
+ SequenceNumber latest_seq_number_;
+
+ public:
+ MemTableSealedListener() {}
+ void OnMemTableSealed(const MemTableInfo& info) override {
+ latest_seq_number_ = info.first_seqno;
+ }
+
+ void OnFlushCompleted(DB* /*db*/,
+ const FlushJobInfo& flush_job_info) override {
+ ASSERT_LE(flush_job_info.smallest_seqno, latest_seq_number_);
+ }
+};
+
+TEST_F(EventListenerTest, MemTableSealedListenerTest) {
+ auto listener = std::make_shared<MemTableSealedListener>();
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.listeners.push_back(listener);
+ DestroyAndReopen(options);
+
+ for (unsigned int i = 0; i < 10; i++) {
+ std::string tag = std::to_string(i);
+ ASSERT_OK(Put("foo" + tag, "aaa"));
+ ASSERT_OK(Put("bar" + tag, "bbb"));
+
+ ASSERT_OK(Flush());
+ }
+}
+
+class ColumnFamilyHandleDeletionStartedListener : public EventListener {
+ private:
+ std::vector<std::string> cfs_;
+ int counter;
+
+ public:
+ explicit ColumnFamilyHandleDeletionStartedListener(
+ const std::vector<std::string>& cfs)
+ : cfs_(cfs), counter(0) {
+ cfs_.insert(cfs_.begin(), kDefaultColumnFamilyName);
+ }
+ void OnColumnFamilyHandleDeletionStarted(
+ ColumnFamilyHandle* handle) override {
+ ASSERT_EQ(cfs_[handle->GetID()], handle->GetName());
+ counter++;
+ }
+ int getCounter() { return counter; }
+};
+
+TEST_F(EventListenerTest, ColumnFamilyHandleDeletionStartedListenerTest) {
+ std::vector<std::string> cfs{"pikachu", "eevee", "Mewtwo"};
+ auto listener =
+ std::make_shared<ColumnFamilyHandleDeletionStartedListener>(cfs);
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.listeners.push_back(listener);
+ CreateAndReopenWithCF(cfs, options);
+ ASSERT_EQ(handles_.size(), 4);
+ delete handles_[3];
+ delete handles_[2];
+ delete handles_[1];
+ handles_.resize(1);
+ ASSERT_EQ(listener->getCounter(), 3);
+}
+
+class BackgroundErrorListener : public EventListener {
+ private:
+ SpecialEnv* env_;
+ int counter_;
+
+ public:
+ BackgroundErrorListener(SpecialEnv* env) : env_(env), counter_(0) {}
+
+ void OnBackgroundError(BackgroundErrorReason /*reason*/,
+ Status* bg_error) override {
+ if (counter_ == 0) {
+ // suppress the first error and disable write-dropping such that a retry
+ // can succeed.
+ *bg_error = Status::OK();
+ env_->drop_writes_.store(false, std::memory_order_release);
+ env_->SetMockSleep(false);
+ }
+ ++counter_;
+ }
+
+ int counter() { return counter_; }
+};
+
+TEST_F(EventListenerTest, BackgroundErrorListenerFailedFlushTest) {
+ auto listener = std::make_shared<BackgroundErrorListener>(env_);
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_;
+ options.listeners.push_back(listener);
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+
+ // the usual TEST_WaitForFlushMemTable() doesn't work for failed flushes, so
+ // forge a custom one for the failed flush case.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkFlush:done",
+ "EventListenerTest:BackgroundErrorListenerFailedFlushTest:1"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ env_->drop_writes_.store(true, std::memory_order_release);
+ env_->SetMockSleep();
+
+ ASSERT_OK(Put("key0", "val"));
+ ASSERT_OK(Put("key1", "val"));
+ TEST_SYNC_POINT("EventListenerTest:BackgroundErrorListenerFailedFlushTest:1");
+ ASSERT_EQ(1, listener->counter());
+ ASSERT_OK(Put("key2", "val"));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+
+TEST_F(EventListenerTest, BackgroundErrorListenerFailedCompactionTest) {
+ auto listener = std::make_shared<BackgroundErrorListener>(env_);
+ Options options;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.push_back(listener);
+ options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
+ options.paranoid_checks = true;
+ DestroyAndReopen(options);
+
+ // third iteration triggers the second memtable's flush
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("key0", "val"));
+ if (i > 0) {
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ ASSERT_OK(Put("key1", "val"));
+ }
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+
+ env_->drop_writes_.store(true, std::memory_order_release);
+ env_->SetMockSleep();
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ(1, listener->counter());
+
+ // trigger flush so compaction is triggered again; this time it succeeds
+ // The previous failed compaction may get retried automatically, so we may
+ // be left with 0 or 1 files in level 1, depending on when the retry gets
+ // scheduled
+ ASSERT_OK(Put("key0", "val"));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_LE(1, NumTableFilesAtLevel(0));
+}
+
+class TestFileOperationListener : public EventListener {
+ public:
+ TestFileOperationListener() {
+ file_reads_.store(0);
+ file_reads_success_.store(0);
+ file_writes_.store(0);
+ file_writes_success_.store(0);
+ file_flushes_.store(0);
+ file_flushes_success_.store(0);
+ file_closes_.store(0);
+ file_closes_success_.store(0);
+ file_syncs_.store(0);
+ file_syncs_success_.store(0);
+ file_truncates_.store(0);
+ file_truncates_success_.store(0);
+ file_seq_reads_.store(0);
+ blob_file_reads_.store(0);
+ blob_file_writes_.store(0);
+ blob_file_flushes_.store(0);
+ blob_file_closes_.store(0);
+ blob_file_syncs_.store(0);
+ blob_file_truncates_.store(0);
+ }
+
+ void OnFileReadFinish(const FileOperationInfo& info) override {
+ ++file_reads_;
+ if (info.status.ok()) {
+ ++file_reads_success_;
+ }
+ if (info.path.find("MANIFEST") != std::string::npos) {
+ ++file_seq_reads_;
+ }
+ if (EndsWith(info.path, ".blob")) {
+ ++blob_file_reads_;
+ }
+ ReportDuration(info);
+ }
+
+ void OnFileWriteFinish(const FileOperationInfo& info) override {
+ ++file_writes_;
+ if (info.status.ok()) {
+ ++file_writes_success_;
+ }
+ if (EndsWith(info.path, ".blob")) {
+ ++blob_file_writes_;
+ }
+ ReportDuration(info);
+ }
+
+ void OnFileFlushFinish(const FileOperationInfo& info) override {
+ ++file_flushes_;
+ if (info.status.ok()) {
+ ++file_flushes_success_;
+ }
+ if (EndsWith(info.path, ".blob")) {
+ ++blob_file_flushes_;
+ }
+ ReportDuration(info);
+ }
+
+ void OnFileCloseFinish(const FileOperationInfo& info) override {
+ ++file_closes_;
+ if (info.status.ok()) {
+ ++file_closes_success_;
+ }
+ if (EndsWith(info.path, ".blob")) {
+ ++blob_file_closes_;
+ }
+ ReportDuration(info);
+ }
+
+ void OnFileSyncFinish(const FileOperationInfo& info) override {
+ ++file_syncs_;
+ if (info.status.ok()) {
+ ++file_syncs_success_;
+ }
+ if (EndsWith(info.path, ".blob")) {
+ ++blob_file_syncs_;
+ }
+ ReportDuration(info);
+ }
+
+ void OnFileTruncateFinish(const FileOperationInfo& info) override {
+ ++file_truncates_;
+ if (info.status.ok()) {
+ ++file_truncates_success_;
+ }
+ if (EndsWith(info.path, ".blob")) {
+ ++blob_file_truncates_;
+ }
+ ReportDuration(info);
+ }
+
+ bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+ std::atomic<size_t> file_reads_;
+ std::atomic<size_t> file_reads_success_;
+ std::atomic<size_t> file_writes_;
+ std::atomic<size_t> file_writes_success_;
+ std::atomic<size_t> file_flushes_;
+ std::atomic<size_t> file_flushes_success_;
+ std::atomic<size_t> file_closes_;
+ std::atomic<size_t> file_closes_success_;
+ std::atomic<size_t> file_syncs_;
+ std::atomic<size_t> file_syncs_success_;
+ std::atomic<size_t> file_truncates_;
+ std::atomic<size_t> file_truncates_success_;
+ std::atomic<size_t> file_seq_reads_;
+ std::atomic<size_t> blob_file_reads_;
+ std::atomic<size_t> blob_file_writes_;
+ std::atomic<size_t> blob_file_flushes_;
+ std::atomic<size_t> blob_file_closes_;
+ std::atomic<size_t> blob_file_syncs_;
+ std::atomic<size_t> blob_file_truncates_;
+
+ private:
+ void ReportDuration(const FileOperationInfo& info) const {
+ ASSERT_GT(info.duration.count(), 0);
+ }
+};
+
+TEST_F(EventListenerTest, OnFileOperationTest) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+
+ TestFileOperationListener* listener = new TestFileOperationListener();
+ options.listeners.emplace_back(listener);
+
+ options.use_direct_io_for_flush_and_compaction = false;
+ Status s = TryReopen(options);
+ if (s.IsInvalidArgument()) {
+ options.use_direct_io_for_flush_and_compaction = false;
+ } else {
+ ASSERT_OK(s);
+ }
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "aaa"));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_GE(listener->file_writes_.load(),
+ listener->file_writes_success_.load());
+ ASSERT_GT(listener->file_writes_.load(), 0);
+ ASSERT_GE(listener->file_flushes_.load(),
+ listener->file_flushes_success_.load());
+ ASSERT_GT(listener->file_flushes_.load(), 0);
+ Close();
+
+ Reopen(options);
+ ASSERT_GE(listener->file_reads_.load(), listener->file_reads_success_.load());
+ ASSERT_GT(listener->file_reads_.load(), 0);
+ ASSERT_GE(listener->file_closes_.load(),
+ listener->file_closes_success_.load());
+ ASSERT_GT(listener->file_closes_.load(), 0);
+ ASSERT_GE(listener->file_syncs_.load(), listener->file_syncs_success_.load());
+ ASSERT_GT(listener->file_syncs_.load(), 0);
+ if (true == options.use_direct_io_for_flush_and_compaction) {
+ ASSERT_GE(listener->file_truncates_.load(),
+ listener->file_truncates_success_.load());
+ ASSERT_GT(listener->file_truncates_.load(), 0);
+ }
+}
+
+TEST_F(EventListenerTest, OnBlobFileOperationTest) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ TestFileOperationListener* listener = new TestFileOperationListener();
+ options.listeners.emplace_back(listener);
+ options.disable_auto_compactions = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 0.5;
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("Key1", "blob_value1"));
+ ASSERT_OK(Put("Key2", "blob_value2"));
+ ASSERT_OK(Put("Key3", "blob_value3"));
+ ASSERT_OK(Put("Key4", "blob_value4"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("Key3", "new_blob_value3"));
+ ASSERT_OK(Put("Key4", "new_blob_value4"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("Key5", "blob_value5"));
+ ASSERT_OK(Put("Key6", "blob_value6"));
+ ASSERT_OK(Flush());
+
+ ASSERT_GT(listener->blob_file_writes_.load(), 0U);
+ ASSERT_GT(listener->blob_file_flushes_.load(), 0U);
+ Close();
+
+ Reopen(options);
+ ASSERT_GT(listener->blob_file_closes_.load(), 0U);
+ ASSERT_GT(listener->blob_file_syncs_.load(), 0U);
+ if (true == options.use_direct_io_for_flush_and_compaction) {
+ ASSERT_GT(listener->blob_file_truncates_.load(), 0U);
+ }
+}
+
+TEST_F(EventListenerTest, ReadManifestAndWALOnRecovery) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+
+ TestFileOperationListener* listener = new TestFileOperationListener();
+ options.listeners.emplace_back(listener);
+
+ options.use_direct_io_for_flush_and_compaction = false;
+ Status s = TryReopen(options);
+ if (s.IsInvalidArgument()) {
+ options.use_direct_io_for_flush_and_compaction = false;
+ } else {
+ ASSERT_OK(s);
+ }
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "aaa"));
+ Close();
+
+ size_t seq_reads = listener->file_seq_reads_.load();
+ Reopen(options);
+ ASSERT_GT(listener->file_seq_reads_.load(), seq_reads);
+}
+
+class BlobDBJobLevelEventListenerTest : public EventListener {
+ public:
+ explicit BlobDBJobLevelEventListenerTest(EventListenerTest* test)
+ : test_(test), call_count_(0) {}
+
+ const VersionStorageInfo* GetVersionStorageInfo() const {
+ VersionSet* const versions = test_->dbfull()->GetVersionSet();
+ assert(versions);
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ EXPECT_NE(cfd, nullptr);
+
+ Version* const current = cfd->current();
+ EXPECT_NE(current, nullptr);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ EXPECT_NE(storage_info, nullptr);
+
+ return storage_info;
+ }
+
+ void CheckBlobFileAdditions(
+ const std::vector<BlobFileAdditionInfo>& blob_file_addition_infos) const {
+ const auto* vstorage = GetVersionStorageInfo();
+
+ EXPECT_FALSE(blob_file_addition_infos.empty());
+
+ for (const auto& blob_file_addition_info : blob_file_addition_infos) {
+ const auto meta = vstorage->GetBlobFileMetaData(
+ blob_file_addition_info.blob_file_number);
+
+ EXPECT_NE(meta, nullptr);
+ EXPECT_EQ(meta->GetBlobFileNumber(),
+ blob_file_addition_info.blob_file_number);
+ EXPECT_EQ(meta->GetTotalBlobBytes(),
+ blob_file_addition_info.total_blob_bytes);
+ EXPECT_EQ(meta->GetTotalBlobCount(),
+ blob_file_addition_info.total_blob_count);
+ EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty());
+ }
+ }
+
+ std::vector<std::string> GetFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ std::vector<std::string> result;
+ for (const auto& fname : flushed_files_) {
+ result.push_back(fname);
+ }
+ return result;
+ }
+
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+ call_count_++;
+
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.push_back(info.file_path);
+ }
+
+ EXPECT_EQ(info.blob_compression_type, kNoCompression);
+
+ CheckBlobFileAdditions(info.blob_file_addition_infos);
+ }
+
+ void OnCompactionCompleted(DB* /*db*/,
+ const CompactionJobInfo& info) override {
+ call_count_++;
+
+ EXPECT_EQ(info.blob_compression_type, kNoCompression);
+
+ CheckBlobFileAdditions(info.blob_file_addition_infos);
+
+ EXPECT_FALSE(info.blob_file_garbage_infos.empty());
+
+ for (const auto& blob_file_garbage_info : info.blob_file_garbage_infos) {
+ EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U);
+ EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U);
+ EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U);
+ EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty());
+ }
+ }
+
+ EventListenerTest* test_;
+ uint32_t call_count_;
+
+ private:
+ std::vector<std::string> flushed_files_;
+ std::mutex mutex_;
+};
+
+// Test OnFlushCompleted EventListener called for blob files
+TEST_F(EventListenerTest, BlobDBOnFlushCompleted) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.enable_blob_files = true;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+
+ options.min_blob_size = 0;
+ BlobDBJobLevelEventListenerTest* blob_event_listener =
+ new BlobDBJobLevelEventListenerTest(this);
+ options.listeners.emplace_back(blob_event_listener);
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("Key1", "blob_value1"));
+ ASSERT_OK(Put("Key2", "blob_value2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("Key3", "blob_value3"));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(Get("Key1"), "blob_value1");
+ ASSERT_EQ(Get("Key2"), "blob_value2");
+ ASSERT_EQ(Get("Key3"), "blob_value3");
+
+ ASSERT_GT(blob_event_listener->call_count_, 0U);
+}
+
+// Test OnCompactionCompleted EventListener called for blob files
+TEST_F(EventListenerTest, BlobDBOnCompactionCompleted) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.enable_blob_files = true;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.min_blob_size = 0;
+ BlobDBJobLevelEventListenerTest* blob_event_listener =
+ new BlobDBJobLevelEventListenerTest(this);
+ options.listeners.emplace_back(blob_event_listener);
+
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 0.5;
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("Key1", "blob_value1"));
+ ASSERT_OK(Put("Key2", "blob_value2"));
+ ASSERT_OK(Put("Key3", "blob_value3"));
+ ASSERT_OK(Put("Key4", "blob_value4"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("Key3", "new_blob_value3"));
+ ASSERT_OK(Put("Key4", "new_blob_value4"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("Key5", "blob_value5"));
+ ASSERT_OK(Put("Key6", "blob_value6"));
+ ASSERT_OK(Flush());
+
+ blob_event_listener->call_count_ = 0;
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ // On compaction, because of blob_garbage_collection_age_cutoff, it will
+ // delete the oldest blob file and create new blob file during compaction.
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ // Make sure, OnCompactionCompleted is called.
+ ASSERT_GT(blob_event_listener->call_count_, 0U);
+}
+
+// Test CompactFiles calls OnCompactionCompleted EventListener for blob files
+// and populate the blob files info.
+TEST_F(EventListenerTest, BlobDBCompactFiles) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.enable_blob_files = true;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.min_blob_size = 0;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 0.5;
+
+ BlobDBJobLevelEventListenerTest* blob_event_listener =
+ new BlobDBJobLevelEventListenerTest(this);
+ options.listeners.emplace_back(blob_event_listener);
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("Key1", "blob_value1"));
+ ASSERT_OK(Put("Key2", "blob_value2"));
+ ASSERT_OK(Put("Key3", "blob_value3"));
+ ASSERT_OK(Put("Key4", "blob_value4"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("Key3", "new_blob_value3"));
+ ASSERT_OK(Put("Key4", "new_blob_value4"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("Key5", "blob_value5"));
+ ASSERT_OK(Put("Key6", "blob_value6"));
+ ASSERT_OK(Flush());
+
+ std::vector<std::string> output_file_names;
+ CompactionJobInfo compaction_job_info;
+
+ // On compaction, because of blob_garbage_collection_age_cutoff, it will
+ // delete the oldest blob file and create new blob file during compaction
+ // which will be populated in output_files_names.
+ ASSERT_OK(dbfull()->CompactFiles(
+ CompactionOptions(), blob_event_listener->GetFlushedFiles(), 1, -1,
+ &output_file_names, &compaction_job_info));
+
+ bool is_blob_in_output = false;
+ for (const auto& file : output_file_names) {
+ if (EndsWith(file, ".blob")) {
+ is_blob_in_output = true;
+ }
+ }
+ ASSERT_TRUE(is_blob_in_output);
+
+ for (const auto& blob_file_addition_info :
+ compaction_job_info.blob_file_addition_infos) {
+ EXPECT_GT(blob_file_addition_info.blob_file_number, 0U);
+ EXPECT_GT(blob_file_addition_info.total_blob_bytes, 0U);
+ EXPECT_GT(blob_file_addition_info.total_blob_count, 0U);
+ EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty());
+ }
+
+ for (const auto& blob_file_garbage_info :
+ compaction_job_info.blob_file_garbage_infos) {
+ EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U);
+ EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U);
+ EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U);
+ EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty());
+ }
+}
+
+class BlobDBFileLevelEventListener : public EventListener {
+ public:
+ void OnBlobFileCreationStarted(
+ const BlobFileCreationBriefInfo& info) override {
+ files_started_++;
+ EXPECT_FALSE(info.db_name.empty());
+ EXPECT_FALSE(info.cf_name.empty());
+ EXPECT_FALSE(info.file_path.empty());
+ EXPECT_GT(info.job_id, 0);
+ }
+
+ void OnBlobFileCreated(const BlobFileCreationInfo& info) override {
+ files_created_++;
+ EXPECT_FALSE(info.db_name.empty());
+ EXPECT_FALSE(info.cf_name.empty());
+ EXPECT_FALSE(info.file_path.empty());
+ EXPECT_GT(info.job_id, 0);
+ EXPECT_GT(info.total_blob_count, 0U);
+ EXPECT_GT(info.total_blob_bytes, 0U);
+ EXPECT_EQ(info.file_checksum, kUnknownFileChecksum);
+ EXPECT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+ EXPECT_TRUE(info.status.ok());
+ }
+
+ void OnBlobFileDeleted(const BlobFileDeletionInfo& info) override {
+ files_deleted_++;
+ EXPECT_FALSE(info.db_name.empty());
+ EXPECT_FALSE(info.file_path.empty());
+ EXPECT_GT(info.job_id, 0);
+ EXPECT_TRUE(info.status.ok());
+ }
+
+ void CheckCounters() {
+ EXPECT_EQ(files_started_, files_created_);
+ EXPECT_GT(files_started_, 0U);
+ EXPECT_GT(files_deleted_, 0U);
+ EXPECT_LT(files_deleted_, files_created_);
+ }
+
+ private:
+ std::atomic<uint32_t> files_started_{};
+ std::atomic<uint32_t> files_created_{};
+ std::atomic<uint32_t> files_deleted_{};
+};
+
+TEST_F(EventListenerTest, BlobDBFileTest) {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.enable_blob_files = true;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.min_blob_size = 0;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 0.5;
+
+ BlobDBFileLevelEventListener* blob_event_listener =
+ new BlobDBFileLevelEventListener();
+ options.listeners.emplace_back(blob_event_listener);
+
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("Key1", "blob_value1"));
+ ASSERT_OK(Put("Key2", "blob_value2"));
+ ASSERT_OK(Put("Key3", "blob_value3"));
+ ASSERT_OK(Put("Key4", "blob_value4"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("Key3", "new_blob_value3"));
+ ASSERT_OK(Put("Key4", "new_blob_value4"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("Key5", "blob_value5"));
+ ASSERT_OK(Put("Key6", "blob_value6"));
+ ASSERT_OK(Flush());
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ // On compaction, because of blob_garbage_collection_age_cutoff, it will
+ // delete the oldest blob file and create new blob file during compaction.
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ blob_event_listener->CheckCounters();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/log_format.h b/src/rocksdb/db/log_format.h
new file mode 100644
index 000000000..d397372f4
--- /dev/null
+++ b/src/rocksdb/db/log_format.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+enum RecordType {
+ // Zero is reserved for preallocated files
+ kZeroType = 0,
+ kFullType = 1,
+
+ // For fragments
+ kFirstType = 2,
+ kMiddleType = 3,
+ kLastType = 4,
+
+ // For recycled log files
+ kRecyclableFullType = 5,
+ kRecyclableFirstType = 6,
+ kRecyclableMiddleType = 7,
+ kRecyclableLastType = 8,
+
+ // Compression Type
+ kSetCompressionType = 9,
+};
+static const int kMaxRecordType = kSetCompressionType;
+
+static const unsigned int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), length (2 bytes), type (1 byte)
+static const int kHeaderSize = 4 + 2 + 1;
+
+// Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte),
+// log number (4 bytes).
+static const int kRecyclableHeaderSize = 4 + 2 + 1 + 4;
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_reader.cc b/src/rocksdb/db/log_reader.cc
new file mode 100644
index 000000000..a21868776
--- /dev/null
+++ b/src/rocksdb/db/log_reader.cc
@@ -0,0 +1,854 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+
+#include "file/sequence_file_reader.h"
+#include "port/lang.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+Reader::Reporter::~Reporter() {}
+
+Reader::Reader(std::shared_ptr<Logger> info_log,
+ std::unique_ptr<SequentialFileReader>&& _file,
+ Reporter* reporter, bool checksum, uint64_t log_num)
+ : info_log_(info_log),
+ file_(std::move(_file)),
+ reporter_(reporter),
+ checksum_(checksum),
+ backing_store_(new char[kBlockSize]),
+ buffer_(),
+ eof_(false),
+ read_error_(false),
+ eof_offset_(0),
+ last_record_offset_(0),
+ end_of_buffer_offset_(0),
+ log_number_(log_num),
+ recycled_(false),
+ first_record_read_(false),
+ compression_type_(kNoCompression),
+ compression_type_record_read_(false),
+ uncompress_(nullptr),
+ hash_state_(nullptr),
+ uncompress_hash_state_(nullptr){};
+
+Reader::~Reader() {
+ delete[] backing_store_;
+ if (uncompress_) {
+ delete uncompress_;
+ }
+ if (hash_state_) {
+ XXH3_freeState(hash_state_);
+ }
+ if (uncompress_hash_state_) {
+ XXH3_freeState(uncompress_hash_state_);
+ }
+}
+
+// For kAbsoluteConsistency, on clean shutdown we don't expect any error
+// in the log files. For other modes, we can ignore only incomplete records
+// in the last log file, which are presumably due to a write in progress
+// during restart (or from log recycling).
+//
+// TODO krad: Evaluate if we need to move to a more strict mode where we
+// restrict the inconsistency to only the last log
+bool Reader::ReadRecord(Slice* record, std::string* scratch,
+ WALRecoveryMode wal_recovery_mode,
+ uint64_t* record_checksum) {
+ scratch->clear();
+ record->clear();
+ if (record_checksum != nullptr) {
+ if (hash_state_ == nullptr) {
+ hash_state_ = XXH3_createState();
+ }
+ XXH3_64bits_reset(hash_state_);
+ }
+ if (uncompress_) {
+ uncompress_->Reset();
+ }
+ bool in_fragmented_record = false;
+ // Record offset of the logical record that we're reading
+ // 0 is a dummy value to make compilers happy
+ uint64_t prospective_record_offset = 0;
+
+ Slice fragment;
+ while (true) {
+ uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+ size_t drop_size = 0;
+ const unsigned int record_type =
+ ReadPhysicalRecord(&fragment, &drop_size, record_checksum);
+ switch (record_type) {
+ case kFullType:
+ case kRecyclableFullType:
+ if (in_fragmented_record && !scratch->empty()) {
+ // Handle bug in earlier versions of log::Writer where
+ // it could emit an empty kFirstType record at the tail end
+ // of a block followed by a kFullType or kFirstType record
+ // at the beginning of the next block.
+ ReportCorruption(scratch->size(), "partial record without end(1)");
+ }
+ // No need to compute record_checksum since the record
+ // consists of a single fragment and the checksum is computed
+ // in ReadPhysicalRecord() if WAL compression is enabled
+ if (record_checksum != nullptr && uncompress_ == nullptr) {
+ // No need to stream since the record is a single fragment
+ *record_checksum = XXH3_64bits(fragment.data(), fragment.size());
+ }
+ prospective_record_offset = physical_record_offset;
+ scratch->clear();
+ *record = fragment;
+ last_record_offset_ = prospective_record_offset;
+ first_record_read_ = true;
+ return true;
+
+ case kFirstType:
+ case kRecyclableFirstType:
+ if (in_fragmented_record && !scratch->empty()) {
+ // Handle bug in earlier versions of log::Writer where
+ // it could emit an empty kFirstType record at the tail end
+ // of a block followed by a kFullType or kFirstType record
+ // at the beginning of the next block.
+ ReportCorruption(scratch->size(), "partial record without end(2)");
+ XXH3_64bits_reset(hash_state_);
+ }
+ if (record_checksum != nullptr) {
+ XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
+ }
+ prospective_record_offset = physical_record_offset;
+ scratch->assign(fragment.data(), fragment.size());
+ in_fragmented_record = true;
+ break;
+
+ case kMiddleType:
+ case kRecyclableMiddleType:
+ if (!in_fragmented_record) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(1)");
+ } else {
+ if (record_checksum != nullptr) {
+ XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
+ }
+ scratch->append(fragment.data(), fragment.size());
+ }
+ break;
+
+ case kLastType:
+ case kRecyclableLastType:
+ if (!in_fragmented_record) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(2)");
+ } else {
+ if (record_checksum != nullptr) {
+ XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
+ *record_checksum = XXH3_64bits_digest(hash_state_);
+ }
+ scratch->append(fragment.data(), fragment.size());
+ *record = Slice(*scratch);
+ last_record_offset_ = prospective_record_offset;
+ first_record_read_ = true;
+ return true;
+ }
+ break;
+
+ case kBadHeader:
+ if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+ wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+ // In clean shutdown we don't expect any error in the log files.
+ // In point-in-time recovery an incomplete record at the end could
+ // produce a hole in the recovered data. Report an error here, which
+ // higher layers can choose to ignore when it's provable there is no
+ // hole.
+ ReportCorruption(drop_size, "truncated header");
+ }
+ FALLTHROUGH_INTENDED;
+
+ case kEof:
+ if (in_fragmented_record) {
+ if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+ wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+ // In clean shutdown we don't expect any error in the log files.
+ // In point-in-time recovery an incomplete record at the end could
+ // produce a hole in the recovered data. Report an error here, which
+ // higher layers can choose to ignore when it's provable there is no
+ // hole.
+ ReportCorruption(scratch->size(), "error reading trailing data");
+ }
+ // This can be caused by the writer dying immediately after
+ // writing a physical record but before completing the next; don't
+ // treat it as a corruption, just ignore the entire logical record.
+ scratch->clear();
+ }
+ return false;
+
+ case kOldRecord:
+ if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) {
+ // Treat a record from a previous instance of the log as EOF.
+ if (in_fragmented_record) {
+ if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+ wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+ // In clean shutdown we don't expect any error in the log files.
+ // In point-in-time recovery an incomplete record at the end could
+ // produce a hole in the recovered data. Report an error here,
+ // which higher layers can choose to ignore when it's provable
+ // there is no hole.
+ ReportCorruption(scratch->size(), "error reading trailing data");
+ }
+ // This can be caused by the writer dying immediately after
+ // writing a physical record but before completing the next; don't
+ // treat it as a corruption, just ignore the entire logical record.
+ scratch->clear();
+ }
+ return false;
+ }
+ FALLTHROUGH_INTENDED;
+
+ case kBadRecord:
+ if (in_fragmented_record) {
+ ReportCorruption(scratch->size(), "error in middle of record");
+ in_fragmented_record = false;
+ scratch->clear();
+ }
+ break;
+
+ case kBadRecordLen:
+ if (eof_) {
+ if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+ wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+ // In clean shutdown we don't expect any error in the log files.
+ // In point-in-time recovery an incomplete record at the end could
+ // produce a hole in the recovered data. Report an error here, which
+ // higher layers can choose to ignore when it's provable there is no
+ // hole.
+ ReportCorruption(drop_size, "truncated record body");
+ }
+ return false;
+ }
+ FALLTHROUGH_INTENDED;
+
+ case kBadRecordChecksum:
+ if (recycled_ && wal_recovery_mode ==
+ WALRecoveryMode::kTolerateCorruptedTailRecords) {
+ scratch->clear();
+ return false;
+ }
+ if (record_type == kBadRecordLen) {
+ ReportCorruption(drop_size, "bad record length");
+ } else {
+ ReportCorruption(drop_size, "checksum mismatch");
+ }
+ if (in_fragmented_record) {
+ ReportCorruption(scratch->size(), "error in middle of record");
+ in_fragmented_record = false;
+ scratch->clear();
+ }
+ break;
+
+ case kSetCompressionType: {
+ if (compression_type_record_read_) {
+ ReportCorruption(fragment.size(),
+ "read multiple SetCompressionType records");
+ }
+ if (first_record_read_) {
+ ReportCorruption(fragment.size(),
+ "SetCompressionType not the first record");
+ }
+ prospective_record_offset = physical_record_offset;
+ scratch->clear();
+ last_record_offset_ = prospective_record_offset;
+ CompressionTypeRecord compression_record(kNoCompression);
+ Status s = compression_record.DecodeFrom(&fragment);
+ if (!s.ok()) {
+ ReportCorruption(fragment.size(),
+ "could not decode SetCompressionType record");
+ } else {
+ InitCompression(compression_record);
+ }
+ break;
+ }
+
+ default: {
+ char buf[40];
+ snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+ ReportCorruption(
+ (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+ buf);
+ in_fragmented_record = false;
+ scratch->clear();
+ break;
+ }
+ }
+ }
+ return false;
+}
+
+uint64_t Reader::LastRecordOffset() { return last_record_offset_; }
+
+uint64_t Reader::LastRecordEnd() {
+ return end_of_buffer_offset_ - buffer_.size();
+}
+
+void Reader::UnmarkEOF() {
+ if (read_error_) {
+ return;
+ }
+ eof_ = false;
+ if (eof_offset_ == 0) {
+ return;
+ }
+ UnmarkEOFInternal();
+}
+
+void Reader::UnmarkEOFInternal() {
+ // If the EOF was in the middle of a block (a partial block was read) we have
+ // to read the rest of the block as ReadPhysicalRecord can only read full
+ // blocks and expects the file position indicator to be aligned to the start
+ // of a block.
+ //
+ // consumed_bytes + buffer_size() + remaining == kBlockSize
+
+ size_t consumed_bytes = eof_offset_ - buffer_.size();
+ size_t remaining = kBlockSize - eof_offset_;
+
+ // backing_store_ is used to concatenate what is left in buffer_ and
+ // the remainder of the block. If buffer_ already uses backing_store_,
+ // we just append the new data.
+ if (buffer_.data() != backing_store_ + consumed_bytes) {
+ // Buffer_ does not use backing_store_ for storage.
+ // Copy what is left in buffer_ to backing_store.
+ memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size());
+ }
+
+ Slice read_buffer;
+ // TODO: rate limit log reader with approriate priority.
+ // TODO: avoid overcharging rate limiter:
+ // Note that the Read here might overcharge SequentialFileReader's internal
+ // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
+ // content left until EOF to read.
+ Status status =
+ file_->Read(remaining, &read_buffer, backing_store_ + eof_offset_,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+
+ size_t added = read_buffer.size();
+ end_of_buffer_offset_ += added;
+
+ if (!status.ok()) {
+ if (added > 0) {
+ ReportDrop(added, status);
+ }
+
+ read_error_ = true;
+ return;
+ }
+
+ if (read_buffer.data() != backing_store_ + eof_offset_) {
+ // Read did not write to backing_store_
+ memmove(backing_store_ + eof_offset_, read_buffer.data(),
+ read_buffer.size());
+ }
+
+ buffer_ = Slice(backing_store_ + consumed_bytes,
+ eof_offset_ + added - consumed_bytes);
+
+ if (added < remaining) {
+ eof_ = true;
+ eof_offset_ += added;
+ } else {
+ eof_offset_ = 0;
+ }
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+ ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+ if (reporter_ != nullptr) {
+ reporter_->Corruption(bytes, reason);
+ }
+}
+
+bool Reader::ReadMore(size_t* drop_size, int* error) {
+ if (!eof_ && !read_error_) {
+ // Last read was a full read, so this is a trailer to skip
+ buffer_.clear();
+ // TODO: rate limit log reader with approriate priority.
+ // TODO: avoid overcharging rate limiter:
+ // Note that the Read here might overcharge SequentialFileReader's internal
+ // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
+ // content left until EOF to read.
+ Status status = file_->Read(kBlockSize, &buffer_, backing_store_,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ TEST_SYNC_POINT_CALLBACK("LogReader::ReadMore:AfterReadFile", &status);
+ end_of_buffer_offset_ += buffer_.size();
+ if (!status.ok()) {
+ buffer_.clear();
+ ReportDrop(kBlockSize, status);
+ read_error_ = true;
+ *error = kEof;
+ return false;
+ } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
+ eof_ = true;
+ eof_offset_ = buffer_.size();
+ }
+ return true;
+ } else {
+ // Note that if buffer_ is non-empty, we have a truncated header at the
+ // end of the file, which can be caused by the writer crashing in the
+ // middle of writing the header. Unless explicitly requested we don't
+ // considering this an error, just report EOF.
+ if (buffer_.size()) {
+ *drop_size = buffer_.size();
+ buffer_.clear();
+ *error = kBadHeader;
+ return false;
+ }
+ buffer_.clear();
+ *error = kEof;
+ return false;
+ }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size,
+ uint64_t* fragment_checksum) {
+ while (true) {
+ // We need at least the minimum header size
+ if (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+ // the default value of r is meaningless because ReadMore will overwrite
+ // it if it returns false; in case it returns true, the return value will
+ // not be used anyway
+ int r = kEof;
+ if (!ReadMore(drop_size, &r)) {
+ return r;
+ }
+ continue;
+ }
+
+ // Parse the header
+ const char* header = buffer_.data();
+ const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+ const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+ const unsigned int type = header[6];
+ const uint32_t length = a | (b << 8);
+ int header_size = kHeaderSize;
+ if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
+ if (end_of_buffer_offset_ - buffer_.size() == 0) {
+ recycled_ = true;
+ }
+ header_size = kRecyclableHeaderSize;
+ // We need enough for the larger header
+ if (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+ int r = kEof;
+ if (!ReadMore(drop_size, &r)) {
+ return r;
+ }
+ continue;
+ }
+ const uint32_t log_num = DecodeFixed32(header + 7);
+ if (log_num != log_number_) {
+ return kOldRecord;
+ }
+ }
+ if (header_size + length > buffer_.size()) {
+ assert(buffer_.size() >= static_cast<size_t>(header_size));
+ *drop_size = buffer_.size();
+ buffer_.clear();
+ // If the end of the read has been reached without seeing
+ // `header_size + length` bytes of payload, report a corruption. The
+ // higher layers can decide how to handle it based on the recovery mode,
+ // whether this occurred at EOF, whether this is the final WAL, etc.
+ return kBadRecordLen;
+ }
+
+ if (type == kZeroType && length == 0) {
+ // Skip zero length record without reporting any drops since
+ // such records are produced by the mmap based writing code in
+ // env_posix.cc that preallocates file regions.
+ // NOTE: this should never happen in DB written by new RocksDB versions,
+ // since we turn off mmap writes to manifest and log files
+ buffer_.clear();
+ return kBadRecord;
+ }
+
+ // Check crc
+ if (checksum_) {
+ uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+ uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
+ if (actual_crc != expected_crc) {
+ // Drop the rest of the buffer since "length" itself may have
+ // been corrupted and if we trust it, we could find some
+ // fragment of a real log record that just happens to look
+ // like a valid log record.
+ *drop_size = buffer_.size();
+ buffer_.clear();
+ return kBadRecordChecksum;
+ }
+ }
+
+ buffer_.remove_prefix(header_size + length);
+
+ if (!uncompress_ || type == kSetCompressionType) {
+ *result = Slice(header + header_size, length);
+ return type;
+ } else {
+ // Uncompress compressed records
+ uncompressed_record_.clear();
+ if (fragment_checksum != nullptr) {
+ if (uncompress_hash_state_ == nullptr) {
+ uncompress_hash_state_ = XXH3_createState();
+ }
+ XXH3_64bits_reset(uncompress_hash_state_);
+ }
+
+ size_t uncompressed_size = 0;
+ int remaining = 0;
+ do {
+ remaining = uncompress_->Uncompress(header + header_size, length,
+ uncompressed_buffer_.get(),
+ &uncompressed_size);
+ if (remaining < 0) {
+ buffer_.clear();
+ return kBadRecord;
+ }
+ if (uncompressed_size > 0) {
+ if (fragment_checksum != nullptr) {
+ XXH3_64bits_update(uncompress_hash_state_,
+ uncompressed_buffer_.get(), uncompressed_size);
+ }
+ uncompressed_record_.append(uncompressed_buffer_.get(),
+ uncompressed_size);
+ }
+ } while (remaining > 0 || uncompressed_size == kBlockSize);
+
+ if (fragment_checksum != nullptr) {
+ // We can remove this check by updating hash_state_ directly,
+ // but that requires resetting hash_state_ for full and first types
+ // for edge cases like consecutive fist type records.
+ // Leaving the check as is since it is cleaner and can revert to the
+ // above approach if it causes performance impact.
+ *fragment_checksum = XXH3_64bits_digest(uncompress_hash_state_);
+ uint64_t actual_checksum = XXH3_64bits(uncompressed_record_.data(),
+ uncompressed_record_.size());
+ if (*fragment_checksum != actual_checksum) {
+ // uncompressed_record_ contains bad content that does not match
+ // actual decompressed content
+ return kBadRecord;
+ }
+ }
+ *result = Slice(uncompressed_record_);
+ return type;
+ }
+ }
+}
+
+// Initialize uncompress related fields
+void Reader::InitCompression(const CompressionTypeRecord& compression_record) {
+ compression_type_ = compression_record.GetCompressionType();
+ compression_type_record_read_ = true;
+ constexpr uint32_t compression_format_version = 2;
+ uncompress_ = StreamingUncompress::Create(
+ compression_type_, compression_format_version, kBlockSize);
+ assert(uncompress_ != nullptr);
+ uncompressed_buffer_ = std::unique_ptr<char[]>(new char[kBlockSize]);
+ assert(uncompressed_buffer_);
+}
+
+bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch,
+ WALRecoveryMode /*unused*/,
+ uint64_t* /* checksum */) {
+ assert(record != nullptr);
+ assert(scratch != nullptr);
+ record->clear();
+ scratch->clear();
+ if (uncompress_) {
+ uncompress_->Reset();
+ }
+
+ uint64_t prospective_record_offset = 0;
+ uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+ size_t drop_size = 0;
+ unsigned int fragment_type_or_err = 0; // Initialize to make compiler happy
+ Slice fragment;
+ while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) {
+ switch (fragment_type_or_err) {
+ case kFullType:
+ case kRecyclableFullType:
+ if (in_fragmented_record_ && !fragments_.empty()) {
+ ReportCorruption(fragments_.size(), "partial record without end(1)");
+ }
+ fragments_.clear();
+ *record = fragment;
+ prospective_record_offset = physical_record_offset;
+ last_record_offset_ = prospective_record_offset;
+ first_record_read_ = true;
+ in_fragmented_record_ = false;
+ return true;
+
+ case kFirstType:
+ case kRecyclableFirstType:
+ if (in_fragmented_record_ || !fragments_.empty()) {
+ ReportCorruption(fragments_.size(), "partial record without end(2)");
+ }
+ prospective_record_offset = physical_record_offset;
+ fragments_.assign(fragment.data(), fragment.size());
+ in_fragmented_record_ = true;
+ break;
+
+ case kMiddleType:
+ case kRecyclableMiddleType:
+ if (!in_fragmented_record_) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(1)");
+ } else {
+ fragments_.append(fragment.data(), fragment.size());
+ }
+ break;
+
+ case kLastType:
+ case kRecyclableLastType:
+ if (!in_fragmented_record_) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(2)");
+ } else {
+ fragments_.append(fragment.data(), fragment.size());
+ scratch->assign(fragments_.data(), fragments_.size());
+ fragments_.clear();
+ *record = Slice(*scratch);
+ last_record_offset_ = prospective_record_offset;
+ first_record_read_ = true;
+ in_fragmented_record_ = false;
+ return true;
+ }
+ break;
+
+ case kBadHeader:
+ case kBadRecord:
+ case kEof:
+ case kOldRecord:
+ if (in_fragmented_record_) {
+ ReportCorruption(fragments_.size(), "error in middle of record");
+ in_fragmented_record_ = false;
+ fragments_.clear();
+ }
+ break;
+
+ case kBadRecordChecksum:
+ if (recycled_) {
+ fragments_.clear();
+ return false;
+ }
+ ReportCorruption(drop_size, "checksum mismatch");
+ if (in_fragmented_record_) {
+ ReportCorruption(fragments_.size(), "error in middle of record");
+ in_fragmented_record_ = false;
+ fragments_.clear();
+ }
+ break;
+
+ case kSetCompressionType: {
+ if (compression_type_record_read_) {
+ ReportCorruption(fragment.size(),
+ "read multiple SetCompressionType records");
+ }
+ if (first_record_read_) {
+ ReportCorruption(fragment.size(),
+ "SetCompressionType not the first record");
+ }
+ fragments_.clear();
+ prospective_record_offset = physical_record_offset;
+ last_record_offset_ = prospective_record_offset;
+ in_fragmented_record_ = false;
+ CompressionTypeRecord compression_record(kNoCompression);
+ Status s = compression_record.DecodeFrom(&fragment);
+ if (!s.ok()) {
+ ReportCorruption(fragment.size(),
+ "could not decode SetCompressionType record");
+ } else {
+ InitCompression(compression_record);
+ }
+ break;
+ }
+
+ default: {
+ char buf[40];
+ snprintf(buf, sizeof(buf), "unknown record type %u",
+ fragment_type_or_err);
+ ReportCorruption(
+ fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0),
+ buf);
+ in_fragmented_record_ = false;
+ fragments_.clear();
+ break;
+ }
+ }
+ }
+ return false;
+}
+
+void FragmentBufferedReader::UnmarkEOF() {
+ if (read_error_) {
+ return;
+ }
+ eof_ = false;
+ UnmarkEOFInternal();
+}
+
+bool FragmentBufferedReader::TryReadMore(size_t* drop_size, int* error) {
+ if (!eof_ && !read_error_) {
+ // Last read was a full read, so this is a trailer to skip
+ buffer_.clear();
+ // TODO: rate limit log reader with approriate priority.
+ // TODO: avoid overcharging rate limiter:
+ // Note that the Read here might overcharge SequentialFileReader's internal
+ // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
+ // content left until EOF to read.
+ Status status = file_->Read(kBlockSize, &buffer_, backing_store_,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ end_of_buffer_offset_ += buffer_.size();
+ if (!status.ok()) {
+ buffer_.clear();
+ ReportDrop(kBlockSize, status);
+ read_error_ = true;
+ *error = kEof;
+ return false;
+ } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
+ eof_ = true;
+ eof_offset_ = buffer_.size();
+ TEST_SYNC_POINT_CALLBACK(
+ "FragmentBufferedLogReader::TryReadMore:FirstEOF", nullptr);
+ }
+ return true;
+ } else if (!read_error_) {
+ UnmarkEOF();
+ }
+ if (!read_error_) {
+ return true;
+ }
+ *error = kEof;
+ *drop_size = buffer_.size();
+ if (buffer_.size() > 0) {
+ *error = kBadHeader;
+ }
+ buffer_.clear();
+ return false;
+}
+
+// return true if the caller should process the fragment_type_or_err.
+bool FragmentBufferedReader::TryReadFragment(
+ Slice* fragment, size_t* drop_size, unsigned int* fragment_type_or_err) {
+ assert(fragment != nullptr);
+ assert(drop_size != nullptr);
+ assert(fragment_type_or_err != nullptr);
+
+ while (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+ size_t old_size = buffer_.size();
+ int error = kEof;
+ if (!TryReadMore(drop_size, &error)) {
+ *fragment_type_or_err = error;
+ return false;
+ } else if (old_size == buffer_.size()) {
+ return false;
+ }
+ }
+ const char* header = buffer_.data();
+ const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+ const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+ const unsigned int type = header[6];
+ const uint32_t length = a | (b << 8);
+ int header_size = kHeaderSize;
+ if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
+ if (end_of_buffer_offset_ - buffer_.size() == 0) {
+ recycled_ = true;
+ }
+ header_size = kRecyclableHeaderSize;
+ while (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+ size_t old_size = buffer_.size();
+ int error = kEof;
+ if (!TryReadMore(drop_size, &error)) {
+ *fragment_type_or_err = error;
+ return false;
+ } else if (old_size == buffer_.size()) {
+ return false;
+ }
+ }
+ const uint32_t log_num = DecodeFixed32(header + 7);
+ if (log_num != log_number_) {
+ *fragment_type_or_err = kOldRecord;
+ return true;
+ }
+ }
+
+ while (header_size + length > buffer_.size()) {
+ size_t old_size = buffer_.size();
+ int error = kEof;
+ if (!TryReadMore(drop_size, &error)) {
+ *fragment_type_or_err = error;
+ return false;
+ } else if (old_size == buffer_.size()) {
+ return false;
+ }
+ }
+
+ if (type == kZeroType && length == 0) {
+ buffer_.clear();
+ *fragment_type_or_err = kBadRecord;
+ return true;
+ }
+
+ if (checksum_) {
+ uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+ uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
+ if (actual_crc != expected_crc) {
+ *drop_size = buffer_.size();
+ buffer_.clear();
+ *fragment_type_or_err = kBadRecordChecksum;
+ return true;
+ }
+ }
+
+ buffer_.remove_prefix(header_size + length);
+
+ if (!uncompress_ || type == kSetCompressionType) {
+ *fragment = Slice(header + header_size, length);
+ *fragment_type_or_err = type;
+ return true;
+ } else {
+ // Uncompress compressed records
+ uncompressed_record_.clear();
+ size_t uncompressed_size = 0;
+ int remaining = 0;
+ do {
+ remaining = uncompress_->Uncompress(header + header_size, length,
+ uncompressed_buffer_.get(),
+ &uncompressed_size);
+ if (remaining < 0) {
+ buffer_.clear();
+ *fragment_type_or_err = kBadRecord;
+ return true;
+ }
+ if (uncompressed_size > 0) {
+ uncompressed_record_.append(uncompressed_buffer_.get(),
+ uncompressed_size);
+ }
+ } while (remaining > 0 || uncompressed_size == kBlockSize);
+ *fragment = Slice(std::move(uncompressed_record_));
+ *fragment_type_or_err = type;
+ return true;
+ }
+}
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_reader.h b/src/rocksdb/db/log_reader.h
new file mode 100644
index 000000000..e3be1570e
--- /dev/null
+++ b/src/rocksdb/db/log_reader.h
@@ -0,0 +1,225 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <memory>
+
+#include "db/log_format.h"
+#include "file/sequence_file_reader.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/compression.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+
+namespace log {
+
+/**
+ * Reader is a general purpose log stream reader implementation. The actual job
+ * of reading from the device is implemented by the SequentialFile interface.
+ *
+ * Please see Writer for details on the file and record layout.
+ */
+class Reader {
+ public:
+ // Interface for reporting errors.
+ class Reporter {
+ public:
+ virtual ~Reporter();
+
+ // Some corruption was detected. "size" is the approximate number
+ // of bytes dropped due to the corruption.
+ virtual void Corruption(size_t bytes, const Status& status) = 0;
+ };
+
+ // Create a reader that will return log records from "*file".
+ // "*file" must remain live while this Reader is in use.
+ //
+ // If "reporter" is non-nullptr, it is notified whenever some data is
+ // dropped due to a detected corruption. "*reporter" must remain
+ // live while this Reader is in use.
+ //
+ // If "checksum" is true, verify checksums if available.
+ Reader(std::shared_ptr<Logger> info_log,
+ std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
+ bool checksum, uint64_t log_num);
+ // No copying allowed
+ Reader(const Reader&) = delete;
+ void operator=(const Reader&) = delete;
+
+ virtual ~Reader();
+
+ // Read the next record into *record. Returns true if read
+ // successfully, false if we hit end of the input. May use
+ // "*scratch" as temporary storage. The contents filled in *record
+ // will only be valid until the next mutating operation on this
+ // reader or the next mutation to *scratch.
+ // If record_checksum is not nullptr, then this function will calculate the
+ // checksum of the record read and set record_checksum to it. The checksum is
+ // calculated from the original buffers that contain the contents of the
+ // record.
+ virtual bool ReadRecord(Slice* record, std::string* scratch,
+ WALRecoveryMode wal_recovery_mode =
+ WALRecoveryMode::kTolerateCorruptedTailRecords,
+ uint64_t* record_checksum = nullptr);
+
+ // Returns the physical offset of the last record returned by ReadRecord.
+ //
+ // Undefined before the first call to ReadRecord.
+ uint64_t LastRecordOffset();
+
+ // Returns the first physical offset after the last record returned by
+ // ReadRecord, or zero before first call to ReadRecord. This can also be
+ // thought of as the "current" position in processing the file bytes.
+ uint64_t LastRecordEnd();
+
+ // returns true if the reader has encountered an eof condition.
+ bool IsEOF() { return eof_; }
+
+ // returns true if the reader has encountered read error.
+ bool hasReadError() const { return read_error_; }
+
+ // when we know more data has been written to the file. we can use this
+ // function to force the reader to look again in the file.
+ // Also aligns the file position indicator to the start of the next block
+ // by reading the rest of the data from the EOF position to the end of the
+ // block that was partially read.
+ virtual void UnmarkEOF();
+
+ SequentialFileReader* file() { return file_.get(); }
+
+ Reporter* GetReporter() const { return reporter_; }
+
+ uint64_t GetLogNumber() const { return log_number_; }
+
+ size_t GetReadOffset() const {
+ return static_cast<size_t>(end_of_buffer_offset_);
+ }
+
+ bool IsCompressedAndEmptyFile() {
+ return !first_record_read_ && compression_type_record_read_;
+ }
+
+ protected:
+ std::shared_ptr<Logger> info_log_;
+ const std::unique_ptr<SequentialFileReader> file_;
+ Reporter* const reporter_;
+ bool const checksum_;
+ char* const backing_store_;
+
+ // Internal state variables used for reading records
+ Slice buffer_;
+ bool eof_; // Last Read() indicated EOF by returning < kBlockSize
+ bool read_error_; // Error occurred while reading from file
+
+ // Offset of the file position indicator within the last block when an
+ // EOF was detected.
+ size_t eof_offset_;
+
+ // Offset of the last record returned by ReadRecord.
+ uint64_t last_record_offset_;
+ // Offset of the first location past the end of buffer_.
+ uint64_t end_of_buffer_offset_;
+
+ // which log number this is
+ uint64_t const log_number_;
+
+ // Whether this is a recycled log file
+ bool recycled_;
+
+ // Whether the first record has been read or not.
+ bool first_record_read_;
+ // Type of compression used
+ CompressionType compression_type_;
+ // Track whether the compression type record has been read or not.
+ bool compression_type_record_read_;
+ StreamingUncompress* uncompress_;
+ // Reusable uncompressed output buffer
+ std::unique_ptr<char[]> uncompressed_buffer_;
+ // Reusable uncompressed record
+ std::string uncompressed_record_;
+ // Used for stream hashing fragment content in ReadRecord()
+ XXH3_state_t* hash_state_;
+ // Used for stream hashing uncompressed buffer in ReadPhysicalRecord()
+ XXH3_state_t* uncompress_hash_state_;
+
+ // Extend record types with the following special values
+ enum {
+ kEof = kMaxRecordType + 1,
+ // Returned whenever we find an invalid physical record.
+ // Currently there are three situations in which this happens:
+ // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+ // * The record is a 0-length record (No drop is reported)
+ kBadRecord = kMaxRecordType + 2,
+ // Returned when we fail to read a valid header.
+ kBadHeader = kMaxRecordType + 3,
+ // Returned when we read an old record from a previous user of the log.
+ kOldRecord = kMaxRecordType + 4,
+ // Returned when we get a bad record length
+ kBadRecordLen = kMaxRecordType + 5,
+ // Returned when we get a bad record checksum
+ kBadRecordChecksum = kMaxRecordType + 6,
+ };
+
+ // Return type, or one of the preceding special values
+ // If WAL compressioned is enabled, fragment_checksum is the checksum of the
+ // fragment computed from the orginal buffer containinng uncompressed
+ // fragment.
+ unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size,
+ uint64_t* fragment_checksum = nullptr);
+
+ // Read some more
+ bool ReadMore(size_t* drop_size, int* error);
+
+ void UnmarkEOFInternal();
+
+ // Reports dropped bytes to the reporter.
+ // buffer_ must be updated to remove the dropped bytes prior to invocation.
+ void ReportCorruption(size_t bytes, const char* reason);
+ void ReportDrop(size_t bytes, const Status& reason);
+
+ void InitCompression(const CompressionTypeRecord& compression_record);
+};
+
+class FragmentBufferedReader : public Reader {
+ public:
+ FragmentBufferedReader(std::shared_ptr<Logger> info_log,
+ std::unique_ptr<SequentialFileReader>&& _file,
+ Reporter* reporter, bool checksum, uint64_t log_num)
+ : Reader(info_log, std::move(_file), reporter, checksum, log_num),
+ fragments_(),
+ in_fragmented_record_(false) {}
+ ~FragmentBufferedReader() override {}
+ bool ReadRecord(Slice* record, std::string* scratch,
+ WALRecoveryMode wal_recovery_mode =
+ WALRecoveryMode::kTolerateCorruptedTailRecords,
+ uint64_t* record_checksum = nullptr) override;
+ void UnmarkEOF() override;
+
+ private:
+ std::string fragments_;
+ bool in_fragmented_record_;
+
+ bool TryReadFragment(Slice* result, size_t* drop_size,
+ unsigned int* fragment_type_or_err);
+
+ bool TryReadMore(size_t* drop_size, int* error);
+
+ // No copy allowed
+ FragmentBufferedReader(const FragmentBufferedReader&);
+ void operator=(const FragmentBufferedReader&);
+};
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_test.cc b/src/rocksdb/db/log_test.cc
new file mode 100644
index 000000000..2a43dc152
--- /dev/null
+++ b/src/rocksdb/db/log_test.cc
@@ -0,0 +1,1062 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+ std::string result;
+ while (result.size() < n) {
+ result.append(partial_string);
+ }
+ result.resize(n);
+ return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "%d.", n);
+ return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+ return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+// Param type is tuple<int, bool>
+// get<0>(tuple): non-zero if recycling log, zero if regular log
+// get<1>(tuple): true if allow retry after read EOF, false otherwise
+class LogTest
+ : public ::testing::TestWithParam<std::tuple<int, bool, CompressionType>> {
+ private:
+ class StringSource : public FSSequentialFile {
+ public:
+ Slice& contents_;
+ bool force_error_;
+ size_t force_error_position_;
+ bool force_eof_;
+ size_t force_eof_position_;
+ bool returned_partial_;
+ bool fail_after_read_partial_;
+ explicit StringSource(Slice& contents, bool fail_after_read_partial)
+ : contents_(contents),
+ force_error_(false),
+ force_error_position_(0),
+ force_eof_(false),
+ force_eof_position_(0),
+ returned_partial_(false),
+ fail_after_read_partial_(fail_after_read_partial) {}
+
+ IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result,
+ char* scratch, IODebugContext* /*dbg*/) override {
+ if (fail_after_read_partial_) {
+ EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+ }
+
+ if (force_error_) {
+ if (force_error_position_ >= n) {
+ force_error_position_ -= n;
+ } else {
+ *result = Slice(contents_.data(), force_error_position_);
+ contents_.remove_prefix(force_error_position_);
+ force_error_ = false;
+ returned_partial_ = true;
+ return IOStatus::Corruption("read error");
+ }
+ }
+
+ if (contents_.size() < n) {
+ n = contents_.size();
+ returned_partial_ = true;
+ }
+
+ if (force_eof_) {
+ if (force_eof_position_ >= n) {
+ force_eof_position_ -= n;
+ } else {
+ force_eof_ = false;
+ n = force_eof_position_;
+ returned_partial_ = true;
+ }
+ }
+
+ // By using scratch we ensure that caller has control over the
+ // lifetime of result.data()
+ memcpy(scratch, contents_.data(), n);
+ *result = Slice(scratch, n);
+
+ contents_.remove_prefix(n);
+ return IOStatus::OK();
+ }
+
+ IOStatus Skip(uint64_t n) override {
+ if (n > contents_.size()) {
+ contents_.clear();
+ return IOStatus::NotFound("in-memory file skipepd past end");
+ }
+
+ contents_.remove_prefix(n);
+
+ return IOStatus::OK();
+ }
+ };
+
+ class ReportCollector : public Reader::Reporter {
+ public:
+ size_t dropped_bytes_;
+ std::string message_;
+
+ ReportCollector() : dropped_bytes_(0) {}
+ void Corruption(size_t bytes, const Status& status) override {
+ dropped_bytes_ += bytes;
+ message_.append(status.ToString());
+ }
+ };
+
+ std::string& dest_contents() { return sink_->contents_; }
+
+ const std::string& dest_contents() const { return sink_->contents_; }
+
+ void reset_source_contents() { source_->contents_ = dest_contents(); }
+
+ Slice reader_contents_;
+ test::StringSink* sink_;
+ StringSource* source_;
+ ReportCollector report_;
+
+ protected:
+ std::unique_ptr<Writer> writer_;
+ std::unique_ptr<Reader> reader_;
+ bool allow_retry_read_;
+ CompressionType compression_type_;
+
+ public:
+ LogTest()
+ : reader_contents_(),
+ sink_(new test::StringSink(&reader_contents_)),
+ source_(new StringSource(reader_contents_, !std::get<1>(GetParam()))),
+ allow_retry_read_(std::get<1>(GetParam())),
+ compression_type_(std::get<2>(GetParam())) {
+ std::unique_ptr<FSWritableFile> sink_holder(sink_);
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(sink_holder), "" /* don't care */, FileOptions()));
+ Writer* writer =
+ new Writer(std::move(file_writer), 123, std::get<0>(GetParam()), false,
+ compression_type_);
+ writer_.reset(writer);
+ std::unique_ptr<FSSequentialFile> source_holder(source_);
+ std::unique_ptr<SequentialFileReader> file_reader(
+ new SequentialFileReader(std::move(source_holder), "" /* file name */));
+ if (allow_retry_read_) {
+ reader_.reset(new FragmentBufferedReader(nullptr, std::move(file_reader),
+ &report_, true /* checksum */,
+ 123 /* log_number */));
+ } else {
+ reader_.reset(new Reader(nullptr, std::move(file_reader), &report_,
+ true /* checksum */, 123 /* log_number */));
+ }
+ }
+
+ Slice* get_reader_contents() { return &reader_contents_; }
+
+ void Write(const std::string& msg) {
+ ASSERT_OK(writer_->AddRecord(Slice(msg)));
+ }
+
+ size_t WrittenBytes() const { return dest_contents().size(); }
+
+ std::string Read(const WALRecoveryMode wal_recovery_mode =
+ WALRecoveryMode::kTolerateCorruptedTailRecords) {
+ std::string scratch;
+ Slice record;
+ bool ret = false;
+ uint64_t record_checksum;
+ ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode,
+ &record_checksum);
+ if (ret) {
+ if (!allow_retry_read_) {
+ // allow_retry_read_ means using FragmentBufferedReader which does not
+ // support record checksum yet.
+ uint64_t actual_record_checksum =
+ XXH3_64bits(record.data(), record.size());
+ assert(actual_record_checksum == record_checksum);
+ }
+ return record.ToString();
+ } else {
+ return "EOF";
+ }
+ }
+
+ void IncrementByte(int offset, char delta) {
+ dest_contents()[offset] += delta;
+ }
+
+ void SetByte(int offset, char new_byte) {
+ dest_contents()[offset] = new_byte;
+ }
+
+ void ShrinkSize(int bytes) { sink_->Drop(bytes); }
+
+ void FixChecksum(int header_offset, int len, bool recyclable) {
+ // Compute crc of type/len/data
+ int header_size = recyclable ? kRecyclableHeaderSize : kHeaderSize;
+ uint32_t crc = crc32c::Value(&dest_contents()[header_offset + 6],
+ header_size - 6 + len);
+ crc = crc32c::Mask(crc);
+ EncodeFixed32(&dest_contents()[header_offset], crc);
+ }
+
+ void ForceError(size_t position = 0) {
+ source_->force_error_ = true;
+ source_->force_error_position_ = position;
+ }
+
+ size_t DroppedBytes() const { return report_.dropped_bytes_; }
+
+ std::string ReportMessage() const { return report_.message_; }
+
+ void ForceEOF(size_t position = 0) {
+ source_->force_eof_ = true;
+ source_->force_eof_position_ = position;
+ }
+
+ void UnmarkEOF() {
+ source_->returned_partial_ = false;
+ reader_->UnmarkEOF();
+ }
+
+ bool IsEOF() { return reader_->IsEOF(); }
+
+ // Returns OK iff recorded error message contains "msg"
+ std::string MatchError(const std::string& msg) const {
+ if (report_.message_.find(msg) == std::string::npos) {
+ return report_.message_;
+ } else {
+ return "OK";
+ }
+ }
+};
+
+TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
+
+TEST_P(LogTest, ReadWrite) {
+ Write("foo");
+ Write("bar");
+ Write("");
+ Write("xxxx");
+ ASSERT_EQ("foo", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("xxxx", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
+}
+
+TEST_P(LogTest, ManyBlocks) {
+ for (int i = 0; i < 100000; i++) {
+ Write(NumberString(i));
+ }
+ for (int i = 0; i < 100000; i++) {
+ ASSERT_EQ(NumberString(i), Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, Fragmentation) {
+ Write("small");
+ Write(BigString("medium", 50000));
+ Write(BigString("large", 100000));
+ ASSERT_EQ("small", Read());
+ ASSERT_EQ(BigString("medium", 50000), Read());
+ ASSERT_EQ(BigString("large", 100000), Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, MarginalTrailer) {
+ // Make a trailer that is exactly the same length as an empty record.
+ int header_size =
+ std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+ const int n = kBlockSize - 2 * header_size;
+ Write(BigString("foo", n));
+ ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
+ Write("");
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, MarginalTrailer2) {
+ // Make a trailer that is exactly the same length as an empty record.
+ int header_size =
+ std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+ const int n = kBlockSize - 2 * header_size;
+ Write(BigString("foo", n));
+ ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(0U, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, ShortTrailer) {
+ int header_size =
+ std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+ const int n = kBlockSize - 2 * header_size + 4;
+ Write(BigString("foo", n));
+ ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
+ Write("");
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, AlignedEof) {
+ int header_size =
+ std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+ const int n = kBlockSize - 2 * header_size + 4;
+ Write(BigString("foo", n));
+ ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, RandomRead) {
+ const int N = 500;
+ Random write_rnd(301);
+ for (int i = 0; i < N; i++) {
+ Write(RandomSkewedString(i, &write_rnd));
+ }
+ Random read_rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST_P(LogTest, ReadError) {
+ Write("foo");
+ ForceError();
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST_P(LogTest, BadRecordType) {
+ Write("foo");
+ // Type is stored in header[6]
+ IncrementByte(6, 100);
+ FixChecksum(0, 3, false);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) {
+ Write("foo");
+ ShrinkSize(4); // Drop all payload as well as a header byte
+ ASSERT_EQ("EOF", Read());
+ // Truncated last record is ignored, not treated as an error
+ ASSERT_EQ(0U, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then truncated trailing record should not
+ // raise an error.
+ return;
+ }
+ Write("foo");
+ ShrinkSize(4); // Drop all payload as well as a header byte
+ ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+ // Truncated last record is ignored, not treated as an error
+ ASSERT_GT(DroppedBytes(), 0U);
+ ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
+}
+
+TEST_P(LogTest, BadLength) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then we should not raise an error when the
+ // record length specified in header is longer than data currently
+ // available. It's possible that the body of the record is not written yet.
+ return;
+ }
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+ const int kPayloadSize = kBlockSize - header_size;
+ Write(BigString("bar", kPayloadSize));
+ Write("foo");
+ // Least significant size byte is stored in header[4].
+ IncrementByte(4, 1);
+ if (!recyclable_log) {
+ ASSERT_EQ("foo", Read());
+ ASSERT_EQ(kBlockSize, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("bad record length"));
+ } else {
+ ASSERT_EQ("EOF", Read());
+ }
+}
+
+TEST_P(LogTest, BadLengthAtEndIsIgnored) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then we should not raise an error when the
+ // record length specified in header is longer than data currently
+ // available. It's possible that the body of the record is not written yet.
+ return;
+ }
+ Write("foo");
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(0U, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, BadLengthAtEndIsNotIgnored) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then we should not raise an error when the
+ // record length specified in header is longer than data currently
+ // available. It's possible that the body of the record is not written yet.
+ return;
+ }
+ Write("foo");
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+ ASSERT_GT(DroppedBytes(), 0U);
+ ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
+}
+
+TEST_P(LogTest, ChecksumMismatch) {
+ Write("foooooo");
+ IncrementByte(0, 14);
+ ASSERT_EQ("EOF", Read());
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ if (!recyclable_log) {
+ ASSERT_EQ(14U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("checksum mismatch"));
+ } else {
+ ASSERT_EQ(0U, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+ }
+}
+
+TEST_P(LogTest, UnexpectedMiddleType) {
+ Write("foo");
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ SetByte(6, static_cast<char>(recyclable_log ? kRecyclableMiddleType
+ : kMiddleType));
+ FixChecksum(0, 3, !!recyclable_log);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST_P(LogTest, UnexpectedLastType) {
+ Write("foo");
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ SetByte(6,
+ static_cast<char>(recyclable_log ? kRecyclableLastType : kLastType));
+ FixChecksum(0, 3, !!recyclable_log);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST_P(LogTest, UnexpectedFullType) {
+ Write("foo");
+ Write("bar");
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ SetByte(
+ 6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+ FixChecksum(0, 3, !!recyclable_log);
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST_P(LogTest, UnexpectedFirstType) {
+ Write("foo");
+ Write(BigString("bar", 100000));
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ SetByte(
+ 6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+ FixChecksum(0, 3, !!recyclable_log);
+ ASSERT_EQ(BigString("bar", 100000), Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST_P(LogTest, MissingLastIsIgnored) {
+ Write(BigString("bar", kBlockSize));
+ // Remove the LAST block, including header.
+ ShrinkSize(14);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("", ReportMessage());
+ ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST_P(LogTest, MissingLastIsNotIgnored) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then truncated trailing record should not
+ // raise an error.
+ return;
+ }
+ Write(BigString("bar", kBlockSize));
+ // Remove the LAST block, including header.
+ ShrinkSize(14);
+ ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+ ASSERT_GT(DroppedBytes(), 0U);
+ ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data"));
+}
+
+TEST_P(LogTest, PartialLastIsIgnored) {
+ Write(BigString("bar", kBlockSize));
+ // Cause a bad record length in the LAST block.
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("", ReportMessage());
+ ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST_P(LogTest, PartialLastIsNotIgnored) {
+ if (allow_retry_read_) {
+ // If read retry is allowed, then truncated trailing record should not
+ // raise an error.
+ return;
+ }
+ Write(BigString("bar", kBlockSize));
+ // Cause a bad record length in the LAST block.
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+ ASSERT_GT(DroppedBytes(), 0U);
+ ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
+}
+
+TEST_P(LogTest, ErrorJoinsRecords) {
+ // Consider two fragmented records:
+ // first(R1) last(R1) first(R2) last(R2)
+ // where the middle two fragments disappear. We do not want
+ // first(R1),last(R2) to get joined and returned as a valid record.
+
+ // Write records that span two blocks
+ Write(BigString("foo", kBlockSize));
+ Write(BigString("bar", kBlockSize));
+ Write("correct");
+
+ // Wipe the middle block
+ for (unsigned int offset = kBlockSize; offset < 2 * kBlockSize; offset++) {
+ SetByte(offset, 'x');
+ }
+
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ if (!recyclable_log) {
+ ASSERT_EQ("correct", Read());
+ ASSERT_EQ("EOF", Read());
+ size_t dropped = DroppedBytes();
+ ASSERT_LE(dropped, 2 * kBlockSize + 100);
+ ASSERT_GE(dropped, 2 * kBlockSize);
+ } else {
+ ASSERT_EQ("EOF", Read());
+ }
+}
+
+TEST_P(LogTest, ClearEofSingleBlock) {
+ Write("foo");
+ Write("bar");
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+ ForceEOF(3 + header_size + 2);
+ ASSERT_EQ("foo", Read());
+ UnmarkEOF();
+ ASSERT_EQ("bar", Read());
+ ASSERT_TRUE(IsEOF());
+ ASSERT_EQ("EOF", Read());
+ Write("xxx");
+ UnmarkEOF();
+ ASSERT_EQ("xxx", Read());
+ ASSERT_TRUE(IsEOF());
+}
+
+TEST_P(LogTest, ClearEofMultiBlock) {
+ size_t num_full_blocks = 5;
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+ size_t n = (kBlockSize - header_size) * num_full_blocks + 25;
+ Write(BigString("foo", n));
+ Write(BigString("bar", n));
+ ForceEOF(n + num_full_blocks * header_size + header_size + 3);
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_TRUE(IsEOF());
+ UnmarkEOF();
+ ASSERT_EQ(BigString("bar", n), Read());
+ ASSERT_TRUE(IsEOF());
+ Write(BigString("xxx", n));
+ UnmarkEOF();
+ ASSERT_EQ(BigString("xxx", n), Read());
+ ASSERT_TRUE(IsEOF());
+}
+
+TEST_P(LogTest, ClearEofError) {
+ // If an error occurs during Read() in UnmarkEOF(), the records contained
+ // in the buffer should be returned on subsequent calls of ReadRecord()
+ // until no more full records are left, whereafter ReadRecord() should return
+ // false to indicate that it cannot read any further.
+
+ Write("foo");
+ Write("bar");
+ UnmarkEOF();
+ ASSERT_EQ("foo", Read());
+ ASSERT_TRUE(IsEOF());
+ Write("xxx");
+ ForceError(0);
+ UnmarkEOF();
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, ClearEofError2) {
+ Write("foo");
+ Write("bar");
+ UnmarkEOF();
+ ASSERT_EQ("foo", Read());
+ Write("xxx");
+ ForceError(3);
+ UnmarkEOF();
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3U, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST_P(LogTest, Recycle) {
+ bool recyclable_log = (std::get<0>(GetParam()) != 0);
+ if (!recyclable_log) {
+ return; // test is only valid for recycled logs
+ }
+ Write("foo");
+ Write("bar");
+ Write("baz");
+ Write("bif");
+ Write("blitz");
+ while (get_reader_contents()->size() < log::kBlockSize * 2) {
+ Write("xxxxxxxxxxxxxxxx");
+ }
+ std::unique_ptr<FSWritableFile> sink(
+ new test::OverwritingStringSink(get_reader_contents()));
+ std::unique_ptr<WritableFileWriter> dest_holder(new WritableFileWriter(
+ std::move(sink), "" /* don't care */, FileOptions()));
+ Writer recycle_writer(std::move(dest_holder), 123, true);
+ ASSERT_OK(recycle_writer.AddRecord(Slice("foooo")));
+ ASSERT_OK(recycle_writer.AddRecord(Slice("bar")));
+ ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2);
+ ASSERT_EQ("foooo", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+// Do NOT enable compression for this instantiation.
+INSTANTIATE_TEST_CASE_P(
+ Log, LogTest,
+ ::testing::Combine(::testing::Values(0, 1), ::testing::Bool(),
+ ::testing::Values(CompressionType::kNoCompression)));
+
+class RetriableLogTest : public ::testing::TestWithParam<int> {
+ private:
+ class ReportCollector : public Reader::Reporter {
+ public:
+ size_t dropped_bytes_;
+ std::string message_;
+
+ ReportCollector() : dropped_bytes_(0) {}
+ void Corruption(size_t bytes, const Status& status) override {
+ dropped_bytes_ += bytes;
+ message_.append(status.ToString());
+ }
+ };
+
+ Slice contents_;
+ test::StringSink* sink_;
+ std::unique_ptr<Writer> log_writer_;
+ Env* env_;
+ const std::string test_dir_;
+ const std::string log_file_;
+ std::unique_ptr<WritableFileWriter> writer_;
+ std::unique_ptr<SequentialFileReader> reader_;
+ ReportCollector report_;
+ std::unique_ptr<FragmentBufferedReader> log_reader_;
+
+ public:
+ RetriableLogTest()
+ : contents_(),
+ sink_(new test::StringSink(&contents_)),
+ log_writer_(nullptr),
+ env_(Env::Default()),
+ test_dir_(test::PerThreadDBPath("retriable_log_test")),
+ log_file_(test_dir_ + "/log"),
+ writer_(nullptr),
+ reader_(nullptr),
+ log_reader_(nullptr) {
+ std::unique_ptr<FSWritableFile> sink_holder(sink_);
+ std::unique_ptr<WritableFileWriter> wfw(new WritableFileWriter(
+ std::move(sink_holder), "" /* file name */, FileOptions()));
+ log_writer_.reset(new Writer(std::move(wfw), 123, GetParam()));
+ }
+
+ Status SetupTestEnv() {
+ Status s;
+ FileOptions fopts;
+ auto fs = env_->GetFileSystem();
+ s = fs->CreateDirIfMissing(test_dir_, IOOptions(), nullptr);
+ std::unique_ptr<FSWritableFile> writable_file;
+ if (s.ok()) {
+ s = fs->NewWritableFile(log_file_, fopts, &writable_file, nullptr);
+ }
+ if (s.ok()) {
+ writer_.reset(
+ new WritableFileWriter(std::move(writable_file), log_file_, fopts));
+ EXPECT_NE(writer_, nullptr);
+ }
+ std::unique_ptr<FSSequentialFile> seq_file;
+ if (s.ok()) {
+ s = fs->NewSequentialFile(log_file_, fopts, &seq_file, nullptr);
+ }
+ if (s.ok()) {
+ reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_));
+ EXPECT_NE(reader_, nullptr);
+ log_reader_.reset(new FragmentBufferedReader(
+ nullptr, std::move(reader_), &report_, true /* checksum */,
+ 123 /* log_number */));
+ EXPECT_NE(log_reader_, nullptr);
+ }
+ return s;
+ }
+
+ std::string contents() { return sink_->contents_; }
+
+ void Encode(const std::string& msg) {
+ ASSERT_OK(log_writer_->AddRecord(Slice(msg)));
+ }
+
+ void Write(const Slice& data) {
+ ASSERT_OK(writer_->Append(data));
+ ASSERT_OK(writer_->Sync(true));
+ }
+
+ bool TryRead(std::string* result) {
+ assert(result != nullptr);
+ result->clear();
+ std::string scratch;
+ Slice record;
+ bool r = log_reader_->ReadRecord(&record, &scratch);
+ if (r) {
+ result->assign(record.data(), record.size());
+ return true;
+ } else {
+ return false;
+ }
+ }
+};
+
+TEST_P(RetriableLogTest, TailLog_PartialHeader) {
+ ASSERT_OK(SetupTestEnv());
+ std::vector<int> remaining_bytes_in_last_record;
+ size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+ bool eof = false;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"RetriableLogTest::TailLog:AfterPart1",
+ "RetriableLogTest::TailLog:BeforeReadRecord"},
+ {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+ "RetriableLogTest::TailLog:BeforePart2"}});
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+ [&](void* /*arg*/) { eof = true; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ size_t delta = header_size - 1;
+ port::Thread log_writer_thread([&]() {
+ size_t old_sz = contents().size();
+ Encode("foo");
+ size_t new_sz = contents().size();
+ std::string part1 = contents().substr(old_sz, delta);
+ std::string part2 =
+ contents().substr(old_sz + delta, new_sz - old_sz - delta);
+ Write(Slice(part1));
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+ Write(Slice(part2));
+ });
+
+ std::string record;
+ port::Thread log_reader_thread([&]() {
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+ while (!TryRead(&record)) {
+ }
+ });
+ log_reader_thread.join();
+ log_writer_thread.join();
+ ASSERT_EQ("foo", record);
+ ASSERT_TRUE(eof);
+}
+
+TEST_P(RetriableLogTest, TailLog_FullHeader) {
+ ASSERT_OK(SetupTestEnv());
+ std::vector<int> remaining_bytes_in_last_record;
+ size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+ bool eof = false;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"RetriableLogTest::TailLog:AfterPart1",
+ "RetriableLogTest::TailLog:BeforeReadRecord"},
+ {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+ "RetriableLogTest::TailLog:BeforePart2"}});
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+ [&](void* /*arg*/) { eof = true; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ size_t delta = header_size + 1;
+ port::Thread log_writer_thread([&]() {
+ size_t old_sz = contents().size();
+ Encode("foo");
+ size_t new_sz = contents().size();
+ std::string part1 = contents().substr(old_sz, delta);
+ std::string part2 =
+ contents().substr(old_sz + delta, new_sz - old_sz - delta);
+ Write(Slice(part1));
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+ Write(Slice(part2));
+ ASSERT_TRUE(eof);
+ });
+
+ std::string record;
+ port::Thread log_reader_thread([&]() {
+ TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+ while (!TryRead(&record)) {
+ }
+ });
+ log_reader_thread.join();
+ log_writer_thread.join();
+ ASSERT_EQ("foo", record);
+}
+
+TEST_P(RetriableLogTest, NonBlockingReadFullRecord) {
+ // Clear all sync point callbacks even if this test does not use sync point.
+ // It is necessary, otherwise the execute of this test may hit a sync point
+ // with which a callback is registered. The registered callback may access
+ // some dead variable, causing segfault.
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ ASSERT_OK(SetupTestEnv());
+ size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+ size_t delta = header_size - 1;
+ size_t old_sz = contents().size();
+ Encode("foo-bar");
+ size_t new_sz = contents().size();
+ std::string part1 = contents().substr(old_sz, delta);
+ std::string part2 =
+ contents().substr(old_sz + delta, new_sz - old_sz - delta);
+ Write(Slice(part1));
+ std::string record;
+ ASSERT_FALSE(TryRead(&record));
+ ASSERT_TRUE(record.empty());
+ Write(Slice(part2));
+ ASSERT_TRUE(TryRead(&record));
+ ASSERT_EQ("foo-bar", record);
+}
+
+INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2));
+
+class CompressionLogTest : public LogTest {
+ public:
+ Status SetupTestEnv() { return writer_->AddCompressionTypeRecord(); }
+};
+
+TEST_P(CompressionLogTest, Empty) {
+ CompressionType compression_type = std::get<2>(GetParam());
+ if (!StreamingCompressionTypeSupported(compression_type)) {
+ ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+ return;
+ }
+ ASSERT_OK(SetupTestEnv());
+ const bool compression_enabled =
+ std::get<2>(GetParam()) == kNoCompression ? false : true;
+ // If WAL compression is enabled, a record is added for the compression type
+ const int compression_record_size = compression_enabled ? kHeaderSize + 4 : 0;
+ ASSERT_EQ(compression_record_size, WrittenBytes());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(CompressionLogTest, ReadWrite) {
+ CompressionType compression_type = std::get<2>(GetParam());
+ if (!StreamingCompressionTypeSupported(compression_type)) {
+ ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+ return;
+ }
+ ASSERT_OK(SetupTestEnv());
+ Write("foo");
+ Write("bar");
+ Write("");
+ Write("xxxx");
+ ASSERT_EQ("foo", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("xxxx", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
+}
+
+TEST_P(CompressionLogTest, ManyBlocks) {
+ CompressionType compression_type = std::get<2>(GetParam());
+ if (!StreamingCompressionTypeSupported(compression_type)) {
+ ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+ return;
+ }
+ ASSERT_OK(SetupTestEnv());
+ for (int i = 0; i < 100000; i++) {
+ Write(NumberString(i));
+ }
+ for (int i = 0; i < 100000; i++) {
+ ASSERT_EQ(NumberString(i), Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(CompressionLogTest, Fragmentation) {
+ CompressionType compression_type = std::get<2>(GetParam());
+ if (!StreamingCompressionTypeSupported(compression_type)) {
+ ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+ return;
+ }
+ ASSERT_OK(SetupTestEnv());
+ Random rnd(301);
+ const std::vector<std::string> wal_entries = {
+ "small",
+ rnd.RandomBinaryString(3 * kBlockSize / 2), // Spans into block 2
+ rnd.RandomBinaryString(3 * kBlockSize), // Spans into block 5
+ };
+ for (const std::string& wal_entry : wal_entries) {
+ Write(wal_entry);
+ }
+
+ for (const std::string& wal_entry : wal_entries) {
+ ASSERT_EQ(wal_entry, Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+INSTANTIATE_TEST_CASE_P(
+ Compression, CompressionLogTest,
+ ::testing::Combine(::testing::Values(0, 1), ::testing::Bool(),
+ ::testing::Values(CompressionType::kNoCompression,
+ CompressionType::kZSTD)));
+
+class StreamingCompressionTest
+ : public ::testing::TestWithParam<std::tuple<int, CompressionType>> {};
+
+TEST_P(StreamingCompressionTest, Basic) {
+ size_t input_size = std::get<0>(GetParam());
+ CompressionType compression_type = std::get<1>(GetParam());
+ if (!StreamingCompressionTypeSupported(compression_type)) {
+ ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+ return;
+ }
+ CompressionOptions opts;
+ constexpr uint32_t compression_format_version = 2;
+ StreamingCompress* compress = StreamingCompress::Create(
+ compression_type, opts, compression_format_version, kBlockSize);
+ StreamingUncompress* uncompress = StreamingUncompress::Create(
+ compression_type, compression_format_version, kBlockSize);
+ MemoryAllocator* allocator = new DefaultMemoryAllocator();
+ std::string input_buffer = BigString("abc", input_size);
+ std::vector<std::string> compressed_buffers;
+ size_t remaining;
+ // Call compress till the entire input is consumed
+ do {
+ char* output_buffer = (char*)allocator->Allocate(kBlockSize);
+ size_t output_pos;
+ remaining = compress->Compress(input_buffer.c_str(), input_size,
+ output_buffer, &output_pos);
+ if (output_pos > 0) {
+ std::string compressed_buffer;
+ compressed_buffer.assign(output_buffer, output_pos);
+ compressed_buffers.emplace_back(std::move(compressed_buffer));
+ }
+ allocator->Deallocate((void*)output_buffer);
+ } while (remaining > 0);
+ std::string uncompressed_buffer = "";
+ int ret_val = 0;
+ size_t output_pos;
+ char* uncompressed_output_buffer = (char*)allocator->Allocate(kBlockSize);
+ // Uncompress the fragments and concatenate them.
+ for (int i = 0; i < (int)compressed_buffers.size(); i++) {
+ // Call uncompress till either the entire input is consumed or the output
+ // buffer size is equal to the allocated output buffer size.
+ do {
+ ret_val = uncompress->Uncompress(compressed_buffers[i].c_str(),
+ compressed_buffers[i].size(),
+ uncompressed_output_buffer, &output_pos);
+ if (output_pos > 0) {
+ std::string uncompressed_fragment;
+ uncompressed_fragment.assign(uncompressed_output_buffer, output_pos);
+ uncompressed_buffer += uncompressed_fragment;
+ }
+ } while (ret_val > 0 || output_pos == kBlockSize);
+ }
+ allocator->Deallocate((void*)uncompressed_output_buffer);
+ delete allocator;
+ delete compress;
+ delete uncompress;
+ // The final return value from uncompress() should be 0.
+ ASSERT_EQ(ret_val, 0);
+ ASSERT_EQ(input_buffer, uncompressed_buffer);
+}
+
+INSTANTIATE_TEST_CASE_P(
+ StreamingCompression, StreamingCompressionTest,
+ ::testing::Combine(::testing::Values(10, 100, 1000, kBlockSize,
+ kBlockSize * 2),
+ ::testing::Values(CompressionType::kZSTD)));
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/log_writer.cc b/src/rocksdb/db/log_writer.cc
new file mode 100644
index 000000000..56f58543e
--- /dev/null
+++ b/src/rocksdb/db/log_writer.cc
@@ -0,0 +1,249 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+Writer::Writer(std::unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
+ bool recycle_log_files, bool manual_flush,
+ CompressionType compression_type)
+ : dest_(std::move(dest)),
+ block_offset_(0),
+ log_number_(log_number),
+ recycle_log_files_(recycle_log_files),
+ manual_flush_(manual_flush),
+ compression_type_(compression_type),
+ compress_(nullptr) {
+ for (int i = 0; i <= kMaxRecordType; i++) {
+ char t = static_cast<char>(i);
+ type_crc_[i] = crc32c::Value(&t, 1);
+ }
+}
+
+Writer::~Writer() {
+ if (dest_) {
+ WriteBuffer().PermitUncheckedError();
+ }
+ if (compress_) {
+ delete compress_;
+ }
+}
+
+IOStatus Writer::WriteBuffer() {
+ if (dest_->seen_error()) {
+ return IOStatus::IOError("Seen error. Skip writing buffer.");
+ }
+ return dest_->Flush();
+}
+
+IOStatus Writer::Close() {
+ IOStatus s;
+ if (dest_) {
+ s = dest_->Close();
+ dest_.reset();
+ }
+ return s;
+}
+
+IOStatus Writer::AddRecord(const Slice& slice,
+ Env::IOPriority rate_limiter_priority) {
+ const char* ptr = slice.data();
+ size_t left = slice.size();
+
+ // Header size varies depending on whether we are recycling or not.
+ const int header_size =
+ recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize;
+
+ // Fragment the record if necessary and emit it. Note that if slice
+ // is empty, we still want to iterate once to emit a single
+ // zero-length record
+ IOStatus s;
+ bool begin = true;
+ int compress_remaining = 0;
+ bool compress_start = false;
+ if (compress_) {
+ compress_->Reset();
+ compress_start = true;
+ }
+ do {
+ const int64_t leftover = kBlockSize - block_offset_;
+ assert(leftover >= 0);
+ if (leftover < header_size) {
+ // Switch to a new block
+ if (leftover > 0) {
+ // Fill the trailer (literal below relies on kHeaderSize and
+ // kRecyclableHeaderSize being <= 11)
+ assert(header_size <= 11);
+ s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+ static_cast<size_t>(leftover)),
+ 0 /* crc32c_checksum */, rate_limiter_priority);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ block_offset_ = 0;
+ }
+
+ // Invariant: we never leave < header_size bytes in a block.
+ assert(static_cast<int64_t>(kBlockSize - block_offset_) >= header_size);
+
+ const size_t avail = kBlockSize - block_offset_ - header_size;
+
+ // Compress the record if compression is enabled.
+ // Compress() is called at least once (compress_start=true) and after the
+ // previous generated compressed chunk is written out as one or more
+ // physical records (left=0).
+ if (compress_ && (compress_start || left == 0)) {
+ compress_remaining = compress_->Compress(slice.data(), slice.size(),
+ compressed_buffer_.get(), &left);
+
+ if (compress_remaining < 0) {
+ // Set failure status
+ s = IOStatus::IOError("Unexpected WAL compression error");
+ s.SetDataLoss(true);
+ break;
+ } else if (left == 0) {
+ // Nothing left to compress
+ if (!compress_start) {
+ break;
+ }
+ }
+ compress_start = false;
+ ptr = compressed_buffer_.get();
+ }
+
+ const size_t fragment_length = (left < avail) ? left : avail;
+
+ RecordType type;
+ const bool end = (left == fragment_length && compress_remaining == 0);
+ if (begin && end) {
+ type = recycle_log_files_ ? kRecyclableFullType : kFullType;
+ } else if (begin) {
+ type = recycle_log_files_ ? kRecyclableFirstType : kFirstType;
+ } else if (end) {
+ type = recycle_log_files_ ? kRecyclableLastType : kLastType;
+ } else {
+ type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType;
+ }
+
+ s = EmitPhysicalRecord(type, ptr, fragment_length, rate_limiter_priority);
+ ptr += fragment_length;
+ left -= fragment_length;
+ begin = false;
+ } while (s.ok() && (left > 0 || compress_remaining > 0));
+
+ if (s.ok()) {
+ if (!manual_flush_) {
+ s = dest_->Flush(rate_limiter_priority);
+ }
+ }
+
+ return s;
+}
+
+IOStatus Writer::AddCompressionTypeRecord() {
+ // Should be the first record
+ assert(block_offset_ == 0);
+
+ if (compression_type_ == kNoCompression) {
+ // No need to add a record
+ return IOStatus::OK();
+ }
+
+ CompressionTypeRecord record(compression_type_);
+ std::string encode;
+ record.EncodeTo(&encode);
+ IOStatus s =
+ EmitPhysicalRecord(kSetCompressionType, encode.data(), encode.size());
+ if (s.ok()) {
+ if (!manual_flush_) {
+ s = dest_->Flush();
+ }
+ // Initialize fields required for compression
+ const size_t max_output_buffer_len =
+ kBlockSize - (recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize);
+ CompressionOptions opts;
+ constexpr uint32_t compression_format_version = 2;
+ compress_ = StreamingCompress::Create(compression_type_, opts,
+ compression_format_version,
+ max_output_buffer_len);
+ assert(compress_ != nullptr);
+ compressed_buffer_ =
+ std::unique_ptr<char[]>(new char[max_output_buffer_len]);
+ assert(compressed_buffer_);
+ } else {
+ // Disable compression if the record could not be added.
+ compression_type_ = kNoCompression;
+ }
+ return s;
+}
+
+bool Writer::BufferIsEmpty() { return dest_->BufferIsEmpty(); }
+
+IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n,
+ Env::IOPriority rate_limiter_priority) {
+ assert(n <= 0xffff); // Must fit in two bytes
+
+ size_t header_size;
+ char buf[kRecyclableHeaderSize];
+
+ // Format the header
+ buf[4] = static_cast<char>(n & 0xff);
+ buf[5] = static_cast<char>(n >> 8);
+ buf[6] = static_cast<char>(t);
+
+ uint32_t crc = type_crc_[t];
+ if (t < kRecyclableFullType || t == kSetCompressionType) {
+ // Legacy record format
+ assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+ header_size = kHeaderSize;
+ } else {
+ // Recyclable record format
+ assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize);
+ header_size = kRecyclableHeaderSize;
+
+ // Only encode low 32-bits of the 64-bit log number. This means
+ // we will fail to detect an old record if we recycled a log from
+ // ~4 billion logs ago, but that is effectively impossible, and
+ // even if it were we'dbe far more likely to see a false positive
+ // on the 32-bit CRC.
+ EncodeFixed32(buf + 7, static_cast<uint32_t>(log_number_));
+ crc = crc32c::Extend(crc, buf + 7, 4);
+ }
+
+ // Compute the crc of the record type and the payload.
+ uint32_t payload_crc = crc32c::Value(ptr, n);
+ crc = crc32c::Crc32cCombine(crc, payload_crc, n);
+ crc = crc32c::Mask(crc); // Adjust for storage
+ TEST_SYNC_POINT_CALLBACK("LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum",
+ &crc);
+ EncodeFixed32(buf, crc);
+
+ // Write the header and the payload
+ IOStatus s = dest_->Append(Slice(buf, header_size), 0 /* crc32c_checksum */,
+ rate_limiter_priority);
+ if (s.ok()) {
+ s = dest_->Append(Slice(ptr, n), payload_crc, rate_limiter_priority);
+ }
+ block_offset_ += header_size + n;
+ return s;
+}
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_writer.h b/src/rocksdb/db/log_writer.h
new file mode 100644
index 000000000..5d266e434
--- /dev/null
+++ b/src/rocksdb/db/log_writer.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "db/log_format.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriter;
+
+namespace log {
+
+/**
+ * Writer is a general purpose log stream writer. It provides an append-only
+ * abstraction for writing data. The details of the how the data is written is
+ * handled by the WritableFile sub-class implementation.
+ *
+ * File format:
+ *
+ * File is broken down into variable sized records. The format of each record
+ * is described below.
+ * +-----+-------------+--+----+----------+------+-- ... ----+
+ * File | r0 | r1 |P | r2 | r3 | r4 | |
+ * +-----+-------------+--+----+----------+------+-- ... ----+
+ * <--- kBlockSize ------>|<-- kBlockSize ------>|
+ * rn = variable size records
+ * P = Padding
+ *
+ * Data is written out in kBlockSize chunks. If next record does not fit
+ * into the space left, the leftover space will be padded with \0.
+ *
+ * Legacy record format:
+ *
+ * +---------+-----------+-----------+--- ... ---+
+ * |CRC (4B) | Size (2B) | Type (1B) | Payload |
+ * +---------+-----------+-----------+--- ... ---+
+ *
+ * CRC = 32bit hash computed over the record type and payload using CRC
+ * Size = Length of the payload data
+ * Type = Type of record
+ * (kZeroType, kFullType, kFirstType, kLastType, kMiddleType )
+ * The type is used to group a bunch of records together to represent
+ * blocks that are larger than kBlockSize
+ * Payload = Byte stream as long as specified by the payload size
+ *
+ * Recyclable record format:
+ *
+ * +---------+-----------+-----------+----------------+--- ... ---+
+ * |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload |
+ * +---------+-----------+-----------+----------------+--- ... ---+
+ *
+ * Same as above, with the addition of
+ * Log number = 32bit log file number, so that we can distinguish between
+ * records written by the most recent log writer vs a previous one.
+ */
+class Writer {
+ public:
+ // Create a writer that will append data to "*dest".
+ // "*dest" must be initially empty.
+ // "*dest" must remain live while this Writer is in use.
+ explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
+ uint64_t log_number, bool recycle_log_files,
+ bool manual_flush = false,
+ CompressionType compressionType = kNoCompression);
+ // No copying allowed
+ Writer(const Writer&) = delete;
+ void operator=(const Writer&) = delete;
+
+ ~Writer();
+
+ IOStatus AddRecord(const Slice& slice,
+ Env::IOPriority rate_limiter_priority = Env::IO_TOTAL);
+ IOStatus AddCompressionTypeRecord();
+
+ WritableFileWriter* file() { return dest_.get(); }
+ const WritableFileWriter* file() const { return dest_.get(); }
+
+ uint64_t get_log_number() const { return log_number_; }
+
+ IOStatus WriteBuffer();
+
+ IOStatus Close();
+
+ bool BufferIsEmpty();
+
+ private:
+ std::unique_ptr<WritableFileWriter> dest_;
+ size_t block_offset_; // Current offset in block
+ uint64_t log_number_;
+ bool recycle_log_files_;
+
+ // crc32c values for all supported record types. These are
+ // pre-computed to reduce the overhead of computing the crc of the
+ // record type stored in the header.
+ uint32_t type_crc_[kMaxRecordType + 1];
+
+ IOStatus EmitPhysicalRecord(
+ RecordType type, const char* ptr, size_t length,
+ Env::IOPriority rate_limiter_priority = Env::IO_TOTAL);
+
+ // If true, it does not flush after each write. Instead it relies on the upper
+ // layer to manually does the flush by calling ::WriteBuffer()
+ bool manual_flush_;
+
+ // Compression Type
+ CompressionType compression_type_;
+ StreamingCompress* compress_;
+ // Reusable compressed output buffer
+ std::unique_ptr<char[]> compressed_buffer_;
+};
+
+} // namespace log
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/logs_with_prep_tracker.cc b/src/rocksdb/db/logs_with_prep_tracker.cc
new file mode 100644
index 000000000..ff98155c4
--- /dev/null
+++ b/src/rocksdb/db/logs_with_prep_tracker.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/logs_with_prep_tracker.h"
+
+#include "port/likely.h"
+
+namespace ROCKSDB_NAMESPACE {
+void LogsWithPrepTracker::MarkLogAsHavingPrepSectionFlushed(uint64_t log) {
+ assert(log != 0);
+ std::lock_guard<std::mutex> lock(prepared_section_completed_mutex_);
+ auto it = prepared_section_completed_.find(log);
+ if (UNLIKELY(it == prepared_section_completed_.end())) {
+ prepared_section_completed_[log] = 1;
+ } else {
+ it->second += 1;
+ }
+}
+
+void LogsWithPrepTracker::MarkLogAsContainingPrepSection(uint64_t log) {
+ assert(log != 0);
+ std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+
+ auto rit = logs_with_prep_.rbegin();
+ bool updated = false;
+ // Most probably the last log is the one that is being marked for
+ // having a prepare section; so search from the end.
+ for (; rit != logs_with_prep_.rend() && rit->log >= log; ++rit) {
+ if (rit->log == log) {
+ rit->cnt++;
+ updated = true;
+ break;
+ }
+ }
+ if (!updated) {
+ // We are either at the start, or at a position with rit->log < log
+ logs_with_prep_.insert(rit.base(), {log, 1});
+ }
+}
+
+uint64_t LogsWithPrepTracker::FindMinLogContainingOutstandingPrep() {
+ std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+ auto it = logs_with_prep_.begin();
+ // start with the smallest log
+ for (; it != logs_with_prep_.end();) {
+ auto min_log = it->log;
+ {
+ std::lock_guard<std::mutex> lock2(prepared_section_completed_mutex_);
+ auto completed_it = prepared_section_completed_.find(min_log);
+ if (completed_it == prepared_section_completed_.end() ||
+ completed_it->second < it->cnt) {
+ return min_log;
+ }
+ assert(completed_it != prepared_section_completed_.end() &&
+ completed_it->second == it->cnt);
+ prepared_section_completed_.erase(completed_it);
+ }
+ // erase from beginning in vector is not efficient but this function is not
+ // on the fast path.
+ it = logs_with_prep_.erase(it);
+ }
+ // no such log found
+ return 0;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/logs_with_prep_tracker.h b/src/rocksdb/db/logs_with_prep_tracker.h
new file mode 100644
index 000000000..f72f0ca07
--- /dev/null
+++ b/src/rocksdb/db/logs_with_prep_tracker.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class is used to track the log files with outstanding prepare entries.
+class LogsWithPrepTracker {
+ public:
+ // Called when a transaction prepared in `log` has been committed or aborted.
+ void MarkLogAsHavingPrepSectionFlushed(uint64_t log);
+ // Called when a transaction is prepared in `log`.
+ void MarkLogAsContainingPrepSection(uint64_t log);
+ // Return the earliest log file with outstanding prepare entries.
+ uint64_t FindMinLogContainingOutstandingPrep();
+ size_t TEST_PreparedSectionCompletedSize() {
+ return prepared_section_completed_.size();
+ }
+ size_t TEST_LogsWithPrepSize() { return logs_with_prep_.size(); }
+
+ private:
+ // REQUIRES: logs_with_prep_mutex_ held
+ //
+ // sorted list of log numbers still containing prepared data.
+ // this is used by FindObsoleteFiles to determine which
+ // flushed logs we must keep around because they still
+ // contain prepared data which has not been committed or rolled back
+ struct LogCnt {
+ uint64_t log; // the log number
+ uint64_t cnt; // number of prepared sections in the log
+ };
+ std::vector<LogCnt> logs_with_prep_;
+ std::mutex logs_with_prep_mutex_;
+
+ // REQUIRES: prepared_section_completed_mutex_ held
+ //
+ // to be used in conjunction with logs_with_prep_.
+ // once a transaction with data in log L is committed or rolled back
+ // rather than updating logs_with_prep_ directly we keep track of that
+ // in prepared_section_completed_ which maps LOG -> instance_count. This helps
+ // avoiding contention between a commit thread and the prepare threads.
+ //
+ // when trying to determine the minimum log still active we first
+ // consult logs_with_prep_. while that root value maps to
+ // an equal value in prepared_section_completed_ we erase the log from
+ // both logs_with_prep_ and prepared_section_completed_.
+ std::unordered_map<uint64_t, uint64_t> prepared_section_completed_;
+ std::mutex prepared_section_completed_mutex_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/lookup_key.h b/src/rocksdb/db/lookup_key.h
new file mode 100644
index 000000000..68851bddd
--- /dev/null
+++ b/src/rocksdb/db/lookup_key.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include <utility>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+ // Initialize *this for looking up user_key at a snapshot with
+ // the specified sequence number.
+ LookupKey(const Slice& _user_key, SequenceNumber sequence,
+ const Slice* ts = nullptr);
+
+ ~LookupKey();
+
+ // Return a key suitable for lookup in a MemTable.
+ Slice memtable_key() const {
+ return Slice(start_, static_cast<size_t>(end_ - start_));
+ }
+
+ // Return an internal key (suitable for passing to an internal iterator)
+ Slice internal_key() const {
+ return Slice(kstart_, static_cast<size_t>(end_ - kstart_));
+ }
+
+ // Return the user key.
+ // If user-defined timestamp is enabled, then timestamp is included in the
+ // result.
+ Slice user_key() const {
+ return Slice(kstart_, static_cast<size_t>(end_ - kstart_ - 8));
+ }
+
+ private:
+ // We construct a char array of the form:
+ // klength varint32 <-- start_
+ // userkey char[klength] <-- kstart_
+ // tag uint64
+ // <-- end_
+ // The array is a suitable MemTable key.
+ // The suffix starting with "userkey" can be used as an InternalKey.
+ const char* start_;
+ const char* kstart_;
+ const char* end_;
+ char space_[200]; // Avoid allocation for short keys
+
+ // No copying allowed
+ LookupKey(const LookupKey&);
+ void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+ if (start_ != space_) delete[] start_;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/malloc_stats.cc b/src/rocksdb/db/malloc_stats.cc
new file mode 100644
index 000000000..52f2e6e0f
--- /dev/null
+++ b/src/rocksdb/db/malloc_stats.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/malloc_stats.h"
+
+#ifndef ROCKSDB_LITE
+#include <string.h>
+
+#include <memory>
+
+#include "port/jemalloc_helper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_JEMALLOC
+
+struct MallocStatus {
+ char* cur;
+ char* end;
+};
+
+static void GetJemallocStatus(void* mstat_arg, const char* status) {
+ MallocStatus* mstat = reinterpret_cast<MallocStatus*>(mstat_arg);
+ size_t status_len = status ? strlen(status) : 0;
+ size_t buf_size = (size_t)(mstat->end - mstat->cur);
+ if (!status_len || status_len > buf_size) {
+ return;
+ }
+
+ snprintf(mstat->cur, buf_size, "%s", status);
+ mstat->cur += status_len;
+}
+void DumpMallocStats(std::string* stats) {
+ if (!HasJemalloc()) {
+ return;
+ }
+ MallocStatus mstat;
+ const unsigned int kMallocStatusLen = 1000000;
+ std::unique_ptr<char[]> buf{new char[kMallocStatusLen + 1]};
+ mstat.cur = buf.get();
+ mstat.end = buf.get() + kMallocStatusLen;
+ malloc_stats_print(GetJemallocStatus, &mstat, "");
+ stats->append(buf.get());
+}
+#else
+void DumpMallocStats(std::string*) {}
+#endif // ROCKSDB_JEMALLOC
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/malloc_stats.h b/src/rocksdb/db/malloc_stats.h
new file mode 100644
index 000000000..18aff3ad0
--- /dev/null
+++ b/src/rocksdb/db/malloc_stats.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DumpMallocStats(std::string*);
+
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/manual_compaction_test.cc b/src/rocksdb/db/manual_compaction_test.cc
new file mode 100644
index 000000000..b92cb794b
--- /dev/null
+++ b/src/rocksdb/db/manual_compaction_test.cc
@@ -0,0 +1,308 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Test for issue 178: a manual compaction causes deleted data to reappear.
+#include <cstdlib>
+
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/testharness.h"
+
+using ROCKSDB_NAMESPACE::CompactionFilter;
+using ROCKSDB_NAMESPACE::CompactionStyle;
+using ROCKSDB_NAMESPACE::CompactRangeOptions;
+using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DestroyDB;
+using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::Iterator;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+namespace {
+
+// Reasoning: previously the number was 1100000. Since the keys are written to
+// the batch in one write each write will result into one SST file. each write
+// will result into one SST file. We reduced the write_buffer_size to 1K to
+// basically have the same effect with however less number of keys, which
+// results into less test runtime.
+const int kNumKeys = 1100;
+
+std::string Key1(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "my_key_%d", i);
+ return buf;
+}
+
+std::string Key2(int i) { return Key1(i) + "_xxx"; }
+
+class ManualCompactionTest : public testing::Test {
+ public:
+ ManualCompactionTest() {
+ // Get rid of any state from an old run.
+ dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath(
+ "rocksdb_manual_compaction_test");
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ }
+
+ std::string dbname_;
+};
+
+class DestroyAllCompactionFilter : public CompactionFilter {
+ public:
+ DestroyAllCompactionFilter() {}
+
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& existing_value,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return existing_value.ToString() == "destroy";
+ }
+
+ const char* Name() const override { return "DestroyAllCompactionFilter"; }
+};
+
+class LogCompactionFilter : public CompactionFilter {
+ public:
+ const char* Name() const override { return "LogCompactionFilter"; }
+
+ bool Filter(int level, const Slice& key, const Slice& /*existing_value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ key_level_[key.ToString()] = level;
+ return false;
+ }
+
+ void Reset() { key_level_.clear(); }
+
+ size_t NumKeys() const { return key_level_.size(); }
+
+ int KeyLevel(const Slice& key) {
+ auto it = key_level_.find(key.ToString());
+ if (it == key_level_.end()) {
+ return -1;
+ }
+ return it->second;
+ }
+
+ private:
+ mutable std::map<std::string, int> key_level_;
+};
+
+TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
+ for (int iter = 0; iter < 2; ++iter) {
+ DB* db;
+ Options options;
+ if (iter == 0) { // level compaction
+ options.num_levels = 3;
+ options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+ } else { // universal compaction
+ options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+ }
+ options.create_if_missing = true;
+ options.compression = CompressionType::kNoCompression;
+ options.compaction_filter = new DestroyAllCompactionFilter();
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("destroy")));
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key2"), Slice("destroy")));
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key4"), Slice("destroy")));
+
+ Slice key4("key4");
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &key4));
+ Iterator* itr = db->NewIterator(ReadOptions());
+ itr->SeekToFirst();
+ ASSERT_TRUE(itr->Valid());
+ ASSERT_EQ("key3", itr->key().ToString());
+ itr->Next();
+ ASSERT_TRUE(!itr->Valid());
+ delete itr;
+
+ delete options.compaction_filter;
+ delete db;
+ ASSERT_OK(DestroyDB(dbname_, options));
+ }
+}
+
+TEST_F(ManualCompactionTest, Test) {
+ // Open database. Disable compression since it affects the creation
+ // of layers and the code below is trying to test against a very
+ // specific scenario.
+ DB* db;
+ Options db_options;
+ db_options.write_buffer_size = 1024;
+ db_options.create_if_missing = true;
+ db_options.compression = CompressionType::kNoCompression;
+ ASSERT_OK(DB::Open(db_options, dbname_, &db));
+
+ // create first key range
+ WriteBatch batch;
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(batch.Put(Key1(i), "value for range 1 key"));
+ }
+ ASSERT_OK(db->Write(WriteOptions(), &batch));
+
+ // create second key range
+ batch.Clear();
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(batch.Put(Key2(i), "value for range 2 key"));
+ }
+ ASSERT_OK(db->Write(WriteOptions(), &batch));
+
+ // delete second key range
+ batch.Clear();
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(batch.Delete(Key2(i)));
+ }
+ ASSERT_OK(db->Write(WriteOptions(), &batch));
+
+ // compact database
+ std::string start_key = Key1(0);
+ std::string end_key = Key1(kNumKeys - 1);
+ Slice least(start_key.data(), start_key.size());
+ Slice greatest(end_key.data(), end_key.size());
+
+ // commenting out the line below causes the example to work correctly
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+ // count the keys
+ Iterator* iter = db->NewIterator(ReadOptions());
+ int num_keys = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ num_keys++;
+ }
+ delete iter;
+ ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
+
+ // close database
+ delete db;
+ ASSERT_OK(DestroyDB(dbname_, Options()));
+}
+
+TEST_F(ManualCompactionTest, SkipLevel) {
+ DB* db;
+ Options options;
+ options.num_levels = 3;
+ // Initially, flushed L0 files won't exceed 100.
+ options.level0_file_num_compaction_trigger = 100;
+ options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+ options.create_if_missing = true;
+ options.compression = CompressionType::kNoCompression;
+ LogCompactionFilter* filter = new LogCompactionFilter();
+ options.compaction_filter = filter;
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+
+ WriteOptions wo;
+ FlushOptions fo;
+ ASSERT_OK(db->Put(wo, "1", ""));
+ ASSERT_OK(db->Flush(fo));
+ ASSERT_OK(db->Put(wo, "2", ""));
+ ASSERT_OK(db->Flush(fo));
+ ASSERT_OK(db->Put(wo, "4", ""));
+ ASSERT_OK(db->Put(wo, "8", ""));
+ ASSERT_OK(db->Flush(fo));
+
+ {
+ // L0: 1, 2, [4, 8]
+ // no file has keys in range [5, 7]
+ Slice start("5");
+ Slice end("7");
+ filter->Reset();
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+ ASSERT_EQ(0, filter->NumKeys());
+ }
+
+ {
+ // L0: 1, 2, [4, 8]
+ // [3, 7] overlaps with 4 in L0
+ Slice start("3");
+ Slice end("7");
+ filter->Reset();
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+ ASSERT_EQ(2, filter->NumKeys());
+ ASSERT_EQ(0, filter->KeyLevel("4"));
+ ASSERT_EQ(0, filter->KeyLevel("8"));
+ }
+
+ {
+ // L0: 1, 2
+ // L1: [4, 8]
+ // no file has keys in range (-inf, 0]
+ Slice end("0");
+ filter->Reset();
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &end));
+ ASSERT_EQ(0, filter->NumKeys());
+ }
+
+ {
+ // L0: 1, 2
+ // L1: [4, 8]
+ // no file has keys in range [9, inf)
+ Slice start("9");
+ filter->Reset();
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
+ ASSERT_EQ(0, filter->NumKeys());
+ }
+
+ {
+ // L0: 1, 2
+ // L1: [4, 8]
+ // [2, 2] overlaps with 2 in L0
+ Slice start("2");
+ Slice end("2");
+ filter->Reset();
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+ ASSERT_EQ(1, filter->NumKeys());
+ ASSERT_EQ(0, filter->KeyLevel("2"));
+ }
+
+ {
+ // L0: 1
+ // L1: 2, [4, 8]
+ // [2, 5] overlaps with 2 and [4, 8) in L1, skip L0
+ Slice start("2");
+ Slice end("5");
+ filter->Reset();
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+ ASSERT_EQ(3, filter->NumKeys());
+ ASSERT_EQ(1, filter->KeyLevel("2"));
+ ASSERT_EQ(1, filter->KeyLevel("4"));
+ ASSERT_EQ(1, filter->KeyLevel("8"));
+ }
+
+ {
+ // L0: 1
+ // L1: [2, 4, 8]
+ // [0, inf) overlaps all files
+ Slice start("0");
+ filter->Reset();
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
+ ASSERT_EQ(4, filter->NumKeys());
+ // 1 is first compacted to L1 and then further compacted into [2, 4, 8],
+ // so finally the logged level for 1 is L1.
+ ASSERT_EQ(1, filter->KeyLevel("1"));
+ ASSERT_EQ(1, filter->KeyLevel("2"));
+ ASSERT_EQ(1, filter->KeyLevel("4"));
+ ASSERT_EQ(1, filter->KeyLevel("8"));
+ }
+
+ delete filter;
+ delete db;
+ ASSERT_OK(DestroyDB(dbname_, options));
+}
+
+} // anonymous namespace
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/memtable.cc b/src/rocksdb/db/memtable.cc
new file mode 100644
index 000000000..45b139e80
--- /dev/null
+++ b/src/rocksdb/db/memtable.cc
@@ -0,0 +1,1675 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <memory>
+
+#include "db/dbformat.h"
+#include "db/kv_checksum.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
+#include "db/wide/wide_column_serialization.h"
+#include "logging/logging.h"
+#include "memory/arena.h"
+#include "memory/memory_usage.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "table/merging_iterator.h"
+#include "util/autovector.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ImmutableMemTableOptions::ImmutableMemTableOptions(
+ const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options)
+ : arena_block_size(mutable_cf_options.arena_block_size),
+ memtable_prefix_bloom_bits(
+ static_cast<uint32_t>(
+ static_cast<double>(mutable_cf_options.write_buffer_size) *
+ mutable_cf_options.memtable_prefix_bloom_size_ratio) *
+ 8u),
+ memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size),
+ memtable_whole_key_filtering(
+ mutable_cf_options.memtable_whole_key_filtering),
+ inplace_update_support(ioptions.inplace_update_support),
+ inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
+ inplace_callback(ioptions.inplace_callback),
+ max_successive_merges(mutable_cf_options.max_successive_merges),
+ statistics(ioptions.stats),
+ merge_operator(ioptions.merge_operator.get()),
+ info_log(ioptions.logger),
+ allow_data_in_errors(ioptions.allow_data_in_errors),
+ protection_bytes_per_key(
+ mutable_cf_options.memtable_protection_bytes_per_key) {}
+
+MemTable::MemTable(const InternalKeyComparator& cmp,
+ const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ WriteBufferManager* write_buffer_manager,
+ SequenceNumber latest_seq, uint32_t column_family_id)
+ : comparator_(cmp),
+ moptions_(ioptions, mutable_cf_options),
+ refs_(0),
+ kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
+ mem_tracker_(write_buffer_manager),
+ arena_(moptions_.arena_block_size,
+ (write_buffer_manager != nullptr &&
+ (write_buffer_manager->enabled() ||
+ write_buffer_manager->cost_to_cache()))
+ ? &mem_tracker_
+ : nullptr,
+ mutable_cf_options.memtable_huge_page_size),
+ table_(ioptions.memtable_factory->CreateMemTableRep(
+ comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
+ ioptions.logger, column_family_id)),
+ range_del_table_(SkipListFactory().CreateMemTableRep(
+ comparator_, &arena_, nullptr /* transform */, ioptions.logger,
+ column_family_id)),
+ is_range_del_table_empty_(true),
+ data_size_(0),
+ num_entries_(0),
+ num_deletes_(0),
+ write_buffer_size_(mutable_cf_options.write_buffer_size),
+ flush_in_progress_(false),
+ flush_completed_(false),
+ file_number_(0),
+ first_seqno_(0),
+ earliest_seqno_(latest_seq),
+ creation_seq_(latest_seq),
+ mem_next_logfile_number_(0),
+ min_prep_log_referenced_(0),
+ locks_(moptions_.inplace_update_support
+ ? moptions_.inplace_update_num_locks
+ : 0),
+ prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+ flush_state_(FLUSH_NOT_REQUESTED),
+ clock_(ioptions.clock),
+ insert_with_hint_prefix_extractor_(
+ ioptions.memtable_insert_with_hint_prefix_extractor.get()),
+ oldest_key_time_(std::numeric_limits<uint64_t>::max()),
+ atomic_flush_seqno_(kMaxSequenceNumber),
+ approximate_memory_usage_(0) {
+ UpdateFlushState();
+ // something went wrong if we need to flush before inserting anything
+ assert(!ShouldScheduleFlush());
+
+ // use bloom_filter_ for both whole key and prefix bloom filter
+ if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) &&
+ moptions_.memtable_prefix_bloom_bits > 0) {
+ bloom_filter_.reset(
+ new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
+ 6 /* hard coded 6 probes */,
+ moptions_.memtable_huge_page_size, ioptions.logger));
+ }
+ // Initialize cached_range_tombstone_ here since it could
+ // be read before it is constructed in MemTable::Add(), which could also lead
+ // to a data race on the global mutex table backing atomic shared_ptr.
+ auto new_cache = std::make_shared<FragmentedRangeTombstoneListCache>();
+ size_t size = cached_range_tombstone_.Size();
+ for (size_t i = 0; i < size; ++i) {
+ std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
+ cached_range_tombstone_.AccessAtCore(i);
+ auto new_local_cache_ref = std::make_shared<
+ const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
+ std::atomic_store_explicit(
+ local_cache_ref_ptr,
+ std::shared_ptr<FragmentedRangeTombstoneListCache>(new_local_cache_ref,
+ new_cache.get()),
+ std::memory_order_relaxed);
+ }
+}
+
+MemTable::~MemTable() {
+ mem_tracker_.FreeMem();
+ assert(refs_ == 0);
+}
+
+size_t MemTable::ApproximateMemoryUsage() {
+ autovector<size_t> usages = {
+ arena_.ApproximateMemoryUsage(), table_->ApproximateMemoryUsage(),
+ range_del_table_->ApproximateMemoryUsage(),
+ ROCKSDB_NAMESPACE::ApproximateMemoryUsage(insert_hints_)};
+ size_t total_usage = 0;
+ for (size_t usage : usages) {
+ // If usage + total_usage >= kMaxSizet, return kMaxSizet.
+ // the following variation is to avoid numeric overflow.
+ if (usage >= std::numeric_limits<size_t>::max() - total_usage) {
+ return std::numeric_limits<size_t>::max();
+ }
+ total_usage += usage;
+ }
+ approximate_memory_usage_.store(total_usage, std::memory_order_relaxed);
+ // otherwise, return the actual usage
+ return total_usage;
+}
+
+bool MemTable::ShouldFlushNow() {
+ size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
+ // In a lot of times, we cannot allocate arena blocks that exactly matches the
+ // buffer size. Thus we have to decide if we should over-allocate or
+ // under-allocate.
+ // This constant variable can be interpreted as: if we still have more than
+ // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
+ // allocate one more block.
+ const double kAllowOverAllocationRatio = 0.6;
+
+ // If arena still have room for new block allocation, we can safely say it
+ // shouldn't flush.
+ auto allocated_memory = table_->ApproximateMemoryUsage() +
+ range_del_table_->ApproximateMemoryUsage() +
+ arena_.MemoryAllocatedBytes();
+
+ approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
+
+ // if we can still allocate one more block without exceeding the
+ // over-allocation ratio, then we should not flush.
+ if (allocated_memory + kArenaBlockSize <
+ write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
+ return false;
+ }
+
+ // if user keeps adding entries that exceeds write_buffer_size, we need to
+ // flush earlier even though we still have much available memory left.
+ if (allocated_memory >
+ write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
+ return true;
+ }
+
+ // In this code path, Arena has already allocated its "last block", which
+ // means the total allocatedmemory size is either:
+ // (1) "moderately" over allocated the memory (no more than `0.6 * arena
+ // block size`. Or,
+ // (2) the allocated memory is less than write buffer size, but we'll stop
+ // here since if we allocate a new arena block, we'll over allocate too much
+ // more (half of the arena block size) memory.
+ //
+ // In either case, to avoid over-allocate, the last block will stop allocation
+ // when its usage reaches a certain ratio, which we carefully choose "0.75
+ // full" as the stop condition because it addresses the following issue with
+ // great simplicity: What if the next inserted entry's size is
+ // bigger than AllocatedAndUnused()?
+ //
+ // The answer is: if the entry size is also bigger than 0.25 *
+ // kArenaBlockSize, a dedicated block will be allocated for it; otherwise
+ // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
+ // and regular block. In either case, we *overly* over-allocated.
+ //
+ // Therefore, setting the last block to be at most "0.75 full" avoids both
+ // cases.
+ //
+ // NOTE: the average percentage of waste space of this approach can be counted
+ // as: "arena block size * 0.25 / write buffer size". User who specify a small
+ // write buffer size and/or big arena block size may suffer.
+ return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
+}
+
+void MemTable::UpdateFlushState() {
+ auto state = flush_state_.load(std::memory_order_relaxed);
+ if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) {
+ // ignore CAS failure, because that means somebody else requested
+ // a flush
+ flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED,
+ std::memory_order_relaxed,
+ std::memory_order_relaxed);
+ }
+}
+
+void MemTable::UpdateOldestKeyTime() {
+ uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
+ if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
+ int64_t current_time = 0;
+ auto s = clock_->GetCurrentTime(&current_time);
+ if (s.ok()) {
+ assert(current_time >= 0);
+ // If fail, the timestamp is already set.
+ oldest_key_time_.compare_exchange_strong(
+ oldest_key_time, static_cast<uint64_t>(current_time),
+ std::memory_order_relaxed, std::memory_order_relaxed);
+ }
+ }
+}
+
+Status MemTable::VerifyEntryChecksum(const char* entry,
+ size_t protection_bytes_per_key,
+ bool allow_data_in_errors) {
+ if (protection_bytes_per_key == 0) {
+ return Status::OK();
+ }
+ uint32_t key_length;
+ const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+ if (key_ptr == nullptr) {
+ return Status::Corruption("Unable to parse internal key length");
+ }
+ if (key_length < 8) {
+ return Status::Corruption("Memtable entry internal key length too short.");
+ }
+ Slice user_key = Slice(key_ptr, key_length - 8);
+
+ const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+ ValueType type;
+ SequenceNumber seq;
+ UnPackSequenceAndType(tag, &seq, &type);
+
+ uint32_t value_length = 0;
+ const char* value_ptr = GetVarint32Ptr(
+ key_ptr + key_length, key_ptr + key_length + 5, &value_length);
+ if (value_ptr == nullptr) {
+ return Status::Corruption("Unable to parse internal key value");
+ }
+ Slice value = Slice(value_ptr, value_length);
+
+ const char* checksum_ptr = value_ptr + value_length;
+ uint64_t expected = ProtectionInfo64()
+ .ProtectKVO(user_key, value, type)
+ .ProtectS(seq)
+ .GetVal();
+ bool match = true;
+ switch (protection_bytes_per_key) {
+ case 1:
+ match = static_cast<uint8_t>(checksum_ptr[0]) ==
+ static_cast<uint8_t>(expected);
+ break;
+ case 2:
+ match = DecodeFixed16(checksum_ptr) == static_cast<uint16_t>(expected);
+ break;
+ case 4:
+ match = DecodeFixed32(checksum_ptr) == static_cast<uint32_t>(expected);
+ break;
+ case 8:
+ match = DecodeFixed64(checksum_ptr) == expected;
+ break;
+ default:
+ assert(false);
+ }
+ if (!match) {
+ std::string msg(
+ "Corrupted memtable entry, per key-value checksum verification "
+ "failed.");
+ if (allow_data_in_errors) {
+ msg.append("Unrecognized value type: " +
+ std::to_string(static_cast<int>(type)) + ". ");
+ msg.append("User key: " + user_key.ToString(/*hex=*/true) + ". ");
+ msg.append("seq: " + std::to_string(seq) + ".");
+ }
+ return Status::Corruption(msg.c_str());
+ }
+ return Status::OK();
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
+ const char* prefix_len_key2) const {
+ // Internal keys are encoded as length-prefixed strings.
+ Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
+ Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
+ return comparator.CompareKeySeq(k1, k2);
+}
+
+int MemTable::KeyComparator::operator()(
+ const char* prefix_len_key, const KeyComparator::DecodedType& key) const {
+ // Internal keys are encoded as length-prefixed strings.
+ Slice a = GetLengthPrefixedSlice(prefix_len_key);
+ return comparator.CompareKeySeq(a, key);
+}
+
+void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) {
+#ifndef ROCKSDB_LITE
+ throw std::runtime_error("concurrent insert not supported");
+#else
+ abort();
+#endif
+}
+
+Slice MemTableRep::UserKey(const char* key) const {
+ Slice slice = GetLengthPrefixedSlice(key);
+ return Slice(slice.data(), slice.size() - 8);
+}
+
+KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
+ *buf = allocator_->Allocate(len);
+ return static_cast<KeyHandle>(*buf);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+const char* EncodeKey(std::string* scratch, const Slice& target) {
+ scratch->clear();
+ PutVarint32(scratch, static_cast<uint32_t>(target.size()));
+ scratch->append(target.data(), target.size());
+ return scratch->data();
+}
+
+class MemTableIterator : public InternalIterator {
+ public:
+ MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
+ Arena* arena, bool use_range_del_table = false)
+ : bloom_(nullptr),
+ prefix_extractor_(mem.prefix_extractor_),
+ comparator_(mem.comparator_),
+ valid_(false),
+ arena_mode_(arena != nullptr),
+ value_pinned_(
+ !mem.GetImmutableMemTableOptions()->inplace_update_support),
+ protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
+ status_(Status::OK()),
+ logger_(mem.moptions_.info_log) {
+ if (use_range_del_table) {
+ iter_ = mem.range_del_table_->GetIterator(arena);
+ } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
+ !read_options.auto_prefix_mode) {
+ // Auto prefix mode is not implemented in memtable yet.
+ bloom_ = mem.bloom_filter_.get();
+ iter_ = mem.table_->GetDynamicPrefixIterator(arena);
+ } else {
+ iter_ = mem.table_->GetIterator(arena);
+ }
+ status_.PermitUncheckedError();
+ }
+ // No copying allowed
+ MemTableIterator(const MemTableIterator&) = delete;
+ void operator=(const MemTableIterator&) = delete;
+
+ ~MemTableIterator() override {
+#ifndef NDEBUG
+ // Assert that the MemTableIterator is never deleted while
+ // Pinning is Enabled.
+ assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
+#endif
+ if (arena_mode_) {
+ iter_->~Iterator();
+ } else {
+ delete iter_;
+ }
+ }
+
+#ifndef NDEBUG
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ }
+ PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
+#endif
+
+ bool Valid() const override { return valid_ && status_.ok(); }
+ void Seek(const Slice& k) override {
+ PERF_TIMER_GUARD(seek_on_memtable_time);
+ PERF_COUNTER_ADD(seek_on_memtable_count, 1);
+ if (bloom_) {
+ // iterator should only use prefix bloom filter
+ auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
+ Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
+ if (prefix_extractor_->InDomain(user_k_without_ts)) {
+ if (!bloom_->MayContain(
+ prefix_extractor_->Transform(user_k_without_ts))) {
+ PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+ valid_ = false;
+ return;
+ } else {
+ PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+ }
+ }
+ }
+ iter_->Seek(k, nullptr);
+ valid_ = iter_->Valid();
+ VerifyEntryChecksum();
+ }
+ void SeekForPrev(const Slice& k) override {
+ PERF_TIMER_GUARD(seek_on_memtable_time);
+ PERF_COUNTER_ADD(seek_on_memtable_count, 1);
+ if (bloom_) {
+ auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
+ Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
+ if (prefix_extractor_->InDomain(user_k_without_ts)) {
+ if (!bloom_->MayContain(
+ prefix_extractor_->Transform(user_k_without_ts))) {
+ PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+ valid_ = false;
+ return;
+ } else {
+ PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+ }
+ }
+ }
+ iter_->Seek(k, nullptr);
+ valid_ = iter_->Valid();
+ VerifyEntryChecksum();
+ if (!Valid() && status().ok()) {
+ SeekToLast();
+ }
+ while (Valid() && comparator_.comparator.Compare(k, key()) < 0) {
+ Prev();
+ }
+ }
+ void SeekToFirst() override {
+ iter_->SeekToFirst();
+ valid_ = iter_->Valid();
+ VerifyEntryChecksum();
+ }
+ void SeekToLast() override {
+ iter_->SeekToLast();
+ valid_ = iter_->Valid();
+ VerifyEntryChecksum();
+ }
+ void Next() override {
+ PERF_COUNTER_ADD(next_on_memtable_count, 1);
+ assert(Valid());
+ iter_->Next();
+ TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
+ valid_ = iter_->Valid();
+ VerifyEntryChecksum();
+ }
+ bool NextAndGetResult(IterateResult* result) override {
+ Next();
+ bool is_valid = Valid();
+ if (is_valid) {
+ result->key = key();
+ result->bound_check_result = IterBoundCheck::kUnknown;
+ result->value_prepared = true;
+ }
+ return is_valid;
+ }
+ void Prev() override {
+ PERF_COUNTER_ADD(prev_on_memtable_count, 1);
+ assert(Valid());
+ iter_->Prev();
+ valid_ = iter_->Valid();
+ VerifyEntryChecksum();
+ }
+ Slice key() const override {
+ assert(Valid());
+ return GetLengthPrefixedSlice(iter_->key());
+ }
+ Slice value() const override {
+ assert(Valid());
+ Slice key_slice = GetLengthPrefixedSlice(iter_->key());
+ return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+ }
+
+ Status status() const override { return status_; }
+
+ bool IsKeyPinned() const override {
+ // memtable data is always pinned
+ return true;
+ }
+
+ bool IsValuePinned() const override {
+ // memtable value is always pinned, except if we allow inplace update.
+ return value_pinned_;
+ }
+
+ private:
+ DynamicBloom* bloom_;
+ const SliceTransform* const prefix_extractor_;
+ const MemTable::KeyComparator comparator_;
+ MemTableRep::Iterator* iter_;
+ bool valid_;
+ bool arena_mode_;
+ bool value_pinned_;
+ size_t protection_bytes_per_key_;
+ Status status_;
+ Logger* logger_;
+
+ void VerifyEntryChecksum() {
+ if (protection_bytes_per_key_ > 0 && Valid()) {
+ status_ = MemTable::VerifyEntryChecksum(iter_->key(),
+ protection_bytes_per_key_);
+ if (!status_.ok()) {
+ ROCKS_LOG_ERROR(logger_, "In MemtableIterator: %s", status_.getState());
+ }
+ }
+ }
+};
+
+InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
+ Arena* arena) {
+ assert(arena != nullptr);
+ auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
+ return new (mem) MemTableIterator(*this, read_options, arena);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
+ const ReadOptions& read_options, SequenceNumber read_seq,
+ bool immutable_memtable) {
+ if (read_options.ignore_range_deletions ||
+ is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+ return nullptr;
+ }
+ return NewRangeTombstoneIteratorInternal(read_options, read_seq,
+ immutable_memtable);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
+ const ReadOptions& read_options, SequenceNumber read_seq,
+ bool immutable_memtable) {
+ if (immutable_memtable) {
+ // Note that caller should already have verified that
+ // !is_range_del_table_empty_
+ assert(IsFragmentedRangeTombstonesConstructed());
+ return new FragmentedRangeTombstoneIterator(
+ fragmented_range_tombstone_list_.get(), comparator_.comparator,
+ read_seq, read_options.timestamp);
+ }
+
+ // takes current cache
+ std::shared_ptr<FragmentedRangeTombstoneListCache> cache =
+ std::atomic_load_explicit(cached_range_tombstone_.Access(),
+ std::memory_order_relaxed);
+ // construct fragmented tombstone list if necessary
+ if (!cache->initialized.load(std::memory_order_acquire)) {
+ cache->reader_mutex.lock();
+ if (!cache->tombstones) {
+ auto* unfragmented_iter =
+ new MemTableIterator(*this, read_options, nullptr /* arena */,
+ true /* use_range_del_table */);
+ cache->tombstones.reset(new FragmentedRangeTombstoneList(
+ std::unique_ptr<InternalIterator>(unfragmented_iter),
+ comparator_.comparator));
+ cache->initialized.store(true, std::memory_order_release);
+ }
+ cache->reader_mutex.unlock();
+ }
+
+ auto* fragmented_iter = new FragmentedRangeTombstoneIterator(
+ cache, comparator_.comparator, read_seq, read_options.timestamp);
+ return fragmented_iter;
+}
+
+void MemTable::ConstructFragmentedRangeTombstones() {
+ assert(!IsFragmentedRangeTombstonesConstructed(false));
+ // There should be no concurrent Construction
+ if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+ auto* unfragmented_iter =
+ new MemTableIterator(*this, ReadOptions(), nullptr /* arena */,
+ true /* use_range_del_table */);
+
+ fragmented_range_tombstone_list_ =
+ std::make_unique<FragmentedRangeTombstoneList>(
+ std::unique_ptr<InternalIterator>(unfragmented_iter),
+ comparator_.comparator);
+ }
+}
+
+port::RWMutex* MemTable::GetLock(const Slice& key) {
+ return &locks_[GetSliceRangedNPHash(key, locks_.size())];
+}
+
+MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
+ const Slice& end_ikey) {
+ uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey);
+ entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey);
+ if (entry_count == 0) {
+ return {0, 0};
+ }
+ uint64_t n = num_entries_.load(std::memory_order_relaxed);
+ if (n == 0) {
+ return {0, 0};
+ }
+ if (entry_count > n) {
+ // (range_del_)table_->ApproximateNumEntries() is just an estimate so it can
+ // be larger than actual entries we have. Cap it to entries we have to limit
+ // the inaccuracy.
+ entry_count = n;
+ }
+ uint64_t data_size = data_size_.load(std::memory_order_relaxed);
+ return {entry_count * (data_size / n), entry_count};
+}
+
+Status MemTable::VerifyEncodedEntry(Slice encoded,
+ const ProtectionInfoKVOS64& kv_prot_info) {
+ uint32_t ikey_len = 0;
+ if (!GetVarint32(&encoded, &ikey_len)) {
+ return Status::Corruption("Unable to parse internal key length");
+ }
+ size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+ if (ikey_len < 8 + ts_sz) {
+ return Status::Corruption("Internal key length too short");
+ }
+ if (ikey_len > encoded.size()) {
+ return Status::Corruption("Internal key length too long");
+ }
+ uint32_t value_len = 0;
+ const size_t user_key_len = ikey_len - 8;
+ Slice key(encoded.data(), user_key_len);
+ encoded.remove_prefix(user_key_len);
+
+ uint64_t packed = DecodeFixed64(encoded.data());
+ ValueType value_type = kMaxValue;
+ SequenceNumber sequence_number = kMaxSequenceNumber;
+ UnPackSequenceAndType(packed, &sequence_number, &value_type);
+ encoded.remove_prefix(8);
+
+ if (!GetVarint32(&encoded, &value_len)) {
+ return Status::Corruption("Unable to parse value length");
+ }
+ if (value_len < encoded.size()) {
+ return Status::Corruption("Value length too short");
+ }
+ if (value_len > encoded.size()) {
+ return Status::Corruption("Value length too long");
+ }
+ Slice value(encoded.data(), value_len);
+
+ return kv_prot_info.StripS(sequence_number)
+ .StripKVO(key, value, value_type)
+ .GetStatus();
+}
+
+void MemTable::UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
+ const Slice& key, const Slice& value,
+ ValueType type, SequenceNumber s,
+ char* checksum_ptr) {
+ if (moptions_.protection_bytes_per_key == 0) {
+ return;
+ }
+
+ uint64_t checksum = 0;
+ if (kv_prot_info == nullptr) {
+ checksum =
+ ProtectionInfo64().ProtectKVO(key, value, type).ProtectS(s).GetVal();
+ } else {
+ checksum = kv_prot_info->GetVal();
+ }
+ switch (moptions_.protection_bytes_per_key) {
+ case 1:
+ checksum_ptr[0] = static_cast<uint8_t>(checksum);
+ break;
+ case 2:
+ EncodeFixed16(checksum_ptr, static_cast<uint16_t>(checksum));
+ break;
+ case 4:
+ EncodeFixed32(checksum_ptr, static_cast<uint32_t>(checksum));
+ break;
+ case 8:
+ EncodeFixed64(checksum_ptr, checksum);
+ break;
+ default:
+ assert(false);
+ }
+}
+
+Status MemTable::Add(SequenceNumber s, ValueType type,
+ const Slice& key, /* user key */
+ const Slice& value,
+ const ProtectionInfoKVOS64* kv_prot_info,
+ bool allow_concurrent,
+ MemTablePostProcessInfo* post_process_info, void** hint) {
+ // Format of an entry is concatenation of:
+ // key_size : varint32 of internal_key.size()
+ // key bytes : char[internal_key.size()]
+ // value_size : varint32 of value.size()
+ // value bytes : char[value.size()]
+ // checksum : char[moptions_.protection_bytes_per_key]
+ uint32_t key_size = static_cast<uint32_t>(key.size());
+ uint32_t val_size = static_cast<uint32_t>(value.size());
+ uint32_t internal_key_size = key_size + 8;
+ const uint32_t encoded_len = VarintLength(internal_key_size) +
+ internal_key_size + VarintLength(val_size) +
+ val_size + moptions_.protection_bytes_per_key;
+ char* buf = nullptr;
+ std::unique_ptr<MemTableRep>& table =
+ type == kTypeRangeDeletion ? range_del_table_ : table_;
+ KeyHandle handle = table->Allocate(encoded_len, &buf);
+
+ char* p = EncodeVarint32(buf, internal_key_size);
+ memcpy(p, key.data(), key_size);
+ Slice key_slice(p, key_size);
+ p += key_size;
+ uint64_t packed = PackSequenceAndType(s, type);
+ EncodeFixed64(p, packed);
+ p += 8;
+ p = EncodeVarint32(p, val_size);
+ memcpy(p, value.data(), val_size);
+ assert((unsigned)(p + val_size - buf + moptions_.protection_bytes_per_key) ==
+ (unsigned)encoded_len);
+
+ UpdateEntryChecksum(kv_prot_info, key, value, type, s,
+ buf + encoded_len - moptions_.protection_bytes_per_key);
+ Slice encoded(buf, encoded_len - moptions_.protection_bytes_per_key);
+ if (kv_prot_info != nullptr) {
+ TEST_SYNC_POINT_CALLBACK("MemTable::Add:Encoded", &encoded);
+ Status status = VerifyEncodedEntry(encoded, *kv_prot_info);
+ if (!status.ok()) {
+ return status;
+ }
+ }
+
+ size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+ Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz);
+
+ if (!allow_concurrent) {
+ // Extract prefix for insert with hint.
+ if (insert_with_hint_prefix_extractor_ != nullptr &&
+ insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
+ Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
+ bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
+ if (UNLIKELY(!res)) {
+ return Status::TryAgain("key+seq exists");
+ }
+ } else {
+ bool res = table->InsertKey(handle);
+ if (UNLIKELY(!res)) {
+ return Status::TryAgain("key+seq exists");
+ }
+ }
+
+ // this is a bit ugly, but is the way to avoid locked instructions
+ // when incrementing an atomic
+ num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
+ std::memory_order_relaxed);
+ data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
+ std::memory_order_relaxed);
+ if (type == kTypeDeletion || type == kTypeSingleDeletion ||
+ type == kTypeDeletionWithTimestamp) {
+ num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1,
+ std::memory_order_relaxed);
+ }
+
+ if (bloom_filter_ && prefix_extractor_ &&
+ prefix_extractor_->InDomain(key_without_ts)) {
+ bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts));
+ }
+ if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+ bloom_filter_->Add(key_without_ts);
+ }
+
+ // The first sequence number inserted into the memtable
+ assert(first_seqno_ == 0 || s >= first_seqno_);
+ if (first_seqno_ == 0) {
+ first_seqno_.store(s, std::memory_order_relaxed);
+
+ if (earliest_seqno_ == kMaxSequenceNumber) {
+ earliest_seqno_.store(GetFirstSequenceNumber(),
+ std::memory_order_relaxed);
+ }
+ assert(first_seqno_.load() >= earliest_seqno_.load());
+ }
+ assert(post_process_info == nullptr);
+ UpdateFlushState();
+ } else {
+ bool res = (hint == nullptr)
+ ? table->InsertKeyConcurrently(handle)
+ : table->InsertKeyWithHintConcurrently(handle, hint);
+ if (UNLIKELY(!res)) {
+ return Status::TryAgain("key+seq exists");
+ }
+
+ assert(post_process_info != nullptr);
+ post_process_info->num_entries++;
+ post_process_info->data_size += encoded_len;
+ if (type == kTypeDeletion) {
+ post_process_info->num_deletes++;
+ }
+
+ if (bloom_filter_ && prefix_extractor_ &&
+ prefix_extractor_->InDomain(key_without_ts)) {
+ bloom_filter_->AddConcurrently(
+ prefix_extractor_->Transform(key_without_ts));
+ }
+ if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+ bloom_filter_->AddConcurrently(key_without_ts);
+ }
+
+ // atomically update first_seqno_ and earliest_seqno_.
+ uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed);
+ while ((cur_seq_num == 0 || s < cur_seq_num) &&
+ !first_seqno_.compare_exchange_weak(cur_seq_num, s)) {
+ }
+ uint64_t cur_earliest_seqno =
+ earliest_seqno_.load(std::memory_order_relaxed);
+ while (
+ (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) &&
+ !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) {
+ }
+ }
+ if (type == kTypeRangeDeletion) {
+ auto new_cache = std::make_shared<FragmentedRangeTombstoneListCache>();
+ size_t size = cached_range_tombstone_.Size();
+ if (allow_concurrent) {
+ range_del_mutex_.lock();
+ }
+ for (size_t i = 0; i < size; ++i) {
+ std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
+ cached_range_tombstone_.AccessAtCore(i);
+ auto new_local_cache_ref = std::make_shared<
+ const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
+ // It is okay for some reader to load old cache during invalidation as
+ // the new sequence number is not published yet.
+ // Each core will have a shared_ptr to a shared_ptr to the cached
+ // fragmented range tombstones, so that ref count is maintianed locally
+ // per-core using the per-core shared_ptr.
+ std::atomic_store_explicit(
+ local_cache_ref_ptr,
+ std::shared_ptr<FragmentedRangeTombstoneListCache>(
+ new_local_cache_ref, new_cache.get()),
+ std::memory_order_relaxed);
+ }
+ if (allow_concurrent) {
+ range_del_mutex_.unlock();
+ }
+ is_range_del_table_empty_.store(false, std::memory_order_relaxed);
+ }
+ UpdateOldestKeyTime();
+
+ TEST_SYNC_POINT_CALLBACK("MemTable::Add:BeforeReturn:Encoded", &encoded);
+ return Status::OK();
+}
+
+// Callback from MemTable::Get()
+namespace {
+
+struct Saver {
+ Status* status;
+ const LookupKey* key;
+ bool* found_final_value; // Is value set correctly? Used by KeyMayExist
+ bool* merge_in_progress;
+ std::string* value;
+ PinnableWideColumns* columns;
+ SequenceNumber seq;
+ std::string* timestamp;
+ const MergeOperator* merge_operator;
+ // the merge operations encountered;
+ MergeContext* merge_context;
+ SequenceNumber max_covering_tombstone_seq;
+ MemTable* mem;
+ Logger* logger;
+ Statistics* statistics;
+ bool inplace_update_support;
+ bool do_merge;
+ SystemClock* clock;
+
+ ReadCallback* callback_;
+ bool* is_blob_index;
+ bool allow_data_in_errors;
+ size_t protection_bytes_per_key;
+ bool CheckCallback(SequenceNumber _seq) {
+ if (callback_) {
+ return callback_->IsVisible(_seq);
+ }
+ return true;
+ }
+};
+} // anonymous namespace
+
+static bool SaveValue(void* arg, const char* entry) {
+ TEST_SYNC_POINT_CALLBACK("Memtable::SaveValue:Begin:entry", &entry);
+ Saver* s = reinterpret_cast<Saver*>(arg);
+ assert(s != nullptr);
+ assert(!s->value || !s->columns);
+
+ if (s->protection_bytes_per_key > 0) {
+ *(s->status) = MemTable::VerifyEntryChecksum(
+ entry, s->protection_bytes_per_key, s->allow_data_in_errors);
+ if (!s->status->ok()) {
+ ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState());
+ // Memtable entry corrupted
+ return false;
+ }
+ }
+
+ MergeContext* merge_context = s->merge_context;
+ SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
+ const MergeOperator* merge_operator = s->merge_operator;
+
+ assert(merge_context != nullptr);
+
+ // Refer to comments under MemTable::Add() for entry format.
+ // Check that it belongs to same user key.
+ uint32_t key_length = 0;
+ const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+ assert(key_length >= 8);
+ Slice user_key_slice = Slice(key_ptr, key_length - 8);
+ const Comparator* user_comparator =
+ s->mem->GetInternalKeyComparator().user_comparator();
+ size_t ts_sz = user_comparator->timestamp_size();
+ if (ts_sz && s->timestamp && max_covering_tombstone_seq > 0) {
+ // timestamp should already be set to range tombstone timestamp
+ assert(s->timestamp->size() == ts_sz);
+ }
+ if (user_comparator->EqualWithoutTimestamp(user_key_slice,
+ s->key->user_key())) {
+ // Correct user key
+ const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+ ValueType type;
+ SequenceNumber seq;
+ UnPackSequenceAndType(tag, &seq, &type);
+ // If the value is not in the snapshot, skip it
+ if (!s->CheckCallback(seq)) {
+ return true; // to continue to the next seq
+ }
+
+ if (s->seq == kMaxSequenceNumber) {
+ s->seq = seq;
+ if (s->seq > max_covering_tombstone_seq) {
+ if (ts_sz && s->timestamp != nullptr) {
+ // `timestamp` was set to range tombstone's timestamp before
+ // `SaveValue` is ever called. This key has a higher sequence number
+ // than range tombstone, and is the key with the highest seqno across
+ // all keys with this user_key, so we update timestamp here.
+ Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz);
+ s->timestamp->assign(ts.data(), ts_sz);
+ }
+ } else {
+ s->seq = max_covering_tombstone_seq;
+ }
+ }
+
+ if (ts_sz > 0 && s->timestamp != nullptr) {
+ if (!s->timestamp->empty()) {
+ assert(ts_sz == s->timestamp->size());
+ }
+ // TODO optimize for smaller size ts
+ const std::string kMaxTs(ts_sz, '\xff');
+ if (s->timestamp->empty() ||
+ user_comparator->CompareTimestamp(*(s->timestamp), kMaxTs) == 0) {
+ Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz);
+ s->timestamp->assign(ts.data(), ts_sz);
+ }
+ }
+
+ if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex ||
+ type == kTypeWideColumnEntity || type == kTypeDeletion ||
+ type == kTypeSingleDeletion || type == kTypeDeletionWithTimestamp) &&
+ max_covering_tombstone_seq > seq) {
+ type = kTypeRangeDeletion;
+ }
+ switch (type) {
+ case kTypeBlobIndex: {
+ if (!s->do_merge) {
+ *(s->status) = Status::NotSupported(
+ "GetMergeOperands not supported by stacked BlobDB");
+ *(s->found_final_value) = true;
+ return false;
+ }
+
+ if (*(s->merge_in_progress)) {
+ *(s->status) = Status::NotSupported(
+ "Merge operator not supported by stacked BlobDB");
+ *(s->found_final_value) = true;
+ return false;
+ }
+
+ if (s->is_blob_index == nullptr) {
+ ROCKS_LOG_ERROR(s->logger, "Encountered unexpected blob index.");
+ *(s->status) = Status::NotSupported(
+ "Encountered unexpected blob index. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB.");
+ *(s->found_final_value) = true;
+ return false;
+ }
+
+ if (s->inplace_update_support) {
+ s->mem->GetLock(s->key->user_key())->ReadLock();
+ }
+
+ Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+
+ *(s->status) = Status::OK();
+
+ if (s->value) {
+ s->value->assign(v.data(), v.size());
+ } else if (s->columns) {
+ s->columns->SetPlainValue(v);
+ }
+
+ if (s->inplace_update_support) {
+ s->mem->GetLock(s->key->user_key())->ReadUnlock();
+ }
+
+ *(s->found_final_value) = true;
+ *(s->is_blob_index) = true;
+
+ return false;
+ }
+ case kTypeValue: {
+ if (s->inplace_update_support) {
+ s->mem->GetLock(s->key->user_key())->ReadLock();
+ }
+
+ Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+
+ *(s->status) = Status::OK();
+
+ if (!s->do_merge) {
+ // Preserve the value with the goal of returning it as part of
+ // raw merge operands to the user
+ // TODO(yanqin) update MergeContext so that timestamps information
+ // can also be retained.
+
+ merge_context->PushOperand(
+ v, s->inplace_update_support == false /* operand_pinned */);
+ } else if (*(s->merge_in_progress)) {
+ assert(s->do_merge);
+
+ if (s->value || s->columns) {
+ std::string result;
+ *(s->status) = MergeHelper::TimedFullMerge(
+ merge_operator, s->key->user_key(), &v,
+ merge_context->GetOperands(), &result, s->logger, s->statistics,
+ s->clock, /* result_operand */ nullptr,
+ /* update_num_ops_stats */ true);
+
+ if (s->status->ok()) {
+ if (s->value) {
+ *(s->value) = std::move(result);
+ } else {
+ assert(s->columns);
+ s->columns->SetPlainValue(result);
+ }
+ }
+ }
+ } else if (s->value) {
+ s->value->assign(v.data(), v.size());
+ } else if (s->columns) {
+ s->columns->SetPlainValue(v);
+ }
+
+ if (s->inplace_update_support) {
+ s->mem->GetLock(s->key->user_key())->ReadUnlock();
+ }
+
+ *(s->found_final_value) = true;
+
+ if (s->is_blob_index != nullptr) {
+ *(s->is_blob_index) = false;
+ }
+
+ return false;
+ }
+ case kTypeWideColumnEntity: {
+ if (s->inplace_update_support) {
+ s->mem->GetLock(s->key->user_key())->ReadLock();
+ }
+
+ Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+
+ *(s->status) = Status::OK();
+
+ if (!s->do_merge) {
+ // Preserve the value with the goal of returning it as part of
+ // raw merge operands to the user
+
+ Slice value_of_default;
+ *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn(
+ v, value_of_default);
+
+ if (s->status->ok()) {
+ merge_context->PushOperand(
+ value_of_default,
+ s->inplace_update_support == false /* operand_pinned */);
+ }
+ } else if (*(s->merge_in_progress)) {
+ assert(s->do_merge);
+
+ if (s->value) {
+ Slice value_of_default;
+ *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn(
+ v, value_of_default);
+ if (s->status->ok()) {
+ *(s->status) = MergeHelper::TimedFullMerge(
+ merge_operator, s->key->user_key(), &value_of_default,
+ merge_context->GetOperands(), s->value, s->logger,
+ s->statistics, s->clock, /* result_operand */ nullptr,
+ /* update_num_ops_stats */ true);
+ }
+ } else if (s->columns) {
+ std::string result;
+ *(s->status) = MergeHelper::TimedFullMergeWithEntity(
+ merge_operator, s->key->user_key(), v,
+ merge_context->GetOperands(), &result, s->logger, s->statistics,
+ s->clock, /* update_num_ops_stats */ true);
+
+ if (s->status->ok()) {
+ *(s->status) = s->columns->SetWideColumnValue(result);
+ }
+ }
+ } else if (s->value) {
+ Slice value_of_default;
+ *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn(
+ v, value_of_default);
+ if (s->status->ok()) {
+ s->value->assign(value_of_default.data(), value_of_default.size());
+ }
+ } else if (s->columns) {
+ *(s->status) = s->columns->SetWideColumnValue(v);
+ }
+
+ if (s->inplace_update_support) {
+ s->mem->GetLock(s->key->user_key())->ReadUnlock();
+ }
+
+ *(s->found_final_value) = true;
+
+ if (s->is_blob_index != nullptr) {
+ *(s->is_blob_index) = false;
+ }
+
+ return false;
+ }
+ case kTypeDeletion:
+ case kTypeDeletionWithTimestamp:
+ case kTypeSingleDeletion:
+ case kTypeRangeDeletion: {
+ if (*(s->merge_in_progress)) {
+ if (s->value || s->columns) {
+ std::string result;
+ *(s->status) = MergeHelper::TimedFullMerge(
+ merge_operator, s->key->user_key(), nullptr,
+ merge_context->GetOperands(), &result, s->logger, s->statistics,
+ s->clock, /* result_operand */ nullptr,
+ /* update_num_ops_stats */ true);
+
+ if (s->status->ok()) {
+ if (s->value) {
+ *(s->value) = std::move(result);
+ } else {
+ assert(s->columns);
+ s->columns->SetPlainValue(result);
+ }
+ }
+ }
+ } else {
+ *(s->status) = Status::NotFound();
+ }
+ *(s->found_final_value) = true;
+ return false;
+ }
+ case kTypeMerge: {
+ if (!merge_operator) {
+ *(s->status) = Status::InvalidArgument(
+ "merge_operator is not properly initialized.");
+ // Normally we continue the loop (return true) when we see a merge
+ // operand. But in case of an error, we should stop the loop
+ // immediately and pretend we have found the value to stop further
+ // seek. Otherwise, the later call will override this error status.
+ *(s->found_final_value) = true;
+ return false;
+ }
+ Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+ *(s->merge_in_progress) = true;
+ merge_context->PushOperand(
+ v, s->inplace_update_support == false /* operand_pinned */);
+ if (s->do_merge && merge_operator->ShouldMerge(
+ merge_context->GetOperandsDirectionBackward())) {
+ if (s->value || s->columns) {
+ std::string result;
+ *(s->status) = MergeHelper::TimedFullMerge(
+ merge_operator, s->key->user_key(), nullptr,
+ merge_context->GetOperands(), &result, s->logger, s->statistics,
+ s->clock, /* result_operand */ nullptr,
+ /* update_num_ops_stats */ true);
+
+ if (s->status->ok()) {
+ if (s->value) {
+ *(s->value) = std::move(result);
+ } else {
+ assert(s->columns);
+ s->columns->SetPlainValue(result);
+ }
+ }
+ }
+
+ *(s->found_final_value) = true;
+ return false;
+ }
+ return true;
+ }
+ default: {
+ std::string msg("Corrupted value not expected.");
+ if (s->allow_data_in_errors) {
+ msg.append("Unrecognized value type: " +
+ std::to_string(static_cast<int>(type)) + ". ");
+ msg.append("User key: " + user_key_slice.ToString(/*hex=*/true) +
+ ". ");
+ msg.append("seq: " + std::to_string(seq) + ".");
+ }
+ *(s->status) = Status::Corruption(msg.c_str());
+ return false;
+ }
+ }
+ }
+
+ // s->state could be Corrupt, merge or notfound
+ return false;
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value,
+ PinnableWideColumns* columns, std::string* timestamp,
+ Status* s, MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ SequenceNumber* seq, const ReadOptions& read_opts,
+ bool immutable_memtable, ReadCallback* callback,
+ bool* is_blob_index, bool do_merge) {
+ // The sequence number is updated synchronously in version_set.h
+ if (IsEmpty()) {
+ // Avoiding recording stats for speed.
+ return false;
+ }
+ PERF_TIMER_GUARD(get_from_memtable_time);
+
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ NewRangeTombstoneIterator(read_opts,
+ GetInternalKeySeqno(key.internal_key()),
+ immutable_memtable));
+ if (range_del_iter != nullptr) {
+ SequenceNumber covering_seq =
+ range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key());
+ if (covering_seq > *max_covering_tombstone_seq) {
+ *max_covering_tombstone_seq = covering_seq;
+ if (timestamp) {
+ // Will be overwritten in SaveValue() if there is a point key with
+ // a higher seqno.
+ timestamp->assign(range_del_iter->timestamp().data(),
+ range_del_iter->timestamp().size());
+ }
+ }
+ }
+
+ bool found_final_value = false;
+ bool merge_in_progress = s->IsMergeInProgress();
+ bool may_contain = true;
+ size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+ Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz);
+ bool bloom_checked = false;
+ if (bloom_filter_) {
+ // when both memtable_whole_key_filtering and prefix_extractor_ are set,
+ // only do whole key filtering for Get() to save CPU
+ if (moptions_.memtable_whole_key_filtering) {
+ may_contain = bloom_filter_->MayContain(user_key_without_ts);
+ bloom_checked = true;
+ } else {
+ assert(prefix_extractor_);
+ if (prefix_extractor_->InDomain(user_key_without_ts)) {
+ may_contain = bloom_filter_->MayContain(
+ prefix_extractor_->Transform(user_key_without_ts));
+ bloom_checked = true;
+ }
+ }
+ }
+
+ if (bloom_filter_ && !may_contain) {
+ // iter is null if prefix bloom says the key does not exist
+ PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+ *seq = kMaxSequenceNumber;
+ } else {
+ if (bloom_checked) {
+ PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+ }
+ GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
+ is_blob_index, value, columns, timestamp, s, merge_context,
+ seq, &found_final_value, &merge_in_progress);
+ }
+
+ // No change to value, since we have not yet found a Put/Delete
+ // Propagate corruption error
+ if (!found_final_value && merge_in_progress && !s->IsCorruption()) {
+ *s = Status::MergeInProgress();
+ }
+ PERF_COUNTER_ADD(get_from_memtable_count, 1);
+ return found_final_value;
+}
+
+void MemTable::GetFromTable(const LookupKey& key,
+ SequenceNumber max_covering_tombstone_seq,
+ bool do_merge, ReadCallback* callback,
+ bool* is_blob_index, std::string* value,
+ PinnableWideColumns* columns,
+ std::string* timestamp, Status* s,
+ MergeContext* merge_context, SequenceNumber* seq,
+ bool* found_final_value, bool* merge_in_progress) {
+ Saver saver;
+ saver.status = s;
+ saver.found_final_value = found_final_value;
+ saver.merge_in_progress = merge_in_progress;
+ saver.key = &key;
+ saver.value = value;
+ saver.columns = columns;
+ saver.timestamp = timestamp;
+ saver.seq = kMaxSequenceNumber;
+ saver.mem = this;
+ saver.merge_context = merge_context;
+ saver.max_covering_tombstone_seq = max_covering_tombstone_seq;
+ saver.merge_operator = moptions_.merge_operator;
+ saver.logger = moptions_.info_log;
+ saver.inplace_update_support = moptions_.inplace_update_support;
+ saver.statistics = moptions_.statistics;
+ saver.clock = clock_;
+ saver.callback_ = callback;
+ saver.is_blob_index = is_blob_index;
+ saver.do_merge = do_merge;
+ saver.allow_data_in_errors = moptions_.allow_data_in_errors;
+ saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
+ table_->Get(key, &saver, SaveValue);
+ *seq = saver.seq;
+}
+
+void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+ ReadCallback* callback, bool immutable_memtable) {
+ // The sequence number is updated synchronously in version_set.h
+ if (IsEmpty()) {
+ // Avoiding recording stats for speed.
+ return;
+ }
+ PERF_TIMER_GUARD(get_from_memtable_time);
+
+ // For now, memtable Bloom filter is effectively disabled if there are any
+ // range tombstones. This is the simplest way to ensure range tombstones are
+ // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
+ bool no_range_del = read_options.ignore_range_deletions ||
+ is_range_del_table_empty_.load(std::memory_order_relaxed);
+ MultiGetRange temp_range(*range, range->begin(), range->end());
+ if (bloom_filter_ && no_range_del) {
+ bool whole_key =
+ !prefix_extractor_ || moptions_.memtable_whole_key_filtering;
+ std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
+ std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match;
+ std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> range_indexes;
+ int num_keys = 0;
+ for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+ if (whole_key) {
+ bloom_keys[num_keys] = iter->ukey_without_ts;
+ range_indexes[num_keys++] = iter.index();
+ } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
+ bloom_keys[num_keys] =
+ prefix_extractor_->Transform(iter->ukey_without_ts);
+ range_indexes[num_keys++] = iter.index();
+ }
+ }
+ bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]);
+ for (int i = 0; i < num_keys; ++i) {
+ if (!may_match[i]) {
+ temp_range.SkipIndex(range_indexes[i]);
+ PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+ } else {
+ PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+ }
+ }
+ }
+ for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+ bool found_final_value{false};
+ bool merge_in_progress = iter->s->IsMergeInProgress();
+ if (!no_range_del) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ NewRangeTombstoneIteratorInternal(
+ read_options, GetInternalKeySeqno(iter->lkey->internal_key()),
+ immutable_memtable));
+ SequenceNumber covering_seq =
+ range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key());
+ if (covering_seq > iter->max_covering_tombstone_seq) {
+ iter->max_covering_tombstone_seq = covering_seq;
+ if (iter->timestamp) {
+ // Will be overwritten in SaveValue() if there is a point key with
+ // a higher seqno.
+ iter->timestamp->assign(range_del_iter->timestamp().data(),
+ range_del_iter->timestamp().size());
+ }
+ }
+ }
+ SequenceNumber dummy_seq;
+ GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
+ callback, &iter->is_blob_index, iter->value->GetSelf(),
+ /*columns=*/nullptr, iter->timestamp, iter->s,
+ &(iter->merge_context), &dummy_seq, &found_final_value,
+ &merge_in_progress);
+
+ if (!found_final_value && merge_in_progress) {
+ *(iter->s) = Status::MergeInProgress();
+ }
+
+ if (found_final_value) {
+ iter->value->PinSelf();
+ range->AddValueSize(iter->value->size());
+ range->MarkKeyDone(iter);
+ RecordTick(moptions_.statistics, MEMTABLE_HIT);
+ if (range->GetValueSize() > read_options.value_size_soft_limit) {
+ // Set all remaining keys in range to Abort
+ for (auto range_iter = range->begin(); range_iter != range->end();
+ ++range_iter) {
+ range->MarkKeyDone(range_iter);
+ *(range_iter->s) = Status::Aborted();
+ }
+ break;
+ }
+ }
+ }
+ PERF_COUNTER_ADD(get_from_memtable_count, 1);
+}
+
+Status MemTable::Update(SequenceNumber seq, ValueType value_type,
+ const Slice& key, const Slice& value,
+ const ProtectionInfoKVOS64* kv_prot_info) {
+ LookupKey lkey(key, seq);
+ Slice mem_key = lkey.memtable_key();
+
+ std::unique_ptr<MemTableRep::Iterator> iter(
+ table_->GetDynamicPrefixIterator());
+ iter->Seek(lkey.internal_key(), mem_key.data());
+
+ if (iter->Valid()) {
+ // Refer to comments under MemTable::Add() for entry format.
+ // Check that it belongs to same user key. We do not check the
+ // sequence number since the Seek() call above should have skipped
+ // all entries with overly large sequence numbers.
+ const char* entry = iter->key();
+ uint32_t key_length = 0;
+ const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+ if (comparator_.comparator.user_comparator()->Equal(
+ Slice(key_ptr, key_length - 8), lkey.user_key())) {
+ // Correct user key
+ const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+ ValueType type;
+ SequenceNumber existing_seq;
+ UnPackSequenceAndType(tag, &existing_seq, &type);
+ assert(existing_seq != seq);
+ if (type == value_type) {
+ Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+ uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+ uint32_t new_size = static_cast<uint32_t>(value.size());
+
+ // Update value, if new value size <= previous value size
+ if (new_size <= prev_size) {
+ char* p =
+ EncodeVarint32(const_cast<char*>(key_ptr) + key_length, new_size);
+ WriteLock wl(GetLock(lkey.user_key()));
+ memcpy(p, value.data(), value.size());
+ assert((unsigned)((p + value.size()) - entry) ==
+ (unsigned)(VarintLength(key_length) + key_length +
+ VarintLength(value.size()) + value.size()));
+ RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
+ if (kv_prot_info != nullptr) {
+ ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+ // `seq` is swallowed and `existing_seq` prevails.
+ updated_kv_prot_info.UpdateS(seq, existing_seq);
+ UpdateEntryChecksum(&updated_kv_prot_info, key, value, type,
+ existing_seq, p + value.size());
+ Slice encoded(entry, p + value.size() - entry);
+ return VerifyEncodedEntry(encoded, updated_kv_prot_info);
+ } else {
+ UpdateEntryChecksum(nullptr, key, value, type, existing_seq,
+ p + value.size());
+ }
+ return Status::OK();
+ }
+ }
+ }
+ }
+
+ // The latest value is not value_type or key doesn't exist
+ return Add(seq, value_type, key, value, kv_prot_info);
+}
+
+Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key,
+ const Slice& delta,
+ const ProtectionInfoKVOS64* kv_prot_info) {
+ LookupKey lkey(key, seq);
+ Slice memkey = lkey.memtable_key();
+
+ std::unique_ptr<MemTableRep::Iterator> iter(
+ table_->GetDynamicPrefixIterator());
+ iter->Seek(lkey.internal_key(), memkey.data());
+
+ if (iter->Valid()) {
+ // Refer to comments under MemTable::Add() for entry format.
+ // Check that it belongs to same user key. We do not check the
+ // sequence number since the Seek() call above should have skipped
+ // all entries with overly large sequence numbers.
+ const char* entry = iter->key();
+ uint32_t key_length = 0;
+ const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+ if (comparator_.comparator.user_comparator()->Equal(
+ Slice(key_ptr, key_length - 8), lkey.user_key())) {
+ // Correct user key
+ const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+ ValueType type;
+ uint64_t existing_seq;
+ UnPackSequenceAndType(tag, &existing_seq, &type);
+ if (type == kTypeValue) {
+ Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+ uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+
+ char* prev_buffer = const_cast<char*>(prev_value.data());
+ uint32_t new_prev_size = prev_size;
+
+ std::string str_value;
+ WriteLock wl(GetLock(lkey.user_key()));
+ auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
+ delta, &str_value);
+ if (status == UpdateStatus::UPDATED_INPLACE) {
+ // Value already updated by callback.
+ assert(new_prev_size <= prev_size);
+ if (new_prev_size < prev_size) {
+ // overwrite the new prev_size
+ char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+ new_prev_size);
+ if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
+ // shift the value buffer as well.
+ memcpy(p, prev_buffer, new_prev_size);
+ prev_buffer = p;
+ }
+ }
+ RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
+ UpdateFlushState();
+ Slice new_value(prev_buffer, new_prev_size);
+ if (kv_prot_info != nullptr) {
+ ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+ // `seq` is swallowed and `existing_seq` prevails.
+ updated_kv_prot_info.UpdateS(seq, existing_seq);
+ updated_kv_prot_info.UpdateV(delta, new_value);
+ Slice encoded(entry, prev_buffer + new_prev_size - entry);
+ UpdateEntryChecksum(&updated_kv_prot_info, key, new_value, type,
+ existing_seq, prev_buffer + new_prev_size);
+ return VerifyEncodedEntry(encoded, updated_kv_prot_info);
+ } else {
+ UpdateEntryChecksum(nullptr, key, new_value, type, existing_seq,
+ prev_buffer + new_prev_size);
+ }
+ return Status::OK();
+ } else if (status == UpdateStatus::UPDATED) {
+ Status s;
+ if (kv_prot_info != nullptr) {
+ ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+ updated_kv_prot_info.UpdateV(delta, str_value);
+ s = Add(seq, kTypeValue, key, Slice(str_value),
+ &updated_kv_prot_info);
+ } else {
+ s = Add(seq, kTypeValue, key, Slice(str_value),
+ nullptr /* kv_prot_info */);
+ }
+ RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
+ UpdateFlushState();
+ return s;
+ } else if (status == UpdateStatus::UPDATE_FAILED) {
+ // `UPDATE_FAILED` is named incorrectly. It indicates no update
+ // happened. It does not indicate a failure happened.
+ UpdateFlushState();
+ return Status::OK();
+ }
+ }
+ }
+ }
+ // The latest value is not `kTypeValue` or key doesn't exist
+ return Status::NotFound();
+}
+
+size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
+ Slice memkey = key.memtable_key();
+
+ // A total ordered iterator is costly for some memtablerep (prefix aware
+ // reps). By passing in the user key, we allow efficient iterator creation.
+ // The iterator only needs to be ordered within the same user key.
+ std::unique_ptr<MemTableRep::Iterator> iter(
+ table_->GetDynamicPrefixIterator());
+ iter->Seek(key.internal_key(), memkey.data());
+
+ size_t num_successive_merges = 0;
+
+ for (; iter->Valid(); iter->Next()) {
+ const char* entry = iter->key();
+ uint32_t key_length = 0;
+ const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+ if (!comparator_.comparator.user_comparator()->Equal(
+ Slice(iter_key_ptr, key_length - 8), key.user_key())) {
+ break;
+ }
+
+ const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
+ ValueType type;
+ uint64_t unused;
+ UnPackSequenceAndType(tag, &unused, &type);
+ if (type != kTypeMerge) {
+ break;
+ }
+
+ ++num_successive_merges;
+ }
+
+ return num_successive_merges;
+}
+
+void MemTableRep::Get(const LookupKey& k, void* callback_args,
+ bool (*callback_func)(void* arg, const char* entry)) {
+ auto iter = GetDynamicPrefixIterator();
+ for (iter->Seek(k.internal_key(), k.memtable_key().data());
+ iter->Valid() && callback_func(callback_args, iter->key());
+ iter->Next()) {
+ }
+}
+
+void MemTable::RefLogContainingPrepSection(uint64_t log) {
+ assert(log > 0);
+ auto cur = min_prep_log_referenced_.load();
+ while ((log < cur || cur == 0) &&
+ !min_prep_log_referenced_.compare_exchange_strong(cur, log)) {
+ cur = min_prep_log_referenced_.load();
+ }
+}
+
+uint64_t MemTable::GetMinLogContainingPrepSection() {
+ return min_prep_log_referenced_.load();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable.h b/src/rocksdb/db/memtable.h
new file mode 100644
index 000000000..6db2721e4
--- /dev/null
+++ b/src/rocksdb/db/memtable.h
@@ -0,0 +1,664 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/kv_checksum.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
+#include "db/version_edit.h"
+#include "memory/allocator.h"
+#include "memory/concurrent_arena.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "table/multiget_context.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FlushJobInfo;
+class Mutex;
+class MemTableIterator;
+class MergeContext;
+class SystemClock;
+
+struct ImmutableMemTableOptions {
+ explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options);
+ size_t arena_block_size;
+ uint32_t memtable_prefix_bloom_bits;
+ size_t memtable_huge_page_size;
+ bool memtable_whole_key_filtering;
+ bool inplace_update_support;
+ size_t inplace_update_num_locks;
+ UpdateStatus (*inplace_callback)(char* existing_value,
+ uint32_t* existing_value_size,
+ Slice delta_value,
+ std::string* merged_value);
+ size_t max_successive_merges;
+ Statistics* statistics;
+ MergeOperator* merge_operator;
+ Logger* info_log;
+ bool allow_data_in_errors;
+ uint32_t protection_bytes_per_key;
+};
+
+// Batched counters to updated when inserting keys in one write batch.
+// In post process of the write batch, these can be updated together.
+// Only used in concurrent memtable insert case.
+struct MemTablePostProcessInfo {
+ uint64_t data_size = 0;
+ uint64_t num_entries = 0;
+ uint64_t num_deletes = 0;
+};
+
+using MultiGetRange = MultiGetContext::Range;
+// Note: Many of the methods in this class have comments indicating that
+// external synchronization is required as these methods are not thread-safe.
+// It is up to higher layers of code to decide how to prevent concurrent
+// invocation of these methods. This is usually done by acquiring either
+// the db mutex or the single writer thread.
+//
+// Some of these methods are documented to only require external
+// synchronization if this memtable is immutable. Calling MarkImmutable() is
+// not sufficient to guarantee immutability. It is up to higher layers of
+// code to determine if this MemTable can still be modified by other threads.
+// Eg: The Superversion stores a pointer to the current MemTable (that can
+// be modified) and a separate list of the MemTables that can no longer be
+// written to (aka the 'immutable memtables').
+class MemTable {
+ public:
+ struct KeyComparator : public MemTableRep::KeyComparator {
+ const InternalKeyComparator comparator;
+ explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {}
+ virtual int operator()(const char* prefix_len_key1,
+ const char* prefix_len_key2) const override;
+ virtual int operator()(const char* prefix_len_key,
+ const DecodedType& key) const override;
+ };
+
+ // MemTables are reference counted. The initial reference count
+ // is zero and the caller must call Ref() at least once.
+ //
+ // earliest_seq should be the current SequenceNumber in the db such that any
+ // key inserted into this memtable will have an equal or larger seq number.
+ // (When a db is first created, the earliest sequence number will be 0).
+ // If the earliest sequence number is not known, kMaxSequenceNumber may be
+ // used, but this may prevent some transactions from succeeding until the
+ // first key is inserted into the memtable.
+ explicit MemTable(const InternalKeyComparator& comparator,
+ const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ WriteBufferManager* write_buffer_manager,
+ SequenceNumber earliest_seq, uint32_t column_family_id);
+ // No copying allowed
+ MemTable(const MemTable&) = delete;
+ MemTable& operator=(const MemTable&) = delete;
+
+ // Do not delete this MemTable unless Unref() indicates it not in use.
+ ~MemTable();
+
+ // Increase reference count.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ void Ref() { ++refs_; }
+
+ // Drop reference count.
+ // If the refcount goes to zero return this memtable, otherwise return null.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ MemTable* Unref() {
+ --refs_;
+ assert(refs_ >= 0);
+ if (refs_ <= 0) {
+ return this;
+ }
+ return nullptr;
+ }
+
+ // Returns an estimate of the number of bytes of data in use by this
+ // data structure.
+ //
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ size_t ApproximateMemoryUsage();
+
+ // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
+ // require external synchronization. The value may be less accurate though
+ size_t ApproximateMemoryUsageFast() const {
+ return approximate_memory_usage_.load(std::memory_order_relaxed);
+ }
+
+ // used by MemTableListVersion::MemoryAllocatedBytesExcludingLast
+ size_t MemoryAllocatedBytes() const {
+ return table_->ApproximateMemoryUsage() +
+ range_del_table_->ApproximateMemoryUsage() +
+ arena_.MemoryAllocatedBytes();
+ }
+
+ // Returns a vector of unique random memtable entries of size 'sample_size'.
+ //
+ // Note: the entries are stored in the unordered_set as length-prefixed keys,
+ // hence their representation in the set as "const char*".
+ // Note2: the size of the output set 'entries' is not enforced to be strictly
+ // equal to 'target_sample_size'. Its final size might be slightly
+ // greater or slightly less than 'target_sample_size'
+ //
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ // REQUIRES: SkipList memtable representation. This function is not
+ // implemented for any other type of memtable representation (vectorrep,
+ // hashskiplist,...).
+ void UniqueRandomSample(const uint64_t& target_sample_size,
+ std::unordered_set<const char*>* entries) {
+ // TODO(bjlemaire): at the moment, only supported by skiplistrep.
+ // Extend it to all other memtable representations.
+ table_->UniqueRandomSample(num_entries(), target_sample_size, entries);
+ }
+
+ // This method heuristically determines if the memtable should continue to
+ // host more data.
+ bool ShouldScheduleFlush() const {
+ return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
+ }
+
+ // Returns true if a flush should be scheduled and the caller should
+ // be the one to schedule it
+ bool MarkFlushScheduled() {
+ auto before = FLUSH_REQUESTED;
+ return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
+ std::memory_order_relaxed,
+ std::memory_order_relaxed);
+ }
+
+ // Return an iterator that yields the contents of the memtable.
+ //
+ // The caller must ensure that the underlying MemTable remains live
+ // while the returned iterator is live. The keys returned by this
+ // iterator are internal keys encoded by AppendInternalKey in the
+ // db/dbformat.{h,cc} module.
+ //
+ // By default, it returns an iterator for prefix seek if prefix_extractor
+ // is configured in Options.
+ // arena: If not null, the arena needs to be used to allocate the Iterator.
+ // Calling ~Iterator of the iterator will destroy all the states but
+ // those allocated in arena.
+ InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
+
+ // Returns an iterator that yields the range tombstones of the memtable.
+ // The caller must ensure that the underlying MemTable remains live
+ // while the returned iterator is live.
+ // @param immutable_memtable Whether this memtable is an immutable memtable.
+ // This information is not stored in memtable itself, so it needs to be
+ // specified by the caller. This flag is used internally to decide whether a
+ // cached fragmented range tombstone list can be returned. This cached version
+ // is constructed when a memtable becomes immutable. Setting the flag to false
+ // will always yield correct result, but may incur performance penalty as it
+ // always creates a new fragmented range tombstone list.
+ FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+ const ReadOptions& read_options, SequenceNumber read_seq,
+ bool immutable_memtable);
+
+ Status VerifyEncodedEntry(Slice encoded,
+ const ProtectionInfoKVOS64& kv_prot_info);
+
+ // Add an entry into memtable that maps key to value at the
+ // specified sequence number and with the specified type.
+ // Typically value will be empty if type==kTypeDeletion.
+ //
+ // REQUIRES: if allow_concurrent = false, external synchronization to prevent
+ // simultaneous operations on the same MemTable.
+ //
+ // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+ // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+ // The next attempt should try a larger value for `seq`.
+ Status Add(SequenceNumber seq, ValueType type, const Slice& key,
+ const Slice& value, const ProtectionInfoKVOS64* kv_prot_info,
+ bool allow_concurrent = false,
+ MemTablePostProcessInfo* post_process_info = nullptr,
+ void** hint = nullptr);
+
+ // Used to Get value associated with key or Get Merge Operands associated
+ // with key.
+ // If do_merge = true the default behavior which is Get value for key is
+ // executed. Expected behavior is described right below.
+ // If memtable contains a value for key, store it in *value and return true.
+ // If memtable contains a deletion for key, store a NotFound() error
+ // in *status and return true.
+ // If memtable contains Merge operation as the most recent entry for a key,
+ // and the merge process does not stop (not reaching a value or delete),
+ // prepend the current merge operand to *operands.
+ // store MergeInProgress in s, and return false.
+ // Else, return false.
+ // If any operation was found, its most recent sequence number
+ // will be stored in *seq on success (regardless of whether true/false is
+ // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
+ // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other
+ // status returned indicates a corruption or other unexpected error.
+ // If do_merge = false then any Merge Operands encountered for key are simply
+ // stored in merge_context.operands_list and never actually merged to get a
+ // final value. The raw Merge Operands are eventually returned to the user.
+ // @param immutable_memtable Whether this memtable is immutable. Used
+ // internally by NewRangeTombstoneIterator(). See comment above
+ // NewRangeTombstoneIterator() for more detail.
+ bool Get(const LookupKey& key, std::string* value,
+ PinnableWideColumns* columns, std::string* timestamp, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+ const ReadOptions& read_opts, bool immutable_memtable,
+ ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+ bool do_merge = true);
+
+ bool Get(const LookupKey& key, std::string* value,
+ PinnableWideColumns* columns, std::string* timestamp, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ const ReadOptions& read_opts, bool immutable_memtable,
+ ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+ bool do_merge = true) {
+ SequenceNumber seq;
+ return Get(key, value, columns, timestamp, s, merge_context,
+ max_covering_tombstone_seq, &seq, read_opts, immutable_memtable,
+ callback, is_blob_index, do_merge);
+ }
+
+ // @param immutable_memtable Whether this memtable is immutable. Used
+ // internally by NewRangeTombstoneIterator(). See comment above
+ // NewRangeTombstoneIterator() for more detail.
+ void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+ ReadCallback* callback, bool immutable_memtable);
+
+ // If `key` exists in current memtable with type value_type and the existing
+ // value is at least as large as the new value, updates it in-place. Otherwise
+ // adds the new value to the memtable out-of-place.
+ //
+ // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+ // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+ // The next attempt should try a larger value for `seq`.
+ //
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ Status Update(SequenceNumber seq, ValueType value_type, const Slice& key,
+ const Slice& value, const ProtectionInfoKVOS64* kv_prot_info);
+
+ // If `key` exists in current memtable with type `kTypeValue` and the existing
+ // value is at least as large as the new value, updates it in-place. Otherwise
+ // if `key` exists in current memtable with type `kTypeValue`, adds the new
+ // value to the memtable out-of-place.
+ //
+ // Returns `Status::NotFound` if `key` does not exist in current memtable or
+ // the latest version of `key` does not have `kTypeValue`.
+ //
+ // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+ // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+ // The next attempt should try a larger value for `seq`.
+ //
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ Status UpdateCallback(SequenceNumber seq, const Slice& key,
+ const Slice& delta,
+ const ProtectionInfoKVOS64* kv_prot_info);
+
+ // Returns the number of successive merge entries starting from the newest
+ // entry for the key up to the last non-merge entry or last entry for the
+ // key in the memtable.
+ size_t CountSuccessiveMergeEntries(const LookupKey& key);
+
+ // Update counters and flush status after inserting a whole write batch
+ // Used in concurrent memtable inserts.
+ void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
+ num_entries_.fetch_add(update_counters.num_entries,
+ std::memory_order_relaxed);
+ data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
+ if (update_counters.num_deletes != 0) {
+ num_deletes_.fetch_add(update_counters.num_deletes,
+ std::memory_order_relaxed);
+ }
+ UpdateFlushState();
+ }
+
+ // Get total number of entries in the mem table.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ uint64_t num_entries() const {
+ return num_entries_.load(std::memory_order_relaxed);
+ }
+
+ // Get total number of deletes in the mem table.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ uint64_t num_deletes() const {
+ return num_deletes_.load(std::memory_order_relaxed);
+ }
+
+ uint64_t get_data_size() const {
+ return data_size_.load(std::memory_order_relaxed);
+ }
+
+ // Dynamically change the memtable's capacity. If set below the current usage,
+ // the next key added will trigger a flush. Can only increase size when
+ // memtable prefix bloom is disabled, since we can't easily allocate more
+ // space.
+ void UpdateWriteBufferSize(size_t new_write_buffer_size) {
+ if (bloom_filter_ == nullptr ||
+ new_write_buffer_size < write_buffer_size_) {
+ write_buffer_size_.store(new_write_buffer_size,
+ std::memory_order_relaxed);
+ }
+ }
+
+ // Returns the edits area that is needed for flushing the memtable
+ VersionEdit* GetEdits() { return &edit_; }
+
+ // Returns if there is no entry inserted to the mem table.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ bool IsEmpty() const { return first_seqno_ == 0; }
+
+ // Returns the sequence number of the first element that was inserted
+ // into the memtable.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ SequenceNumber GetFirstSequenceNumber() {
+ return first_seqno_.load(std::memory_order_relaxed);
+ }
+
+ // Returns the sequence number of the first element that was inserted
+ // into the memtable.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable (unless this Memtable is immutable).
+ void SetFirstSequenceNumber(SequenceNumber first_seqno) {
+ return first_seqno_.store(first_seqno, std::memory_order_relaxed);
+ }
+
+ // Returns the sequence number that is guaranteed to be smaller than or equal
+ // to the sequence number of any key that could be inserted into this
+ // memtable. It can then be assumed that any write with a larger(or equal)
+ // sequence number will be present in this memtable or a later memtable.
+ //
+ // If the earliest sequence number could not be determined,
+ // kMaxSequenceNumber will be returned.
+ SequenceNumber GetEarliestSequenceNumber() {
+ return earliest_seqno_.load(std::memory_order_relaxed);
+ }
+
+ // Sets the sequence number that is guaranteed to be smaller than or equal
+ // to the sequence number of any key that could be inserted into this
+ // memtable. It can then be assumed that any write with a larger(or equal)
+ // sequence number will be present in this memtable or a later memtable.
+ // Used only for MemPurge operation
+ void SetEarliestSequenceNumber(SequenceNumber earliest_seqno) {
+ return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed);
+ }
+
+ // DB's latest sequence ID when the memtable is created. This number
+ // may be updated to a more recent one before any key is inserted.
+ SequenceNumber GetCreationSeq() const { return creation_seq_; }
+
+ void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
+
+ // Returns the next active logfile number when this memtable is about to
+ // be flushed to storage
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
+
+ // Sets the next active logfile number when this memtable is about to
+ // be flushed to storage
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+
+ // if this memtable contains data from a committed
+ // two phase transaction we must take note of the
+ // log which contains that data so we can know
+ // when to relese that log
+ void RefLogContainingPrepSection(uint64_t log);
+ uint64_t GetMinLogContainingPrepSection();
+
+ // Notify the underlying storage that no more items will be added.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ // After MarkImmutable() is called, you should not attempt to
+ // write anything to this MemTable(). (Ie. do not call Add() or Update()).
+ void MarkImmutable() {
+ table_->MarkReadOnly();
+ mem_tracker_.DoneAllocating();
+ }
+
+ // Notify the underlying storage that all data it contained has been
+ // persisted.
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ void MarkFlushed() { table_->MarkFlushed(); }
+
+ // return true if the current MemTableRep supports merge operator.
+ bool IsMergeOperatorSupported() const {
+ return table_->IsMergeOperatorSupported();
+ }
+
+ // return true if the current MemTableRep supports snapshots.
+ // inplace update prevents snapshots,
+ bool IsSnapshotSupported() const {
+ return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
+ }
+
+ struct MemTableStats {
+ uint64_t size;
+ uint64_t count;
+ };
+
+ MemTableStats ApproximateStats(const Slice& start_ikey,
+ const Slice& end_ikey);
+
+ // Get the lock associated for the key
+ port::RWMutex* GetLock(const Slice& key);
+
+ const InternalKeyComparator& GetInternalKeyComparator() const {
+ return comparator_.comparator;
+ }
+
+ const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
+ return &moptions_;
+ }
+
+ uint64_t ApproximateOldestKeyTime() const {
+ return oldest_key_time_.load(std::memory_order_relaxed);
+ }
+
+ // REQUIRES: db_mutex held.
+ void SetID(uint64_t id) { id_ = id; }
+
+ uint64_t GetID() const { return id_; }
+
+ void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
+
+ uint64_t GetFileNumber() const { return file_number_; }
+
+ void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
+
+ void SetFlushInProgress(bool in_progress) {
+ flush_in_progress_ = in_progress;
+ }
+
+#ifndef ROCKSDB_LITE
+ void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
+ flush_job_info_ = std::move(info);
+ }
+
+ std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
+ return std::move(flush_job_info_);
+ }
+#endif // !ROCKSDB_LITE
+
+ // Returns a heuristic flush decision
+ bool ShouldFlushNow();
+
+ void ConstructFragmentedRangeTombstones();
+
+ // Returns whether a fragmented range tombstone list is already constructed
+ // for this memtable. It should be constructed right before a memtable is
+ // added to an immutable memtable list. Note that if a memtable does not have
+ // any range tombstone, then no range tombstone list will ever be constructed.
+ // @param allow_empty Specifies whether a memtable with no range tombstone is
+ // considered to have its fragmented range tombstone list constructed.
+ bool IsFragmentedRangeTombstonesConstructed(bool allow_empty = true) const {
+ if (allow_empty) {
+ return fragmented_range_tombstone_list_.get() != nullptr ||
+ is_range_del_table_empty_;
+ } else {
+ return fragmented_range_tombstone_list_.get() != nullptr;
+ }
+ }
+
+ // Returns Corruption status if verification fails.
+ static Status VerifyEntryChecksum(const char* entry,
+ size_t protection_bytes_per_key,
+ bool allow_data_in_errors = false);
+
+ private:
+ enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
+
+ friend class MemTableIterator;
+ friend class MemTableBackwardIterator;
+ friend class MemTableList;
+
+ KeyComparator comparator_;
+ const ImmutableMemTableOptions moptions_;
+ int refs_;
+ const size_t kArenaBlockSize;
+ AllocTracker mem_tracker_;
+ ConcurrentArena arena_;
+ std::unique_ptr<MemTableRep> table_;
+ std::unique_ptr<MemTableRep> range_del_table_;
+ std::atomic_bool is_range_del_table_empty_;
+
+ // Total data size of all data inserted
+ std::atomic<uint64_t> data_size_;
+ std::atomic<uint64_t> num_entries_;
+ std::atomic<uint64_t> num_deletes_;
+
+ // Dynamically changeable memtable option
+ std::atomic<size_t> write_buffer_size_;
+
+ // These are used to manage memtable flushes to storage
+ bool flush_in_progress_; // started the flush
+ bool flush_completed_; // finished the flush
+ uint64_t file_number_; // filled up after flush is complete
+
+ // The updates to be applied to the transaction log when this
+ // memtable is flushed to storage.
+ VersionEdit edit_;
+
+ // The sequence number of the kv that was inserted first
+ std::atomic<SequenceNumber> first_seqno_;
+
+ // The db sequence number at the time of creation or kMaxSequenceNumber
+ // if not set.
+ std::atomic<SequenceNumber> earliest_seqno_;
+
+ SequenceNumber creation_seq_;
+
+ // The log files earlier than this number can be deleted.
+ uint64_t mem_next_logfile_number_;
+
+ // the earliest log containing a prepared section
+ // which has been inserted into this memtable.
+ std::atomic<uint64_t> min_prep_log_referenced_;
+
+ // rw locks for inplace updates
+ std::vector<port::RWMutex> locks_;
+
+ const SliceTransform* const prefix_extractor_;
+ std::unique_ptr<DynamicBloom> bloom_filter_;
+
+ std::atomic<FlushStateEnum> flush_state_;
+
+ SystemClock* clock_;
+
+ // Extract sequential insert prefixes.
+ const SliceTransform* insert_with_hint_prefix_extractor_;
+
+ // Insert hints for each prefix.
+ UnorderedMapH<Slice, void*, SliceHasher> insert_hints_;
+
+ // Timestamp of oldest key
+ std::atomic<uint64_t> oldest_key_time_;
+
+ // Memtable id to track flush.
+ uint64_t id_ = 0;
+
+ // Sequence number of the atomic flush that is responsible for this memtable.
+ // The sequence number of atomic flush is a seq, such that no writes with
+ // sequence numbers greater than or equal to seq are flushed, while all
+ // writes with sequence number smaller than seq are flushed.
+ SequenceNumber atomic_flush_seqno_;
+
+ // keep track of memory usage in table_, arena_, and range_del_table_.
+ // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
+ std::atomic<uint64_t> approximate_memory_usage_;
+
+#ifndef ROCKSDB_LITE
+ // Flush job info of the current memtable.
+ std::unique_ptr<FlushJobInfo> flush_job_info_;
+#endif // !ROCKSDB_LITE
+
+ // Updates flush_state_ using ShouldFlushNow()
+ void UpdateFlushState();
+
+ void UpdateOldestKeyTime();
+
+ void GetFromTable(const LookupKey& key,
+ SequenceNumber max_covering_tombstone_seq, bool do_merge,
+ ReadCallback* callback, bool* is_blob_index,
+ std::string* value, PinnableWideColumns* columns,
+ std::string* timestamp, Status* s,
+ MergeContext* merge_context, SequenceNumber* seq,
+ bool* found_final_value, bool* merge_in_progress);
+
+ // Always returns non-null and assumes certain pre-checks (e.g.,
+ // is_range_del_table_empty_) are done. This is only valid during the lifetime
+ // of the underlying memtable.
+ // read_seq and read_options.timestamp will be used as the upper bound
+ // for range tombstones.
+ FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
+ const ReadOptions& read_options, SequenceNumber read_seq,
+ bool immutable_memtable);
+
+ // The fragmented range tombstones of this memtable.
+ // This is constructed when this memtable becomes immutable
+ // if !is_range_del_table_empty_.
+ std::unique_ptr<FragmentedRangeTombstoneList>
+ fragmented_range_tombstone_list_;
+
+ // makes sure there is a single range tombstone writer to invalidate cache
+ std::mutex range_del_mutex_;
+ CoreLocalArray<std::shared_ptr<FragmentedRangeTombstoneListCache>>
+ cached_range_tombstone_;
+
+ void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
+ const Slice& key, const Slice& value, ValueType type,
+ SequenceNumber s, char* checksum_ptr);
+};
+
+extern const char* EncodeKey(std::string* scratch, const Slice& target);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list.cc b/src/rocksdb/db/memtable_list.cc
new file mode 100644
index 000000000..1545003ad
--- /dev/null
+++ b/src/rocksdb/db/memtable_list.cc
@@ -0,0 +1,991 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/memtable_list.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/memtable.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_set.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/thread_status_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class InternalKeyComparator;
+class Mutex;
+class VersionSet;
+
+void MemTableListVersion::AddMemTable(MemTable* m) {
+ memlist_.push_front(m);
+ *parent_memtable_list_memory_usage_ += m->ApproximateMemoryUsage();
+}
+
+void MemTableListVersion::UnrefMemTable(autovector<MemTable*>* to_delete,
+ MemTable* m) {
+ if (m->Unref()) {
+ to_delete->push_back(m);
+ assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage());
+ *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage();
+ }
+}
+
+MemTableListVersion::MemTableListVersion(
+ size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old)
+ : max_write_buffer_number_to_maintain_(
+ old.max_write_buffer_number_to_maintain_),
+ max_write_buffer_size_to_maintain_(
+ old.max_write_buffer_size_to_maintain_),
+ parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
+ memlist_ = old.memlist_;
+ for (auto& m : memlist_) {
+ m->Ref();
+ }
+
+ memlist_history_ = old.memlist_history_;
+ for (auto& m : memlist_history_) {
+ m->Ref();
+ }
+}
+
+MemTableListVersion::MemTableListVersion(
+ size_t* parent_memtable_list_memory_usage,
+ int max_write_buffer_number_to_maintain,
+ int64_t max_write_buffer_size_to_maintain)
+ : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain),
+ max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain),
+ parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {}
+
+void MemTableListVersion::Ref() { ++refs_; }
+
+// called by superversion::clean()
+void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
+ assert(refs_ >= 1);
+ --refs_;
+ if (refs_ == 0) {
+ // if to_delete is equal to nullptr it means we're confident
+ // that refs_ will not be zero
+ assert(to_delete != nullptr);
+ for (const auto& m : memlist_) {
+ UnrefMemTable(to_delete, m);
+ }
+ for (const auto& m : memlist_history_) {
+ UnrefMemTable(to_delete, m);
+ }
+ delete this;
+ }
+}
+
+int MemTableList::NumNotFlushed() const {
+ int size = static_cast<int>(current_->memlist_.size());
+ assert(num_flush_not_started_ <= size);
+ return size;
+}
+
+int MemTableList::NumFlushed() const {
+ return static_cast<int>(current_->memlist_history_.size());
+}
+
+// Search all the memtables starting from the most recent one.
+// Return the most recent value found, if any.
+// Operands stores the list of merge operations to apply, so far.
+bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
+ PinnableWideColumns* columns,
+ std::string* timestamp, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ SequenceNumber* seq, const ReadOptions& read_opts,
+ ReadCallback* callback, bool* is_blob_index) {
+ return GetFromList(&memlist_, key, value, columns, timestamp, s,
+ merge_context, max_covering_tombstone_seq, seq, read_opts,
+ callback, is_blob_index);
+}
+
+void MemTableListVersion::MultiGet(const ReadOptions& read_options,
+ MultiGetRange* range,
+ ReadCallback* callback) {
+ for (auto memtable : memlist_) {
+ memtable->MultiGet(read_options, range, callback,
+ true /* immutable_memtable */);
+ if (range->empty()) {
+ return;
+ }
+ }
+}
+
+bool MemTableListVersion::GetMergeOperands(
+ const LookupKey& key, Status* s, MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) {
+ for (MemTable* memtable : memlist_) {
+ bool done = memtable->Get(
+ key, /*value=*/nullptr, /*columns=*/nullptr, /*timestamp=*/nullptr, s,
+ merge_context, max_covering_tombstone_seq, read_opts,
+ true /* immutable_memtable */, nullptr, nullptr, false);
+ if (done) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool MemTableListVersion::GetFromHistory(
+ const LookupKey& key, std::string* value, PinnableWideColumns* columns,
+ std::string* timestamp, Status* s, MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+ const ReadOptions& read_opts, bool* is_blob_index) {
+ return GetFromList(&memlist_history_, key, value, columns, timestamp, s,
+ merge_context, max_covering_tombstone_seq, seq, read_opts,
+ nullptr /*read_callback*/, is_blob_index);
+}
+
+bool MemTableListVersion::GetFromList(
+ std::list<MemTable*>* list, const LookupKey& key, std::string* value,
+ PinnableWideColumns* columns, std::string* timestamp, Status* s,
+ MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq,
+ SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback,
+ bool* is_blob_index) {
+ *seq = kMaxSequenceNumber;
+
+ for (auto& memtable : *list) {
+ assert(memtable->IsFragmentedRangeTombstonesConstructed());
+ SequenceNumber current_seq = kMaxSequenceNumber;
+
+ bool done =
+ memtable->Get(key, value, columns, timestamp, s, merge_context,
+ max_covering_tombstone_seq, &current_seq, read_opts,
+ true /* immutable_memtable */, callback, is_blob_index);
+ if (*seq == kMaxSequenceNumber) {
+ // Store the most recent sequence number of any operation on this key.
+ // Since we only care about the most recent change, we only need to
+ // return the first operation found when searching memtables in
+ // reverse-chronological order.
+ // current_seq would be equal to kMaxSequenceNumber if the value was to be
+ // skipped. This allows seq to be assigned again when the next value is
+ // read.
+ *seq = current_seq;
+ }
+
+ if (done) {
+ assert(*seq != kMaxSequenceNumber || s->IsNotFound());
+ return true;
+ }
+ if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {
+ return false;
+ }
+ }
+ return false;
+}
+
+Status MemTableListVersion::AddRangeTombstoneIterators(
+ const ReadOptions& read_opts, Arena* /*arena*/,
+ RangeDelAggregator* range_del_agg) {
+ assert(range_del_agg != nullptr);
+ // Except for snapshot read, using kMaxSequenceNumber is OK because these
+ // are immutable memtables.
+ SequenceNumber read_seq = read_opts.snapshot != nullptr
+ ? read_opts.snapshot->GetSequenceNumber()
+ : kMaxSequenceNumber;
+ for (auto& m : memlist_) {
+ assert(m->IsFragmentedRangeTombstonesConstructed());
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ m->NewRangeTombstoneIterator(read_opts, read_seq,
+ true /* immutable_memtable */));
+ range_del_agg->AddTombstones(std::move(range_del_iter));
+ }
+ return Status::OK();
+}
+
+void MemTableListVersion::AddIterators(
+ const ReadOptions& options, std::vector<InternalIterator*>* iterator_list,
+ Arena* arena) {
+ for (auto& m : memlist_) {
+ iterator_list->push_back(m->NewIterator(options, arena));
+ }
+}
+
+void MemTableListVersion::AddIterators(const ReadOptions& options,
+ MergeIteratorBuilder* merge_iter_builder,
+ bool add_range_tombstone_iter) {
+ for (auto& m : memlist_) {
+ auto mem_iter = m->NewIterator(options, merge_iter_builder->GetArena());
+ if (!add_range_tombstone_iter || options.ignore_range_deletions) {
+ merge_iter_builder->AddIterator(mem_iter);
+ } else {
+ // Except for snapshot read, using kMaxSequenceNumber is OK because these
+ // are immutable memtables.
+ SequenceNumber read_seq = options.snapshot != nullptr
+ ? options.snapshot->GetSequenceNumber()
+ : kMaxSequenceNumber;
+ TruncatedRangeDelIterator* mem_tombstone_iter = nullptr;
+ auto range_del_iter = m->NewRangeTombstoneIterator(
+ options, read_seq, true /* immutale_memtable */);
+ if (range_del_iter == nullptr || range_del_iter->empty()) {
+ delete range_del_iter;
+ } else {
+ mem_tombstone_iter = new TruncatedRangeDelIterator(
+ std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
+ &m->GetInternalKeyComparator(), nullptr /* smallest */,
+ nullptr /* largest */);
+ }
+ merge_iter_builder->AddPointAndTombstoneIterator(mem_iter,
+ mem_tombstone_iter);
+ }
+ }
+}
+
+uint64_t MemTableListVersion::GetTotalNumEntries() const {
+ uint64_t total_num = 0;
+ for (auto& m : memlist_) {
+ total_num += m->num_entries();
+ }
+ return total_num;
+}
+
+MemTable::MemTableStats MemTableListVersion::ApproximateStats(
+ const Slice& start_ikey, const Slice& end_ikey) {
+ MemTable::MemTableStats total_stats = {0, 0};
+ for (auto& m : memlist_) {
+ auto mStats = m->ApproximateStats(start_ikey, end_ikey);
+ total_stats.size += mStats.size;
+ total_stats.count += mStats.count;
+ }
+ return total_stats;
+}
+
+uint64_t MemTableListVersion::GetTotalNumDeletes() const {
+ uint64_t total_num = 0;
+ for (auto& m : memlist_) {
+ total_num += m->num_deletes();
+ }
+ return total_num;
+}
+
+SequenceNumber MemTableListVersion::GetEarliestSequenceNumber(
+ bool include_history) const {
+ if (include_history && !memlist_history_.empty()) {
+ return memlist_history_.back()->GetEarliestSequenceNumber();
+ } else if (!memlist_.empty()) {
+ return memlist_.back()->GetEarliestSequenceNumber();
+ } else {
+ return kMaxSequenceNumber;
+ }
+}
+
+SequenceNumber MemTableListVersion::GetFirstSequenceNumber() const {
+ SequenceNumber min_first_seqno = kMaxSequenceNumber;
+ // The first memtable in the list might not be the oldest one with mempurge
+ for (const auto& m : memlist_) {
+ min_first_seqno = std::min(m->GetFirstSequenceNumber(), min_first_seqno);
+ }
+ return min_first_seqno;
+}
+
+// caller is responsible for referencing m
+void MemTableListVersion::Add(MemTable* m, autovector<MemTable*>* to_delete) {
+ assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable
+ AddMemTable(m);
+ // m->MemoryAllocatedBytes() is added in MemoryAllocatedBytesExcludingLast
+ TrimHistory(to_delete, 0);
+}
+
+// Removes m from list of memtables not flushed. Caller should NOT Unref m.
+void MemTableListVersion::Remove(MemTable* m,
+ autovector<MemTable*>* to_delete) {
+ assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable
+ memlist_.remove(m);
+
+ m->MarkFlushed();
+ if (max_write_buffer_size_to_maintain_ > 0 ||
+ max_write_buffer_number_to_maintain_ > 0) {
+ memlist_history_.push_front(m);
+ // Unable to get size of mutable memtable at this point, pass 0 to
+ // TrimHistory as a best effort.
+ TrimHistory(to_delete, 0);
+ } else {
+ UnrefMemTable(to_delete, m);
+ }
+}
+
+// return the total memory usage assuming the oldest flushed memtable is dropped
+size_t MemTableListVersion::MemoryAllocatedBytesExcludingLast() const {
+ size_t total_memtable_size = 0;
+ for (auto& memtable : memlist_) {
+ total_memtable_size += memtable->MemoryAllocatedBytes();
+ }
+ for (auto& memtable : memlist_history_) {
+ total_memtable_size += memtable->MemoryAllocatedBytes();
+ }
+ if (!memlist_history_.empty()) {
+ total_memtable_size -= memlist_history_.back()->MemoryAllocatedBytes();
+ }
+ return total_memtable_size;
+}
+
+bool MemTableListVersion::MemtableLimitExceeded(size_t usage) {
+ if (max_write_buffer_size_to_maintain_ > 0) {
+ // calculate the total memory usage after dropping the oldest flushed
+ // memtable, compare with max_write_buffer_size_to_maintain_ to decide
+ // whether to trim history
+ return MemoryAllocatedBytesExcludingLast() + usage >=
+ static_cast<size_t>(max_write_buffer_size_to_maintain_);
+ } else if (max_write_buffer_number_to_maintain_ > 0) {
+ return memlist_.size() + memlist_history_.size() >
+ static_cast<size_t>(max_write_buffer_number_to_maintain_);
+ } else {
+ return false;
+ }
+}
+
+// Make sure we don't use up too much space in history
+bool MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete,
+ size_t usage) {
+ bool ret = false;
+ while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) {
+ MemTable* x = memlist_history_.back();
+ memlist_history_.pop_back();
+
+ UnrefMemTable(to_delete, x);
+ ret = true;
+ }
+ return ret;
+}
+
+// Returns true if there is at least one memtable on which flush has
+// not yet started.
+bool MemTableList::IsFlushPending() const {
+ if ((flush_requested_ && num_flush_not_started_ > 0) ||
+ (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
+ assert(imm_flush_needed.load(std::memory_order_relaxed));
+ return true;
+ }
+ return false;
+}
+
+bool MemTableList::IsFlushPendingOrRunning() const {
+ if (current_->memlist_.size() - num_flush_not_started_ > 0) {
+ // Flush is already running on at least one memtable
+ return true;
+ }
+ return IsFlushPending();
+}
+
+// Returns the memtables that need to be flushed.
+void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id,
+ autovector<MemTable*>* ret,
+ uint64_t* max_next_log_number) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
+ const auto& memlist = current_->memlist_;
+ bool atomic_flush = false;
+
+ // Note: every time MemTableList::Add(mem) is called, it adds the new mem
+ // at the FRONT of the memlist (memlist.push_front(mem)). Therefore, by
+ // iterating through the memlist starting at the end, the vector<MemTable*>
+ // ret is filled with memtables already sorted in increasing MemTable ID.
+ // However, when the mempurge feature is activated, new memtables with older
+ // IDs will be added to the memlist.
+ for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+ MemTable* m = *it;
+ if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) {
+ atomic_flush = true;
+ }
+ if (m->GetID() > max_memtable_id) {
+ break;
+ }
+ if (!m->flush_in_progress_) {
+ assert(!m->flush_completed_);
+ num_flush_not_started_--;
+ if (num_flush_not_started_ == 0) {
+ imm_flush_needed.store(false, std::memory_order_release);
+ }
+ m->flush_in_progress_ = true; // flushing will start very soon
+ if (max_next_log_number) {
+ *max_next_log_number =
+ std::max(m->GetNextLogNumber(), *max_next_log_number);
+ }
+ ret->push_back(m);
+ } else if (!ret->empty()) {
+ // This `break` is necessary to prevent picking non-consecutive memtables
+ // in case `memlist` has one or more entries with
+ // `flush_in_progress_ == true` sandwiched between entries with
+ // `flush_in_progress_ == false`. This could happen after parallel flushes
+ // are picked and the one flushing older memtables is rolled back.
+ break;
+ }
+ }
+ if (!atomic_flush || num_flush_not_started_ == 0) {
+ flush_requested_ = false; // start-flush request is complete
+ }
+}
+
+void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
+ uint64_t /*file_number*/) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_MEMTABLE_ROLLBACK);
+ assert(!mems.empty());
+
+ // If the flush was not successful, then just reset state.
+ // Maybe a succeeding attempt to flush will be successful.
+ for (MemTable* m : mems) {
+ assert(m->flush_in_progress_);
+ assert(m->file_number_ == 0);
+
+ m->flush_in_progress_ = false;
+ m->flush_completed_ = false;
+ m->edit_.Clear();
+ num_flush_not_started_++;
+ }
+ imm_flush_needed.store(true, std::memory_order_release);
+}
+
+// Try record a successful flush in the manifest file. It might just return
+// Status::OK letting a concurrent flush to do actual the recording..
+Status MemTableList::TryInstallMemtableFlushResults(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ const autovector<MemTable*>& mems, LogsWithPrepTracker* prep_tracker,
+ VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+ autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+ LogBuffer* log_buffer,
+ std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
+ bool write_edits) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+ mu->AssertHeld();
+
+ // Flush was successful
+ // Record the status on the memtable object. Either this call or a call by a
+ // concurrent flush thread will read the status and write it to manifest.
+ for (size_t i = 0; i < mems.size(); ++i) {
+ // All the edits are associated with the first memtable of this batch.
+ assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
+
+ mems[i]->flush_completed_ = true;
+ mems[i]->file_number_ = file_number;
+ }
+
+ // if some other thread is already committing, then return
+ Status s;
+ if (commit_in_progress_) {
+ TEST_SYNC_POINT("MemTableList::TryInstallMemtableFlushResults:InProgress");
+ return s;
+ }
+
+ // Only a single thread can be executing this piece of code
+ commit_in_progress_ = true;
+
+ // Retry until all completed flushes are committed. New flushes can finish
+ // while the current thread is writing manifest where mutex is released.
+ while (s.ok()) {
+ auto& memlist = current_->memlist_;
+ // The back is the oldest; if flush_completed_ is not set to it, it means
+ // that we were assigned a more recent memtable. The memtables' flushes must
+ // be recorded in manifest in order. A concurrent flush thread, who is
+ // assigned to flush the oldest memtable, will later wake up and does all
+ // the pending writes to manifest, in order.
+ if (memlist.empty() || !memlist.back()->flush_completed_) {
+ break;
+ }
+ // scan all memtables from the earliest, and commit those
+ // (in that order) that have finished flushing. Memtables
+ // are always committed in the order that they were created.
+ uint64_t batch_file_number = 0;
+ size_t batch_count = 0;
+ autovector<VersionEdit*> edit_list;
+ autovector<MemTable*> memtables_to_flush;
+ // enumerate from the last (earliest) element to see how many batch finished
+ for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+ MemTable* m = *it;
+ if (!m->flush_completed_) {
+ break;
+ }
+ if (it == memlist.rbegin() || batch_file_number != m->file_number_) {
+ batch_file_number = m->file_number_;
+ if (m->edit_.GetBlobFileAdditions().empty()) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Level-0 commit table #%" PRIu64 " started",
+ cfd->GetName().c_str(), m->file_number_);
+ } else {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Level-0 commit table #%" PRIu64
+ " (+%zu blob files) started",
+ cfd->GetName().c_str(), m->file_number_,
+ m->edit_.GetBlobFileAdditions().size());
+ }
+
+ edit_list.push_back(&m->edit_);
+ memtables_to_flush.push_back(m);
+#ifndef ROCKSDB_LITE
+ std::unique_ptr<FlushJobInfo> info = m->ReleaseFlushJobInfo();
+ if (info != nullptr) {
+ committed_flush_jobs_info->push_back(std::move(info));
+ }
+#else
+ (void)committed_flush_jobs_info;
+#endif // !ROCKSDB_LITE
+ }
+ batch_count++;
+ }
+
+ // TODO(myabandeh): Not sure how batch_count could be 0 here.
+ if (batch_count > 0) {
+ uint64_t min_wal_number_to_keep = 0;
+ assert(edit_list.size() > 0);
+ if (vset->db_options()->allow_2pc) {
+ // Note that if mempurge is successful, the edit_list will
+ // not be applicable (contains info of new min_log number to keep,
+ // and level 0 file path of SST file created during normal flush,
+ // so both pieces of information are irrelevant after a successful
+ // mempurge operation).
+ min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
+ vset, *cfd, edit_list, memtables_to_flush, prep_tracker);
+
+ // We piggyback the information of earliest log file to keep in the
+ // manifest entry for the last file flushed.
+ } else {
+ min_wal_number_to_keep =
+ PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list);
+ }
+
+ VersionEdit wal_deletion;
+ wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep);
+ if (vset->db_options()->track_and_verify_wals_in_manifest) {
+ if (min_wal_number_to_keep >
+ vset->GetWalSet().GetMinWalNumberToKeep()) {
+ wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
+ }
+ TEST_SYNC_POINT_CALLBACK(
+ "MemTableList::TryInstallMemtableFlushResults:"
+ "AfterComputeMinWalToKeep",
+ nullptr);
+ }
+ edit_list.push_back(&wal_deletion);
+
+ const auto manifest_write_cb = [this, cfd, batch_count, log_buffer,
+ to_delete, mu](const Status& status) {
+ RemoveMemTablesOrRestoreFlags(status, cfd, batch_count, log_buffer,
+ to_delete, mu);
+ };
+ if (write_edits) {
+ // this can release and reacquire the mutex.
+ s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu,
+ db_directory, /*new_descriptor_log=*/false,
+ /*column_family_options=*/nullptr,
+ manifest_write_cb);
+ } else {
+ // If write_edit is false (e.g: successful mempurge),
+ // then remove old memtables, wake up manifest write queue threads,
+ // and don't commit anything to the manifest file.
+ RemoveMemTablesOrRestoreFlags(s, cfd, batch_count, log_buffer,
+ to_delete, mu);
+ // Note: cfd->SetLogNumber is only called when a VersionEdit
+ // is written to MANIFEST. When mempurge is succesful, we skip
+ // this step, therefore cfd->GetLogNumber is always is
+ // earliest log with data unflushed.
+ // Notify new head of manifest write queue.
+ // wake up all the waiting writers
+ // TODO(bjlemaire): explain full reason WakeUpWaitingManifestWriters
+ // needed or investigate more.
+ vset->WakeUpWaitingManifestWriters();
+ }
+ }
+ }
+ commit_in_progress_ = false;
+ return s;
+}
+
+// New memtables are inserted at the front of the list.
+void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
+ assert(static_cast<int>(current_->memlist_.size()) >= num_flush_not_started_);
+ InstallNewVersion();
+ // this method is used to move mutable memtable into an immutable list.
+ // since mutable memtable is already refcounted by the DBImpl,
+ // and when moving to the immutable list we don't unref it,
+ // we don't have to ref the memtable here. we just take over the
+ // reference from the DBImpl.
+ current_->Add(m, to_delete);
+ m->MarkImmutable();
+ num_flush_not_started_++;
+ if (num_flush_not_started_ == 1) {
+ imm_flush_needed.store(true, std::memory_order_release);
+ }
+ UpdateCachedValuesFromMemTableListVersion();
+ ResetTrimHistoryNeeded();
+}
+
+bool MemTableList::TrimHistory(autovector<MemTable*>* to_delete, size_t usage) {
+ InstallNewVersion();
+ bool ret = current_->TrimHistory(to_delete, usage);
+ UpdateCachedValuesFromMemTableListVersion();
+ ResetTrimHistoryNeeded();
+ return ret;
+}
+
+// Returns an estimate of the number of bytes of data in use.
+size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() {
+ size_t total_size = 0;
+ for (auto& memtable : current_->memlist_) {
+ total_size += memtable->ApproximateMemoryUsage();
+ }
+ return total_size;
+}
+
+size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
+
+size_t MemTableList::MemoryAllocatedBytesExcludingLast() const {
+ const size_t usage = current_memory_allocted_bytes_excluding_last_.load(
+ std::memory_order_relaxed);
+ return usage;
+}
+
+bool MemTableList::HasHistory() const {
+ const bool has_history = current_has_history_.load(std::memory_order_relaxed);
+ return has_history;
+}
+
+void MemTableList::UpdateCachedValuesFromMemTableListVersion() {
+ const size_t total_memtable_size =
+ current_->MemoryAllocatedBytesExcludingLast();
+ current_memory_allocted_bytes_excluding_last_.store(
+ total_memtable_size, std::memory_order_relaxed);
+
+ const bool has_history = current_->HasHistory();
+ current_has_history_.store(has_history, std::memory_order_relaxed);
+}
+
+uint64_t MemTableList::ApproximateOldestKeyTime() const {
+ if (!current_->memlist_.empty()) {
+ return current_->memlist_.back()->ApproximateOldestKeyTime();
+ }
+ return std::numeric_limits<uint64_t>::max();
+}
+
+void MemTableList::InstallNewVersion() {
+ if (current_->refs_ == 1) {
+ // we're the only one using the version, just keep using it
+ } else {
+ // somebody else holds the current version, we need to create new one
+ MemTableListVersion* version = current_;
+ current_ = new MemTableListVersion(&current_memory_usage_, *version);
+ current_->Ref();
+ version->Unref();
+ }
+}
+
+void MemTableList::RemoveMemTablesOrRestoreFlags(
+ const Status& s, ColumnFamilyData* cfd, size_t batch_count,
+ LogBuffer* log_buffer, autovector<MemTable*>* to_delete,
+ InstrumentedMutex* mu) {
+ assert(mu);
+ mu->AssertHeld();
+ assert(to_delete);
+ // we will be changing the version in the next code path,
+ // so we better create a new one, since versions are immutable
+ InstallNewVersion();
+
+ // All the later memtables that have the same filenum
+ // are part of the same batch. They can be committed now.
+ uint64_t mem_id = 1; // how many memtables have been flushed.
+
+ // commit new state only if the column family is NOT dropped.
+ // The reason is as follows (refer to
+ // ColumnFamilyTest.FlushAndDropRaceCondition).
+ // If the column family is dropped, then according to LogAndApply, its
+ // corresponding flush operation is NOT written to the MANIFEST. This
+ // means the DB is not aware of the L0 files generated from the flush.
+ // By committing the new state, we remove the memtable from the memtable
+ // list. Creating an iterator on this column family will not be able to
+ // read full data since the memtable is removed, and the DB is not aware
+ // of the L0 files, causing MergingIterator unable to build child
+ // iterators. RocksDB contract requires that the iterator can be created
+ // on a dropped column family, and we must be able to
+ // read full data as long as column family handle is not deleted, even if
+ // the column family is dropped.
+ if (s.ok() && !cfd->IsDropped()) { // commit new state
+ while (batch_count-- > 0) {
+ MemTable* m = current_->memlist_.back();
+ if (m->edit_.GetBlobFileAdditions().empty()) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Level-0 commit table #%" PRIu64
+ ": memtable #%" PRIu64 " done",
+ cfd->GetName().c_str(), m->file_number_, mem_id);
+ } else {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Level-0 commit table #%" PRIu64
+ " (+%zu blob files)"
+ ": memtable #%" PRIu64 " done",
+ cfd->GetName().c_str(), m->file_number_,
+ m->edit_.GetBlobFileAdditions().size(), mem_id);
+ }
+
+ assert(m->file_number_ > 0);
+ current_->Remove(m, to_delete);
+ UpdateCachedValuesFromMemTableListVersion();
+ ResetTrimHistoryNeeded();
+ ++mem_id;
+ }
+ } else {
+ for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) {
+ MemTable* m = *it;
+ // commit failed. setup state so that we can flush again.
+ if (m->edit_.GetBlobFileAdditions().empty()) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "Level-0 commit table #%" PRIu64 ": memtable #%" PRIu64
+ " failed",
+ m->file_number_, mem_id);
+ } else {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "Level-0 commit table #%" PRIu64
+ " (+%zu blob files)"
+ ": memtable #%" PRIu64 " failed",
+ m->file_number_,
+ m->edit_.GetBlobFileAdditions().size(), mem_id);
+ }
+
+ m->flush_completed_ = false;
+ m->flush_in_progress_ = false;
+ m->edit_.Clear();
+ num_flush_not_started_++;
+ m->file_number_ = 0;
+ imm_flush_needed.store(true, std::memory_order_release);
+ ++mem_id;
+ }
+ }
+}
+
+uint64_t MemTableList::PrecomputeMinLogContainingPrepSection(
+ const std::unordered_set<MemTable*>* memtables_to_flush) {
+ uint64_t min_log = 0;
+
+ for (auto& m : current_->memlist_) {
+ if (memtables_to_flush && memtables_to_flush->count(m)) {
+ continue;
+ }
+
+ auto log = m->GetMinLogContainingPrepSection();
+
+ if (log > 0 && (min_log == 0 || log < min_log)) {
+ min_log = log;
+ }
+ }
+
+ return min_log;
+}
+
+// Commit a successful atomic flush in the manifest file.
+Status InstallMemtableAtomicFlushResults(
+ const autovector<MemTableList*>* imm_lists,
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+ LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
+ const autovector<FileMetaData*>& file_metas,
+ const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+ committed_flush_jobs_info,
+ autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+ LogBuffer* log_buffer) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+ mu->AssertHeld();
+
+ size_t num = mems_list.size();
+ assert(cfds.size() == num);
+ if (imm_lists != nullptr) {
+ assert(imm_lists->size() == num);
+ }
+ if (num == 0) {
+ return Status::OK();
+ }
+
+ for (size_t k = 0; k != num; ++k) {
+#ifndef NDEBUG
+ const auto* imm =
+ (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+ if (!mems_list[k]->empty()) {
+ assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID());
+ }
+#endif
+ assert(nullptr != file_metas[k]);
+ for (size_t i = 0; i != mems_list[k]->size(); ++i) {
+ assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0);
+ (*mems_list[k])[i]->SetFlushCompleted(true);
+ (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber());
+ }
+#ifndef ROCKSDB_LITE
+ if (committed_flush_jobs_info[k]) {
+ assert(!mems_list[k]->empty());
+ assert((*mems_list[k])[0]);
+ std::unique_ptr<FlushJobInfo> flush_job_info =
+ (*mems_list[k])[0]->ReleaseFlushJobInfo();
+ committed_flush_jobs_info[k]->push_back(std::move(flush_job_info));
+ }
+#else //! ROCKSDB_LITE
+ (void)committed_flush_jobs_info;
+#endif // ROCKSDB_LITE
+ }
+
+ Status s;
+
+ autovector<autovector<VersionEdit*>> edit_lists;
+ uint32_t num_entries = 0;
+ for (const auto mems : mems_list) {
+ assert(mems != nullptr);
+ autovector<VersionEdit*> edits;
+ assert(!mems->empty());
+ edits.emplace_back((*mems)[0]->GetEdits());
+ ++num_entries;
+ edit_lists.emplace_back(edits);
+ }
+
+ WalNumber min_wal_number_to_keep = 0;
+ if (vset->db_options()->allow_2pc) {
+ min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
+ vset, cfds, edit_lists, mems_list, prep_tracker);
+ } else {
+ min_wal_number_to_keep =
+ PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists);
+ }
+
+ VersionEdit wal_deletion;
+ wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep);
+ if (vset->db_options()->track_and_verify_wals_in_manifest &&
+ min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
+ wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
+ }
+ edit_lists.back().push_back(&wal_deletion);
+ ++num_entries;
+
+ // Mark the version edits as an atomic group if the number of version edits
+ // exceeds 1.
+ if (cfds.size() > 1) {
+ for (size_t i = 0; i < edit_lists.size(); i++) {
+ assert((edit_lists[i].size() == 1) ||
+ ((edit_lists[i].size() == 2) && (i == edit_lists.size() - 1)));
+ for (auto& e : edit_lists[i]) {
+ e->MarkAtomicGroup(--num_entries);
+ }
+ }
+ assert(0 == num_entries);
+ }
+
+ // this can release and reacquire the mutex.
+ s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+ db_directory);
+
+ for (size_t k = 0; k != cfds.size(); ++k) {
+ auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+ imm->InstallNewVersion();
+ }
+
+ if (s.ok() || s.IsColumnFamilyDropped()) {
+ for (size_t i = 0; i != cfds.size(); ++i) {
+ if (cfds[i]->IsDropped()) {
+ continue;
+ }
+ auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+ for (auto m : *mems_list[i]) {
+ assert(m->GetFileNumber() > 0);
+ uint64_t mem_id = m->GetID();
+
+ const VersionEdit* const edit = m->GetEdits();
+ assert(edit);
+
+ if (edit->GetBlobFileAdditions().empty()) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Level-0 commit table #%" PRIu64
+ ": memtable #%" PRIu64 " done",
+ cfds[i]->GetName().c_str(), m->GetFileNumber(),
+ mem_id);
+ } else {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Level-0 commit table #%" PRIu64
+ " (+%zu blob files)"
+ ": memtable #%" PRIu64 " done",
+ cfds[i]->GetName().c_str(), m->GetFileNumber(),
+ edit->GetBlobFileAdditions().size(), mem_id);
+ }
+
+ imm->current_->Remove(m, to_delete);
+ imm->UpdateCachedValuesFromMemTableListVersion();
+ imm->ResetTrimHistoryNeeded();
+ }
+ }
+ } else {
+ for (size_t i = 0; i != cfds.size(); ++i) {
+ auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+ for (auto m : *mems_list[i]) {
+ uint64_t mem_id = m->GetID();
+
+ const VersionEdit* const edit = m->GetEdits();
+ assert(edit);
+
+ if (edit->GetBlobFileAdditions().empty()) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Level-0 commit table #%" PRIu64
+ ": memtable #%" PRIu64 " failed",
+ cfds[i]->GetName().c_str(), m->GetFileNumber(),
+ mem_id);
+ } else {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Level-0 commit table #%" PRIu64
+ " (+%zu blob files)"
+ ": memtable #%" PRIu64 " failed",
+ cfds[i]->GetName().c_str(), m->GetFileNumber(),
+ edit->GetBlobFileAdditions().size(), mem_id);
+ }
+
+ m->SetFlushCompleted(false);
+ m->SetFlushInProgress(false);
+ m->GetEdits()->Clear();
+ m->SetFileNumber(0);
+ imm->num_flush_not_started_++;
+ }
+ imm->imm_flush_needed.store(true, std::memory_order_release);
+ }
+ }
+
+ return s;
+}
+
+void MemTableList::RemoveOldMemTables(uint64_t log_number,
+ autovector<MemTable*>* to_delete) {
+ assert(to_delete != nullptr);
+ InstallNewVersion();
+ auto& memlist = current_->memlist_;
+ autovector<MemTable*> old_memtables;
+ for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+ MemTable* mem = *it;
+ if (mem->GetNextLogNumber() > log_number) {
+ break;
+ }
+ old_memtables.push_back(mem);
+ }
+
+ for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) {
+ MemTable* mem = *it;
+ current_->Remove(mem, to_delete);
+ --num_flush_not_started_;
+ if (0 == num_flush_not_started_) {
+ imm_flush_needed.store(false, std::memory_order_release);
+ }
+ }
+
+ UpdateCachedValuesFromMemTableListVersion();
+ ResetTrimHistoryNeeded();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list.h b/src/rocksdb/db/memtable_list.h
new file mode 100644
index 000000000..1ad28a59e
--- /dev/null
+++ b/src/rocksdb/db/memtable_list.h
@@ -0,0 +1,471 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <deque>
+#include <limits>
+#include <list>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable.h"
+#include "db/range_del_aggregator.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+class InternalKeyComparator;
+class InstrumentedMutex;
+class MergeIteratorBuilder;
+class MemTableList;
+
+struct FlushJobInfo;
+
+// keeps a list of immutable memtables in a vector. the list is immutable
+// if refcount is bigger than one. It is used as a state for Get() and
+// Iterator code paths
+//
+// This class is not thread-safe. External synchronization is required
+// (such as holding the db mutex or being on the write thread).
+class MemTableListVersion {
+ public:
+ explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+ const MemTableListVersion& old);
+ explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+ int max_write_buffer_number_to_maintain,
+ int64_t max_write_buffer_size_to_maintain);
+
+ void Ref();
+ void Unref(autovector<MemTable*>* to_delete = nullptr);
+
+ // Search all the memtables starting from the most recent one.
+ // Return the most recent value found, if any.
+ //
+ // If any operation was found for this key, its most recent sequence number
+ // will be stored in *seq on success (regardless of whether true/false is
+ // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
+ bool Get(const LookupKey& key, std::string* value,
+ PinnableWideColumns* columns, std::string* timestamp, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+ const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+ bool* is_blob_index = nullptr);
+
+ bool Get(const LookupKey& key, std::string* value,
+ PinnableWideColumns* columns, std::string* timestamp, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+ bool* is_blob_index = nullptr) {
+ SequenceNumber seq;
+ return Get(key, value, columns, timestamp, s, merge_context,
+ max_covering_tombstone_seq, &seq, read_opts, callback,
+ is_blob_index);
+ }
+
+ void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+ ReadCallback* callback);
+
+ // Returns all the merge operands corresponding to the key by searching all
+ // memtables starting from the most recent one.
+ bool GetMergeOperands(const LookupKey& key, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ const ReadOptions& read_opts);
+
+ // Similar to Get(), but searches the Memtable history of memtables that
+ // have already been flushed. Should only be used from in-memory only
+ // queries (such as Transaction validation) as the history may contain
+ // writes that are also present in the SST files.
+ bool GetFromHistory(const LookupKey& key, std::string* value,
+ PinnableWideColumns* columns, std::string* timestamp,
+ Status* s, MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ SequenceNumber* seq, const ReadOptions& read_opts,
+ bool* is_blob_index = nullptr);
+ bool GetFromHistory(const LookupKey& key, std::string* value,
+ PinnableWideColumns* columns, std::string* timestamp,
+ Status* s, MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ const ReadOptions& read_opts,
+ bool* is_blob_index = nullptr) {
+ SequenceNumber seq;
+ return GetFromHistory(key, value, columns, timestamp, s, merge_context,
+ max_covering_tombstone_seq, &seq, read_opts,
+ is_blob_index);
+ }
+
+ Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
+ RangeDelAggregator* range_del_agg);
+
+ void AddIterators(const ReadOptions& options,
+ std::vector<InternalIterator*>* iterator_list,
+ Arena* arena);
+
+ void AddIterators(const ReadOptions& options,
+ MergeIteratorBuilder* merge_iter_builder,
+ bool add_range_tombstone_iter);
+
+ uint64_t GetTotalNumEntries() const;
+
+ uint64_t GetTotalNumDeletes() const;
+
+ MemTable::MemTableStats ApproximateStats(const Slice& start_ikey,
+ const Slice& end_ikey);
+
+ // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
+ // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
+ // If include_history=true, will also search Memtables in MemTableList
+ // History.
+ SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
+
+ // Return the first sequence number from the memtable list, which is the
+ // smallest sequence number of all FirstSequenceNumber.
+ // Return kMaxSequenceNumber if the list is empty.
+ SequenceNumber GetFirstSequenceNumber() const;
+
+ private:
+ friend class MemTableList;
+
+ friend Status InstallMemtableAtomicFlushResults(
+ const autovector<MemTableList*>* imm_lists,
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<const autovector<MemTable*>*>& mems_list,
+ VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+ InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+ const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+ committed_flush_jobs_info,
+ autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+ LogBuffer* log_buffer);
+
+ // REQUIRE: m is an immutable memtable
+ void Add(MemTable* m, autovector<MemTable*>* to_delete);
+ // REQUIRE: m is an immutable memtable
+ void Remove(MemTable* m, autovector<MemTable*>* to_delete);
+
+ // Return true if memtable is trimmed
+ bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
+ bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
+ std::string* value, PinnableWideColumns* columns,
+ std::string* timestamp, Status* s,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ SequenceNumber* seq, const ReadOptions& read_opts,
+ ReadCallback* callback = nullptr,
+ bool* is_blob_index = nullptr);
+
+ void AddMemTable(MemTable* m);
+
+ void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
+
+ // Calculate the total amount of memory used by memlist_ and memlist_history_
+ // excluding the last MemTable in memlist_history_. The reason for excluding
+ // the last MemTable is to see if dropping the last MemTable will keep total
+ // memory usage above or equal to max_write_buffer_size_to_maintain_
+ size_t MemoryAllocatedBytesExcludingLast() const;
+
+ // Whether this version contains flushed memtables that are only kept around
+ // for transaction conflict checking.
+ bool HasHistory() const { return !memlist_history_.empty(); }
+
+ bool MemtableLimitExceeded(size_t usage);
+
+ // Immutable MemTables that have not yet been flushed.
+ std::list<MemTable*> memlist_;
+
+ // MemTables that have already been flushed
+ // (used during Transaction validation)
+ std::list<MemTable*> memlist_history_;
+
+ // Maximum number of MemTables to keep in memory (including both flushed
+ const int max_write_buffer_number_to_maintain_;
+ // Maximum size of MemTables to keep in memory (including both flushed
+ // and not-yet-flushed tables).
+ const int64_t max_write_buffer_size_to_maintain_;
+
+ int refs_ = 0;
+
+ size_t* parent_memtable_list_memory_usage_;
+};
+
+// This class stores references to all the immutable memtables.
+// The memtables are flushed to L0 as soon as possible and in
+// any order. If there are more than one immutable memtable, their
+// flushes can occur concurrently. However, they are 'committed'
+// to the manifest in FIFO order to maintain correctness and
+// recoverability from a crash.
+//
+//
+// Other than imm_flush_needed and imm_trim_needed, this class is not
+// thread-safe and requires external synchronization (such as holding the db
+// mutex or being on the write thread.)
+class MemTableList {
+ public:
+ // A list of memtables.
+ explicit MemTableList(int min_write_buffer_number_to_merge,
+ int max_write_buffer_number_to_maintain,
+ int64_t max_write_buffer_size_to_maintain)
+ : imm_flush_needed(false),
+ imm_trim_needed(false),
+ min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
+ current_(new MemTableListVersion(&current_memory_usage_,
+ max_write_buffer_number_to_maintain,
+ max_write_buffer_size_to_maintain)),
+ num_flush_not_started_(0),
+ commit_in_progress_(false),
+ flush_requested_(false),
+ current_memory_usage_(0),
+ current_memory_allocted_bytes_excluding_last_(0),
+ current_has_history_(false) {
+ current_->Ref();
+ }
+
+ // Should not delete MemTableList without making sure MemTableList::current()
+ // is Unref()'d.
+ ~MemTableList() {}
+
+ MemTableListVersion* current() const { return current_; }
+
+ // so that background threads can detect non-nullptr pointer to
+ // determine whether there is anything more to start flushing.
+ std::atomic<bool> imm_flush_needed;
+
+ std::atomic<bool> imm_trim_needed;
+
+ // Returns the total number of memtables in the list that haven't yet
+ // been flushed and logged.
+ int NumNotFlushed() const;
+
+ // Returns total number of memtables in the list that have been
+ // completely flushed and logged.
+ int NumFlushed() const;
+
+ // Returns true if there is at least one memtable on which flush has
+ // not yet started.
+ bool IsFlushPending() const;
+
+ // Returns true if there is at least one memtable that is pending flush or
+ // flushing.
+ bool IsFlushPendingOrRunning() const;
+
+ // Returns the earliest memtables that needs to be flushed. The returned
+ // memtables are guaranteed to be in the ascending order of created time.
+ void PickMemtablesToFlush(uint64_t max_memtable_id,
+ autovector<MemTable*>* mems,
+ uint64_t* max_next_log_number = nullptr);
+
+ // Reset status of the given memtable list back to pending state so that
+ // they can get picked up again on the next round of flush.
+ void RollbackMemtableFlush(const autovector<MemTable*>& mems,
+ uint64_t file_number);
+
+ // Try commit a successful flush in the manifest file. It might just return
+ // Status::OK letting a concurrent flush to do the actual the recording.
+ Status TryInstallMemtableFlushResults(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
+ VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+ autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+ LogBuffer* log_buffer,
+ std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
+ bool write_edits = true);
+
+ // New memtables are inserted at the front of the list.
+ // Takes ownership of the referenced held on *m by the caller of Add().
+ // By default, adding memtables will flag that the memtable list needs to be
+ // flushed, but in certain situations, like after a mempurge, we may want to
+ // avoid flushing the memtable list upon addition of a memtable.
+ void Add(MemTable* m, autovector<MemTable*>* to_delete);
+
+ // Returns an estimate of the number of bytes of data in use.
+ size_t ApproximateMemoryUsage();
+
+ // Returns the cached current_memory_allocted_bytes_excluding_last_ value.
+ size_t MemoryAllocatedBytesExcludingLast() const;
+
+ // Returns the cached current_has_history_ value.
+ bool HasHistory() const;
+
+ // Updates current_memory_allocted_bytes_excluding_last_ and
+ // current_has_history_ from MemTableListVersion. Must be called whenever
+ // InstallNewVersion is called.
+ void UpdateCachedValuesFromMemTableListVersion();
+
+ // `usage` is the current size of the mutable Memtable. When
+ // max_write_buffer_size_to_maintain is used, total size of mutable and
+ // immutable memtables is checked against it to decide whether to trim
+ // memtable list.
+ //
+ // Return true if memtable is trimmed
+ bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
+ // Returns an estimate of the number of bytes of data used by
+ // the unflushed mem-tables.
+ size_t ApproximateUnflushedMemTablesMemoryUsage();
+
+ // Returns an estimate of the timestamp of the earliest key.
+ uint64_t ApproximateOldestKeyTime() const;
+
+ // Request a flush of all existing memtables to storage. This will
+ // cause future calls to IsFlushPending() to return true if this list is
+ // non-empty (regardless of the min_write_buffer_number_to_merge
+ // parameter). This flush request will persist until the next time
+ // PickMemtablesToFlush() is called.
+ void FlushRequested() {
+ flush_requested_ = true;
+ // If there are some memtables stored in imm() that don't trigger
+ // flush (eg: mempurge output memtable), then update imm_flush_needed.
+ // Note: if race condition and imm_flush_needed is set to true
+ // when there is num_flush_not_started_==0, then there is no
+ // impact whatsoever. Imm_flush_needed is only used in an assert
+ // in IsFlushPending().
+ if (num_flush_not_started_ > 0) {
+ imm_flush_needed.store(true, std::memory_order_release);
+ }
+ }
+
+ bool HasFlushRequested() { return flush_requested_; }
+
+ // Returns true if a trim history should be scheduled and the caller should
+ // be the one to schedule it
+ bool MarkTrimHistoryNeeded() {
+ auto expected = false;
+ return imm_trim_needed.compare_exchange_strong(
+ expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
+ }
+
+ void ResetTrimHistoryNeeded() {
+ auto expected = true;
+ imm_trim_needed.compare_exchange_strong(
+ expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
+ }
+
+ // Copying allowed
+ // MemTableList(const MemTableList&);
+ // void operator=(const MemTableList&);
+
+ size_t* current_memory_usage() { return &current_memory_usage_; }
+
+ // Returns the min log containing the prep section after memtables listsed in
+ // `memtables_to_flush` are flushed and their status is persisted in manifest.
+ uint64_t PrecomputeMinLogContainingPrepSection(
+ const std::unordered_set<MemTable*>* memtables_to_flush = nullptr);
+
+ uint64_t GetEarliestMemTableID() const {
+ auto& memlist = current_->memlist_;
+ if (memlist.empty()) {
+ return std::numeric_limits<uint64_t>::max();
+ }
+ return memlist.back()->GetID();
+ }
+
+ uint64_t GetLatestMemTableID() const {
+ auto& memlist = current_->memlist_;
+ if (memlist.empty()) {
+ return 0;
+ }
+ return memlist.front()->GetID();
+ }
+
+ void AssignAtomicFlushSeq(const SequenceNumber& seq) {
+ const auto& memlist = current_->memlist_;
+ // Scan the memtable list from new to old
+ for (auto it = memlist.begin(); it != memlist.end(); ++it) {
+ MemTable* mem = *it;
+ if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
+ mem->atomic_flush_seqno_ = seq;
+ } else {
+ // Earlier memtables must have been assigned a atomic flush seq, no
+ // need to continue scan.
+ break;
+ }
+ }
+ }
+
+ // Used only by DBImplSecondary during log replay.
+ // Remove memtables whose data were written before the WAL with log_number
+ // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
+ // not freed, but put into a vector for future deref and reclamation.
+ void RemoveOldMemTables(uint64_t log_number,
+ autovector<MemTable*>* to_delete);
+
+ private:
+ friend Status InstallMemtableAtomicFlushResults(
+ const autovector<MemTableList*>* imm_lists,
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<const autovector<MemTable*>*>& mems_list,
+ VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+ InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+ const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+ committed_flush_jobs_info,
+ autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+ LogBuffer* log_buffer);
+
+ // DB mutex held
+ void InstallNewVersion();
+
+ // DB mutex held
+ // Called after writing to MANIFEST
+ void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd,
+ size_t batch_count, LogBuffer* log_buffer,
+ autovector<MemTable*>* to_delete,
+ InstrumentedMutex* mu);
+
+ const int min_write_buffer_number_to_merge_;
+
+ MemTableListVersion* current_;
+
+ // the number of elements that still need flushing
+ int num_flush_not_started_;
+
+ // committing in progress
+ bool commit_in_progress_;
+
+ // Requested a flush of memtables to storage. It's possible to request that
+ // a subset of memtables be flushed.
+ bool flush_requested_;
+
+ // The current memory usage.
+ size_t current_memory_usage_;
+
+ // Cached value of current_->MemoryAllocatedBytesExcludingLast().
+ std::atomic<size_t> current_memory_allocted_bytes_excluding_last_;
+
+ // Cached value of current_->HasHistory().
+ std::atomic<bool> current_has_history_;
+};
+
+// Installs memtable atomic flush results.
+// In most cases, imm_lists is nullptr, and the function simply uses the
+// immutable memtable lists associated with the cfds. There are unit tests that
+// installs flush results for external immutable memtable lists other than the
+// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
+// imm_lists parameter is not nullptr.
+extern Status InstallMemtableAtomicFlushResults(
+ const autovector<MemTableList*>* imm_lists,
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+ LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
+ const autovector<FileMetaData*>& file_meta,
+ const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+ committed_flush_jobs_info,
+ autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+ LogBuffer* log_buffer);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list_test.cc b/src/rocksdb/db/memtable_list_test.cc
new file mode 100644
index 000000000..8242061af
--- /dev/null
+++ b/src/rocksdb/db/memtable_list_test.cc
@@ -0,0 +1,1039 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/memtable_list.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/merge_context.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTableListTest : public testing::Test {
+ public:
+ std::string dbname;
+ DB* db;
+ Options options;
+ std::vector<ColumnFamilyHandle*> handles;
+ std::atomic<uint64_t> file_number;
+
+ MemTableListTest() : db(nullptr), file_number(1) {
+ dbname = test::PerThreadDBPath("memtable_list_test");
+ options.create_if_missing = true;
+ EXPECT_OK(DestroyDB(dbname, options));
+ }
+
+ // Create a test db if not yet created
+ void CreateDB() {
+ if (db == nullptr) {
+ options.create_if_missing = true;
+ EXPECT_OK(DestroyDB(dbname, options));
+ // Open DB only with default column family
+ ColumnFamilyOptions cf_options;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, cf_options);
+ Status s = DB::Open(options, dbname, cf_descs, &handles, &db);
+ EXPECT_OK(s);
+
+ ColumnFamilyOptions cf_opt1, cf_opt2;
+ cf_opt1.cf_paths.emplace_back(dbname + "_one_1",
+ std::numeric_limits<uint64_t>::max());
+ cf_opt2.cf_paths.emplace_back(dbname + "_two_1",
+ std::numeric_limits<uint64_t>::max());
+ int sz = static_cast<int>(handles.size());
+ handles.resize(sz + 2);
+ s = db->CreateColumnFamily(cf_opt1, "one", &handles[1]);
+ EXPECT_OK(s);
+ s = db->CreateColumnFamily(cf_opt2, "two", &handles[2]);
+ EXPECT_OK(s);
+
+ cf_descs.emplace_back("one", cf_options);
+ cf_descs.emplace_back("two", cf_options);
+ }
+ }
+
+ ~MemTableListTest() override {
+ if (db) {
+ std::vector<ColumnFamilyDescriptor> cf_descs(handles.size());
+#ifndef ROCKSDB_LITE
+ for (int i = 0; i != static_cast<int>(handles.size()); ++i) {
+ EXPECT_OK(handles[i]->GetDescriptor(&cf_descs[i]));
+ }
+#endif // !ROCKSDB_LITE
+ for (auto h : handles) {
+ if (h) {
+ EXPECT_OK(db->DestroyColumnFamilyHandle(h));
+ }
+ }
+ handles.clear();
+ delete db;
+ db = nullptr;
+ EXPECT_OK(DestroyDB(dbname, options, cf_descs));
+ }
+ }
+
+ // Calls MemTableList::TryInstallMemtableFlushResults() and sets up all
+ // structures needed to call this function.
+ Status Mock_InstallMemtableFlushResults(
+ MemTableList* list, const MutableCFOptions& mutable_cf_options,
+ const autovector<MemTable*>& m, autovector<MemTable*>* to_delete) {
+ // Create a mock Logger
+ test::NullLogger logger;
+ LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+ CreateDB();
+ // Create a mock VersionSet
+ DBOptions db_options;
+ ImmutableDBOptions immutable_db_options(db_options);
+ EnvOptions env_options;
+ std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+ WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+ WriteController write_controller(10000000u);
+
+ VersionSet versions(dbname, &immutable_db_options, env_options,
+ table_cache.get(), &write_buffer_manager,
+ &write_controller, /*block_cache_tracer=*/nullptr,
+ /*io_tracer=*/nullptr, /*db_id*/ "",
+ /*db_session_id*/ "");
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+ cf_descs.emplace_back("one", ColumnFamilyOptions());
+ cf_descs.emplace_back("two", ColumnFamilyOptions());
+
+ EXPECT_OK(versions.Recover(cf_descs, false));
+
+ // Create mock default ColumnFamilyData
+ auto column_family_set = versions.GetColumnFamilySet();
+ LogsWithPrepTracker dummy_prep_tracker;
+ auto cfd = column_family_set->GetDefault();
+ EXPECT_TRUE(nullptr != cfd);
+ uint64_t file_num = file_number.fetch_add(1);
+ IOStatus io_s;
+ // Create dummy mutex.
+ InstrumentedMutex mutex;
+ InstrumentedMutexLock l(&mutex);
+ std::list<std::unique_ptr<FlushJobInfo>> flush_jobs_info;
+ Status s = list->TryInstallMemtableFlushResults(
+ cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
+ file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info);
+ EXPECT_OK(io_s);
+ return s;
+ }
+
+ // Calls MemTableList::InstallMemtableFlushResults() and sets up all
+ // structures needed to call this function.
+ Status Mock_InstallMemtableAtomicFlushResults(
+ autovector<MemTableList*>& lists, const autovector<uint32_t>& cf_ids,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<const autovector<MemTable*>*>& mems_list,
+ autovector<MemTable*>* to_delete) {
+ // Create a mock Logger
+ test::NullLogger logger;
+ LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+ CreateDB();
+ // Create a mock VersionSet
+ DBOptions db_options;
+
+ ImmutableDBOptions immutable_db_options(db_options);
+ EnvOptions env_options;
+ std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+ WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+ WriteController write_controller(10000000u);
+
+ VersionSet versions(dbname, &immutable_db_options, env_options,
+ table_cache.get(), &write_buffer_manager,
+ &write_controller, /*block_cache_tracer=*/nullptr,
+ /*io_tracer=*/nullptr, /*db_id*/ "",
+ /*db_session_id*/ "");
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+ cf_descs.emplace_back("one", ColumnFamilyOptions());
+ cf_descs.emplace_back("two", ColumnFamilyOptions());
+ EXPECT_OK(versions.Recover(cf_descs, false));
+
+ // Create mock default ColumnFamilyData
+
+ auto column_family_set = versions.GetColumnFamilySet();
+
+ LogsWithPrepTracker dummy_prep_tracker;
+ autovector<ColumnFamilyData*> cfds;
+ for (int i = 0; i != static_cast<int>(cf_ids.size()); ++i) {
+ cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i]));
+ EXPECT_NE(nullptr, cfds[i]);
+ }
+ std::vector<FileMetaData> file_metas;
+ file_metas.reserve(cf_ids.size());
+ for (size_t i = 0; i != cf_ids.size(); ++i) {
+ FileMetaData meta;
+ uint64_t file_num = file_number.fetch_add(1);
+ meta.fd = FileDescriptor(file_num, 0, 0);
+ file_metas.emplace_back(meta);
+ }
+ autovector<FileMetaData*> file_meta_ptrs;
+ for (auto& meta : file_metas) {
+ file_meta_ptrs.push_back(&meta);
+ }
+ std::vector<std::list<std::unique_ptr<FlushJobInfo>>>
+ committed_flush_jobs_info_storage(cf_ids.size());
+ autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+ committed_flush_jobs_info;
+ for (int i = 0; i < static_cast<int>(cf_ids.size()); ++i) {
+ committed_flush_jobs_info.push_back(
+ &committed_flush_jobs_info_storage[i]);
+ }
+
+ InstrumentedMutex mutex;
+ InstrumentedMutexLock l(&mutex);
+ return InstallMemtableAtomicFlushResults(
+ &lists, cfds, mutable_cf_options_list, mems_list, &versions,
+ nullptr /* prep_tracker */, &mutex, file_meta_ptrs,
+ committed_flush_jobs_info, to_delete, nullptr, &log_buffer);
+ }
+};
+
+TEST_F(MemTableListTest, Empty) {
+ // Create an empty MemTableList and validate basic functions.
+ MemTableList list(1, 0, 0);
+
+ ASSERT_EQ(0, list.NumNotFlushed());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_FALSE(list.IsFlushPending());
+
+ autovector<MemTable*> mems;
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &mems);
+ ASSERT_EQ(0, mems.size());
+
+ autovector<MemTable*> to_delete;
+ list.current()->Unref(&to_delete);
+ ASSERT_EQ(0, to_delete.size());
+}
+
+TEST_F(MemTableListTest, GetTest) {
+ // Create MemTableList
+ int min_write_buffer_number_to_merge = 2;
+ int max_write_buffer_number_to_maintain = 0;
+ int64_t max_write_buffer_size_to_maintain = 0;
+ MemTableList list(min_write_buffer_number_to_merge,
+ max_write_buffer_number_to_maintain,
+ max_write_buffer_size_to_maintain);
+
+ SequenceNumber seq = 1;
+ std::string value;
+ Status s;
+ MergeContext merge_context;
+ InternalKeyComparator ikey_cmp(options.comparator);
+ SequenceNumber max_covering_tombstone_seq = 0;
+ autovector<MemTable*> to_delete;
+
+ LookupKey lkey("key1", seq);
+ bool found = list.current()->Get(lkey, &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ // Create a MemTable
+ InternalKeyComparator cmp(BytewiseComparator());
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ ImmutableOptions ioptions(options);
+
+ WriteBufferManager wb(options.db_write_buffer_size);
+ MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem->Ref();
+
+ // Write some keys to this memtable.
+ ASSERT_OK(
+ mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
+ nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", "value1",
+ nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
+ nullptr /* kv_prot_info */));
+
+ // Fetch the newly written keys
+ merge_context.Clear();
+ found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
+ /*timestamp*/ nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions(),
+ false /* immutable_memtable */);
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ(value, "value1");
+
+ merge_context.Clear();
+ found = mem->Get(LookupKey("key1", 2), &value, /*columns*/ nullptr,
+ /*timestamp*/ nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions(),
+ false /* immutable_memtable */);
+ // MemTable found out that this key is *not* found (at this sequence#)
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
+ /*timestamp*/ nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions(),
+ false /* immutable_memtable */);
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ(value, "value2.2");
+
+ ASSERT_EQ(4, mem->num_entries());
+ ASSERT_EQ(1, mem->num_deletes());
+
+ // Add memtable to list
+ // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+ // in MemTableListVersion::GetFromList work.
+ mem->ConstructFragmentedRangeTombstones();
+ list.Add(mem, &to_delete);
+
+ SequenceNumber saved_seq = seq;
+
+ // Create another memtable and write some keys to it
+ WriteBufferManager wb2(options.db_write_buffer_size);
+ MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem2->Ref();
+
+ ASSERT_OK(
+ mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+ ASSERT_OK(mem2->Add(++seq, kTypeValue, "key2", "value2.3",
+ nullptr /* kv_prot_info */));
+
+ // Add second memtable to list
+ // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+ // in MemTableListVersion::GetFromList work.
+ mem2->ConstructFragmentedRangeTombstones();
+ list.Add(mem2, &to_delete);
+
+ // Fetch keys via MemTableList
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found = list.current()->Get(LookupKey("key1", saved_seq), &value,
+ /*columns=*/nullptr, /*timestamp=*/nullptr, &s,
+ &merge_context, &max_covering_tombstone_seq,
+ ReadOptions());
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ("value1", value);
+
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ(value, "value2.3");
+
+ merge_context.Clear();
+ found = list.current()->Get(LookupKey("key2", 1), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ ASSERT_EQ(2, list.NumNotFlushed());
+
+ list.current()->Unref(&to_delete);
+ for (MemTable* m : to_delete) {
+ delete m;
+ }
+}
+
+TEST_F(MemTableListTest, GetFromHistoryTest) {
+ // Create MemTableList
+ int min_write_buffer_number_to_merge = 2;
+ int max_write_buffer_number_to_maintain = 2;
+ int64_t max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize;
+ MemTableList list(min_write_buffer_number_to_merge,
+ max_write_buffer_number_to_maintain,
+ max_write_buffer_size_to_maintain);
+
+ SequenceNumber seq = 1;
+ std::string value;
+ Status s;
+ MergeContext merge_context;
+ InternalKeyComparator ikey_cmp(options.comparator);
+ SequenceNumber max_covering_tombstone_seq = 0;
+ autovector<MemTable*> to_delete;
+
+ LookupKey lkey("key1", seq);
+ bool found = list.current()->Get(lkey, &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ // Create a MemTable
+ InternalKeyComparator cmp(BytewiseComparator());
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ ImmutableOptions ioptions(options);
+
+ WriteBufferManager wb(options.db_write_buffer_size);
+ MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem->Ref();
+
+ // Write some keys to this memtable.
+ ASSERT_OK(
+ mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
+ nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
+ nullptr /* kv_prot_info */));
+
+ // Fetch the newly written keys
+ merge_context.Clear();
+ found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
+ /*timestamp*/ nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions(),
+ false /* immutable_memtable */);
+ // MemTable found out that this key is *not* found (at this sequence#)
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
+ /*timestamp*/ nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions(),
+ false /* immutable_memtable */);
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ(value, "value2.2");
+
+ // Add memtable to list
+ // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+ // in MemTableListVersion::GetFromList work.
+ mem->ConstructFragmentedRangeTombstones();
+ list.Add(mem, &to_delete);
+ ASSERT_EQ(0, to_delete.size());
+
+ // Fetch keys via MemTableList
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_TRUE(s.ok() && found);
+ ASSERT_EQ("value2.2", value);
+
+ // Flush this memtable from the list.
+ // (It will then be a part of the memtable history).
+ autovector<MemTable*> to_flush;
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+ ASSERT_EQ(1, to_flush.size());
+
+ MutableCFOptions mutable_cf_options(options);
+ s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+ &to_delete);
+ ASSERT_OK(s);
+ ASSERT_EQ(0, list.NumNotFlushed());
+ ASSERT_EQ(1, list.NumFlushed());
+ ASSERT_EQ(0, to_delete.size());
+
+ // Verify keys are no longer in MemTableList
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ // Verify keys are present in history
+ merge_context.Clear();
+ found = list.current()->GetFromHistory(
+ LookupKey("key1", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+ ReadOptions());
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found = list.current()->GetFromHistory(
+ LookupKey("key2", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+ ReadOptions());
+ ASSERT_TRUE(found);
+ ASSERT_EQ("value2.2", value);
+
+ // Create another memtable and write some keys to it
+ WriteBufferManager wb2(options.db_write_buffer_size);
+ MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem2->Ref();
+
+ ASSERT_OK(
+ mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+ ASSERT_OK(mem2->Add(++seq, kTypeValue, "key3", "value3",
+ nullptr /* kv_prot_info */));
+
+ // Add second memtable to list
+ // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+ // in MemTableListVersion::GetFromList work.
+ mem2->ConstructFragmentedRangeTombstones();
+ list.Add(mem2, &to_delete);
+ ASSERT_EQ(0, to_delete.size());
+
+ to_flush.clear();
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+ ASSERT_EQ(1, to_flush.size());
+
+ // Flush second memtable
+ s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+ &to_delete);
+ ASSERT_OK(s);
+ ASSERT_EQ(0, list.NumNotFlushed());
+ ASSERT_EQ(2, list.NumFlushed());
+ ASSERT_EQ(0, to_delete.size());
+
+ // Add a third memtable to push the first memtable out of the history
+ WriteBufferManager wb3(options.db_write_buffer_size);
+ MemTable* mem3 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb3,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem3->Ref();
+ // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+ // in MemTableListVersion::GetFromList work.
+ mem3->ConstructFragmentedRangeTombstones();
+ list.Add(mem3, &to_delete);
+ ASSERT_EQ(1, list.NumNotFlushed());
+ ASSERT_EQ(1, list.NumFlushed());
+ ASSERT_EQ(1, to_delete.size());
+
+ // Verify keys are no longer in MemTableList
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ // Verify that the second memtable's keys are in the history
+ merge_context.Clear();
+ found = list.current()->GetFromHistory(
+ LookupKey("key1", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+ ReadOptions());
+ ASSERT_TRUE(found && s.IsNotFound());
+
+ merge_context.Clear();
+ found = list.current()->GetFromHistory(
+ LookupKey("key3", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+ ReadOptions());
+ ASSERT_TRUE(found);
+ ASSERT_EQ("value3", value);
+
+ // Verify that key2 from the first memtable is no longer in the history
+ merge_context.Clear();
+ found =
+ list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+ /*timestamp=*/nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, ReadOptions());
+ ASSERT_FALSE(found);
+
+ // Cleanup
+ list.current()->Unref(&to_delete);
+ ASSERT_EQ(3, to_delete.size());
+ for (MemTable* m : to_delete) {
+ delete m;
+ }
+}
+
+TEST_F(MemTableListTest, FlushPendingTest) {
+ const int num_tables = 6;
+ SequenceNumber seq = 1;
+ Status s;
+
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ ImmutableOptions ioptions(options);
+ InternalKeyComparator cmp(BytewiseComparator());
+ WriteBufferManager wb(options.db_write_buffer_size);
+ autovector<MemTable*> to_delete;
+
+ // Create MemTableList
+ int min_write_buffer_number_to_merge = 3;
+ int max_write_buffer_number_to_maintain = 7;
+ int64_t max_write_buffer_size_to_maintain =
+ 7 * static_cast<int>(options.write_buffer_size);
+ MemTableList list(min_write_buffer_number_to_merge,
+ max_write_buffer_number_to_maintain,
+ max_write_buffer_size_to_maintain);
+
+ // Create some MemTables
+ uint64_t memtable_id = 0;
+ std::vector<MemTable*> tables;
+ MutableCFOptions mutable_cf_options(options);
+ for (int i = 0; i < num_tables; i++) {
+ MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem->SetID(memtable_id++);
+ mem->Ref();
+
+ std::string value;
+ MergeContext merge_context;
+
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", std::to_string(i),
+ nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i), "valueN",
+ nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value",
+ nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i), "valueM",
+ nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "",
+ nullptr /* kv_prot_info */));
+
+ tables.push_back(mem);
+ }
+
+ // Nothing to flush
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+ autovector<MemTable*> to_flush;
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+ ASSERT_EQ(0, to_flush.size());
+
+ // Request a flush even though there is nothing to flush
+ list.FlushRequested();
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Attempt to 'flush' to clear request for flush
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+ ASSERT_EQ(0, to_flush.size());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Request a flush again
+ list.FlushRequested();
+ // No flush pending since the list is empty.
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Add 2 tables
+ list.Add(tables[0], &to_delete);
+ list.Add(tables[1], &to_delete);
+ ASSERT_EQ(2, list.NumNotFlushed());
+ ASSERT_EQ(0, to_delete.size());
+
+ // Even though we have less than the minimum to flush, a flush is
+ // pending since we had previously requested a flush and never called
+ // PickMemtablesToFlush() to clear the flush.
+ ASSERT_TRUE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Pick tables to flush
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+ ASSERT_EQ(2, to_flush.size());
+ ASSERT_EQ(2, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Revert flush
+ list.RollbackMemtableFlush(to_flush, 0);
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ to_flush.clear();
+
+ // Add another table
+ list.Add(tables[2], &to_delete);
+ // We now have the minimum to flush regardles of whether FlushRequested()
+ // was called.
+ ASSERT_TRUE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_EQ(0, to_delete.size());
+
+ // Pick tables to flush
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+ ASSERT_EQ(3, to_flush.size());
+ ASSERT_EQ(3, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Pick tables to flush again
+ autovector<MemTable*> to_flush2;
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush2);
+ ASSERT_EQ(0, to_flush2.size());
+ ASSERT_EQ(3, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Add another table
+ list.Add(tables[3], &to_delete);
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_EQ(0, to_delete.size());
+
+ // Request a flush again
+ list.FlushRequested();
+ ASSERT_TRUE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Pick tables to flush again
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush2);
+ ASSERT_EQ(1, to_flush2.size());
+ ASSERT_EQ(4, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Rollback first pick of tables
+ list.RollbackMemtableFlush(to_flush, 0);
+ ASSERT_TRUE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ to_flush.clear();
+
+ // Add another tables
+ list.Add(tables[4], &to_delete);
+ ASSERT_EQ(5, list.NumNotFlushed());
+ // We now have the minimum to flush regardles of whether FlushRequested()
+ ASSERT_TRUE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_EQ(0, to_delete.size());
+
+ // Pick tables to flush
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+ // Picks three oldest memtables. The fourth oldest is picked in `to_flush2` so
+ // must be excluded. The newest (fifth oldest) is non-consecutive with the
+ // three oldest due to omitting the fourth oldest so must not be picked.
+ ASSERT_EQ(3, to_flush.size());
+ ASSERT_EQ(5, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Pick tables to flush again
+ autovector<MemTable*> to_flush3;
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush3);
+ // Picks newest (fifth oldest)
+ ASSERT_EQ(1, to_flush3.size());
+ ASSERT_EQ(5, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Nothing left to flush
+ autovector<MemTable*> to_flush4;
+ list.PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush4);
+ ASSERT_EQ(0, to_flush4.size());
+ ASSERT_EQ(5, list.NumNotFlushed());
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Flush the 3 memtables that were picked in to_flush
+ s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+ &to_delete);
+ ASSERT_OK(s);
+
+ // Note: now to_flush contains tables[0,1,2]. to_flush2 contains
+ // tables[3]. to_flush3 contains tables[4].
+ // Current implementation will only commit memtables in the order they were
+ // created. So TryInstallMemtableFlushResults will install the first 3 tables
+ // in to_flush and stop when it encounters a table not yet flushed.
+ ASSERT_EQ(2, list.NumNotFlushed());
+ int num_in_history =
+ std::min(3, static_cast<int>(max_write_buffer_size_to_maintain) /
+ static_cast<int>(options.write_buffer_size));
+ ASSERT_EQ(num_in_history, list.NumFlushed());
+ ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
+
+ // Request a flush again. Should be nothing to flush
+ list.FlushRequested();
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+ // Flush the 1 memtable (tables[4]) that was picked in to_flush3
+ s = MemTableListTest::Mock_InstallMemtableFlushResults(
+ &list, mutable_cf_options, to_flush3, &to_delete);
+ ASSERT_OK(s);
+
+ // This will install 0 tables since tables[4] flushed while tables[3] has not
+ // yet flushed.
+ ASSERT_EQ(2, list.NumNotFlushed());
+ ASSERT_EQ(0, to_delete.size());
+
+ // Flush the 1 memtable (tables[3]) that was picked in to_flush2
+ s = MemTableListTest::Mock_InstallMemtableFlushResults(
+ &list, mutable_cf_options, to_flush2, &to_delete);
+ ASSERT_OK(s);
+
+ // This will actually install 2 tables. The 1 we told it to flush, and also
+ // tables[4] which has been waiting for tables[3] to commit.
+ ASSERT_EQ(0, list.NumNotFlushed());
+ num_in_history =
+ std::min(5, static_cast<int>(max_write_buffer_size_to_maintain) /
+ static_cast<int>(options.write_buffer_size));
+ ASSERT_EQ(num_in_history, list.NumFlushed());
+ ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
+
+ for (const auto& m : to_delete) {
+ // Refcount should be 0 after calling TryInstallMemtableFlushResults.
+ // Verify this, by Ref'ing then UnRef'ing:
+ m->Ref();
+ ASSERT_EQ(m, m->Unref());
+ delete m;
+ }
+ to_delete.clear();
+
+ // Add another table
+ list.Add(tables[5], &to_delete);
+ ASSERT_EQ(1, list.NumNotFlushed());
+ ASSERT_EQ(5, list.GetLatestMemTableID());
+ memtable_id = 4;
+ // Pick tables to flush. The tables to pick must have ID smaller than or
+ // equal to 4. Therefore, no table will be selected in this case.
+ autovector<MemTable*> to_flush5;
+ list.FlushRequested();
+ ASSERT_TRUE(list.HasFlushRequested());
+ list.PickMemtablesToFlush(memtable_id, &to_flush5);
+ ASSERT_TRUE(to_flush5.empty());
+ ASSERT_EQ(1, list.NumNotFlushed());
+ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_FALSE(list.IsFlushPending());
+ ASSERT_FALSE(list.HasFlushRequested());
+
+ // Pick tables to flush. The tables to pick must have ID smaller than or
+ // equal to 5. Therefore, only tables[5] will be selected.
+ memtable_id = 5;
+ list.FlushRequested();
+ list.PickMemtablesToFlush(memtable_id, &to_flush5);
+ ASSERT_EQ(1, static_cast<int>(to_flush5.size()));
+ ASSERT_EQ(1, list.NumNotFlushed());
+ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+ ASSERT_FALSE(list.IsFlushPending());
+ to_delete.clear();
+
+ list.current()->Unref(&to_delete);
+ int to_delete_size =
+ std::min(num_tables, static_cast<int>(max_write_buffer_size_to_maintain) /
+ static_cast<int>(options.write_buffer_size));
+ ASSERT_EQ(to_delete_size, to_delete.size());
+
+ for (const auto& m : to_delete) {
+ // Refcount should be 0 after calling TryInstallMemtableFlushResults.
+ // Verify this, by Ref'ing then UnRef'ing:
+ m->Ref();
+ ASSERT_EQ(m, m->Unref());
+ delete m;
+ }
+ to_delete.clear();
+}
+
+TEST_F(MemTableListTest, EmptyAtomicFlusTest) {
+ autovector<MemTableList*> lists;
+ autovector<uint32_t> cf_ids;
+ autovector<const MutableCFOptions*> options_list;
+ autovector<const autovector<MemTable*>*> to_flush;
+ autovector<MemTable*> to_delete;
+ Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list,
+ to_flush, &to_delete);
+ ASSERT_OK(s);
+ ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(MemTableListTest, AtomicFlusTest) {
+ const int num_cfs = 3;
+ const int num_tables_per_cf = 2;
+ SequenceNumber seq = 1;
+
+ auto factory = std::make_shared<SkipListFactory>();
+ options.memtable_factory = factory;
+ ImmutableOptions ioptions(options);
+ InternalKeyComparator cmp(BytewiseComparator());
+ WriteBufferManager wb(options.db_write_buffer_size);
+
+ // Create MemTableLists
+ int min_write_buffer_number_to_merge = 3;
+ int max_write_buffer_number_to_maintain = 7;
+ int64_t max_write_buffer_size_to_maintain =
+ 7 * static_cast<int64_t>(options.write_buffer_size);
+ autovector<MemTableList*> lists;
+ for (int i = 0; i != num_cfs; ++i) {
+ lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
+ max_write_buffer_number_to_maintain,
+ max_write_buffer_size_to_maintain));
+ }
+
+ autovector<uint32_t> cf_ids;
+ std::vector<std::vector<MemTable*>> tables(num_cfs);
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ uint32_t cf_id = 0;
+ for (auto& elem : tables) {
+ mutable_cf_options_list.emplace_back(new MutableCFOptions(options));
+ uint64_t memtable_id = 0;
+ for (int i = 0; i != num_tables_per_cf; ++i) {
+ MemTable* mem =
+ new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb,
+ kMaxSequenceNumber, cf_id);
+ mem->SetID(memtable_id++);
+ mem->Ref();
+
+ std::string value;
+
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", std::to_string(i),
+ nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i),
+ "valueN", nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value",
+ nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i),
+ "valueM", nullptr /* kv_prot_info */));
+ ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "",
+ nullptr /* kv_prot_info */));
+
+ elem.push_back(mem);
+ }
+ cf_ids.push_back(cf_id++);
+ }
+
+ std::vector<autovector<MemTable*>> flush_candidates(num_cfs);
+
+ // Nothing to flush
+ for (auto i = 0; i != num_cfs; ++i) {
+ auto* list = lists[i];
+ ASSERT_FALSE(list->IsFlushPending());
+ ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+ list->PickMemtablesToFlush(
+ std::numeric_limits<uint64_t>::max() /* memtable_id */,
+ &flush_candidates[i]);
+ ASSERT_EQ(0, flush_candidates[i].size());
+ }
+ // Request flush even though there is nothing to flush
+ for (auto i = 0; i != num_cfs; ++i) {
+ auto* list = lists[i];
+ list->FlushRequested();
+ ASSERT_FALSE(list->IsFlushPending());
+ ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+ }
+ autovector<MemTable*> to_delete;
+ // Add tables to the immutable memtalbe lists associated with column families
+ for (auto i = 0; i != num_cfs; ++i) {
+ for (auto j = 0; j != num_tables_per_cf; ++j) {
+ lists[i]->Add(tables[i][j], &to_delete);
+ }
+ ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed());
+ ASSERT_TRUE(lists[i]->IsFlushPending());
+ ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire));
+ }
+ std::vector<uint64_t> flush_memtable_ids = {1, 1, 0};
+ // +----+
+ // list[0]: |0 1|
+ // list[1]: |0 1|
+ // | +--+
+ // list[2]: |0| 1
+ // +-+
+ // Pick memtables to flush
+ for (auto i = 0; i != num_cfs; ++i) {
+ flush_candidates[i].clear();
+ lists[i]->PickMemtablesToFlush(flush_memtable_ids[i], &flush_candidates[i]);
+ ASSERT_EQ(flush_memtable_ids[i] - 0 + 1,
+ static_cast<uint64_t>(flush_candidates[i].size()));
+ }
+ autovector<MemTableList*> tmp_lists;
+ autovector<uint32_t> tmp_cf_ids;
+ autovector<const MutableCFOptions*> tmp_options_list;
+ autovector<const autovector<MemTable*>*> to_flush;
+ for (auto i = 0; i != num_cfs; ++i) {
+ if (!flush_candidates[i].empty()) {
+ to_flush.push_back(&flush_candidates[i]);
+ tmp_lists.push_back(lists[i]);
+ tmp_cf_ids.push_back(i);
+ tmp_options_list.push_back(mutable_cf_options_list[i]);
+ }
+ }
+ Status s = Mock_InstallMemtableAtomicFlushResults(
+ tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete);
+ ASSERT_OK(s);
+
+ for (auto i = 0; i != num_cfs; ++i) {
+ for (auto j = 0; j != num_tables_per_cf; ++j) {
+ if (static_cast<uint64_t>(j) <= flush_memtable_ids[i]) {
+ ASSERT_LT(0, tables[i][j]->GetFileNumber());
+ }
+ }
+ ASSERT_EQ(
+ static_cast<size_t>(num_tables_per_cf) - flush_candidates[i].size(),
+ lists[i]->NumNotFlushed());
+ }
+
+ to_delete.clear();
+ for (auto list : lists) {
+ list->current()->Unref(&to_delete);
+ delete list;
+ }
+ for (auto& mutable_cf_options : mutable_cf_options_list) {
+ if (mutable_cf_options != nullptr) {
+ delete mutable_cf_options;
+ mutable_cf_options = nullptr;
+ }
+ }
+ // All memtables in tables array must have been flushed, thus ready to be
+ // deleted.
+ ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size());
+ for (const auto& m : to_delete) {
+ // Refcount should be 0 after calling InstallMemtableFlushResults.
+ // Verify this by Ref'ing and then Unref'ing.
+ m->Ref();
+ ASSERT_EQ(m, m->Unref());
+ delete m;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/merge_context.h b/src/rocksdb/db/merge_context.h
new file mode 100644
index 000000000..8a7b07290
--- /dev/null
+++ b/src/rocksdb/db/merge_context.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::vector<Slice> empty_operand_list;
+
+// The merge context for merging a user key.
+// When doing a Get(), DB will create such a class and pass it when
+// issuing Get() operation to memtables and version_set. The operands
+// will be fetched from the context when issuing partial of full merge.
+class MergeContext {
+ public:
+ // Clear all the operands
+ void Clear() {
+ if (operand_list_) {
+ operand_list_->clear();
+ copied_operands_->clear();
+ }
+ }
+
+ // Push a merge operand
+ void PushOperand(const Slice& operand_slice, bool operand_pinned = false) {
+ Initialize();
+ SetDirectionBackward();
+
+ if (operand_pinned) {
+ operand_list_->push_back(operand_slice);
+ } else {
+ // We need to have our own copy of the operand since it's not pinned
+ copied_operands_->emplace_back(
+ new std::string(operand_slice.data(), operand_slice.size()));
+ operand_list_->push_back(*copied_operands_->back());
+ }
+ }
+
+ // Push back a merge operand
+ void PushOperandBack(const Slice& operand_slice,
+ bool operand_pinned = false) {
+ Initialize();
+ SetDirectionForward();
+
+ if (operand_pinned) {
+ operand_list_->push_back(operand_slice);
+ } else {
+ // We need to have our own copy of the operand since it's not pinned
+ copied_operands_->emplace_back(
+ new std::string(operand_slice.data(), operand_slice.size()));
+ operand_list_->push_back(*copied_operands_->back());
+ }
+ }
+
+ // return total number of operands in the list
+ size_t GetNumOperands() const {
+ if (!operand_list_) {
+ return 0;
+ }
+ return operand_list_->size();
+ }
+
+ // Get the operand at the index.
+ Slice GetOperand(int index) const {
+ assert(operand_list_);
+
+ SetDirectionForward();
+ return (*operand_list_)[index];
+ }
+
+ // Same as GetOperandsDirectionForward
+ //
+ // Note that the returned reference is only good until another call
+ // to this MergeContext. If the returned value is needed for longer,
+ // a copy must be made.
+ const std::vector<Slice>& GetOperands() const {
+ return GetOperandsDirectionForward();
+ }
+
+ // Return all the operands in the order as they were merged (passed to
+ // FullMerge or FullMergeV2)
+ //
+ // Note that the returned reference is only good until another call
+ // to this MergeContext. If the returned value is needed for longer,
+ // a copy must be made.
+ const std::vector<Slice>& GetOperandsDirectionForward() const {
+ if (!operand_list_) {
+ return empty_operand_list;
+ }
+
+ SetDirectionForward();
+ return *operand_list_;
+ }
+
+ // Return all the operands in the reversed order relative to how they were
+ // merged (passed to FullMerge or FullMergeV2)
+ //
+ // Note that the returned reference is only good until another call
+ // to this MergeContext. If the returned value is needed for longer,
+ // a copy must be made.
+ const std::vector<Slice>& GetOperandsDirectionBackward() const {
+ if (!operand_list_) {
+ return empty_operand_list;
+ }
+
+ SetDirectionBackward();
+ return *operand_list_;
+ }
+
+ private:
+ void Initialize() {
+ if (!operand_list_) {
+ operand_list_.reset(new std::vector<Slice>());
+ copied_operands_.reset(new std::vector<std::unique_ptr<std::string>>());
+ }
+ }
+
+ void SetDirectionForward() const {
+ if (operands_reversed_ == true) {
+ std::reverse(operand_list_->begin(), operand_list_->end());
+ operands_reversed_ = false;
+ }
+ }
+
+ void SetDirectionBackward() const {
+ if (operands_reversed_ == false) {
+ std::reverse(operand_list_->begin(), operand_list_->end());
+ operands_reversed_ = true;
+ }
+ }
+
+ // List of operands
+ mutable std::unique_ptr<std::vector<Slice>> operand_list_;
+ // Copy of operands that are not pinned.
+ std::unique_ptr<std::vector<std::unique_ptr<std::string>>> copied_operands_;
+ mutable bool operands_reversed_ = true;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper.cc b/src/rocksdb/db/merge_helper.cc
new file mode 100644
index 000000000..6df841012
--- /dev/null
+++ b/src/rocksdb/db/merge_helper.cc
@@ -0,0 +1,583 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/merge_helper.h"
+
+#include <string>
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/prefetch_buffer_collection.h"
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/wide/wide_column_serialization.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/likely.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
+ const MergeOperator* user_merge_operator,
+ const CompactionFilter* compaction_filter,
+ Logger* logger, bool assert_valid_internal_key,
+ SequenceNumber latest_snapshot,
+ const SnapshotChecker* snapshot_checker, int level,
+ Statistics* stats,
+ const std::atomic<bool>* shutting_down)
+ : env_(env),
+ clock_(env->GetSystemClock().get()),
+ user_comparator_(user_comparator),
+ user_merge_operator_(user_merge_operator),
+ compaction_filter_(compaction_filter),
+ shutting_down_(shutting_down),
+ logger_(logger),
+ assert_valid_internal_key_(assert_valid_internal_key),
+ allow_single_operand_(false),
+ latest_snapshot_(latest_snapshot),
+ snapshot_checker_(snapshot_checker),
+ level_(level),
+ keys_(),
+ filter_timer_(clock_),
+ total_filter_time_(0U),
+ stats_(stats) {
+ assert(user_comparator_ != nullptr);
+ if (user_merge_operator_) {
+ allow_single_operand_ = user_merge_operator_->AllowSingleOperand();
+ }
+}
+
+Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
+ const Slice& key, const Slice* value,
+ const std::vector<Slice>& operands,
+ std::string* result, Logger* logger,
+ Statistics* statistics, SystemClock* clock,
+ Slice* result_operand,
+ bool update_num_ops_stats) {
+ assert(merge_operator != nullptr);
+
+ if (operands.empty()) {
+ assert(value != nullptr && result != nullptr);
+ result->assign(value->data(), value->size());
+ return Status::OK();
+ }
+
+ if (update_num_ops_stats) {
+ RecordInHistogram(statistics, READ_NUM_MERGE_OPERANDS,
+ static_cast<uint64_t>(operands.size()));
+ }
+
+ bool success = false;
+ Slice tmp_result_operand(nullptr, 0);
+ const MergeOperator::MergeOperationInput merge_in(key, value, operands,
+ logger);
+ MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand);
+ {
+ // Setup to time the merge
+ StopWatchNano timer(clock, statistics != nullptr);
+ PERF_TIMER_GUARD(merge_operator_time_nanos);
+
+ // Do the merge
+ success = merge_operator->FullMergeV2(merge_in, &merge_out);
+
+ if (tmp_result_operand.data()) {
+ // FullMergeV2 result is an existing operand
+ if (result_operand != nullptr) {
+ *result_operand = tmp_result_operand;
+ } else {
+ result->assign(tmp_result_operand.data(), tmp_result_operand.size());
+ }
+ } else if (result_operand) {
+ *result_operand = Slice(nullptr, 0);
+ }
+
+ RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME,
+ statistics ? timer.ElapsedNanos() : 0);
+ }
+
+ if (!success) {
+ RecordTick(statistics, NUMBER_MERGE_FAILURES);
+ return Status::Corruption("Error: Could not perform merge.");
+ }
+
+ return Status::OK();
+}
+
+Status MergeHelper::TimedFullMergeWithEntity(
+ const MergeOperator* merge_operator, const Slice& key, Slice base_entity,
+ const std::vector<Slice>& operands, std::string* result, Logger* logger,
+ Statistics* statistics, SystemClock* clock, bool update_num_ops_stats) {
+ WideColumns base_columns;
+
+ {
+ const Status s =
+ WideColumnSerialization::Deserialize(base_entity, base_columns);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ const bool has_default_column =
+ !base_columns.empty() && base_columns[0].name() == kDefaultWideColumnName;
+
+ Slice value_of_default;
+ if (has_default_column) {
+ value_of_default = base_columns[0].value();
+ }
+
+ std::string merge_result;
+
+ {
+ constexpr Slice* result_operand = nullptr;
+
+ const Status s = TimedFullMerge(
+ merge_operator, key, &value_of_default, operands, &merge_result, logger,
+ statistics, clock, result_operand, update_num_ops_stats);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (has_default_column) {
+ base_columns[0].value() = merge_result;
+
+ const Status s = WideColumnSerialization::Serialize(base_columns, *result);
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ const Status s =
+ WideColumnSerialization::Serialize(merge_result, base_columns, *result);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ return Status::OK();
+}
+
+// PRE: iter points to the first merge type entry
+// POST: iter points to the first entry beyond the merge process (or the end)
+// keys_, operands_ are updated to reflect the merge result.
+// keys_ stores the list of keys encountered while merging.
+// operands_ stores the list of merge operands encountered while merging.
+// keys_[i] corresponds to operands_[i] for each i.
+//
+// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator
+// and just pass the StripeRep corresponding to the stripe being merged.
+Status MergeHelper::MergeUntil(InternalIterator* iter,
+ CompactionRangeDelAggregator* range_del_agg,
+ const SequenceNumber stop_before,
+ const bool at_bottom,
+ const bool allow_data_in_errors,
+ const BlobFetcher* blob_fetcher,
+ const std::string* const full_history_ts_low,
+ PrefetchBufferCollection* prefetch_buffers,
+ CompactionIterationStats* c_iter_stats) {
+ // Get a copy of the internal key, before it's invalidated by iter->Next()
+ // Also maintain the list of merge operands seen.
+ assert(HasOperator());
+ keys_.clear();
+ merge_context_.Clear();
+ has_compaction_filter_skip_until_ = false;
+ assert(user_merge_operator_);
+ assert(user_comparator_);
+ const size_t ts_sz = user_comparator_->timestamp_size();
+ if (full_history_ts_low) {
+ assert(ts_sz > 0);
+ assert(ts_sz == full_history_ts_low->size());
+ }
+ bool first_key = true;
+
+ // We need to parse the internal key again as the parsed key is
+ // backed by the internal key!
+ // Assume no internal key corruption as it has been successfully parsed
+ // by the caller.
+ // original_key_is_iter variable is just caching the information:
+ // original_key_is_iter == (iter->key().ToString() == original_key)
+ bool original_key_is_iter = true;
+ std::string original_key = iter->key().ToString();
+ // Important:
+ // orig_ikey is backed by original_key if keys_.empty()
+ // orig_ikey is backed by keys_.back() if !keys_.empty()
+ ParsedInternalKey orig_ikey;
+
+ Status s = ParseInternalKey(original_key, &orig_ikey, allow_data_in_errors);
+ assert(s.ok());
+ if (!s.ok()) return s;
+
+ assert(kTypeMerge == orig_ikey.type);
+
+ bool hit_the_next_user_key = false;
+ int cmp_with_full_history_ts_low = 0;
+ for (; iter->Valid(); iter->Next(), original_key_is_iter = false) {
+ if (IsShuttingDown()) {
+ s = Status::ShutdownInProgress();
+ return s;
+ }
+
+ ParsedInternalKey ikey;
+ assert(keys_.size() == merge_context_.GetNumOperands());
+
+ Status pik_status =
+ ParseInternalKey(iter->key(), &ikey, allow_data_in_errors);
+ Slice ts;
+ if (pik_status.ok()) {
+ ts = ExtractTimestampFromUserKey(ikey.user_key, ts_sz);
+ if (full_history_ts_low) {
+ cmp_with_full_history_ts_low =
+ user_comparator_->CompareTimestamp(ts, *full_history_ts_low);
+ }
+ }
+ if (!pik_status.ok()) {
+ // stop at corrupted key
+ if (assert_valid_internal_key_) {
+ return pik_status;
+ }
+ break;
+ } else if (first_key) {
+ // If user-defined timestamp is enabled, we expect both user key and
+ // timestamps are equal, as a sanity check.
+ assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key));
+ first_key = false;
+ } else if (!user_comparator_->EqualWithoutTimestamp(ikey.user_key,
+ orig_ikey.user_key) ||
+ (ts_sz > 0 &&
+ !user_comparator_->Equal(ikey.user_key, orig_ikey.user_key) &&
+ cmp_with_full_history_ts_low >= 0)) {
+ // 1) hit a different user key, or
+ // 2) user-defined timestamp is enabled, and hit a version of user key NOT
+ // eligible for GC, then stop right here.
+ hit_the_next_user_key = true;
+ break;
+ } else if (stop_before > 0 && ikey.sequence <= stop_before &&
+ LIKELY(snapshot_checker_ == nullptr ||
+ snapshot_checker_->CheckInSnapshot(ikey.sequence,
+ stop_before) !=
+ SnapshotCheckerResult::kNotInSnapshot)) {
+ // hit an entry that's possibly visible by the previous snapshot, can't
+ // touch that
+ break;
+ }
+
+ // At this point we are guaranteed that we need to process this key.
+
+ assert(IsValueType(ikey.type));
+ if (ikey.type != kTypeMerge) {
+ // hit a put/delete/single delete
+ // => merge the put value or a nullptr with operands_
+ // => store result in operands_.back() (and update keys_.back())
+ // => change the entry type to kTypeValue for keys_.back()
+ // We are done! Success!
+
+ // If there are no operands, just return the Status::OK(). That will cause
+ // the compaction iterator to write out the key we're currently at, which
+ // is the put/delete we just encountered.
+ if (keys_.empty()) {
+ return s;
+ }
+
+ // TODO(noetzli) If the merge operator returns false, we are currently
+ // (almost) silently dropping the put/delete. That's probably not what we
+ // want. Also if we're in compaction and it's a put, it would be nice to
+ // run compaction filter on it.
+ std::string merge_result;
+
+ if (range_del_agg &&
+ range_del_agg->ShouldDelete(
+ ikey, RangeDelPositioningMode::kForwardTraversal)) {
+ s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr,
+ merge_context_.GetOperands(), &merge_result, logger_,
+ stats_, clock_,
+ /* result_operand */ nullptr,
+ /* update_num_ops_stats */ false);
+ } else if (ikey.type == kTypeValue) {
+ const Slice val = iter->value();
+
+ s = TimedFullMerge(user_merge_operator_, ikey.user_key, &val,
+ merge_context_.GetOperands(), &merge_result, logger_,
+ stats_, clock_,
+ /* result_operand */ nullptr,
+ /* update_num_ops_stats */ false);
+ } else if (ikey.type == kTypeBlobIndex) {
+ BlobIndex blob_index;
+
+ s = blob_index.DecodeFrom(iter->value());
+ if (!s.ok()) {
+ return s;
+ }
+
+ FilePrefetchBuffer* prefetch_buffer =
+ prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer(
+ blob_index.file_number())
+ : nullptr;
+
+ uint64_t bytes_read = 0;
+
+ assert(blob_fetcher);
+
+ PinnableSlice blob_value;
+ s = blob_fetcher->FetchBlob(ikey.user_key, blob_index, prefetch_buffer,
+ &blob_value, &bytes_read);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (c_iter_stats) {
+ ++c_iter_stats->num_blobs_read;
+ c_iter_stats->total_blob_bytes_read += bytes_read;
+ }
+
+ s = TimedFullMerge(user_merge_operator_, ikey.user_key, &blob_value,
+ merge_context_.GetOperands(), &merge_result, logger_,
+ stats_, clock_,
+ /* result_operand */ nullptr,
+ /* update_num_ops_stats */ false);
+ } else if (ikey.type == kTypeWideColumnEntity) {
+ s = TimedFullMergeWithEntity(
+ user_merge_operator_, ikey.user_key, iter->value(),
+ merge_context_.GetOperands(), &merge_result, logger_, stats_,
+ clock_, /* update_num_ops_stats */ false);
+ } else {
+ s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr,
+ merge_context_.GetOperands(), &merge_result, logger_,
+ stats_, clock_,
+ /* result_operand */ nullptr,
+ /* update_num_ops_stats */ false);
+ }
+
+ // We store the result in keys_.back() and operands_.back()
+ // if nothing went wrong (i.e.: no operand corruption on disk)
+ if (s.ok()) {
+ // The original key encountered
+ original_key = std::move(keys_.back());
+ orig_ikey.type = ikey.type == kTypeWideColumnEntity
+ ? kTypeWideColumnEntity
+ : kTypeValue;
+ UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+ keys_.clear();
+ merge_context_.Clear();
+ keys_.emplace_front(std::move(original_key));
+ merge_context_.PushOperand(merge_result);
+ }
+
+ // move iter to the next entry
+ iter->Next();
+ return s;
+ } else {
+ // hit a merge
+ // => if there is a compaction filter, apply it.
+ // => check for range tombstones covering the operand
+ // => merge the operand into the front of the operands_ list
+ // if not filtered
+ // => then continue because we haven't yet seen a Put/Delete.
+ //
+ // Keep queuing keys and operands until we either meet a put / delete
+ // request or later did a partial merge.
+
+ Slice value_slice = iter->value();
+ // add an operand to the list if:
+ // 1) it's included in one of the snapshots. in that case we *must* write
+ // it out, no matter what compaction filter says
+ // 2) it's not filtered by a compaction filter
+ CompactionFilter::Decision filter =
+ ikey.sequence <= latest_snapshot_
+ ? CompactionFilter::Decision::kKeep
+ : FilterMerge(orig_ikey.user_key, value_slice);
+ if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil &&
+ range_del_agg != nullptr &&
+ range_del_agg->ShouldDelete(
+ iter->key(), RangeDelPositioningMode::kForwardTraversal)) {
+ filter = CompactionFilter::Decision::kRemove;
+ }
+ if (filter == CompactionFilter::Decision::kKeep ||
+ filter == CompactionFilter::Decision::kChangeValue) {
+ if (original_key_is_iter) {
+ // this is just an optimization that saves us one memcpy
+ keys_.emplace_front(original_key);
+ } else {
+ keys_.emplace_front(iter->key().ToString());
+ }
+ if (keys_.size() == 1) {
+ // we need to re-anchor the orig_ikey because it was anchored by
+ // original_key before
+ pik_status =
+ ParseInternalKey(keys_.back(), &orig_ikey, allow_data_in_errors);
+ pik_status.PermitUncheckedError();
+ assert(pik_status.ok());
+ }
+ if (filter == CompactionFilter::Decision::kKeep) {
+ merge_context_.PushOperand(
+ value_slice, iter->IsValuePinned() /* operand_pinned */);
+ } else {
+ assert(filter == CompactionFilter::Decision::kChangeValue);
+ // Compaction filter asked us to change the operand from value_slice
+ // to compaction_filter_value_.
+ merge_context_.PushOperand(compaction_filter_value_, false);
+ }
+ } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+ // Compaction filter asked us to remove this key altogether
+ // (not just this operand), along with some keys following it.
+ keys_.clear();
+ merge_context_.Clear();
+ has_compaction_filter_skip_until_ = true;
+ return s;
+ }
+ }
+ }
+
+ if (cmp_with_full_history_ts_low >= 0) {
+ size_t num_merge_operands = merge_context_.GetNumOperands();
+ if (ts_sz && num_merge_operands > 1) {
+ // We do not merge merge operands with different timestamps if they are
+ // not eligible for GC.
+ ROCKS_LOG_ERROR(logger_, "ts_sz=%d, %d merge oprands",
+ static_cast<int>(ts_sz),
+ static_cast<int>(num_merge_operands));
+ assert(false);
+ }
+ }
+
+ if (merge_context_.GetNumOperands() == 0) {
+ // we filtered out all the merge operands
+ return s;
+ }
+
+ // We are sure we have seen this key's entire history if:
+ // at_bottom == true (this does not necessarily mean it is the bottommost
+ // layer, but rather that we are confident the key does not appear on any of
+ // the lower layers, at_bottom == false doesn't mean it does appear, just
+ // that we can't be sure, see Compaction::IsBottommostLevel for details)
+ // AND
+ // we have either encountered another key or end of key history on this
+ // layer.
+ // Note that if user-defined timestamp is enabled, we need some extra caution
+ // here: if full_history_ts_low is nullptr, or it's not null but the key's
+ // timestamp is greater than or equal to full_history_ts_low, it means this
+ // key cannot be dropped. We may not have seen the beginning of the key.
+ //
+ // When these conditions are true we are able to merge all the keys
+ // using full merge.
+ //
+ // For these cases we are not sure about, we simply miss the opportunity
+ // to combine the keys. Since VersionSet::SetupOtherInputs() always makes
+ // sure that all merge-operands on the same level get compacted together,
+ // this will simply lead to these merge operands moving to the next level.
+ bool surely_seen_the_beginning =
+ (hit_the_next_user_key || !iter->Valid()) && at_bottom &&
+ (ts_sz == 0 || cmp_with_full_history_ts_low < 0);
+ if (surely_seen_the_beginning) {
+ // do a final merge with nullptr as the existing value and say
+ // bye to the merge type (it's now converted to a Put)
+ assert(kTypeMerge == orig_ikey.type);
+ assert(merge_context_.GetNumOperands() >= 1);
+ assert(merge_context_.GetNumOperands() == keys_.size());
+ std::string merge_result;
+ s = TimedFullMerge(
+ user_merge_operator_, orig_ikey.user_key, nullptr,
+ merge_context_.GetOperands(), &merge_result, logger_, stats_, clock_,
+ /* result_operand */ nullptr, /* update_num_ops_stats */ false);
+ if (s.ok()) {
+ // The original key encountered
+ // We are certain that keys_ is not empty here (see assertions couple of
+ // lines before).
+ original_key = std::move(keys_.back());
+ orig_ikey.type = kTypeValue;
+ UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+ keys_.clear();
+ merge_context_.Clear();
+ keys_.emplace_front(std::move(original_key));
+ merge_context_.PushOperand(merge_result);
+ }
+ } else {
+ // We haven't seen the beginning of the key nor a Put/Delete.
+ // Attempt to use the user's associative merge function to
+ // merge the stacked merge operands into a single operand.
+ s = Status::MergeInProgress();
+ if (merge_context_.GetNumOperands() >= 2 ||
+ (allow_single_operand_ && merge_context_.GetNumOperands() == 1)) {
+ bool merge_success = false;
+ std::string merge_result;
+ {
+ StopWatchNano timer(clock_, stats_ != nullptr);
+ PERF_TIMER_GUARD(merge_operator_time_nanos);
+ merge_success = user_merge_operator_->PartialMergeMulti(
+ orig_ikey.user_key,
+ std::deque<Slice>(merge_context_.GetOperands().begin(),
+ merge_context_.GetOperands().end()),
+ &merge_result, logger_);
+ RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME,
+ stats_ ? timer.ElapsedNanosSafe() : 0);
+ }
+ if (merge_success) {
+ // Merging of operands (associative merge) was successful.
+ // Replace operands with the merge result
+ merge_context_.Clear();
+ merge_context_.PushOperand(merge_result);
+ keys_.erase(keys_.begin(), keys_.end() - 1);
+ }
+ }
+ }
+
+ return s;
+}
+
+MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper)
+ : merge_helper_(merge_helper) {
+ it_keys_ = merge_helper_->keys().rend();
+ it_values_ = merge_helper_->values().rend();
+}
+
+void MergeOutputIterator::SeekToFirst() {
+ const auto& keys = merge_helper_->keys();
+ const auto& values = merge_helper_->values();
+ assert(keys.size() == values.size());
+ it_keys_ = keys.rbegin();
+ it_values_ = values.rbegin();
+}
+
+void MergeOutputIterator::Next() {
+ ++it_keys_;
+ ++it_values_;
+}
+
+CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key,
+ const Slice& value_slice) {
+ if (compaction_filter_ == nullptr) {
+ return CompactionFilter::Decision::kKeep;
+ }
+ if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
+ filter_timer_.Start();
+ }
+ compaction_filter_value_.clear();
+ compaction_filter_skip_until_.Clear();
+ auto ret = compaction_filter_->FilterV2(
+ level_, user_key, CompactionFilter::ValueType::kMergeOperand, value_slice,
+ &compaction_filter_value_, compaction_filter_skip_until_.rep());
+ if (ret == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+ if (user_comparator_->Compare(*compaction_filter_skip_until_.rep(),
+ user_key) <= 0) {
+ // Invalid skip_until returned from compaction filter.
+ // Keep the key as per FilterV2 documentation.
+ ret = CompactionFilter::Decision::kKeep;
+ } else {
+ compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+ kValueTypeForSeek);
+ }
+ }
+ if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
+ total_filter_time_ += filter_timer_.ElapsedNanosSafe();
+ }
+ return ret;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper.h b/src/rocksdb/db/merge_helper.h
new file mode 100644
index 000000000..790ec6239
--- /dev/null
+++ b/src/rocksdb/db/merge_helper.h
@@ -0,0 +1,216 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "db/merge_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/wide_columns.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+class Iterator;
+class Logger;
+class MergeOperator;
+class Statistics;
+class SystemClock;
+class BlobFetcher;
+class PrefetchBufferCollection;
+struct CompactionIterationStats;
+
+class MergeHelper {
+ public:
+ MergeHelper(Env* env, const Comparator* user_comparator,
+ const MergeOperator* user_merge_operator,
+ const CompactionFilter* compaction_filter, Logger* logger,
+ bool assert_valid_internal_key, SequenceNumber latest_snapshot,
+ const SnapshotChecker* snapshot_checker = nullptr, int level = 0,
+ Statistics* stats = nullptr,
+ const std::atomic<bool>* shutting_down = nullptr);
+
+ // Wrapper around MergeOperator::FullMergeV2() that records perf statistics.
+ // Result of merge will be written to result if status returned is OK.
+ // If operands is empty, the value will simply be copied to result.
+ // Set `update_num_ops_stats` to true if it is from a user read, so that
+ // the latency is sensitive.
+ // Returns one of the following statuses:
+ // - OK: Entries were successfully merged.
+ // - Corruption: Merge operator reported unsuccessful merge.
+ static Status TimedFullMerge(const MergeOperator* merge_operator,
+ const Slice& key, const Slice* value,
+ const std::vector<Slice>& operands,
+ std::string* result, Logger* logger,
+ Statistics* statistics, SystemClock* clock,
+ Slice* result_operand,
+ bool update_num_ops_stats);
+
+ static Status TimedFullMergeWithEntity(
+ const MergeOperator* merge_operator, const Slice& key, Slice base_entity,
+ const std::vector<Slice>& operands, std::string* result, Logger* logger,
+ Statistics* statistics, SystemClock* clock, bool update_num_ops_stats);
+
+ // During compaction, merge entries until we hit
+ // - a corrupted key
+ // - a Put/Delete,
+ // - a different user key,
+ // - a specific sequence number (snapshot boundary),
+ // - REMOVE_AND_SKIP_UNTIL returned from compaction filter,
+ // or - the end of iteration
+ // iter: (IN) points to the first merge type entry
+ // (OUT) points to the first entry not included in the merge process
+ // range_del_agg: (IN) filters merge operands covered by range tombstones.
+ // stop_before: (IN) a sequence number that merge should not cross.
+ // 0 means no restriction
+ // at_bottom: (IN) true if the iterator covers the bottem level, which means
+ // we could reach the start of the history of this user key.
+ // allow_data_in_errors: (IN) if true, data details will be displayed in
+ // error/log messages.
+ // blob_fetcher: (IN) blob fetcher object for the compaction's input version.
+ // prefetch_buffers: (IN/OUT) a collection of blob file prefetch buffers
+ // used for compaction readahead.
+ // c_iter_stats: (OUT) compaction iteration statistics.
+ //
+ // Returns one of the following statuses:
+ // - OK: Entries were successfully merged.
+ // - MergeInProgress: Put/Delete not encountered, and didn't reach the start
+ // of key's history. Output consists of merge operands only.
+ // - Corruption: Merge operator reported unsuccessful merge or a corrupted
+ // key has been encountered and not expected (applies only when compiling
+ // with asserts removed).
+ // - ShutdownInProgress: interrupted by shutdown (*shutting_down == true).
+ //
+ // REQUIRED: The first key in the input is not corrupted.
+ Status MergeUntil(InternalIterator* iter,
+ CompactionRangeDelAggregator* range_del_agg,
+ const SequenceNumber stop_before, const bool at_bottom,
+ const bool allow_data_in_errors,
+ const BlobFetcher* blob_fetcher,
+ const std::string* const full_history_ts_low,
+ PrefetchBufferCollection* prefetch_buffers,
+ CompactionIterationStats* c_iter_stats);
+
+ // Filters a merge operand using the compaction filter specified
+ // in the constructor. Returns the decision that the filter made.
+ // Uses compaction_filter_value_ and compaction_filter_skip_until_ for the
+ // optional outputs of compaction filter.
+ // user_key includes timestamp if user-defined timestamp is enabled.
+ CompactionFilter::Decision FilterMerge(const Slice& user_key,
+ const Slice& value_slice);
+
+ // Query the merge result
+ // These are valid until the next MergeUntil call
+ // If the merging was successful:
+ // - keys() contains a single element with the latest sequence number of
+ // the merges. The type will be Put or Merge. See IMPORTANT 1 note, below.
+ // - values() contains a single element with the result of merging all the
+ // operands together
+ //
+ // IMPORTANT 1: the key type could change after the MergeUntil call.
+ // Put/Delete + Merge + ... + Merge => Put
+ // Merge + ... + Merge => Merge
+ //
+ // If the merge operator is not associative, and if a Put/Delete is not found
+ // then the merging will be unsuccessful. In this case:
+ // - keys() contains the list of internal keys seen in order of iteration.
+ // - values() contains the list of values (merges) seen in the same order.
+ // values() is parallel to keys() so that the first entry in
+ // keys() is the key associated with the first entry in values()
+ // and so on. These lists will be the same length.
+ // All of these pairs will be merges over the same user key.
+ // See IMPORTANT 2 note below.
+ //
+ // IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
+ // So keys().back() was the first key seen by iterator.
+ // TODO: Re-style this comment to be like the first one
+ const std::deque<std::string>& keys() const { return keys_; }
+ const std::vector<Slice>& values() const {
+ return merge_context_.GetOperands();
+ }
+ uint64_t TotalFilterTime() const { return total_filter_time_; }
+ bool HasOperator() const { return user_merge_operator_ != nullptr; }
+
+ // If compaction filter returned REMOVE_AND_SKIP_UNTIL, this method will
+ // return true and fill *until with the key to which we should skip.
+ // If true, keys() and values() are empty.
+ bool FilteredUntil(Slice* skip_until) const {
+ if (!has_compaction_filter_skip_until_) {
+ return false;
+ }
+ assert(compaction_filter_ != nullptr);
+ assert(skip_until != nullptr);
+ assert(compaction_filter_skip_until_.Valid());
+ *skip_until = compaction_filter_skip_until_.Encode();
+ return true;
+ }
+
+ private:
+ Env* env_;
+ SystemClock* clock_;
+ const Comparator* user_comparator_;
+ const MergeOperator* user_merge_operator_;
+ const CompactionFilter* compaction_filter_;
+ const std::atomic<bool>* shutting_down_;
+ Logger* logger_;
+ bool assert_valid_internal_key_; // enforce no internal key corruption?
+ bool allow_single_operand_;
+ SequenceNumber latest_snapshot_;
+ const SnapshotChecker* const snapshot_checker_;
+ int level_;
+
+ // the scratch area that holds the result of MergeUntil
+ // valid up to the next MergeUntil call
+
+ // Keeps track of the sequence of keys seen
+ std::deque<std::string> keys_;
+ // Parallel with keys_; stores the operands
+ mutable MergeContext merge_context_;
+
+ StopWatchNano filter_timer_;
+ uint64_t total_filter_time_;
+ Statistics* stats_;
+
+ bool has_compaction_filter_skip_until_ = false;
+ std::string compaction_filter_value_;
+ InternalKey compaction_filter_skip_until_;
+
+ bool IsShuttingDown() {
+ // This is a best-effort facility, so memory_order_relaxed is sufficient.
+ return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+ }
+};
+
+// MergeOutputIterator can be used to iterate over the result of a merge.
+class MergeOutputIterator {
+ public:
+ // The MergeOutputIterator is bound to a MergeHelper instance.
+ explicit MergeOutputIterator(const MergeHelper* merge_helper);
+
+ // Seeks to the first record in the output.
+ void SeekToFirst();
+ // Advances to the next record in the output.
+ void Next();
+
+ Slice key() { return Slice(*it_keys_); }
+ Slice value() { return Slice(*it_values_); }
+ bool Valid() { return it_keys_ != merge_helper_->keys().rend(); }
+
+ private:
+ const MergeHelper* merge_helper_;
+ std::deque<std::string>::const_reverse_iterator it_keys_;
+ std::vector<Slice>::const_reverse_iterator it_values_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper_test.cc b/src/rocksdb/db/merge_helper_test.cc
new file mode 100644
index 000000000..05408d5b9
--- /dev/null
+++ b/src/rocksdb/db/merge_helper_test.cc
@@ -0,0 +1,298 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/merge_helper.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/vector_iterator.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergeHelperTest : public testing::Test {
+ public:
+ MergeHelperTest() : icmp_(BytewiseComparator()) { env_ = Env::Default(); }
+
+ ~MergeHelperTest() override = default;
+
+ Status Run(SequenceNumber stop_before, bool at_bottom,
+ SequenceNumber latest_snapshot = 0) {
+ iter_.reset(new VectorIterator(ks_, vs_, &icmp_));
+ iter_->SeekToFirst();
+ merge_helper_.reset(new MergeHelper(env_, icmp_.user_comparator(),
+ merge_op_.get(), filter_.get(), nullptr,
+ false, latest_snapshot));
+ return merge_helper_->MergeUntil(
+ iter_.get(), nullptr /* range_del_agg */, stop_before, at_bottom,
+ false /* allow_data_in_errors */, nullptr /* blob_fetcher */,
+ nullptr /* full_history_ts_low */, nullptr /* prefetch_buffers */,
+ nullptr /* c_iter_stats */);
+ }
+
+ void AddKeyVal(const std::string& user_key, const SequenceNumber& seq,
+ const ValueType& t, const std::string& val,
+ bool corrupt = false) {
+ InternalKey ikey(user_key, seq, t);
+ if (corrupt) {
+ test::CorruptKeyType(&ikey);
+ }
+ ks_.push_back(ikey.Encode().ToString());
+ vs_.push_back(val);
+ }
+
+ Env* env_;
+ InternalKeyComparator icmp_;
+ std::unique_ptr<VectorIterator> iter_;
+ std::shared_ptr<MergeOperator> merge_op_;
+ std::unique_ptr<MergeHelper> merge_helper_;
+ std::vector<std::string> ks_;
+ std::vector<std::string> vs_;
+ std::unique_ptr<test::FilterNumber> filter_;
+};
+
+// If MergeHelper encounters a new key on the last level, we know that
+// the key has no more history and it can merge keys.
+TEST_F(MergeHelperTest, MergeAtBottomSuccess) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 20, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("b", 10, kTypeMerge, test::EncodeInt(4U)); // <- iter_ after merge
+
+ ASSERT_TRUE(Run(0, true).ok());
+ ASSERT_EQ(ks_[2], iter_->key());
+ ASSERT_EQ(test::KeyStr("a", 20, kTypeValue), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a value results in a successful merge.
+TEST_F(MergeHelperTest, MergeValue) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U)); // <- iter_ after merge
+ AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+ ASSERT_TRUE(Run(0, false).ok());
+ ASSERT_EQ(ks_[3], iter_->key());
+ ASSERT_EQ(test::KeyStr("a", 40, kTypeValue), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(8U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging stops before a snapshot.
+TEST_F(MergeHelperTest, SnapshotBeforeValue) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(3U)); // <- iter_ after merge
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));
+ AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+ ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+ ASSERT_EQ(ks_[2], iter_->key());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// MergeHelper preserves the operand stack for merge operators that
+// cannot do a partial merge.
+TEST_F(MergeHelperTest, NoPartialMerge) {
+ merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+
+ AddKeyVal("a", 50, kTypeMerge, "v2");
+ AddKeyVal("a", 40, kTypeMerge, "v"); // <- iter_ after merge
+ AddKeyVal("a", 30, kTypeMerge, "v");
+
+ ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+ ASSERT_EQ(ks_[2], iter_->key());
+ ASSERT_EQ(test::KeyStr("a", 40, kTypeMerge), merge_helper_->keys()[0]);
+ ASSERT_EQ("v", merge_helper_->values()[0]);
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[1]);
+ ASSERT_EQ("v2", merge_helper_->values()[1]);
+ ASSERT_EQ(2U, merge_helper_->keys().size());
+ ASSERT_EQ(2U, merge_helper_->values().size());
+}
+
+// A single operand can not be merged.
+TEST_F(MergeHelperTest, SingleOperand) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+
+ ASSERT_TRUE(Run(31, false).IsMergeInProgress());
+ ASSERT_FALSE(iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(1U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a deletion turns the deletion into a value
+TEST_F(MergeHelperTest, MergeDeletion) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 20, kTypeDeletion, "");
+
+ ASSERT_TRUE(Run(15, false).ok());
+ ASSERT_FALSE(iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(3U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The merge helper stops upon encountering a corrupt key
+TEST_F(MergeHelperTest, CorruptKey) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(1U));
+ // Corrupt key
+ AddKeyVal("a", 20, kTypeDeletion, "", true); // <- iter_ after merge
+
+ ASSERT_TRUE(Run(15, false).IsMergeInProgress());
+ ASSERT_EQ(ks_[2], iter_->key());
+ ASSERT_EQ(test::KeyStr("a", 30, kTypeMerge), merge_helper_->keys()[0]);
+ ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+ ASSERT_EQ(1U, merge_helper_->keys().size());
+ ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The compaction filter is called on every merge operand
+TEST_F(MergeHelperTest, FilterMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ filter_.reset(new test::FilterNumber(5U));
+
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("a", 25, kTypeValue, test::EncodeInt(1U));
+
+ ASSERT_TRUE(Run(15, false).ok());
+ ASSERT_FALSE(iter_->Valid());
+ MergeOutputIterator merge_output_iter(merge_helper_.get());
+ merge_output_iter.SeekToFirst();
+ ASSERT_EQ(test::KeyStr("a", 30, kTypeValue),
+ merge_output_iter.key().ToString());
+ ASSERT_EQ(test::EncodeInt(8U), merge_output_iter.value().ToString());
+ merge_output_iter.Next();
+ ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+TEST_F(MergeHelperTest, FilterAllMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ filter_.reset(new test::FilterNumber(5U));
+
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+
+ // filtered out all
+ ASSERT_TRUE(Run(15, false).ok());
+ ASSERT_FALSE(iter_->Valid());
+ MergeOutputIterator merge_output_iter(merge_helper_.get());
+ merge_output_iter.SeekToFirst();
+ ASSERT_FALSE(merge_output_iter.Valid());
+
+ // we have one operand that will survive because it's a delete
+ AddKeyVal("a", 24, kTypeDeletion, test::EncodeInt(5U));
+ AddKeyVal("b", 23, kTypeValue, test::EncodeInt(5U));
+ ASSERT_TRUE(Run(15, true).ok());
+ merge_output_iter = MergeOutputIterator(merge_helper_.get());
+ ASSERT_TRUE(iter_->Valid());
+ merge_output_iter.SeekToFirst();
+ ASSERT_FALSE(merge_output_iter.Valid());
+
+ // when all merge operands are filtered out, we leave the iterator pointing to
+ // the Put/Delete that survived
+ ASSERT_EQ(test::KeyStr("a", 24, kTypeDeletion), iter_->key().ToString());
+ ASSERT_EQ(test::EncodeInt(5U), iter_->value().ToString());
+}
+
+// Make sure that merge operands are filtered at the beginning
+TEST_F(MergeHelperTest, FilterFirstMergeOperand) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ filter_.reset(new test::FilterNumber(5U));
+
+ AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+ AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U)); // Filtered
+ AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U)); // next user key
+
+ ASSERT_OK(Run(15, true));
+ ASSERT_TRUE(iter_->Valid());
+ MergeOutputIterator merge_output_iter(merge_helper_.get());
+ merge_output_iter.SeekToFirst();
+ // sequence number is 29 here, because the first merge operand got filtered
+ // out
+ ASSERT_EQ(test::KeyStr("a", 29, kTypeValue),
+ merge_output_iter.key().ToString());
+ ASSERT_EQ(test::EncodeInt(6U), merge_output_iter.value().ToString());
+ merge_output_iter.Next();
+ ASSERT_FALSE(merge_output_iter.Valid());
+
+ // make sure that we're passing user keys into the filter
+ ASSERT_EQ("a", filter_->last_merge_operand_key());
+}
+
+// Make sure that merge operands are not filtered out if there's a snapshot
+// pointing at them
+TEST_F(MergeHelperTest, DontFilterMergeOperandsBeforeSnapshotTest) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ filter_.reset(new test::FilterNumber(5U));
+
+ AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+ AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+ AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+ AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+ AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));
+
+ ASSERT_OK(Run(15, true, 32));
+ ASSERT_TRUE(iter_->Valid());
+ MergeOutputIterator merge_output_iter(merge_helper_.get());
+ merge_output_iter.SeekToFirst();
+ ASSERT_EQ(test::KeyStr("a", 31, kTypeValue),
+ merge_output_iter.key().ToString());
+ ASSERT_EQ(test::EncodeInt(26U), merge_output_iter.value().ToString());
+ merge_output_iter.Next();
+ ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/merge_operator.cc b/src/rocksdb/db/merge_operator.cc
new file mode 100644
index 000000000..d32585640
--- /dev/null
+++ b/src/rocksdb/db/merge_operator.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+/**
+ * Back-end implementation details specific to the Merge Operator.
+ */
+
+#include "rocksdb/merge_operator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool MergeOperator::FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const {
+ // If FullMergeV2 is not implemented, we convert the operand_list to
+ // std::deque<std::string> and pass it to FullMerge
+ std::deque<std::string> operand_list_str;
+ for (auto& op : merge_in.operand_list) {
+ operand_list_str.emplace_back(op.data(), op.size());
+ }
+ return FullMerge(merge_in.key, merge_in.existing_value, operand_list_str,
+ &merge_out->new_value, merge_in.logger);
+}
+
+// The default implementation of PartialMergeMulti, which invokes
+// PartialMerge multiple times internally and merges two operands at
+// a time.
+bool MergeOperator::PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* logger) const {
+ assert(operand_list.size() >= 2);
+ // Simply loop through the operands
+ Slice temp_slice(operand_list[0]);
+
+ for (size_t i = 1; i < operand_list.size(); ++i) {
+ auto& operand = operand_list[i];
+ std::string temp_value;
+ if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) {
+ return false;
+ }
+ swap(temp_value, *new_value);
+ temp_slice = Slice(*new_value);
+ }
+
+ // The result will be in *new_value. All merges succeeded.
+ return true;
+}
+
+// Given a "real" merge from the library, call the user's
+// associative merge function one-by-one on each of the operands.
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::FullMergeV2(
+ const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const {
+ // Simply loop through the operands
+ Slice temp_existing;
+ const Slice* existing_value = merge_in.existing_value;
+ for (const auto& operand : merge_in.operand_list) {
+ std::string temp_value;
+ if (!Merge(merge_in.key, existing_value, operand, &temp_value,
+ merge_in.logger)) {
+ return false;
+ }
+ swap(temp_value, merge_out->new_value);
+ temp_existing = Slice(merge_out->new_value);
+ existing_value = &temp_existing;
+ }
+
+ // The result will be in *new_value. All merges succeeded.
+ return true;
+}
+
+// Call the user defined simple merge on the operands;
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::PartialMerge(const Slice& key,
+ const Slice& left_operand,
+ const Slice& right_operand,
+ std::string* new_value,
+ Logger* logger) const {
+ return Merge(key, &left_operand, right_operand, new_value, logger);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_test.cc b/src/rocksdb/db/merge_test.cc
new file mode 100644
index 000000000..0d373d41e
--- /dev/null
+++ b/src/rocksdb/db/merge_test.cc
@@ -0,0 +1,629 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include <assert.h>
+
+#include <iostream>
+#include <memory>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool use_compression;
+
+class MergeTest : public testing::Test {};
+
+size_t num_merge_operator_calls;
+void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
+
+size_t num_partial_merge_calls;
+void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
+
+class CountMergeOperator : public AssociativeMergeOperator {
+ public:
+ CountMergeOperator() {
+ mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
+ }
+
+ bool Merge(const Slice& key, const Slice* existing_value, const Slice& value,
+ std::string* new_value, Logger* logger) const override {
+ assert(new_value->empty());
+ ++num_merge_operator_calls;
+ if (existing_value == nullptr) {
+ new_value->assign(value.data(), value.size());
+ return true;
+ }
+
+ return mergeOperator_->PartialMerge(key, *existing_value, value, new_value,
+ logger);
+ }
+
+ bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* logger) const override {
+ assert(new_value->empty());
+ ++num_partial_merge_calls;
+ return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
+ logger);
+ }
+
+ const char* Name() const override { return "UInt64AddOperator"; }
+
+ private:
+ std::shared_ptr<MergeOperator> mergeOperator_;
+};
+
+class EnvMergeTest : public EnvWrapper {
+ public:
+ EnvMergeTest() : EnvWrapper(Env::Default()) {}
+ static const char* kClassName() { return "MergeEnv"; }
+ const char* Name() const override { return kClassName(); }
+ // ~EnvMergeTest() override {}
+
+ uint64_t NowNanos() override {
+ ++now_nanos_count_;
+ return target()->NowNanos();
+ }
+
+ static uint64_t now_nanos_count_;
+
+ static std::unique_ptr<EnvMergeTest> singleton_;
+
+ static EnvMergeTest* GetInstance() {
+ if (nullptr == singleton_) singleton_.reset(new EnvMergeTest);
+ return singleton_.get();
+ }
+};
+
+uint64_t EnvMergeTest::now_nanos_count_{0};
+std::unique_ptr<EnvMergeTest> EnvMergeTest::singleton_;
+
+std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
+ const size_t max_successive_merges = 0) {
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator = std::make_shared<CountMergeOperator>();
+ options.max_successive_merges = max_successive_merges;
+ options.env = EnvMergeTest::GetInstance();
+ EXPECT_OK(DestroyDB(dbname, Options()));
+ Status s;
+// DBWithTTL is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+ if (ttl) {
+ DBWithTTL* db_with_ttl;
+ s = DBWithTTL::Open(options, dbname, &db_with_ttl);
+ db = db_with_ttl;
+ } else {
+ s = DB::Open(options, dbname, &db);
+ }
+#else
+ assert(!ttl);
+ s = DB::Open(options, dbname, &db);
+#endif // !ROCKSDB_LITE
+ EXPECT_OK(s);
+ assert(s.ok());
+ // Allowed to call NowNanos during DB creation (in GenerateRawUniqueId() for
+ // session ID)
+ EnvMergeTest::now_nanos_count_ = 0;
+ return std::shared_ptr<DB>(db);
+}
+
+// Imagine we are maintaining a set of uint64 counters.
+// Each counter has a distinct name. And we would like
+// to support four high level operations:
+// set, add, get and remove
+// This is a quick implementation without a Merge operation.
+class Counters {
+ protected:
+ std::shared_ptr<DB> db_;
+
+ WriteOptions put_option_;
+ ReadOptions get_option_;
+ WriteOptions delete_option_;
+
+ uint64_t default_;
+
+ public:
+ explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+ : db_(db),
+ put_option_(),
+ get_option_(),
+ delete_option_(),
+ default_(defaultCount) {
+ assert(db_);
+ }
+
+ virtual ~Counters() {}
+
+ // public interface of Counters.
+ // All four functions return false
+ // if the underlying level db operation failed.
+
+ // mapped to a levedb Put
+ bool set(const std::string& key, uint64_t value) {
+ // just treat the internal rep of int64 as the string
+ char buf[sizeof(value)];
+ EncodeFixed64(buf, value);
+ Slice slice(buf, sizeof(value));
+ auto s = db_->Put(put_option_, key, slice);
+
+ if (s.ok()) {
+ return true;
+ } else {
+ std::cerr << s.ToString() << std::endl;
+ return false;
+ }
+ }
+
+ // mapped to a rocksdb Delete
+ bool remove(const std::string& key) {
+ auto s = db_->Delete(delete_option_, key);
+
+ if (s.ok()) {
+ return true;
+ } else {
+ std::cerr << s.ToString() << std::endl;
+ return false;
+ }
+ }
+
+ // mapped to a rocksdb Get
+ bool get(const std::string& key, uint64_t* value) {
+ std::string str;
+ auto s = db_->Get(get_option_, key, &str);
+
+ if (s.IsNotFound()) {
+ // return default value if not found;
+ *value = default_;
+ return true;
+ } else if (s.ok()) {
+ // deserialization
+ if (str.size() != sizeof(uint64_t)) {
+ std::cerr << "value corruption\n";
+ return false;
+ }
+ *value = DecodeFixed64(&str[0]);
+ return true;
+ } else {
+ std::cerr << s.ToString() << std::endl;
+ return false;
+ }
+ }
+
+ // 'add' is implemented as get -> modify -> set
+ // An alternative is a single merge operation, see MergeBasedCounters
+ virtual bool add(const std::string& key, uint64_t value) {
+ uint64_t base = default_;
+ return get(key, &base) && set(key, base + value);
+ }
+
+ // convenience functions for testing
+ void assert_set(const std::string& key, uint64_t value) {
+ assert(set(key, value));
+ }
+
+ void assert_remove(const std::string& key) { assert(remove(key)); }
+
+ uint64_t assert_get(const std::string& key) {
+ uint64_t value = default_;
+ int result = get(key, &value);
+ assert(result);
+ if (result == 0) exit(1); // Disable unused variable warning.
+ return value;
+ }
+
+ void assert_add(const std::string& key, uint64_t value) {
+ int result = add(key, value);
+ assert(result);
+ if (result == 0) exit(1); // Disable unused variable warning.
+ }
+};
+
+// Implement 'add' directly with the new Merge operation
+class MergeBasedCounters : public Counters {
+ private:
+ WriteOptions merge_option_; // for merge
+
+ public:
+ explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+ : Counters(db, defaultCount), merge_option_() {}
+
+ // mapped to a rocksdb Merge operation
+ bool add(const std::string& key, uint64_t value) override {
+ char encoded[sizeof(uint64_t)];
+ EncodeFixed64(encoded, value);
+ Slice slice(encoded, sizeof(uint64_t));
+ auto s = db_->Merge(merge_option_, key, slice);
+
+ if (s.ok()) {
+ return true;
+ } else {
+ std::cerr << s.ToString() << std::endl;
+ return false;
+ }
+ }
+};
+
+void dumpDb(DB* db) {
+ auto it = std::unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ // uint64_t value = DecodeFixed64(it->value().data());
+ // std::cout << it->key().ToString() << ": " << value << std::endl;
+ }
+ assert(it->status().ok()); // Check for any errors found during the scan
+}
+
+void testCounters(Counters& counters, DB* db, bool test_compaction) {
+ FlushOptions o;
+ o.wait = true;
+
+ counters.assert_set("a", 1);
+
+ if (test_compaction) {
+ ASSERT_OK(db->Flush(o));
+ }
+
+ ASSERT_EQ(counters.assert_get("a"), 1);
+
+ counters.assert_remove("b");
+
+ // defaut value is 0 if non-existent
+ ASSERT_EQ(counters.assert_get("b"), 0);
+
+ counters.assert_add("a", 2);
+
+ if (test_compaction) {
+ ASSERT_OK(db->Flush(o));
+ }
+
+ // 1+2 = 3
+ ASSERT_EQ(counters.assert_get("a"), 3);
+
+ dumpDb(db);
+
+ // 1+...+49 = ?
+ uint64_t sum = 0;
+ for (int i = 1; i < 50; i++) {
+ counters.assert_add("b", i);
+ sum += i;
+ }
+ ASSERT_EQ(counters.assert_get("b"), sum);
+
+ dumpDb(db);
+
+ if (test_compaction) {
+ ASSERT_OK(db->Flush(o));
+
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ dumpDb(db);
+
+ ASSERT_EQ(counters.assert_get("a"), 3);
+ ASSERT_EQ(counters.assert_get("b"), sum);
+ }
+}
+
+void testCountersWithFlushAndCompaction(Counters& counters, DB* db) {
+ ASSERT_OK(db->Put({}, "1", "1"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ std::atomic<int> cnt{0};
+ const auto get_thread_id = [&cnt]() {
+ thread_local int thread_id{cnt++};
+ return thread_id;
+ };
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:BeforeWriterWaiting", [&](void* /*arg*/) {
+ int thread_id = get_thread_id();
+ if (1 == thread_id) {
+ TEST_SYNC_POINT(
+ "testCountersWithFlushAndCompaction::bg_compact_thread:0");
+ } else if (2 == thread_id) {
+ TEST_SYNC_POINT(
+ "testCountersWithFlushAndCompaction::bg_flush_thread:0");
+ }
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WriteManifest", [&](void* /*arg*/) {
+ int thread_id = get_thread_id();
+ if (0 == thread_id) {
+ TEST_SYNC_POINT(
+ "testCountersWithFlushAndCompaction::set_options_thread:0");
+ TEST_SYNC_POINT(
+ "testCountersWithFlushAndCompaction::set_options_thread:1");
+ }
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::LogAndApply:WakeUpAndDone", [&](void* arg) {
+ auto* mutex = reinterpret_cast<InstrumentedMutex*>(arg);
+ mutex->AssertHeld();
+ int thread_id = get_thread_id();
+ ASSERT_EQ(2, thread_id);
+ mutex->Unlock();
+ TEST_SYNC_POINT(
+ "testCountersWithFlushAndCompaction::bg_flush_thread:1");
+ TEST_SYNC_POINT(
+ "testCountersWithFlushAndCompaction::bg_flush_thread:2");
+ mutex->Lock();
+ });
+ SyncPoint::GetInstance()->LoadDependency({
+ {"testCountersWithFlushAndCompaction::set_options_thread:0",
+ "testCountersWithCompactionAndFlush:BeforeCompact"},
+ {"testCountersWithFlushAndCompaction::bg_compact_thread:0",
+ "testCountersWithFlushAndCompaction:BeforeIncCounters"},
+ {"testCountersWithFlushAndCompaction::bg_flush_thread:0",
+ "testCountersWithFlushAndCompaction::set_options_thread:1"},
+ {"testCountersWithFlushAndCompaction::bg_flush_thread:1",
+ "testCountersWithFlushAndCompaction:BeforeVerification"},
+ {"testCountersWithFlushAndCompaction:AfterGet",
+ "testCountersWithFlushAndCompaction::bg_flush_thread:2"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread set_options_thread([&]() {
+ ASSERT_OK(reinterpret_cast<DBImpl*>(db)->SetOptions(
+ {{"disable_auto_compactions", "false"}}));
+ });
+ TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact");
+ port::Thread compact_thread([&]() {
+ ASSERT_OK(reinterpret_cast<DBImpl*>(db)->CompactRange(
+ CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeIncCounters");
+ counters.add("test-key", 1);
+
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ ASSERT_OK(db->Flush(flush_opts));
+
+ TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeVerification");
+ std::string expected;
+ PutFixed64(&expected, 1);
+ std::string actual;
+ Status s = db->Get(ReadOptions(), "test-key", &actual);
+ TEST_SYNC_POINT("testCountersWithFlushAndCompaction:AfterGet");
+ set_options_thread.join();
+ compact_thread.join();
+ ASSERT_OK(s);
+ ASSERT_EQ(expected, actual);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
+ size_t num_merges) {
+ counters.assert_remove("z");
+ uint64_t sum = 0;
+
+ for (size_t i = 1; i <= num_merges; ++i) {
+ resetNumMergeOperatorCalls();
+ counters.assert_add("z", i);
+ sum += i;
+
+ if (i % (max_num_merges + 1) == 0) {
+ ASSERT_EQ(num_merge_operator_calls, max_num_merges + 1);
+ } else {
+ ASSERT_EQ(num_merge_operator_calls, 0);
+ }
+
+ resetNumMergeOperatorCalls();
+ ASSERT_EQ(counters.assert_get("z"), sum);
+ ASSERT_EQ(num_merge_operator_calls, i % (max_num_merges + 1));
+ }
+}
+
+void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
+ size_t min_merge, size_t count) {
+ FlushOptions o;
+ o.wait = true;
+
+ // Test case 1: partial merge should be called when the number of merge
+ // operands exceeds the threshold.
+ uint64_t tmp_sum = 0;
+ resetNumPartialMergeCalls();
+ for (size_t i = 1; i <= count; i++) {
+ counters->assert_add("b", i);
+ tmp_sum += i;
+ }
+ ASSERT_OK(db->Flush(o));
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(tmp_sum, counters->assert_get("b"));
+ if (count > max_merge) {
+ // in this case, FullMerge should be called instead.
+ ASSERT_EQ(num_partial_merge_calls, 0U);
+ } else {
+ // if count >= min_merge, then partial merge should be called once.
+ ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
+ }
+
+ // Test case 2: partial merge should not be called when a put is found.
+ resetNumPartialMergeCalls();
+ tmp_sum = 0;
+ ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10"));
+ for (size_t i = 1; i <= count; i++) {
+ counters->assert_add("c", i);
+ tmp_sum += i;
+ }
+ ASSERT_OK(db->Flush(o));
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(tmp_sum, counters->assert_get("c"));
+ ASSERT_EQ(num_partial_merge_calls, 0U);
+ // NowNanos was previously called in MergeHelper::FilterMerge(), which
+ // harmed performance.
+ ASSERT_EQ(EnvMergeTest::now_nanos_count_, 0U);
+}
+
+void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
+ size_t num_merges) {
+ ASSERT_GT(num_merges, max_num_merges);
+
+ Slice key("BatchSuccessiveMerge");
+ uint64_t merge_value = 1;
+ char buf[sizeof(merge_value)];
+ EncodeFixed64(buf, merge_value);
+ Slice merge_value_slice(buf, sizeof(merge_value));
+
+ // Create the batch
+ WriteBatch batch;
+ for (size_t i = 0; i < num_merges; ++i) {
+ ASSERT_OK(batch.Merge(key, merge_value_slice));
+ }
+
+ // Apply to memtable and count the number of merges
+ resetNumMergeOperatorCalls();
+ ASSERT_OK(db->Write(WriteOptions(), &batch));
+ ASSERT_EQ(
+ num_merge_operator_calls,
+ static_cast<size_t>(num_merges - (num_merges % (max_num_merges + 1))));
+
+ // Get the value
+ resetNumMergeOperatorCalls();
+ std::string get_value_str;
+ ASSERT_OK(db->Get(ReadOptions(), key, &get_value_str));
+ assert(get_value_str.size() == sizeof(uint64_t));
+ uint64_t get_value = DecodeFixed64(&get_value_str[0]);
+ ASSERT_EQ(get_value, num_merges * merge_value);
+ ASSERT_EQ(num_merge_operator_calls,
+ static_cast<size_t>((num_merges % (max_num_merges + 1))));
+}
+
+void runTest(const std::string& dbname, const bool use_ttl = false) {
+ {
+ auto db = OpenDb(dbname, use_ttl);
+
+ {
+ Counters counters(db, 0);
+ testCounters(counters, db.get(), true);
+ }
+
+ {
+ MergeBasedCounters counters(db, 0);
+ testCounters(counters, db.get(), use_compression);
+ }
+ }
+
+ ASSERT_OK(DestroyDB(dbname, Options()));
+
+ {
+ size_t max_merge = 5;
+ auto db = OpenDb(dbname, use_ttl, max_merge);
+ MergeBasedCounters counters(db, 0);
+ testCounters(counters, db.get(), use_compression);
+ testSuccessiveMerge(counters, max_merge, max_merge * 2);
+ testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+ ASSERT_OK(db->Close());
+ ASSERT_OK(DestroyDB(dbname, Options()));
+ }
+
+ {
+ size_t max_merge = 100;
+ // Min merge is hard-coded to 2.
+ uint32_t min_merge = 2;
+ for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) {
+ auto db = OpenDb(dbname, use_ttl, max_merge);
+ MergeBasedCounters counters(db, 0);
+ testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
+ ASSERT_OK(db->Close());
+ ASSERT_OK(DestroyDB(dbname, Options()));
+ }
+ {
+ auto db = OpenDb(dbname, use_ttl, max_merge);
+ MergeBasedCounters counters(db, 0);
+ testPartialMerge(&counters, db.get(), max_merge, min_merge,
+ min_merge * 10);
+ ASSERT_OK(db->Close());
+ ASSERT_OK(DestroyDB(dbname, Options()));
+ }
+ }
+
+ {
+ {
+ auto db = OpenDb(dbname);
+ MergeBasedCounters counters(db, 0);
+ counters.add("test-key", 1);
+ counters.add("test-key", 1);
+ counters.add("test-key", 1);
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ }
+
+ DB* reopen_db;
+ ASSERT_OK(DB::Open(Options(), dbname, &reopen_db));
+ std::string value;
+ ASSERT_NOK(reopen_db->Get(ReadOptions(), "test-key", &value));
+ delete reopen_db;
+ ASSERT_OK(DestroyDB(dbname, Options()));
+ }
+
+ /* Temporary remove this test
+ {
+ std::cout << "Test merge-operator not set after reopen (recovery case)\n";
+ {
+ auto db = OpenDb(dbname);
+ MergeBasedCounters counters(db, 0);
+ counters.add("test-key", 1);
+ counters.add("test-key", 1);
+ counters.add("test-key", 1);
+ }
+
+ DB* reopen_db;
+ ASSERT_TRUE(DB::Open(Options(), dbname, &reopen_db).IsInvalidArgument());
+ }
+ */
+}
+
+TEST_F(MergeTest, MergeDbTest) {
+ runTest(test::PerThreadDBPath("merge_testdb"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(MergeTest, MergeDbTtlTest) {
+ runTest(test::PerThreadDBPath("merge_testdbttl"),
+ true); // Run test on TTL database
+}
+
+TEST_F(MergeTest, MergeWithCompactionAndFlush) {
+ const std::string dbname =
+ test::PerThreadDBPath("merge_with_compaction_and_flush");
+ {
+ auto db = OpenDb(dbname);
+ {
+ MergeBasedCounters counters(db, 0);
+ testCountersWithFlushAndCompaction(counters, db.get());
+ }
+ }
+ ASSERT_OK(DestroyDB(dbname, Options()));
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::use_compression = false;
+ if (argc > 1) {
+ ROCKSDB_NAMESPACE::use_compression = true;
+ }
+
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/obsolete_files_test.cc b/src/rocksdb/db/obsolete_files_test.cc
new file mode 100644
index 000000000..8e9f28f65
--- /dev/null
+++ b/src/rocksdb/db/obsolete_files_test.cc
@@ -0,0 +1,328 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ObsoleteFilesTest : public DBTestBase {
+ public:
+ ObsoleteFilesTest()
+ : DBTestBase("obsolete_files_test", /*env_do_fsync=*/true),
+ wal_dir_(dbname_ + "/wal_files") {}
+
+ void AddKeys(int numkeys, int startkey) {
+ WriteOptions options;
+ options.sync = false;
+ for (int i = startkey; i < (numkeys + startkey); i++) {
+ std::string temp = std::to_string(i);
+ Slice key(temp);
+ Slice value(temp);
+ ASSERT_OK(db_->Put(options, key, value));
+ }
+ }
+
+ void createLevel0Files(int numFiles, int numKeysPerFile) {
+ int startKey = 0;
+ for (int i = 0; i < numFiles; i++) {
+ AddKeys(numKeysPerFile, startKey);
+ startKey += numKeysPerFile;
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_OK(
+ dbfull()->TEST_WaitForCompact()); // wait for background flush (flush
+ // is also a kind of compaction).
+ }
+ }
+
+ void CheckFileTypeCounts(const std::string& dir, int required_log,
+ int required_sst, int required_manifest) {
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dir, &filenames));
+
+ int log_cnt = 0;
+ int sst_cnt = 0;
+ int manifest_cnt = 0;
+ for (auto file : filenames) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(file, &number, &type)) {
+ log_cnt += (type == kWalFile);
+ sst_cnt += (type == kTableFile);
+ manifest_cnt += (type == kDescriptorFile);
+ }
+ }
+ ASSERT_EQ(required_log, log_cnt);
+ ASSERT_EQ(required_sst, sst_cnt);
+ ASSERT_EQ(required_manifest, manifest_cnt);
+ }
+
+ void ReopenDB() {
+ Options options = CurrentOptions();
+ // Trigger compaction when the number of level 0 files reaches 2.
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = 2;
+ options.disable_auto_compactions = false;
+ options.delete_obsolete_files_period_micros = 0; // always do full purge
+ options.enable_thread_tracking = true;
+ options.write_buffer_size = 1024 * 1024 * 1000;
+ options.target_file_size_base = 1024 * 1024 * 1000;
+ options.max_bytes_for_level_base = 1024 * 1024 * 1000;
+ options.WAL_ttl_seconds = 300; // Used to test log files
+ options.WAL_size_limit_MB = 1024; // Used to test log files
+ options.wal_dir = wal_dir_;
+
+ // Note: the following prevents an otherwise harmless data race between the
+ // test setup code (AddBlobFile) in ObsoleteFilesTest.BlobFiles and the
+ // periodic stat dumping thread.
+ options.stats_dump_period_sec = 0;
+
+ Destroy(options);
+ Reopen(options);
+ }
+
+ const std::string wal_dir_;
+};
+
+TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
+ ReopenDB();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles",
+ "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"},
+ {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+ "ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"},
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DeleteObsoleteFileImpl:AfterDeletion", [&](void* arg) {
+ Status* p_status = reinterpret_cast<Status*>(arg);
+ ASSERT_OK(*p_status);
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) {
+ std::unordered_set<uint64_t>* files_grabbed_for_purge_ptr =
+ reinterpret_cast<std::unordered_set<uint64_t>*>(arg);
+ ASSERT_TRUE(files_grabbed_for_purge_ptr->empty());
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ createLevel0Files(2, 50000);
+ CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+
+ port::Thread user_thread([this]() {
+ JobContext jobCxt(0);
+ TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1");
+ dbfull()->TEST_LockMutex();
+ dbfull()->FindObsoleteFiles(&jobCxt, true /* force=true */,
+ false /* no_full_scan=false */);
+ dbfull()->TEST_UnlockMutex();
+ TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2");
+ dbfull()->PurgeObsoleteFiles(jobCxt);
+ jobCxt.Clean();
+ });
+
+ user_thread.join();
+}
+
+TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
+ ReopenDB();
+
+ createLevel0Files(2, 50000);
+ CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+
+ ASSERT_OK(dbfull()->DisableFileDeletions());
+ for (int i = 0; i != 4; ++i) {
+ if (i % 2) {
+ ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+ {{"paranoid_file_checks", "false"}}));
+ } else {
+ ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+ {{"paranoid_file_checks", "true"}}));
+ }
+ }
+ ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */));
+
+ Close();
+
+ std::vector<std::string> files;
+ int opts_file_count = 0;
+ ASSERT_OK(env_->GetChildren(dbname_, &files));
+ for (const auto& file : files) {
+ uint64_t file_num;
+ Slice dummy_info_log_name_prefix;
+ FileType type;
+ WalFileType log_type;
+ if (ParseFileName(file, &file_num, dummy_info_log_name_prefix, &type,
+ &log_type) &&
+ type == kOptionsFile) {
+ opts_file_count++;
+ }
+ }
+ ASSERT_EQ(2, opts_file_count);
+}
+
+TEST_F(ObsoleteFilesTest, BlobFiles) {
+ ReopenDB();
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ assert(versions->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ const ImmutableCFOptions* const ioptions = cfd->ioptions();
+ assert(ioptions);
+ assert(!ioptions->cf_paths.empty());
+
+ const std::string& path = ioptions->cf_paths.front().path;
+
+ // Add an obsolete blob file.
+ constexpr uint64_t first_blob_file_number = 234;
+ versions->AddObsoleteBlobFile(first_blob_file_number, path);
+
+ // Add a live blob file.
+ Version* const version = cfd->current();
+ assert(version);
+
+ VersionStorageInfo* const storage_info = version->storage_info();
+ assert(storage_info);
+
+ constexpr uint64_t second_blob_file_number = 456;
+ constexpr uint64_t second_total_blob_count = 100;
+ constexpr uint64_t second_total_blob_bytes = 2000000;
+ constexpr char second_checksum_method[] = "CRC32B";
+ constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a";
+
+ auto shared_meta = SharedBlobFileMetaData::Create(
+ second_blob_file_number, second_total_blob_count, second_total_blob_bytes,
+ second_checksum_method, second_checksum_value);
+
+ constexpr uint64_t second_garbage_blob_count = 0;
+ constexpr uint64_t second_garbage_blob_bytes = 0;
+
+ auto meta = BlobFileMetaData::Create(
+ std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+ second_garbage_blob_count, second_garbage_blob_bytes);
+
+ storage_info->AddBlobFile(std::move(meta));
+
+ // Check for obsolete files and make sure the first blob file is picked up
+ // and grabbed for purge. The second blob file should be on the live list.
+ constexpr int job_id = 0;
+ JobContext job_context{job_id};
+
+ dbfull()->TEST_LockMutex();
+ constexpr bool force_full_scan = false;
+ dbfull()->FindObsoleteFiles(&job_context, force_full_scan);
+ dbfull()->TEST_UnlockMutex();
+
+ ASSERT_TRUE(job_context.HaveSomethingToDelete());
+ ASSERT_EQ(job_context.blob_delete_files.size(), 1);
+ ASSERT_EQ(job_context.blob_delete_files[0].GetBlobFileNumber(),
+ first_blob_file_number);
+
+ const auto& files_grabbed_for_purge =
+ dbfull()->TEST_GetFilesGrabbedForPurge();
+ ASSERT_NE(files_grabbed_for_purge.find(first_blob_file_number),
+ files_grabbed_for_purge.end());
+
+ ASSERT_EQ(job_context.blob_live.size(), 1);
+ ASSERT_EQ(job_context.blob_live[0], second_blob_file_number);
+
+ // Hack the job context a bit by adding a few files to the full scan
+ // list and adjusting the pending file number. We add the two files
+ // above as well as two additional ones, where one is old
+ // and should be cleaned up, and the other is still pending.
+ constexpr uint64_t old_blob_file_number = 123;
+ constexpr uint64_t pending_blob_file_number = 567;
+
+ job_context.full_scan_candidate_files.emplace_back(
+ BlobFileName(old_blob_file_number), path);
+ job_context.full_scan_candidate_files.emplace_back(
+ BlobFileName(first_blob_file_number), path);
+ job_context.full_scan_candidate_files.emplace_back(
+ BlobFileName(second_blob_file_number), path);
+ job_context.full_scan_candidate_files.emplace_back(
+ BlobFileName(pending_blob_file_number), path);
+
+ job_context.min_pending_output = pending_blob_file_number;
+
+ // Purge obsolete files and make sure we purge the old file and the first file
+ // (and keep the second file and the pending file).
+ std::vector<std::string> deleted_files;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion", [&](void* arg) {
+ const std::string* file = static_cast<std::string*>(arg);
+ assert(file);
+
+ constexpr char blob_extension[] = ".blob";
+
+ if (file->find(blob_extension) != std::string::npos) {
+ deleted_files.emplace_back(*file);
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ dbfull()->PurgeObsoleteFiles(job_context);
+ job_context.Clean();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_EQ(files_grabbed_for_purge.find(first_blob_file_number),
+ files_grabbed_for_purge.end());
+
+ std::sort(deleted_files.begin(), deleted_files.end());
+ const std::vector<std::string> expected_deleted_files{
+ BlobFileName(path, old_blob_file_number),
+ BlobFileName(path, first_blob_file_number)};
+
+ ASSERT_EQ(deleted_files, expected_deleted_files);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/options_file_test.cc b/src/rocksdb/db/options_file_test.cc
new file mode 100644
index 000000000..eb02e6ca4
--- /dev/null
+++ b/src/rocksdb/db/options_file_test.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+class OptionsFileTest : public testing::Test {
+ public:
+ OptionsFileTest() : dbname_(test::PerThreadDBPath("options_file_test")) {}
+
+ std::string dbname_;
+};
+
+namespace {
+void UpdateOptionsFiles(DB* db,
+ std::unordered_set<std::string>* filename_history,
+ int* options_files_count) {
+ std::vector<std::string> filenames;
+ EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames));
+ uint64_t number;
+ FileType type;
+ *options_files_count = 0;
+ for (auto filename : filenames) {
+ if (ParseFileName(filename, &number, &type) && type == kOptionsFile) {
+ filename_history->insert(filename);
+ (*options_files_count)++;
+ }
+ }
+}
+
+// Verify whether the current Options Files are the latest ones.
+void VerifyOptionsFileName(
+ DB* db, const std::unordered_set<std::string>& past_filenames) {
+ std::vector<std::string> filenames;
+ std::unordered_set<std::string> current_filenames;
+ EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames));
+ uint64_t number;
+ FileType type;
+ for (auto filename : filenames) {
+ if (ParseFileName(filename, &number, &type) && type == kOptionsFile) {
+ current_filenames.insert(filename);
+ }
+ }
+ for (auto past_filename : past_filenames) {
+ if (current_filenames.find(past_filename) != current_filenames.end()) {
+ continue;
+ }
+ for (auto filename : current_filenames) {
+ ASSERT_GT(filename, past_filename);
+ }
+ }
+}
+} // anonymous namespace
+
+TEST_F(OptionsFileTest, NumberOfOptionsFiles) {
+ const int kReopenCount = 20;
+ Options opt;
+ opt.create_if_missing = true;
+ ASSERT_OK(DestroyDB(dbname_, opt));
+ std::unordered_set<std::string> filename_history;
+ DB* db;
+ for (int i = 0; i < kReopenCount; ++i) {
+ ASSERT_OK(DB::Open(opt, dbname_, &db));
+ int num_options_files = 0;
+ UpdateOptionsFiles(db, &filename_history, &num_options_files);
+ ASSERT_GT(num_options_files, 0);
+ ASSERT_LE(num_options_files, 2);
+ // Make sure we always keep the latest option files.
+ VerifyOptionsFileName(db, filename_history);
+ delete db;
+ }
+}
+
+TEST_F(OptionsFileTest, OptionsFileName) {
+ const uint64_t kOptionsFileNum = 12345;
+ uint64_t number;
+ FileType type;
+
+ auto options_file_name = OptionsFileName("", kOptionsFileNum);
+ ASSERT_TRUE(ParseFileName(options_file_name, &number, &type, nullptr));
+ ASSERT_EQ(type, kOptionsFile);
+ ASSERT_EQ(number, kOptionsFileNum);
+
+ const uint64_t kTempOptionsFileNum = 54352;
+ auto temp_options_file_name = TempOptionsFileName("", kTempOptionsFileNum);
+ ASSERT_TRUE(ParseFileName(temp_options_file_name, &number, &type, nullptr));
+ ASSERT_NE(temp_options_file_name.find(kTempFileNameSuffix),
+ std::string::npos);
+ ASSERT_EQ(type, kTempFile);
+ ASSERT_EQ(number, kTempOptionsFileNum);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ return 0;
+#endif // !(defined NDEBUG) || !defined(OS_WIN)
+}
+#else
+
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+ printf("Skipped as Options file is not supported in RocksDBLite.\n");
+ return 0;
+}
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/output_validator.cc b/src/rocksdb/db/output_validator.cc
new file mode 100644
index 000000000..e93e2d68c
--- /dev/null
+++ b/src/rocksdb/db/output_validator.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/output_validator.h"
+
+#include "test_util/sync_point.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status OutputValidator::Add(const Slice& key, const Slice& value) {
+ if (enable_hash_) {
+ // Generate a rolling 64-bit hash of the key and values
+ paranoid_hash_ = NPHash64(key.data(), key.size(), paranoid_hash_);
+ paranoid_hash_ = NPHash64(value.data(), value.size(), paranoid_hash_);
+ }
+ if (enable_order_check_) {
+ TEST_SYNC_POINT_CALLBACK("OutputValidator::Add:order_check",
+ /*arg=*/nullptr);
+ if (key.size() < kNumInternalBytes) {
+ return Status::Corruption(
+ "Compaction tries to write a key without internal bytes.");
+ }
+ // prev_key_ starts with empty.
+ if (!prev_key_.empty() && icmp_.Compare(key, prev_key_) < 0) {
+ return Status::Corruption("Compaction sees out-of-order keys.");
+ }
+ prev_key_.assign(key.data(), key.size());
+ }
+ return Status::OK();
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/output_validator.h b/src/rocksdb/db/output_validator.h
new file mode 100644
index 000000000..40635f9c4
--- /dev/null
+++ b/src/rocksdb/db/output_validator.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A class that validates key/value that is inserted to an SST file.
+// Pass every key/value of the file using OutputValidator::Add()
+// and the class validates key order and optionally calculate a hash
+// of all the key and value.
+class OutputValidator {
+ public:
+ explicit OutputValidator(const InternalKeyComparator& icmp,
+ bool enable_order_check, bool enable_hash,
+ uint64_t precalculated_hash = 0)
+ : icmp_(icmp),
+ paranoid_hash_(precalculated_hash),
+ enable_order_check_(enable_order_check),
+ enable_hash_(enable_hash) {}
+
+ // Add a key to the KV sequence, and return whether the key follows
+ // criteria, e.g. key is ordered.
+ Status Add(const Slice& key, const Slice& value);
+
+ // Compare result of two key orders are the same. It can be used
+ // to compare the keys inserted into a file, and what is read back.
+ // Return true if the validation passes.
+ bool CompareValidator(const OutputValidator& other_validator) {
+ return GetHash() == other_validator.GetHash();
+ }
+
+ // Not (yet) intended to be persisted, so subject to change
+ // without notice between releases.
+ uint64_t GetHash() const { return paranoid_hash_; }
+
+ private:
+ const InternalKeyComparator& icmp_;
+ std::string prev_key_;
+ uint64_t paranoid_hash_ = 0;
+ bool enable_order_check_;
+ bool enable_hash_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/perf_context_test.cc b/src/rocksdb/db/perf_context_test.cc
new file mode 100644
index 000000000..454d12dc5
--- /dev/null
+++ b/src/rocksdb/db/perf_context_test.cc
@@ -0,0 +1,1010 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "rocksdb/perf_context.h"
+
+#include <algorithm>
+#include <iostream>
+#include <thread>
+#include <vector>
+
+#include "monitoring/histogram.h"
+#include "monitoring/instrumented_mutex.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/testharness.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+bool FLAGS_random_key = false;
+bool FLAGS_use_set_based_memetable = false;
+int FLAGS_total_keys = 100;
+int FLAGS_write_buffer_size = 1000000000;
+int FLAGS_max_write_buffer_number = 8;
+int FLAGS_min_write_buffer_number_to_merge = 7;
+bool FLAGS_verbose = false;
+
+// Path to the database on file system
+const std::string kDbName =
+ ROCKSDB_NAMESPACE::test::PerThreadDBPath("perf_context_test");
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<DB> OpenDb(bool read_only = false) {
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ options.max_open_files = -1;
+ options.write_buffer_size = FLAGS_write_buffer_size;
+ options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+ options.min_write_buffer_number_to_merge =
+ FLAGS_min_write_buffer_number_to_merge;
+
+ if (FLAGS_use_set_based_memetable) {
+#ifndef ROCKSDB_LITE
+ options.prefix_extractor.reset(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(0));
+ options.memtable_factory.reset(NewHashSkipListRepFactory());
+#endif // ROCKSDB_LITE
+ }
+
+ Status s;
+ if (!read_only) {
+ s = DB::Open(options, kDbName, &db);
+ } else {
+ s = DB::OpenForReadOnly(options, kDbName, &db);
+ }
+ EXPECT_OK(s);
+ return std::shared_ptr<DB>(db);
+}
+
+class PerfContextTest : public testing::Test {};
+
+TEST_F(PerfContextTest, SeekIntoDeletion) {
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ std::string key = "k" + std::to_string(i);
+ std::string value = "v" + std::to_string(i);
+
+ ASSERT_OK(db->Put(write_options, key, value));
+ }
+
+ for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+ std::string key = "k" + std::to_string(i);
+ ASSERT_OK(db->Delete(write_options, key));
+ }
+
+ HistogramImpl hist_get;
+ HistogramImpl hist_get_time;
+ for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+ std::string key = "k" + std::to_string(i);
+ std::string value;
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(SystemClock::Default().get());
+ timer.Start();
+ auto status = db->Get(read_options, key, &value);
+ auto elapsed_nanos = timer.ElapsedNanos();
+ ASSERT_TRUE(status.IsNotFound());
+ hist_get.Add(get_perf_context()->user_key_comparison_count);
+ hist_get_time.Add(elapsed_nanos);
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << "Get user key comparison: \n"
+ << hist_get.ToString() << "Get time: \n"
+ << hist_get_time.ToString();
+ }
+
+ {
+ HistogramImpl hist_seek_to_first;
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(SystemClock::Default().get(), true);
+ iter->SeekToFirst();
+ hist_seek_to_first.Add(get_perf_context()->user_key_comparison_count);
+ auto elapsed_nanos = timer.ElapsedNanos();
+
+ if (FLAGS_verbose) {
+ std::cout << "SeekToFirst user key comparison: \n"
+ << hist_seek_to_first.ToString() << "ikey skipped: "
+ << get_perf_context()->internal_key_skipped_count << "\n"
+ << "idelete skipped: "
+ << get_perf_context()->internal_delete_skipped_count << "\n"
+ << "elapsed: " << elapsed_nanos << "\n";
+ }
+ }
+
+ HistogramImpl hist_seek;
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ std::string key = "k" + std::to_string(i);
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(SystemClock::Default().get(), true);
+ iter->Seek(key);
+ auto elapsed_nanos = timer.ElapsedNanos();
+ hist_seek.Add(get_perf_context()->user_key_comparison_count);
+ if (FLAGS_verbose) {
+ std::cout << "seek cmp: " << get_perf_context()->user_key_comparison_count
+ << " ikey skipped "
+ << get_perf_context()->internal_key_skipped_count
+ << " idelete skipped "
+ << get_perf_context()->internal_delete_skipped_count
+ << " elapsed: " << elapsed_nanos << "ns\n";
+ }
+
+ get_perf_context()->Reset();
+ ASSERT_TRUE(iter->Valid());
+ StopWatchNano timer2(SystemClock::Default().get(), true);
+ iter->Next();
+ auto elapsed_nanos2 = timer2.ElapsedNanos();
+ if (FLAGS_verbose) {
+ std::cout << "next cmp: " << get_perf_context()->user_key_comparison_count
+ << "elapsed: " << elapsed_nanos2 << "ns\n";
+ }
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << "Seek user key comparison: \n" << hist_seek.ToString();
+ }
+}
+
+TEST_F(PerfContextTest, StopWatchNanoOverhead) {
+ // profile the timer cost by itself!
+ const int kTotalIterations = 1000000;
+ std::vector<uint64_t> timings(kTotalIterations);
+
+ StopWatchNano timer(SystemClock::Default().get(), true);
+ for (auto& timing : timings) {
+ timing = timer.ElapsedNanos(true /* reset */);
+ }
+
+ HistogramImpl histogram;
+ for (const auto timing : timings) {
+ histogram.Add(timing);
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << histogram.ToString();
+ }
+}
+
+TEST_F(PerfContextTest, StopWatchOverhead) {
+ // profile the timer cost by itself!
+ const int kTotalIterations = 1000000;
+ uint64_t elapsed = 0;
+ std::vector<uint64_t> timings(kTotalIterations);
+
+ StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed);
+ for (auto& timing : timings) {
+ timing = elapsed;
+ }
+
+ HistogramImpl histogram;
+ uint64_t prev_timing = 0;
+ for (const auto timing : timings) {
+ histogram.Add(timing - prev_timing);
+ prev_timing = timing;
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << histogram.ToString();
+ }
+}
+
+void ProfileQueries(bool enabled_time = false) {
+ ASSERT_OK(DestroyDB(kDbName, Options())); // Start this test with a fresh DB
+
+ auto db = OpenDb();
+
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ HistogramImpl hist_put;
+
+ HistogramImpl hist_get;
+ HistogramImpl hist_get_snapshot;
+ HistogramImpl hist_get_memtable;
+ HistogramImpl hist_get_files;
+ HistogramImpl hist_get_post_process;
+ HistogramImpl hist_num_memtable_checked;
+
+ HistogramImpl hist_mget;
+ HistogramImpl hist_mget_snapshot;
+ HistogramImpl hist_mget_memtable;
+ HistogramImpl hist_mget_files;
+ HistogramImpl hist_mget_post_process;
+ HistogramImpl hist_mget_num_memtable_checked;
+
+ HistogramImpl hist_write_pre_post;
+ HistogramImpl hist_write_wal_time;
+ HistogramImpl hist_write_memtable_time;
+ HistogramImpl hist_write_delay_time;
+ HistogramImpl hist_write_thread_wait_nanos;
+ HistogramImpl hist_write_scheduling_time;
+
+ uint64_t total_db_mutex_nanos = 0;
+
+ if (FLAGS_verbose) {
+ std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+ }
+
+ std::vector<int> keys;
+ const int kFlushFlag = -1;
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ keys.push_back(i);
+ if (i == FLAGS_total_keys / 2) {
+ // Issuing a flush in the middle.
+ keys.push_back(kFlushFlag);
+ }
+ }
+
+ if (FLAGS_random_key) {
+ RandomShuffle(std::begin(keys), std::end(keys));
+ }
+#ifndef NDEBUG
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U);
+#endif
+ int num_mutex_waited = 0;
+ for (const int i : keys) {
+ if (i == kFlushFlag) {
+ FlushOptions fo;
+ db->Flush(fo);
+ continue;
+ }
+
+ std::string key = "k" + std::to_string(i);
+ std::string value = "v" + std::to_string(i);
+
+ std::vector<std::string> values;
+
+ get_perf_context()->Reset();
+ ASSERT_OK(db->Put(write_options, key, value));
+ if (++num_mutex_waited > 3) {
+#ifndef NDEBUG
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+ }
+ hist_write_pre_post.Add(
+ get_perf_context()->write_pre_and_post_process_time);
+ hist_write_wal_time.Add(get_perf_context()->write_wal_time);
+ hist_write_memtable_time.Add(get_perf_context()->write_memtable_time);
+ hist_write_delay_time.Add(get_perf_context()->write_delay_time);
+ hist_write_thread_wait_nanos.Add(
+ get_perf_context()->write_thread_wait_nanos);
+ hist_write_scheduling_time.Add(
+ get_perf_context()->write_scheduling_flushes_compactions_time);
+ hist_put.Add(get_perf_context()->user_key_comparison_count);
+ total_db_mutex_nanos += get_perf_context()->db_mutex_lock_nanos;
+ }
+#ifndef NDEBUG
+ ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+
+ for (const int i : keys) {
+ if (i == kFlushFlag) {
+ continue;
+ }
+ std::string key = "k" + std::to_string(i);
+ std::string expected_value = "v" + std::to_string(i);
+ std::string value;
+
+ std::vector<Slice> multiget_keys = {Slice(key)};
+ std::vector<std::string> values;
+
+ get_perf_context()->Reset();
+ ASSERT_OK(db->Get(read_options, key, &value));
+ ASSERT_EQ(expected_value, value);
+ hist_get_snapshot.Add(get_perf_context()->get_snapshot_time);
+ hist_get_memtable.Add(get_perf_context()->get_from_memtable_time);
+ hist_get_files.Add(get_perf_context()->get_from_output_files_time);
+ hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+ hist_get_post_process.Add(get_perf_context()->get_post_process_time);
+ hist_get.Add(get_perf_context()->user_key_comparison_count);
+
+ get_perf_context()->Reset();
+ auto statuses = db->MultiGet(read_options, multiget_keys, &values);
+ for (const auto& s : statuses) {
+ ASSERT_OK(s);
+ }
+ hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
+ hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
+ hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
+ hist_mget_num_memtable_checked.Add(
+ get_perf_context()->get_from_memtable_count);
+ hist_mget_post_process.Add(get_perf_context()->get_post_process_time);
+ hist_mget.Add(get_perf_context()->user_key_comparison_count);
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << "Put user key comparison: \n"
+ << hist_put.ToString() << "Get user key comparison: \n"
+ << hist_get.ToString() << "MultiGet user key comparison: \n"
+ << hist_get.ToString();
+ std::cout << "Put(): Pre and Post Process Time: \n"
+ << hist_write_pre_post.ToString() << " Writing WAL time: \n"
+ << hist_write_wal_time.ToString() << "\n"
+ << " Writing Mem Table time: \n"
+ << hist_write_memtable_time.ToString() << "\n"
+ << " Write Delay: \n"
+ << hist_write_delay_time.ToString() << "\n"
+ << " Waiting for Batch time: \n"
+ << hist_write_thread_wait_nanos.ToString() << "\n"
+ << " Scheduling Flushes and Compactions Time: \n"
+ << hist_write_scheduling_time.ToString() << "\n"
+ << " Total DB mutex nanos: \n"
+ << total_db_mutex_nanos << "\n";
+
+ std::cout << "Get(): Time to get snapshot: \n"
+ << hist_get_snapshot.ToString()
+ << " Time to get value from memtables: \n"
+ << hist_get_memtable.ToString() << "\n"
+ << " Time to get value from output files: \n"
+ << hist_get_files.ToString() << "\n"
+ << " Number of memtables checked: \n"
+ << hist_num_memtable_checked.ToString() << "\n"
+ << " Time to post process: \n"
+ << hist_get_post_process.ToString() << "\n";
+
+ std::cout << "MultiGet(): Time to get snapshot: \n"
+ << hist_mget_snapshot.ToString()
+ << " Time to get value from memtables: \n"
+ << hist_mget_memtable.ToString() << "\n"
+ << " Time to get value from output files: \n"
+ << hist_mget_files.ToString() << "\n"
+ << " Number of memtables checked: \n"
+ << hist_mget_num_memtable_checked.ToString() << "\n"
+ << " Time to post process: \n"
+ << hist_mget_post_process.ToString() << "\n";
+ }
+
+ if (enabled_time) {
+ ASSERT_GT(hist_get.Average(), 0);
+ ASSERT_GT(hist_get_snapshot.Average(), 0);
+ ASSERT_GT(hist_get_memtable.Average(), 0);
+ ASSERT_GT(hist_get_files.Average(), 0);
+ ASSERT_GT(hist_get_post_process.Average(), 0);
+ ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+
+ ASSERT_GT(hist_mget.Average(), 0);
+ ASSERT_GT(hist_mget_snapshot.Average(), 0);
+ ASSERT_GT(hist_mget_memtable.Average(), 0);
+ ASSERT_GT(hist_mget_files.Average(), 0);
+ ASSERT_GT(hist_mget_post_process.Average(), 0);
+ ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+
+ EXPECT_GT(hist_write_pre_post.Average(), 0);
+ EXPECT_GT(hist_write_wal_time.Average(), 0);
+ EXPECT_GT(hist_write_memtable_time.Average(), 0);
+ EXPECT_EQ(hist_write_delay_time.Average(), 0);
+ EXPECT_EQ(hist_write_thread_wait_nanos.Average(), 0);
+ EXPECT_GT(hist_write_scheduling_time.Average(), 0);
+
+#ifndef NDEBUG
+ ASSERT_LT(total_db_mutex_nanos, 100U);
+#endif
+ }
+
+ db.reset();
+ db = OpenDb(true);
+
+ hist_get.Clear();
+ hist_get_snapshot.Clear();
+ hist_get_memtable.Clear();
+ hist_get_files.Clear();
+ hist_get_post_process.Clear();
+ hist_num_memtable_checked.Clear();
+
+ hist_mget.Clear();
+ hist_mget_snapshot.Clear();
+ hist_mget_memtable.Clear();
+ hist_mget_files.Clear();
+ hist_mget_post_process.Clear();
+ hist_mget_num_memtable_checked.Clear();
+
+ for (const int i : keys) {
+ if (i == kFlushFlag) {
+ continue;
+ }
+ std::string key = "k" + std::to_string(i);
+ std::string expected_value = "v" + std::to_string(i);
+ std::string value;
+
+ std::vector<Slice> multiget_keys = {Slice(key)};
+ std::vector<std::string> values;
+
+ get_perf_context()->Reset();
+ ASSERT_OK(db->Get(read_options, key, &value));
+ ASSERT_EQ(expected_value, value);
+ hist_get_snapshot.Add(get_perf_context()->get_snapshot_time);
+ hist_get_memtable.Add(get_perf_context()->get_from_memtable_time);
+ hist_get_files.Add(get_perf_context()->get_from_output_files_time);
+ hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+ hist_get_post_process.Add(get_perf_context()->get_post_process_time);
+ hist_get.Add(get_perf_context()->user_key_comparison_count);
+
+ get_perf_context()->Reset();
+ auto statuses = db->MultiGet(read_options, multiget_keys, &values);
+ for (const auto& s : statuses) {
+ ASSERT_OK(s);
+ }
+ hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
+ hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
+ hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
+ hist_mget_num_memtable_checked.Add(
+ get_perf_context()->get_from_memtable_count);
+ hist_mget_post_process.Add(get_perf_context()->get_post_process_time);
+ hist_mget.Add(get_perf_context()->user_key_comparison_count);
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << "ReadOnly Get user key comparison: \n"
+ << hist_get.ToString()
+ << "ReadOnly MultiGet user key comparison: \n"
+ << hist_mget.ToString();
+
+ std::cout << "ReadOnly Get(): Time to get snapshot: \n"
+ << hist_get_snapshot.ToString()
+ << " Time to get value from memtables: \n"
+ << hist_get_memtable.ToString() << "\n"
+ << " Time to get value from output files: \n"
+ << hist_get_files.ToString() << "\n"
+ << " Number of memtables checked: \n"
+ << hist_num_memtable_checked.ToString() << "\n"
+ << " Time to post process: \n"
+ << hist_get_post_process.ToString() << "\n";
+
+ std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n"
+ << hist_mget_snapshot.ToString()
+ << " Time to get value from memtables: \n"
+ << hist_mget_memtable.ToString() << "\n"
+ << " Time to get value from output files: \n"
+ << hist_mget_files.ToString() << "\n"
+ << " Number of memtables checked: \n"
+ << hist_mget_num_memtable_checked.ToString() << "\n"
+ << " Time to post process: \n"
+ << hist_mget_post_process.ToString() << "\n";
+ }
+
+ if (enabled_time) {
+ ASSERT_GT(hist_get.Average(), 0);
+ ASSERT_GT(hist_get_memtable.Average(), 0);
+ ASSERT_GT(hist_get_files.Average(), 0);
+ ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+ // In read-only mode Get(), no super version operation is needed
+ ASSERT_EQ(hist_get_post_process.Average(), 0);
+ ASSERT_GT(hist_get_snapshot.Average(), 0);
+
+ ASSERT_GT(hist_mget.Average(), 0);
+ ASSERT_GT(hist_mget_snapshot.Average(), 0);
+ ASSERT_GT(hist_mget_memtable.Average(), 0);
+ ASSERT_GT(hist_mget_files.Average(), 0);
+ ASSERT_GT(hist_mget_post_process.Average(), 0);
+ ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(PerfContextTest, KeyComparisonCount) {
+ SetPerfLevel(kEnableCount);
+ ProfileQueries();
+
+ SetPerfLevel(kDisable);
+ ProfileQueries();
+
+ SetPerfLevel(kEnableTime);
+ ProfileQueries(true);
+}
+#endif // ROCKSDB_LITE
+
+// make perf_context_test
+// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
+// For one memtable:
+// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
+// For two memtables:
+// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
+// Specify --random_key=1 to shuffle the key before insertion
+// Results show that, for sequential insertion, worst-case Seek Key comparison
+// is close to the total number of keys (linear), when there is only one
+// memtable. When there are two memtables, even the avg Seek Key comparison
+// starts to become linear to the input size.
+
+TEST_F(PerfContextTest, SeekKeyComparison) {
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ if (FLAGS_verbose) {
+ std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+ }
+
+ std::vector<int> keys;
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ keys.push_back(i);
+ }
+
+ if (FLAGS_random_key) {
+ RandomShuffle(std::begin(keys), std::end(keys));
+ }
+
+ HistogramImpl hist_put_time;
+ HistogramImpl hist_wal_time;
+ HistogramImpl hist_time_diff;
+
+ SetPerfLevel(kEnableTime);
+ StopWatchNano timer(SystemClock::Default().get());
+ for (const int i : keys) {
+ std::string key = "k" + std::to_string(i);
+ std::string value = "v" + std::to_string(i);
+
+ get_perf_context()->Reset();
+ timer.Start();
+ ASSERT_OK(db->Put(write_options, key, value));
+ auto put_time = timer.ElapsedNanos();
+ hist_put_time.Add(put_time);
+ hist_wal_time.Add(get_perf_context()->write_wal_time);
+ hist_time_diff.Add(put_time - get_perf_context()->write_wal_time);
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << "Put time:\n"
+ << hist_put_time.ToString() << "WAL time:\n"
+ << hist_wal_time.ToString() << "time diff:\n"
+ << hist_time_diff.ToString();
+ }
+
+ HistogramImpl hist_seek;
+ HistogramImpl hist_next;
+
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ std::string key = "k" + std::to_string(i);
+ std::string value = "v" + std::to_string(i);
+
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ get_perf_context()->Reset();
+ iter->Seek(key);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->value().ToString(), value);
+ hist_seek.Add(get_perf_context()->user_key_comparison_count);
+ }
+
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ for (iter->SeekToFirst(); iter->Valid();) {
+ get_perf_context()->Reset();
+ iter->Next();
+ hist_next.Add(get_perf_context()->user_key_comparison_count);
+ }
+ ASSERT_OK(iter->status());
+ if (FLAGS_verbose) {
+ std::cout << "Seek:\n"
+ << hist_seek.ToString() << "Next:\n"
+ << hist_next.ToString();
+ }
+}
+
+TEST_F(PerfContextTest, DBMutexLockCounter) {
+ int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
+ for (PerfLevel perf_level_test :
+ {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) {
+ for (int c = 0; c < 2; ++c) {
+ InstrumentedMutex mutex(nullptr, SystemClock::Default().get(),
+ stats_code[c]);
+ mutex.Lock();
+ ROCKSDB_NAMESPACE::port::Thread child_thread([&] {
+ SetPerfLevel(perf_level_test);
+ get_perf_context()->Reset();
+ ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+ mutex.Lock();
+ mutex.Unlock();
+ if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex ||
+ stats_code[c] != DB_MUTEX_WAIT_MICROS) {
+ ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+ } else {
+ // increment the counter only when it's a DB Mutex
+ ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0);
+ }
+ });
+ SystemClock::Default()->SleepForMicroseconds(100);
+ mutex.Unlock();
+ child_thread.join();
+ }
+ }
+}
+
+TEST_F(PerfContextTest, FalseDBMutexWait) {
+ SetPerfLevel(kEnableTime);
+ int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
+ for (int c = 0; c < 2; ++c) {
+ InstrumentedMutex mutex(nullptr, SystemClock::Default().get(),
+ stats_code[c]);
+ InstrumentedCondVar lock(&mutex);
+ get_perf_context()->Reset();
+ mutex.Lock();
+ lock.TimedWait(100);
+ mutex.Unlock();
+ if (stats_code[c] == static_cast<int>(DB_MUTEX_WAIT_MICROS)) {
+ // increment the counter only when it's a DB Mutex
+ ASSERT_GT(get_perf_context()->db_condition_wait_nanos, 0);
+ } else {
+ ASSERT_EQ(get_perf_context()->db_condition_wait_nanos, 0);
+ }
+ }
+}
+
+TEST_F(PerfContextTest, ToString) {
+ get_perf_context()->Reset();
+ get_perf_context()->block_read_count = 12345;
+
+ std::string zero_included = get_perf_context()->ToString();
+ ASSERT_NE(std::string::npos, zero_included.find("= 0"));
+ ASSERT_NE(std::string::npos, zero_included.find("= 12345"));
+
+ std::string zero_excluded = get_perf_context()->ToString(true);
+ ASSERT_EQ(std::string::npos, zero_excluded.find("= 0"));
+ ASSERT_NE(std::string::npos, zero_excluded.find("= 12345"));
+}
+
+TEST_F(PerfContextTest, MergeOperatorTime) {
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ Status s = DB::Open(options, kDbName, &db);
+ EXPECT_OK(s);
+
+ std::string val;
+ ASSERT_OK(db->Merge(WriteOptions(), "k1", "val1"));
+ ASSERT_OK(db->Merge(WriteOptions(), "k1", "val2"));
+ ASSERT_OK(db->Merge(WriteOptions(), "k1", "val3"));
+ ASSERT_OK(db->Merge(WriteOptions(), "k1", "val4"));
+
+ SetPerfLevel(kEnableTime);
+ get_perf_context()->Reset();
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+ }
+#endif
+ EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ get_perf_context()->Reset();
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+ }
+#endif
+ EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ get_perf_context()->Reset();
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+ }
+#endif
+ EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+ delete db;
+}
+
+TEST_F(PerfContextTest, CopyAndMove) {
+ // Assignment operator
+ {
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+ ASSERT_EQ(
+ 1,
+ (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+ PerfContext perf_context_assign;
+ perf_context_assign = *get_perf_context();
+ ASSERT_EQ(
+ 1,
+ (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+ get_perf_context()->ClearPerLevelPerfContext();
+ get_perf_context()->Reset();
+ ASSERT_EQ(
+ 1,
+ (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+ perf_context_assign.ClearPerLevelPerfContext();
+ perf_context_assign.Reset();
+ }
+ // Copy constructor
+ {
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+ ASSERT_EQ(
+ 1,
+ (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+ PerfContext perf_context_copy(*get_perf_context());
+ ASSERT_EQ(
+ 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+ get_perf_context()->ClearPerLevelPerfContext();
+ get_perf_context()->Reset();
+ ASSERT_EQ(
+ 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+ perf_context_copy.ClearPerLevelPerfContext();
+ perf_context_copy.Reset();
+ }
+ // Move constructor
+ {
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+ ASSERT_EQ(
+ 1,
+ (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+ PerfContext perf_context_move = std::move(*get_perf_context());
+ ASSERT_EQ(
+ 1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+ get_perf_context()->ClearPerLevelPerfContext();
+ get_perf_context()->Reset();
+ ASSERT_EQ(
+ 1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+ perf_context_move.ClearPerLevelPerfContext();
+ perf_context_move.Reset();
+ }
+}
+
+TEST_F(PerfContextTest, PerfContextDisableEnable) {
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+ get_perf_context()->DisablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+ get_perf_context()->DisablePerLevelPerfContext();
+ PerfContext perf_context_copy(*get_perf_context());
+ ASSERT_EQ(1, (*(perf_context_copy.level_to_perf_context))[0]
+ .bloom_filter_full_positive);
+ // this was set when per level perf context is disabled, should not be copied
+ ASSERT_NE(
+ 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+ ASSERT_EQ(
+ 1, (*(perf_context_copy.level_to_perf_context))[0].block_cache_hit_count);
+ perf_context_copy.ClearPerLevelPerfContext();
+ perf_context_copy.Reset();
+ get_perf_context()->ClearPerLevelPerfContext();
+ get_perf_context()->Reset();
+}
+
+TEST_F(PerfContextTest, PerfContextByLevelGetSet) {
+ get_perf_context()->Reset();
+ get_perf_context()->EnablePerLevelPerfContext();
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, 2);
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 5, 2);
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 2, 3);
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 4, 1);
+ ASSERT_EQ(
+ 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+ ASSERT_EQ(
+ 1, (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+ ASSERT_EQ(
+ 2, (*(get_perf_context()->level_to_perf_context))[7].bloom_filter_useful);
+ ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0]
+ .bloom_filter_full_positive);
+ ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[2]
+ .bloom_filter_full_true_positive);
+ ASSERT_EQ(
+ 1,
+ (*(get_perf_context()->level_to_perf_context))[0].block_cache_hit_count);
+ ASSERT_EQ(
+ 5,
+ (*(get_perf_context()->level_to_perf_context))[2].block_cache_hit_count);
+ ASSERT_EQ(
+ 2,
+ (*(get_perf_context()->level_to_perf_context))[3].block_cache_miss_count);
+ ASSERT_EQ(
+ 4,
+ (*(get_perf_context()->level_to_perf_context))[1].block_cache_miss_count);
+ std::string zero_excluded = get_perf_context()->ToString(true);
+ ASSERT_NE(std::string::npos,
+ zero_excluded.find("bloom_filter_useful = 1@level5, 2@level7"));
+ ASSERT_NE(std::string::npos,
+ zero_excluded.find("bloom_filter_full_positive = 1@level0"));
+ ASSERT_NE(std::string::npos,
+ zero_excluded.find("bloom_filter_full_true_positive = 1@level2"));
+ ASSERT_NE(std::string::npos,
+ zero_excluded.find("block_cache_hit_count = 1@level0, 5@level2"));
+ ASSERT_NE(std::string::npos,
+ zero_excluded.find("block_cache_miss_count = 4@level1, 2@level3"));
+}
+
+TEST_F(PerfContextTest, CPUTimer) {
+ if (SystemClock::Default()->CPUNanos() == 0) {
+ ROCKSDB_GTEST_SKIP("Target without CPUNanos support");
+ return;
+ }
+
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+ SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+
+ std::string max_str = "0";
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ std::string i_str = std::to_string(i);
+ std::string key = "k" + i_str;
+ std::string value = "v" + i_str;
+ max_str = max_str > i_str ? max_str : i_str;
+
+ ASSERT_OK(db->Put(write_options, key, value));
+ }
+ std::string last_key = "k" + max_str;
+ std::string last_value = "v" + max_str;
+
+ {
+ // Get
+ get_perf_context()->Reset();
+ std::string value;
+ ASSERT_OK(db->Get(read_options, "k0", &value));
+ ASSERT_EQ(value, "v0");
+
+ if (FLAGS_verbose) {
+ std::cout << "Get CPU time nanos: " << get_perf_context()->get_cpu_nanos
+ << "ns\n";
+ }
+
+ // Iter
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+ // Seek
+ get_perf_context()->Reset();
+ iter->Seek(last_key);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(last_value, iter->value().ToString());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter Seek CPU time nanos: "
+ << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+ }
+
+ // SeekForPrev
+ get_perf_context()->Reset();
+ iter->SeekForPrev(last_key);
+ ASSERT_TRUE(iter->Valid());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter SeekForPrev CPU time nanos: "
+ << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+ }
+
+ // SeekToLast
+ get_perf_context()->Reset();
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(last_value, iter->value().ToString());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter SeekToLast CPU time nanos: "
+ << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+ }
+
+ // SeekToFirst
+ get_perf_context()->Reset();
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter SeekToFirst CPU time nanos: "
+ << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+ }
+
+ // Next
+ get_perf_context()->Reset();
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v1", iter->value().ToString());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter Next CPU time nanos: "
+ << get_perf_context()->iter_next_cpu_nanos << "ns\n";
+ }
+
+ // Prev
+ get_perf_context()->Reset();
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v0", iter->value().ToString());
+
+ if (FLAGS_verbose) {
+ std::cout << "Iter Prev CPU time nanos: "
+ << get_perf_context()->iter_prev_cpu_nanos << "ns\n";
+ }
+
+ // monotonically increasing
+ get_perf_context()->Reset();
+ auto count = get_perf_context()->iter_seek_cpu_nanos;
+ for (int i = 0; i < FLAGS_total_keys; ++i) {
+ iter->Seek("k" + std::to_string(i));
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("v" + std::to_string(i), iter->value().ToString());
+ auto next_count = get_perf_context()->iter_seek_cpu_nanos;
+ ASSERT_GT(next_count, count);
+ count = next_count;
+ }
+
+ // iterator creation/destruction; multiple iterators
+ {
+ std::unique_ptr<Iterator> iter2(db->NewIterator(read_options));
+ ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+ iter2->Seek(last_key);
+ ASSERT_TRUE(iter2->Valid());
+ ASSERT_EQ(last_value, iter2->value().ToString());
+ ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, count);
+ count = get_perf_context()->iter_seek_cpu_nanos;
+ }
+ ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+
+ for (int i = 1; i < argc; i++) {
+ int n;
+ char junk;
+
+ if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+ FLAGS_write_buffer_size = n;
+ }
+
+ if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
+ FLAGS_total_keys = n;
+ }
+
+ if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
+ (n == 0 || n == 1)) {
+ FLAGS_random_key = n;
+ }
+
+ if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
+ (n == 0 || n == 1)) {
+ FLAGS_use_set_based_memetable = n;
+ }
+
+ if (sscanf(argv[i], "--verbose=%d%c", &n, &junk) == 1 &&
+ (n == 0 || n == 1)) {
+ FLAGS_verbose = n;
+ }
+ }
+
+ if (FLAGS_verbose) {
+ std::cout << kDbName << "\n";
+ }
+
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/periodic_task_scheduler.cc b/src/rocksdb/db/periodic_task_scheduler.cc
new file mode 100644
index 000000000..2024510dd
--- /dev/null
+++ b/src/rocksdb/db/periodic_task_scheduler.cc
@@ -0,0 +1,113 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/periodic_task_scheduler.h"
+
+#include "rocksdb/system_clock.h"
+
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+
+// `timer_mutex` is a global mutex serves 3 purposes currently:
+// (1) to ensure calls to `Start()` and `Shutdown()` are serialized, as
+// they are currently not implemented in a thread-safe way; and
+// (2) to ensure the `Timer::Add()`s and `Timer::Start()` run atomically, and
+// the `Timer::Cancel()`s and `Timer::Shutdown()` run atomically.
+// (3) protect tasks_map_ in PeriodicTaskScheduler
+// Note: It's not efficient to have a static global mutex, for
+// PeriodicTaskScheduler it should be okay, as the operations are called
+// infrequently.
+static port::Mutex timer_mutex;
+
+static const std::map<PeriodicTaskType, uint64_t> kDefaultPeriodSeconds = {
+ {PeriodicTaskType::kDumpStats, kInvalidPeriodSec},
+ {PeriodicTaskType::kPersistStats, kInvalidPeriodSec},
+ {PeriodicTaskType::kFlushInfoLog, 10},
+ {PeriodicTaskType::kRecordSeqnoTime, kInvalidPeriodSec},
+};
+
+static const std::map<PeriodicTaskType, std::string> kPeriodicTaskTypeNames = {
+ {PeriodicTaskType::kDumpStats, "dump_st"},
+ {PeriodicTaskType::kPersistStats, "pst_st"},
+ {PeriodicTaskType::kFlushInfoLog, "flush_info_log"},
+ {PeriodicTaskType::kRecordSeqnoTime, "record_seq_time"},
+};
+
+Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
+ const PeriodicTaskFunc& fn) {
+ return Register(task_type, fn, kDefaultPeriodSeconds.at(task_type));
+}
+
+Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
+ const PeriodicTaskFunc& fn,
+ uint64_t repeat_period_seconds) {
+ MutexLock l(&timer_mutex);
+ static std::atomic<uint64_t> initial_delay(0);
+
+ if (repeat_period_seconds == kInvalidPeriodSec) {
+ return Status::InvalidArgument("Invalid task repeat period");
+ }
+ auto it = tasks_map_.find(task_type);
+ if (it != tasks_map_.end()) {
+ // the task already exists and it's the same, no update needed
+ if (it->second.repeat_every_sec == repeat_period_seconds) {
+ return Status::OK();
+ }
+ // cancel the existing one before register new one
+ timer_->Cancel(it->second.name);
+ tasks_map_.erase(it);
+ }
+
+ timer_->Start();
+ // put task type name as prefix, for easy debug
+ std::string unique_id =
+ kPeriodicTaskTypeNames.at(task_type) + std::to_string(id_++);
+
+ bool succeeded = timer_->Add(
+ fn, unique_id,
+ (initial_delay.fetch_add(1) % repeat_period_seconds) * kMicrosInSecond,
+ repeat_period_seconds * kMicrosInSecond);
+ if (!succeeded) {
+ return Status::Aborted("Failed to register periodic task");
+ }
+ auto result = tasks_map_.try_emplace(
+ task_type, TaskInfo{unique_id, repeat_period_seconds});
+ if (!result.second) {
+ return Status::Aborted("Failed to add periodic task");
+ };
+ return Status::OK();
+}
+
+Status PeriodicTaskScheduler::Unregister(PeriodicTaskType task_type) {
+ MutexLock l(&timer_mutex);
+ auto it = tasks_map_.find(task_type);
+ if (it != tasks_map_.end()) {
+ timer_->Cancel(it->second.name);
+ tasks_map_.erase(it);
+ }
+ if (!timer_->HasPendingTask()) {
+ timer_->Shutdown();
+ }
+ return Status::OK();
+}
+
+Timer* PeriodicTaskScheduler::Default() {
+ static Timer timer(SystemClock::Default().get());
+ return &timer;
+}
+
+#ifndef NDEBUG
+void PeriodicTaskScheduler::TEST_OverrideTimer(SystemClock* clock) {
+ static Timer test_timer(clock);
+ test_timer.TEST_OverrideTimer(clock);
+ MutexLock l(&timer_mutex);
+ timer_ = &test_timer;
+}
+#endif // NDEBUG
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/periodic_task_scheduler.h b/src/rocksdb/db/periodic_task_scheduler.h
new file mode 100644
index 000000000..f45b80c4d
--- /dev/null
+++ b/src/rocksdb/db/periodic_task_scheduler.h
@@ -0,0 +1,110 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "util/timer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+
+using PeriodicTaskFunc = std::function<void()>;
+
+constexpr uint64_t kInvalidPeriodSec = 0;
+
+// List of task types
+enum class PeriodicTaskType : uint8_t {
+ kDumpStats = 0,
+ kPersistStats,
+ kFlushInfoLog,
+ kRecordSeqnoTime,
+ kMax,
+};
+
+// PeriodicTaskScheduler contains the periodic task scheduled from the DB
+// instance. It's used to schedule/unschedule DumpStats(), PersistStats(),
+// FlushInfoLog(), etc. Each type of the task can only have one instance,
+// re-register the same task type would only update the repeat period.
+//
+// Internally, it uses a global single threaded timer object to run the periodic
+// task functions. Timer thread will always be started since the info log
+// flushing cannot be disabled.
+class PeriodicTaskScheduler {
+ public:
+ explicit PeriodicTaskScheduler() = default;
+
+ PeriodicTaskScheduler(const PeriodicTaskScheduler&) = delete;
+ PeriodicTaskScheduler(PeriodicTaskScheduler&&) = delete;
+ PeriodicTaskScheduler& operator=(const PeriodicTaskScheduler&) = delete;
+ PeriodicTaskScheduler& operator=(PeriodicTaskScheduler&&) = delete;
+
+ // Register a task with its default repeat period
+ Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn);
+
+ // Register a task with specified repeat period. 0 is an invalid argument
+ // (kInvalidPeriodSec). To stop the task, please use Unregister() specifically
+ Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn,
+ uint64_t repeat_period_seconds);
+
+ // Unregister the task
+ Status Unregister(PeriodicTaskType task_type);
+
+#ifndef NDEBUG
+ // Override the timer for the unittest
+ void TEST_OverrideTimer(SystemClock* clock);
+
+ // Call Timer TEST_WaitForRun() which wait until Timer starting waiting.
+ void TEST_WaitForRun(const std::function<void()>& callback) const {
+ if (timer_ != nullptr) {
+ timer_->TEST_WaitForRun(callback);
+ }
+ }
+
+ // Get global valid task number in the Timer
+ size_t TEST_GetValidTaskNum() const {
+ if (timer_ != nullptr) {
+ return timer_->TEST_GetPendingTaskNum();
+ }
+ return 0;
+ }
+
+ // If it has the specified task type registered
+ bool TEST_HasTask(PeriodicTaskType task_type) const {
+ auto it = tasks_map_.find(task_type);
+ return it != tasks_map_.end();
+ }
+#endif // NDEBUG
+
+ private:
+ // default global Timer instance
+ static Timer* Default();
+
+ // Internal structure to store task information
+ struct TaskInfo {
+ TaskInfo(std::string _name, uint64_t _repeat_every_sec)
+ : name(std::move(_name)), repeat_every_sec(_repeat_every_sec) {}
+ std::string name;
+ uint64_t repeat_every_sec;
+ };
+
+ // Internal tasks map
+ std::map<PeriodicTaskType, TaskInfo> tasks_map_;
+
+ // Global timer pointer, which doesn't support synchronous add/cancel tasks
+ // so having a global `timer_mutex` for add/cancel task.
+ Timer* timer_ = Default();
+
+ // Global task id, protected by the global `timer_mutex`
+ inline static uint64_t id_;
+
+ static constexpr uint64_t kMicrosInSecond = 1000U * 1000U;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/periodic_task_scheduler_test.cc b/src/rocksdb/db/periodic_task_scheduler_test.cc
new file mode 100644
index 000000000..4abea4d5e
--- /dev/null
+++ b/src/rocksdb/db/periodic_task_scheduler_test.cc
@@ -0,0 +1,231 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/periodic_task_scheduler.h"
+
+#include "db/db_test_util.h"
+#include "env/composite_env_wrapper.h"
+#include "test_util/mock_time_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class PeriodicTaskSchedulerTest : public DBTestBase {
+ public:
+ PeriodicTaskSchedulerTest()
+ : DBTestBase("periodic_task_scheduler_test", /*env_do_fsync=*/true) {
+ mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+ mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_));
+ }
+
+ protected:
+ std::unique_ptr<Env> mock_env_;
+ std::shared_ptr<MockSystemClock> mock_clock_;
+
+ void SetUp() override {
+ mock_clock_->InstallTimedWaitFixCallback();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+ auto periodic_task_scheduler_ptr =
+ reinterpret_cast<PeriodicTaskScheduler*>(arg);
+ periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
+ });
+ }
+};
+
+TEST_F(PeriodicTaskSchedulerTest, Basic) {
+ constexpr unsigned int kPeriodSec = 10;
+ Close();
+ Options options;
+ options.stats_dump_period_sec = kPeriodSec;
+ options.stats_persist_period_sec = kPeriodSec;
+ options.create_if_missing = true;
+ options.env = mock_env_.get();
+
+ int dump_st_counter = 0;
+ SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:StartRunning",
+ [&](void*) { dump_st_counter++; });
+
+ int pst_st_counter = 0;
+ SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning",
+ [&](void*) { pst_st_counter++; });
+
+ int flush_info_log_counter = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::FlushInfoLog:StartRunning",
+ [&](void*) { flush_info_log_counter++; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(options);
+
+ ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_dump_period_sec);
+ ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+ ASSERT_GT(kPeriodSec, 1u);
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec) - 1);
+ });
+
+ const PeriodicTaskScheduler& scheduler =
+ dbfull()->TEST_GetPeriodicTaskScheduler();
+ ASSERT_EQ(3, scheduler.TEST_GetValidTaskNum());
+
+ ASSERT_EQ(1, dump_st_counter);
+ ASSERT_EQ(1, pst_st_counter);
+ ASSERT_EQ(1, flush_info_log_counter);
+
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+
+ ASSERT_EQ(2, dump_st_counter);
+ ASSERT_EQ(2, pst_st_counter);
+ ASSERT_EQ(2, flush_info_log_counter);
+
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+
+ ASSERT_EQ(3, dump_st_counter);
+ ASSERT_EQ(3, pst_st_counter);
+ ASSERT_EQ(3, flush_info_log_counter);
+
+ // Disable scheduler with SetOption
+ ASSERT_OK(dbfull()->SetDBOptions(
+ {{"stats_dump_period_sec", "0"}, {"stats_persist_period_sec", "0"}}));
+ ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_dump_period_sec);
+ ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+ // Info log flush should still run.
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+ ASSERT_EQ(3, dump_st_counter);
+ ASSERT_EQ(3, pst_st_counter);
+ ASSERT_EQ(4, flush_info_log_counter);
+
+ ASSERT_EQ(1u, scheduler.TEST_GetValidTaskNum());
+
+ // Re-enable one task
+ ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "5"}}));
+ ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
+ ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+ ASSERT_EQ(2, scheduler.TEST_GetValidTaskNum());
+
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+ ASSERT_EQ(4, dump_st_counter);
+ ASSERT_EQ(3, pst_st_counter);
+ ASSERT_EQ(5, flush_info_log_counter);
+
+ Close();
+}
+
+TEST_F(PeriodicTaskSchedulerTest, MultiInstances) {
+ constexpr int kPeriodSec = 5;
+ const int kInstanceNum = 10;
+
+ Close();
+ Options options;
+ options.stats_dump_period_sec = kPeriodSec;
+ options.stats_persist_period_sec = kPeriodSec;
+ options.create_if_missing = true;
+ options.env = mock_env_.get();
+
+ int dump_st_counter = 0;
+ SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:2",
+ [&](void*) { dump_st_counter++; });
+
+ int pst_st_counter = 0;
+ SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning",
+ [&](void*) { pst_st_counter++; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ auto dbs = std::vector<DB*>(kInstanceNum);
+ for (int i = 0; i < kInstanceNum; i++) {
+ ASSERT_OK(
+ DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
+ }
+
+ auto dbi = static_cast_with_check<DBImpl>(dbs[kInstanceNum - 1]);
+
+ const PeriodicTaskScheduler& scheduler = dbi->TEST_GetPeriodicTaskScheduler();
+ ASSERT_EQ(kInstanceNum * 3, scheduler.TEST_GetValidTaskNum());
+
+ int expected_run = kInstanceNum;
+ dbi->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+ ASSERT_EQ(expected_run, dump_st_counter);
+ ASSERT_EQ(expected_run, pst_st_counter);
+
+ expected_run += kInstanceNum;
+ dbi->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+ ASSERT_EQ(expected_run, dump_st_counter);
+ ASSERT_EQ(expected_run, pst_st_counter);
+
+ expected_run += kInstanceNum;
+ dbi->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+ ASSERT_EQ(expected_run, dump_st_counter);
+ ASSERT_EQ(expected_run, pst_st_counter);
+
+ int half = kInstanceNum / 2;
+ for (int i = 0; i < half; i++) {
+ delete dbs[i];
+ }
+
+ expected_run += (kInstanceNum - half) * 2;
+
+ dbi->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+ dbi->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+ ASSERT_EQ(expected_run, dump_st_counter);
+ ASSERT_EQ(expected_run, pst_st_counter);
+
+ for (int i = half; i < kInstanceNum; i++) {
+ ASSERT_OK(dbs[i]->Close());
+ delete dbs[i];
+ }
+}
+
+TEST_F(PeriodicTaskSchedulerTest, MultiEnv) {
+ constexpr int kDumpPeriodSec = 5;
+ constexpr int kPersistPeriodSec = 10;
+ Close();
+ Options options1;
+ options1.stats_dump_period_sec = kDumpPeriodSec;
+ options1.stats_persist_period_sec = kPersistPeriodSec;
+ options1.create_if_missing = true;
+ options1.env = mock_env_.get();
+
+ Reopen(options1);
+
+ std::unique_ptr<Env> mock_env2(
+ new CompositeEnvWrapper(Env::Default(), mock_clock_));
+ Options options2;
+ options2.stats_dump_period_sec = kDumpPeriodSec;
+ options2.stats_persist_period_sec = kPersistPeriodSec;
+ options2.create_if_missing = true;
+ options1.env = mock_env2.get();
+
+ std::string dbname = test::PerThreadDBPath("multi_env_test");
+ DB* db;
+ ASSERT_OK(DB::Open(options2, dbname, &db));
+
+ ASSERT_OK(db->Close());
+ delete db;
+ Close();
+}
+
+#endif // !ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/pinned_iterators_manager.h b/src/rocksdb/db/pinned_iterators_manager.h
new file mode 100644
index 000000000..0fcf231da
--- /dev/null
+++ b/src/rocksdb/db/pinned_iterators_manager.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PinnedIteratorsManager will be notified whenever we need to pin an Iterator
+// and it will be responsible for deleting pinned Iterators when they are
+// not needed anymore.
+class PinnedIteratorsManager : public Cleanable {
+ public:
+ PinnedIteratorsManager() : pinning_enabled(false) {}
+ ~PinnedIteratorsManager() {
+ if (pinning_enabled) {
+ ReleasePinnedData();
+ }
+ }
+
+ // Move constructor and move assignment is allowed.
+ PinnedIteratorsManager(PinnedIteratorsManager&& other) noexcept = default;
+ PinnedIteratorsManager& operator=(PinnedIteratorsManager&& other) noexcept =
+ default;
+
+ // Enable Iterators pinning
+ void StartPinning() {
+ assert(pinning_enabled == false);
+ pinning_enabled = true;
+ }
+
+ // Is pinning enabled ?
+ bool PinningEnabled() { return pinning_enabled; }
+
+ // Take ownership of iter and delete it when ReleasePinnedData() is called
+ void PinIterator(InternalIterator* iter, bool arena = false) {
+ if (arena) {
+ PinPtr(iter, &PinnedIteratorsManager::ReleaseArenaInternalIterator);
+ } else {
+ PinPtr(iter, &PinnedIteratorsManager::ReleaseInternalIterator);
+ }
+ }
+
+ using ReleaseFunction = void (*)(void* arg1);
+ void PinPtr(void* ptr, ReleaseFunction release_func) {
+ assert(pinning_enabled);
+ if (ptr == nullptr) {
+ return;
+ }
+ pinned_ptrs_.emplace_back(ptr, release_func);
+ }
+
+ // Release pinned Iterators
+ inline void ReleasePinnedData() {
+ assert(pinning_enabled == true);
+ pinning_enabled = false;
+
+ // Remove duplicate pointers
+ std::sort(pinned_ptrs_.begin(), pinned_ptrs_.end());
+ auto unique_end = std::unique(pinned_ptrs_.begin(), pinned_ptrs_.end());
+
+ for (auto i = pinned_ptrs_.begin(); i != unique_end; ++i) {
+ void* ptr = i->first;
+ ReleaseFunction release_func = i->second;
+ release_func(ptr);
+ }
+ pinned_ptrs_.clear();
+ // Also do cleanups from the base Cleanable
+ Cleanable::Reset();
+ }
+
+ private:
+ static void ReleaseInternalIterator(void* ptr) {
+ delete reinterpret_cast<InternalIterator*>(ptr);
+ }
+
+ static void ReleaseArenaInternalIterator(void* ptr) {
+ reinterpret_cast<InternalIterator*>(ptr)->~InternalIterator();
+ }
+
+ bool pinning_enabled;
+ std::vector<std::pair<void*, ReleaseFunction>> pinned_ptrs_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/plain_table_db_test.cc b/src/rocksdb/db/plain_table_db_test.cc
new file mode 100644
index 000000000..755b639b0
--- /dev/null
+++ b/src/rocksdb/db/plain_table_db_test.cc
@@ -0,0 +1,1357 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <set>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/plain/plain_table_reader.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+class PlainTableKeyDecoderTest : public testing::Test {};
+
+TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) {
+ Random rnd(301);
+ const uint32_t kLength = 2222;
+ std::string tmp = rnd.RandomString(kLength);
+ Slice contents(tmp);
+ test::StringSource* string_source =
+ new test::StringSource(contents, 0, false);
+ std::unique_ptr<FSRandomAccessFile> holder(string_source);
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(std::move(holder), "test"));
+ std::unique_ptr<PlainTableReaderFileInfo> file_info(
+ new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(),
+ kLength));
+
+ {
+ PlainTableFileReader reader(file_info.get());
+
+ const uint32_t kReadSize = 77;
+ for (uint32_t pos = 0; pos < kLength; pos += kReadSize) {
+ uint32_t read_size = std::min(kLength - pos, kReadSize);
+ Slice out;
+ ASSERT_TRUE(reader.Read(pos, read_size, &out));
+ ASSERT_EQ(0, out.compare(tmp.substr(pos, read_size)));
+ }
+
+ ASSERT_LT(uint32_t(string_source->total_reads()), kLength / kReadSize / 2);
+ }
+
+ std::vector<std::vector<std::pair<uint32_t, uint32_t>>> reads = {
+ {{600, 30}, {590, 30}, {600, 20}, {600, 40}},
+ {{800, 20}, {100, 20}, {500, 20}, {1500, 20}, {100, 20}, {80, 20}},
+ {{1000, 20}, {500, 20}, {1000, 50}},
+ {{1000, 20}, {500, 20}, {500, 20}},
+ {{1000, 20}, {500, 20}, {200, 20}, {500, 20}},
+ {{1000, 20}, {500, 20}, {200, 20}, {1000, 50}},
+ {{600, 500}, {610, 20}, {100, 20}},
+ {{500, 100}, {490, 100}, {550, 50}},
+ };
+
+ std::vector<int> num_file_reads = {2, 6, 2, 2, 4, 3, 2, 2};
+
+ for (size_t i = 0; i < reads.size(); i++) {
+ string_source->set_total_reads(0);
+ PlainTableFileReader reader(file_info.get());
+ for (auto p : reads[i]) {
+ Slice out;
+ ASSERT_TRUE(reader.Read(p.first, p.second, &out));
+ ASSERT_EQ(0, out.compare(tmp.substr(p.first, p.second)));
+ }
+ ASSERT_EQ(num_file_reads[i], string_source->total_reads());
+ }
+}
+
+class PlainTableDBTest : public testing::Test,
+ public testing::WithParamInterface<bool> {
+ protected:
+ private:
+ std::string dbname_;
+ Env* env_;
+ DB* db_;
+
+ bool mmap_mode_;
+ Options last_options_;
+
+ public:
+ PlainTableDBTest() : env_(Env::Default()) {}
+
+ ~PlainTableDBTest() override {
+ delete db_;
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ }
+
+ void SetUp() override {
+ mmap_mode_ = GetParam();
+ dbname_ = test::PerThreadDBPath("plain_table_db_test");
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ db_ = nullptr;
+ Reopen();
+ }
+
+ // Return the current option configuration.
+ Options CurrentOptions() {
+ Options options;
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 0;
+ plain_table_options.bloom_bits_per_key = 2;
+ plain_table_options.hash_table_ratio = 0.8;
+ plain_table_options.index_sparseness = 3;
+ plain_table_options.huge_page_tlb_size = 0;
+ plain_table_options.encoding_type = kPrefix;
+ plain_table_options.full_scan_mode = false;
+ plain_table_options.store_index_in_file = false;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ options.allow_mmap_reads = mmap_mode_;
+ options.allow_concurrent_memtable_write = false;
+ options.unordered_write = false;
+ return options;
+ }
+
+ DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+ void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
+
+ void Close() {
+ delete db_;
+ db_ = nullptr;
+ }
+
+ bool mmap_mode() const { return mmap_mode_; }
+
+ void DestroyAndReopen(Options* options = nullptr) {
+ // Destroy using last options
+ Destroy(&last_options_);
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void Destroy(Options* options) {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, *options));
+ }
+
+ Status PureReopen(Options* options, DB** db) {
+ return DB::Open(*options, dbname_, db);
+ }
+
+ Status ReopenForReadOnly(Options* options) {
+ delete db_;
+ db_ = nullptr;
+ return DB::OpenForReadOnly(*options, dbname_, &db_);
+ }
+
+ Status TryReopen(Options* options = nullptr) {
+ delete db_;
+ db_ = nullptr;
+ Options opts;
+ if (options != nullptr) {
+ opts = *options;
+ } else {
+ opts = CurrentOptions();
+ opts.create_if_missing = true;
+ }
+ last_options_ = opts;
+
+ return DB::Open(opts, dbname_, &db_);
+ }
+
+ Status Put(const Slice& k, const Slice& v) {
+ return db_->Put(WriteOptions(), k, v);
+ }
+
+ Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ int NumTableFilesAtLevel(int level) {
+ std::string property;
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.num-files-at-level" + std::to_string(level), &property));
+ return atoi(property.c_str());
+ }
+
+ // Return spread of files per level
+ std::string FilesPerLevel() {
+ std::string result;
+ size_t last_non_zero_offset = 0;
+ for (int level = 0; level < db_->NumberLevels(); level++) {
+ int f = NumTableFilesAtLevel(level);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+
+ std::string IterStatus(Iterator* iter) {
+ std::string result;
+ if (iter->Valid()) {
+ result = iter->key().ToString() + "->" + iter->value().ToString();
+ } else {
+ result = "(invalid)";
+ }
+ return result;
+ }
+};
+
+TEST_P(PlainTableDBTest, Empty) {
+ ASSERT_TRUE(dbfull() != nullptr);
+ ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+extern const uint64_t kPlainTableMagicNumber;
+
+class TestPlainTableReader : public PlainTableReader {
+ public:
+ TestPlainTableReader(
+ const EnvOptions& env_options, const InternalKeyComparator& icomparator,
+ EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key,
+ double hash_table_ratio, size_t index_sparseness,
+ std::unique_ptr<TableProperties>&& props,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ const ImmutableOptions& ioptions, const SliceTransform* prefix_extractor,
+ bool* expect_bloom_not_match, bool store_index_in_file,
+ uint32_t column_family_id, const std::string& column_family_name)
+ : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
+ encoding_type, file_size, props.get(),
+ prefix_extractor),
+ expect_bloom_not_match_(expect_bloom_not_match) {
+ Status s = MmapDataIfNeeded();
+ EXPECT_TRUE(s.ok());
+
+ s = PopulateIndex(props.get(), bloom_bits_per_key, hash_table_ratio,
+ index_sparseness, 2 * 1024 * 1024);
+ EXPECT_TRUE(s.ok());
+
+ EXPECT_EQ(column_family_id, static_cast<uint32_t>(props->column_family_id));
+ EXPECT_EQ(column_family_name, props->column_family_name);
+ if (store_index_in_file) {
+ auto bloom_version_ptr = props->user_collected_properties.find(
+ PlainTablePropertyNames::kBloomVersion);
+ EXPECT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
+ EXPECT_EQ(bloom_version_ptr->second, std::string("1"));
+ if (ioptions.bloom_locality > 0) {
+ auto num_blocks_ptr = props->user_collected_properties.find(
+ PlainTablePropertyNames::kNumBloomBlocks);
+ EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
+ }
+ }
+ table_properties_ = std::move(props);
+ }
+
+ ~TestPlainTableReader() override {}
+
+ private:
+ bool MatchBloom(uint32_t hash) const override {
+ bool ret = PlainTableReader::MatchBloom(hash);
+ if (*expect_bloom_not_match_) {
+ EXPECT_TRUE(!ret);
+ } else {
+ EXPECT_TRUE(ret);
+ }
+ return ret;
+ }
+ bool* expect_bloom_not_match_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+class TestPlainTableFactory : public PlainTableFactory {
+ public:
+ explicit TestPlainTableFactory(bool* expect_bloom_not_match,
+ const PlainTableOptions& options,
+ uint32_t column_family_id,
+ std::string column_family_name)
+ : PlainTableFactory(options),
+ bloom_bits_per_key_(options.bloom_bits_per_key),
+ hash_table_ratio_(options.hash_table_ratio),
+ index_sparseness_(options.index_sparseness),
+ store_index_in_file_(options.store_index_in_file),
+ expect_bloom_not_match_(expect_bloom_not_match),
+ column_family_id_(column_family_id),
+ column_family_name_(std::move(column_family_name)) {}
+
+ using PlainTableFactory::NewTableReader;
+ Status NewTableReader(
+ const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table,
+ bool /*prefetch_index_and_filter_in_cache*/) const override {
+ std::unique_ptr<TableProperties> props;
+ auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+ table_reader_options.ioptions, &props);
+ EXPECT_TRUE(s.ok());
+
+ if (store_index_in_file_) {
+ BlockHandle bloom_block_handle;
+ s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber,
+ table_reader_options.ioptions,
+ BloomBlockBuilder::kBloomBlock,
+ &bloom_block_handle);
+ EXPECT_TRUE(s.ok());
+
+ BlockHandle index_block_handle;
+ s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber,
+ table_reader_options.ioptions,
+ PlainTableIndexBuilder::kPlainTableIndexBlock,
+ &index_block_handle);
+ EXPECT_TRUE(s.ok());
+ }
+
+ auto& user_props = props->user_collected_properties;
+ auto encoding_type_prop =
+ user_props.find(PlainTablePropertyNames::kEncodingType);
+ assert(encoding_type_prop != user_props.end());
+ EncodingType encoding_type = static_cast<EncodingType>(
+ DecodeFixed32(encoding_type_prop->second.c_str()));
+
+ std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
+ table_reader_options.env_options,
+ table_reader_options.internal_comparator, encoding_type, file_size,
+ bloom_bits_per_key_, hash_table_ratio_, index_sparseness_,
+ std::move(props), std::move(file), table_reader_options.ioptions,
+ table_reader_options.prefix_extractor.get(), expect_bloom_not_match_,
+ store_index_in_file_, column_family_id_, column_family_name_));
+
+ *table = std::move(new_reader);
+ return s;
+ }
+
+ private:
+ int bloom_bits_per_key_;
+ double hash_table_ratio_;
+ size_t index_sparseness_;
+ bool store_index_in_file_;
+ bool* expect_bloom_not_match_;
+ const uint32_t column_family_id_;
+ const std::string column_family_name_;
+};
+
+TEST_P(PlainTableDBTest, BadOptions1) {
+ // Build with a prefix extractor
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ // Bad attempt to re-open without a prefix extractor
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset();
+ ASSERT_EQ(
+ "Invalid argument: Prefix extractor is missing when opening a PlainTable "
+ "built using a prefix extractor",
+ TryReopen(&options).ToString());
+
+ // Bad attempt to re-open with different prefix extractor
+ options.prefix_extractor.reset(NewFixedPrefixTransform(6));
+ ASSERT_EQ(
+ "Invalid argument: Prefix extractor given doesn't match the one used to "
+ "build PlainTable",
+ TryReopen(&options).ToString());
+
+ // Correct prefix extractor
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ Reopen(&options);
+ ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, BadOptions2) {
+ Options options = CurrentOptions();
+ options.prefix_extractor.reset();
+ options.create_if_missing = true;
+ DestroyAndReopen(&options);
+ // Build without a prefix extractor
+ // (apparently works even if hash_table_ratio > 0)
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ // Build without a prefix extractor, this call will fail and returns the
+ // status for this bad attempt.
+ ASSERT_NOK(dbfull()->TEST_FlushMemTable());
+
+ // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor
+ Status s = TryReopen(&options);
+ ASSERT_EQ(
+ "Not implemented: PlainTable requires a prefix extractor enable prefix "
+ "hash mode.",
+ s.ToString());
+
+ // OK to open with hash_table_ratio == 0 and no prefix extractor
+ PlainTableOptions plain_table_options;
+ plain_table_options.hash_table_ratio = 0;
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ Reopen(&options);
+ ASSERT_EQ("v1", Get("1000000000000foo"));
+
+ // OK to open newly with a prefix_extractor and hash table; builds index
+ // in memory.
+ options = CurrentOptions();
+ Reopen(&options);
+ ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, Flush) {
+ for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+ huge_page_tlb_size += 2 * 1024 * 1024) {
+ for (EncodingType encoding_type : {kPlain, kPrefix}) {
+ for (int bloom = -1; bloom <= 117; bloom += 117) {
+ const int bloom_bits = std::max(bloom, 0);
+ const bool full_scan_mode = bloom < 0;
+ for (int total_order = 0; total_order <= 1; total_order++) {
+ for (int store_index_in_file = 0; store_index_in_file <= 1;
+ ++store_index_in_file) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ if (total_order) {
+ options.prefix_extractor.reset();
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 0;
+ plain_table_options.bloom_bits_per_key = bloom_bits;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 2;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+ plain_table_options.encoding_type = encoding_type;
+ plain_table_options.full_scan_mode = full_scan_mode;
+ plain_table_options.store_index_in_file = store_index_in_file;
+
+ options.table_factory.reset(
+ NewPlainTableFactory(plain_table_options));
+ } else {
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 0;
+ plain_table_options.bloom_bits_per_key = bloom_bits;
+ plain_table_options.hash_table_ratio = 0.75;
+ plain_table_options.index_sparseness = 16;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+ plain_table_options.encoding_type = encoding_type;
+ plain_table_options.full_scan_mode = full_scan_mode;
+ plain_table_options.store_index_in_file = store_index_in_file;
+
+ options.table_factory.reset(
+ NewPlainTableFactory(plain_table_options));
+ }
+ DestroyAndReopen(&options);
+ uint64_t int_num;
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_EQ(int_num, 0U);
+
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ ASSERT_OK(Put("0000000000000bar", "v2"));
+ ASSERT_OK(Put("1000000000000foo", "v3"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ ASSERT_TRUE(dbfull()->GetIntProperty(
+ "rocksdb.estimate-table-readers-mem", &int_num));
+ ASSERT_GT(int_num, 0U);
+
+ TablePropertiesCollection ptc;
+ ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(
+ &ptc));
+ ASSERT_EQ(1U, ptc.size());
+ auto row = ptc.begin();
+ auto tp = row->second;
+
+ if (full_scan_mode) {
+ // Does not support Get/Seek
+ std::unique_ptr<Iterator> iter(
+ dbfull()->NewIterator(ReadOptions()));
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("0000000000000bar", iter->key().ToString());
+ ASSERT_EQ("v2", iter->value().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000000foo", iter->key().ToString());
+ ASSERT_EQ("v3", iter->value().ToString());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_TRUE(iter->status().ok());
+ } else {
+ if (!store_index_in_file) {
+ ASSERT_EQ(total_order ? "4" : "12",
+ (tp->user_collected_properties)
+ .at("plain_table_hash_table_size"));
+ ASSERT_EQ("0", (tp->user_collected_properties)
+ .at("plain_table_sub_index_size"));
+ } else {
+ ASSERT_EQ("0", (tp->user_collected_properties)
+ .at("plain_table_hash_table_size"));
+ ASSERT_EQ("0", (tp->user_collected_properties)
+ .at("plain_table_sub_index_size"));
+ }
+ ASSERT_EQ("v3", Get("1000000000000foo"));
+ ASSERT_EQ("v2", Get("0000000000000bar"));
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+TEST_P(PlainTableDBTest, Flush2) {
+ for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+ huge_page_tlb_size += 2 * 1024 * 1024) {
+ for (EncodingType encoding_type : {kPlain, kPrefix}) {
+ for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+ for (int total_order = 0; total_order <= 1; total_order++) {
+ for (int store_index_in_file = 0; store_index_in_file <= 1;
+ ++store_index_in_file) {
+ if (encoding_type == kPrefix && total_order) {
+ continue;
+ }
+ if (!bloom_bits && store_index_in_file) {
+ continue;
+ }
+ if (total_order && store_index_in_file) {
+ continue;
+ }
+ bool expect_bloom_not_match = false;
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ PlainTableOptions plain_table_options;
+ if (total_order) {
+ options.prefix_extractor = nullptr;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 2;
+ } else {
+ plain_table_options.hash_table_ratio = 0.75;
+ plain_table_options.index_sparseness = 16;
+ }
+ plain_table_options.user_key_len = kPlainTableVariableLength;
+ plain_table_options.bloom_bits_per_key = bloom_bits;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+ plain_table_options.encoding_type = encoding_type;
+ plain_table_options.store_index_in_file = store_index_in_file;
+ options.table_factory.reset(new TestPlainTableFactory(
+ &expect_bloom_not_match, plain_table_options,
+ 0 /* column_family_id */, kDefaultColumnFamilyName));
+
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("0000000000000bar", "b"));
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ ASSERT_OK(Put("1000000000000foo", "v2"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_EQ("v2", Get("1000000000000foo"));
+
+ ASSERT_OK(Put("0000000000000eee", "v3"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_EQ("v3", Get("0000000000000eee"));
+
+ ASSERT_OK(Delete("0000000000000bar"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+ ASSERT_OK(Put("0000000000000eee", "v5"));
+ ASSERT_OK(Put("9000000000000eee", "v5"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_EQ("v5", Get("0000000000000eee"));
+
+ // Test Bloom Filter
+ if (bloom_bits > 0) {
+ // Neither key nor value should exist.
+ expect_bloom_not_match = true;
+ ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
+ // Key doesn't exist any more but prefix exists.
+ if (total_order) {
+ ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
+ ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
+ }
+ expect_bloom_not_match = false;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+TEST_P(PlainTableDBTest, Immortal) {
+ for (EncodingType encoding_type : {kPlain, kPrefix}) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.max_open_files = -1;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ PlainTableOptions plain_table_options;
+ plain_table_options.hash_table_ratio = 0.75;
+ plain_table_options.index_sparseness = 16;
+ plain_table_options.user_key_len = kPlainTableVariableLength;
+ plain_table_options.bloom_bits_per_key = 10;
+ plain_table_options.encoding_type = encoding_type;
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("0000000000000bar", "b"));
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ int copied = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GetContext::SaveValue::PinSelf", [&](void* /*arg*/) { copied++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_EQ("b", Get("0000000000000bar"));
+ ASSERT_EQ("v1", Get("1000000000000foo"));
+ ASSERT_EQ(2, copied);
+ copied = 0;
+
+ Close();
+ ASSERT_OK(ReopenForReadOnly(&options));
+
+ ASSERT_EQ("b", Get("0000000000000bar"));
+ ASSERT_EQ("v1", Get("1000000000000foo"));
+ ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+ if (mmap_mode()) {
+ ASSERT_EQ(0, copied);
+ } else {
+ ASSERT_EQ(2, copied);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_P(PlainTableDBTest, Iterator) {
+ for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+ huge_page_tlb_size += 2 * 1024 * 1024) {
+ for (EncodingType encoding_type : {kPlain, kPrefix}) {
+ for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+ for (int total_order = 0; total_order <= 1; total_order++) {
+ if (encoding_type == kPrefix && total_order == 1) {
+ continue;
+ }
+ bool expect_bloom_not_match = false;
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ if (total_order) {
+ options.prefix_extractor = nullptr;
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = bloom_bits;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 2;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+ plain_table_options.encoding_type = encoding_type;
+
+ options.table_factory.reset(new TestPlainTableFactory(
+ &expect_bloom_not_match, plain_table_options,
+ 0 /* column_family_id */, kDefaultColumnFamilyName));
+ } else {
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = bloom_bits;
+ plain_table_options.hash_table_ratio = 0.75;
+ plain_table_options.index_sparseness = 16;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+ plain_table_options.encoding_type = encoding_type;
+
+ options.table_factory.reset(new TestPlainTableFactory(
+ &expect_bloom_not_match, plain_table_options,
+ 0 /* column_family_id */, kDefaultColumnFamilyName));
+ }
+ DestroyAndReopen(&options);
+
+ ASSERT_OK(Put("1000000000foo002", "v_2"));
+ ASSERT_OK(Put("0000000000000bar", "random"));
+ ASSERT_OK(Put("1000000000foo001", "v1"));
+ ASSERT_OK(Put("3000000000000bar", "bar_v"));
+ ASSERT_OK(Put("1000000000foo003", "v__3"));
+ ASSERT_OK(Put("1000000000foo004", "v__4"));
+ ASSERT_OK(Put("1000000000foo005", "v__5"));
+ ASSERT_OK(Put("1000000000foo007", "v__7"));
+ ASSERT_OK(Put("1000000000foo008", "v__8"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_EQ("v1", Get("1000000000foo001"));
+ ASSERT_EQ("v__3", Get("1000000000foo003"));
+ Iterator* iter = dbfull()->NewIterator(ReadOptions());
+ iter->Seek("1000000000foo000");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo001", iter->key().ToString());
+ ASSERT_EQ("v1", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo002", iter->key().ToString());
+ ASSERT_EQ("v_2", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo003", iter->key().ToString());
+ ASSERT_EQ("v__3", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo004", iter->key().ToString());
+ ASSERT_EQ("v__4", iter->value().ToString());
+
+ iter->Seek("3000000000000bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("3000000000000bar", iter->key().ToString());
+ ASSERT_EQ("bar_v", iter->value().ToString());
+
+ iter->Seek("1000000000foo000");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo001", iter->key().ToString());
+ ASSERT_EQ("v1", iter->value().ToString());
+
+ iter->Seek("1000000000foo005");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo005", iter->key().ToString());
+ ASSERT_EQ("v__5", iter->value().ToString());
+
+ iter->Seek("1000000000foo006");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo007", iter->key().ToString());
+ ASSERT_EQ("v__7", iter->value().ToString());
+
+ iter->Seek("1000000000foo008");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo008", iter->key().ToString());
+ ASSERT_EQ("v__8", iter->value().ToString());
+
+ if (total_order == 0) {
+ iter->Seek("1000000000foo009");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("3000000000000bar", iter->key().ToString());
+ }
+
+ // Test Bloom Filter
+ if (bloom_bits > 0) {
+ if (!total_order) {
+ // Neither key nor value should exist.
+ expect_bloom_not_match = true;
+ iter->Seek("2not000000000bar");
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+ expect_bloom_not_match = false;
+ } else {
+ expect_bloom_not_match = true;
+ ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+ expect_bloom_not_match = false;
+ }
+ }
+ ASSERT_OK(iter->status());
+ delete iter;
+ }
+ }
+ }
+ }
+}
+
+namespace {
+std::string NthKey(size_t n, char filler) {
+ std::string rv(16, filler);
+ rv[0] = n % 10;
+ rv[1] = (n / 10) % 10;
+ rv[2] = (n / 100) % 10;
+ rv[3] = (n / 1000) % 10;
+ return rv;
+}
+} // anonymous namespace
+
+TEST_P(PlainTableDBTest, BloomSchema) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) {
+ options.bloom_locality = bloom_locality;
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = 3; // high FP rate for test
+ plain_table_options.hash_table_ratio = 0.75;
+ plain_table_options.index_sparseness = 16;
+ plain_table_options.huge_page_tlb_size = 0;
+ plain_table_options.encoding_type = kPlain;
+
+ bool expect_bloom_not_match = false;
+ options.table_factory.reset(new TestPlainTableFactory(
+ &expect_bloom_not_match, plain_table_options, 0 /* column_family_id */,
+ kDefaultColumnFamilyName));
+ DestroyAndReopen(&options);
+
+ for (unsigned i = 0; i < 2345; ++i) {
+ ASSERT_OK(Put(NthKey(i, 'y'), "added"));
+ }
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_EQ("added", Get(NthKey(42, 'y')));
+
+ for (unsigned i = 0; i < 32; ++i) {
+ // Known pattern of Bloom filter false positives can detect schema change
+ // with high probability. Known FPs stuffed into bits:
+ uint32_t pattern;
+ if (!bloom_locality) {
+ pattern = 1785868347UL;
+ } else if (CACHE_LINE_SIZE == 64U) {
+ pattern = 2421694657UL;
+ } else if (CACHE_LINE_SIZE == 128U) {
+ pattern = 788710956UL;
+ } else {
+ ASSERT_EQ(CACHE_LINE_SIZE, 256U);
+ pattern = 163905UL;
+ }
+ bool expect_fp = pattern & (1UL << i);
+ // fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp);
+ expect_bloom_not_match = !expect_fp;
+ ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n')));
+ }
+ }
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+ return std::string(length, c);
+}
+} // anonymous namespace
+
+TEST_P(PlainTableDBTest, IteratorLargeKeys) {
+ Options options = CurrentOptions();
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 0;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ options.create_if_missing = true;
+ options.prefix_extractor.reset();
+ DestroyAndReopen(&options);
+
+ std::string key_list[] = {MakeLongKey(30, '0'), MakeLongKey(16, '1'),
+ MakeLongKey(32, '2'), MakeLongKey(60, '3'),
+ MakeLongKey(90, '4'), MakeLongKey(50, '5'),
+ MakeLongKey(26, '6')};
+
+ for (size_t i = 0; i < 7; i++) {
+ ASSERT_OK(Put(key_list[i], std::to_string(i)));
+ }
+
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ Iterator* iter = dbfull()->NewIterator(ReadOptions());
+ iter->Seek(key_list[0]);
+
+ for (size_t i = 0; i < 7; i++) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(key_list[i], iter->key().ToString());
+ ASSERT_EQ(std::to_string(i), iter->value().ToString());
+ iter->Next();
+ }
+
+ ASSERT_TRUE(!iter->Valid());
+
+ delete iter;
+}
+
+namespace {
+std::string MakeLongKeyWithPrefix(size_t length, char c) {
+ return "00000000" + std::string(length - 8, c);
+}
+} // anonymous namespace
+
+TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
+ Options options = CurrentOptions();
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0.8;
+ plain_table_options.index_sparseness = 3;
+ plain_table_options.huge_page_tlb_size = 0;
+ plain_table_options.encoding_type = kPrefix;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ options.create_if_missing = true;
+ DestroyAndReopen(&options);
+
+ std::string key_list[] = {
+ MakeLongKeyWithPrefix(30, '0'), MakeLongKeyWithPrefix(16, '1'),
+ MakeLongKeyWithPrefix(32, '2'), MakeLongKeyWithPrefix(60, '3'),
+ MakeLongKeyWithPrefix(90, '4'), MakeLongKeyWithPrefix(50, '5'),
+ MakeLongKeyWithPrefix(26, '6')};
+
+ for (size_t i = 0; i < 7; i++) {
+ ASSERT_OK(Put(key_list[i], std::to_string(i)));
+ }
+
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ Iterator* iter = dbfull()->NewIterator(ReadOptions());
+ iter->Seek(key_list[0]);
+
+ for (size_t i = 0; i < 7; i++) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(key_list[i], iter->key().ToString());
+ ASSERT_EQ(std::to_string(i), iter->value().ToString());
+ iter->Next();
+ }
+
+ ASSERT_TRUE(!iter->Valid());
+
+ delete iter;
+}
+
+TEST_P(PlainTableDBTest, IteratorReverseSuffixComparator) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ test::SimpleSuffixReverseComparator comp;
+ options.comparator = &comp;
+ DestroyAndReopen(&options);
+
+ ASSERT_OK(Put("1000000000foo002", "v_2"));
+ ASSERT_OK(Put("0000000000000bar", "random"));
+ ASSERT_OK(Put("1000000000foo001", "v1"));
+ ASSERT_OK(Put("3000000000000bar", "bar_v"));
+ ASSERT_OK(Put("1000000000foo003", "v__3"));
+ ASSERT_OK(Put("1000000000foo004", "v__4"));
+ ASSERT_OK(Put("1000000000foo005", "v__5"));
+ ASSERT_OK(Put("1000000000foo007", "v__7"));
+ ASSERT_OK(Put("1000000000foo008", "v__8"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_EQ("v1", Get("1000000000foo001"));
+ ASSERT_EQ("v__3", Get("1000000000foo003"));
+ Iterator* iter = dbfull()->NewIterator(ReadOptions());
+ iter->Seek("1000000000foo009");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo008", iter->key().ToString());
+ ASSERT_EQ("v__8", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo007", iter->key().ToString());
+ ASSERT_EQ("v__7", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo005", iter->key().ToString());
+ ASSERT_EQ("v__5", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo004", iter->key().ToString());
+ ASSERT_EQ("v__4", iter->value().ToString());
+
+ iter->Seek("3000000000000bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("3000000000000bar", iter->key().ToString());
+ ASSERT_EQ("bar_v", iter->value().ToString());
+
+ iter->Seek("1000000000foo005");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo005", iter->key().ToString());
+ ASSERT_EQ("v__5", iter->value().ToString());
+
+ iter->Seek("1000000000foo006");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo005", iter->key().ToString());
+ ASSERT_EQ("v__5", iter->value().ToString());
+
+ iter->Seek("1000000000foo008");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("1000000000foo008", iter->key().ToString());
+ ASSERT_EQ("v__8", iter->value().ToString());
+
+ iter->Seek("1000000000foo000");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("3000000000000bar", iter->key().ToString());
+
+ delete iter;
+}
+
+TEST_P(PlainTableDBTest, HashBucketConflict) {
+ for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+ huge_page_tlb_size += 2 * 1024 * 1024) {
+ for (unsigned char i = 1; i <= 3; i++) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 2 ^ i;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("5000000000000fo0", "v1"));
+ ASSERT_OK(Put("5000000000000fo1", "v2"));
+ ASSERT_OK(Put("5000000000000fo2", "v"));
+ ASSERT_OK(Put("2000000000000fo0", "v3"));
+ ASSERT_OK(Put("2000000000000fo1", "v4"));
+ ASSERT_OK(Put("2000000000000fo2", "v"));
+ ASSERT_OK(Put("2000000000000fo3", "v"));
+
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ ASSERT_EQ("v1", Get("5000000000000fo0"));
+ ASSERT_EQ("v2", Get("5000000000000fo1"));
+ ASSERT_EQ("v3", Get("2000000000000fo0"));
+ ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+ ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+ ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+ ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+ ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+ ReadOptions ro;
+ Iterator* iter = dbfull()->NewIterator(ro);
+
+ iter->Seek("5000000000000fo0");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+ iter->Seek("5000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+ iter->Seek("2000000000000fo0");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+ iter->Seek("2000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+ iter->Seek("2000000000000bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+ iter->Seek("5000000000000bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+ iter->Seek("2000000000000fo8");
+ ASSERT_TRUE(!iter->Valid() ||
+ options.comparator->Compare(iter->key(), "20000001") > 0);
+
+ iter->Seek("5000000000000fo8");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("1000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("3000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("8000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ ASSERT_OK(iter->status());
+ delete iter;
+ }
+ }
+}
+
+TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
+ for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+ huge_page_tlb_size += 2 * 1024 * 1024) {
+ for (unsigned char i = 1; i <= 3; i++) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ test::SimpleSuffixReverseComparator comp;
+ options.comparator = &comp;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 2 ^ i;
+ plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("5000000000000fo0", "v1"));
+ ASSERT_OK(Put("5000000000000fo1", "v2"));
+ ASSERT_OK(Put("5000000000000fo2", "v"));
+ ASSERT_OK(Put("2000000000000fo0", "v3"));
+ ASSERT_OK(Put("2000000000000fo1", "v4"));
+ ASSERT_OK(Put("2000000000000fo2", "v"));
+ ASSERT_OK(Put("2000000000000fo3", "v"));
+
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ ASSERT_EQ("v1", Get("5000000000000fo0"));
+ ASSERT_EQ("v2", Get("5000000000000fo1"));
+ ASSERT_EQ("v3", Get("2000000000000fo0"));
+ ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+ ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+ ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+ ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+ ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+ ReadOptions ro;
+ Iterator* iter = dbfull()->NewIterator(ro);
+
+ iter->Seek("5000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+ iter->Seek("5000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+ iter->Seek("2000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+ iter->Seek("2000000000000fo1");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+ iter->Seek("2000000000000var");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("2000000000000fo3", iter->key().ToString());
+
+ iter->Seek("5000000000000var");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo2", iter->key().ToString());
+
+ std::string seek_key = "2000000000000bar";
+ iter->Seek(seek_key);
+ ASSERT_TRUE(!iter->Valid() ||
+ options.prefix_extractor->Transform(iter->key()) !=
+ options.prefix_extractor->Transform(seek_key));
+
+ iter->Seek("1000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("3000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("8000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ ASSERT_OK(iter->status());
+ delete iter;
+ }
+ }
+}
+
+TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // Set only one bucket to force bucket conflict.
+ // Test index interval for the same prefix to be 1, 2 and 4
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 5;
+
+ options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("5000000000000fo0", "v1"));
+ ASSERT_OK(Put("5000000000000fo1", "v2"));
+ ASSERT_OK(Put("5000000000000fo2", "v3"));
+
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ ASSERT_EQ("v1", Get("5000000000000fo0"));
+ ASSERT_EQ("v2", Get("5000000000000fo1"));
+ ASSERT_EQ("v3", Get("5000000000000fo2"));
+
+ ASSERT_EQ("NOT_FOUND", Get("8000000000000bar"));
+ ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+
+ Iterator* iter = dbfull()->NewIterator(ReadOptions());
+
+ iter->Seek("5000000000000bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+ iter->Seek("5000000000000fo8");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("1000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("8000000000000fo2");
+ ASSERT_TRUE(!iter->Valid());
+
+ ASSERT_OK(iter->status());
+ delete iter;
+}
+
+static std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key_______%06d", i);
+ return std::string(buf);
+}
+
+TEST_P(PlainTableDBTest, CompactionTrigger) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 120 << 10; // 120KB
+ options.num_levels = 3;
+ options.level0_file_num_compaction_trigger = 3;
+ Reopen(&options);
+
+ Random rnd(301);
+
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ std::vector<std::string> values;
+ // Write 120KB (10 values, each 12K)
+ for (int i = 0; i < 10; i++) {
+ values.push_back(rnd.RandomString(12 << 10));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Put(Key(999), ""));
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+ }
+
+ // generate one more file in level-0, and should trigger level-0 compaction
+ std::vector<std::string> values;
+ for (int i = 0; i < 12; i++) {
+ values.push_back(rnd.RandomString(10000));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Put(Key(999), ""));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+TEST_P(PlainTableDBTest, AdaptiveTable) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+
+ options.table_factory.reset(NewPlainTableFactory());
+ DestroyAndReopen(&options);
+
+ ASSERT_OK(Put("1000000000000foo", "v1"));
+ ASSERT_OK(Put("0000000000000bar", "v2"));
+ ASSERT_OK(Put("1000000000000foo", "v3"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+ options.create_if_missing = false;
+ std::shared_ptr<TableFactory> block_based_factory(
+ NewBlockBasedTableFactory());
+ std::shared_ptr<TableFactory> plain_table_factory(NewPlainTableFactory());
+ std::shared_ptr<TableFactory> dummy_factory;
+ options.table_factory.reset(NewAdaptiveTableFactory(
+ block_based_factory, block_based_factory, plain_table_factory));
+ Reopen(&options);
+ ASSERT_EQ("v3", Get("1000000000000foo"));
+ ASSERT_EQ("v2", Get("0000000000000bar"));
+
+ ASSERT_OK(Put("2000000000000foo", "v4"));
+ ASSERT_OK(Put("3000000000000bar", "v5"));
+ ASSERT_OK(dbfull()->TEST_FlushMemTable());
+ ASSERT_EQ("v4", Get("2000000000000foo"));
+ ASSERT_EQ("v5", Get("3000000000000bar"));
+
+ Reopen(&options);
+ ASSERT_EQ("v3", Get("1000000000000foo"));
+ ASSERT_EQ("v2", Get("0000000000000bar"));
+ ASSERT_EQ("v4", Get("2000000000000foo"));
+ ASSERT_EQ("v5", Get("3000000000000bar"));
+
+ options.paranoid_checks = false;
+ options.table_factory.reset(NewBlockBasedTableFactory());
+ Reopen(&options);
+ ASSERT_NE("v3", Get("1000000000000foo"));
+
+ options.paranoid_checks = false;
+ options.table_factory.reset(NewPlainTableFactory());
+ Reopen(&options);
+ ASSERT_NE("v5", Get("3000000000000bar"));
+}
+
+INSTANTIATE_TEST_CASE_P(PlainTableDBTest, PlainTableDBTest, ::testing::Bool());
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as plain table is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/post_memtable_callback.h b/src/rocksdb/db/post_memtable_callback.h
new file mode 100644
index 000000000..fbf2fbe86
--- /dev/null
+++ b/src/rocksdb/db/post_memtable_callback.h
@@ -0,0 +1,25 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Callback invoked after finishing writing to the memtable but before
+// publishing the sequence number to readers.
+// Note that with write-prepared/write-unprepared transactions with
+// two-write-queues, PreReleaseCallback is called before publishing the
+// sequence numbers to readers.
+class PostMemTableCallback {
+ public:
+ virtual ~PostMemTableCallback() {}
+
+ virtual Status operator()(SequenceNumber seq, bool disable_memtable) = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/pre_release_callback.h b/src/rocksdb/db/pre_release_callback.h
new file mode 100644
index 000000000..6b9039487
--- /dev/null
+++ b/src/rocksdb/db/pre_release_callback.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PreReleaseCallback {
+ public:
+ virtual ~PreReleaseCallback() {}
+
+ // Will be called while on the write thread after the write to the WAL and
+ // before the write to memtable. This is useful if any operation needs to be
+ // done before the write gets visible to the readers, or if we want to reduce
+ // the overhead of locking by updating something sequentially while we are on
+ // the write thread. If the callback fails, this function returns a non-OK
+ // status, the sequence number will not be released, and same status will be
+ // propagated to all the writers in the write group.
+ // seq is the sequence number that is used for this write and will be
+ // released.
+ // is_mem_disabled is currently used for debugging purposes to assert that
+ // the callback is done from the right write queue.
+ // If non-zero, log_number indicates the WAL log to which we wrote.
+ // index >= 0 specifies the order of callback in the same write thread.
+ // total > index specifies the total number of callbacks in the same write
+ // thread. Together with index, could be used to reduce the redundant
+ // operations among the callbacks.
+ virtual Status Callback(SequenceNumber seq, bool is_mem_disabled,
+ uint64_t log_number, size_t index, size_t total) = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/prefix_test.cc b/src/rocksdb/db/prefix_test.cc
new file mode 100644
index 000000000..8592b8f31
--- /dev/null
+++ b/src/rocksdb/db/prefix_test.cc
@@ -0,0 +1,906 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+ return 0;
+}
+#else
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "monitoring/histogram.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_bool(trigger_deadlock, false,
+ "issue delete in range scan to trigger PrefixHashMap deadlock");
+DEFINE_int32(bucket_count, 100000, "number of buckets");
+DEFINE_uint64(num_locks, 10001, "number of locks");
+DEFINE_bool(random_prefix, false, "randomize prefix");
+DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
+DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
+DEFINE_int64(write_buffer_size, 33554432, "");
+DEFINE_int32(max_write_buffer_number, 2, "");
+DEFINE_int32(min_write_buffer_number_to_merge, 1, "");
+DEFINE_int32(skiplist_height, 4, "");
+DEFINE_double(memtable_prefix_bloom_size_ratio, 0.1, "");
+DEFINE_int32(memtable_huge_page_size, 2 * 1024 * 1024, "");
+DEFINE_int32(value_size, 40, "");
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+
+// Path to the database on file system
+const std::string kDbName =
+ ROCKSDB_NAMESPACE::test::PerThreadDBPath("prefix_test");
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TestKey {
+ uint64_t prefix;
+ uint64_t sorted;
+
+ TestKey(uint64_t _prefix, uint64_t _sorted)
+ : prefix(_prefix), sorted(_sorted) {}
+};
+
+// return a slice backed by test_key
+inline Slice TestKeyToSlice(std::string& s, const TestKey& test_key) {
+ s.clear();
+ PutFixed64(&s, test_key.prefix);
+ PutFixed64(&s, test_key.sorted);
+ return Slice(s.c_str(), s.size());
+}
+
+inline const TestKey SliceToTestKey(const Slice& slice) {
+ return TestKey(DecodeFixed64(slice.data()), DecodeFixed64(slice.data() + 8));
+}
+
+class TestKeyComparator : public Comparator {
+ public:
+ // Compare needs to be aware of the possibility of a and/or b is
+ // prefix only
+ int Compare(const Slice& a, const Slice& b) const override {
+ const TestKey kkey_a = SliceToTestKey(a);
+ const TestKey kkey_b = SliceToTestKey(b);
+ const TestKey* key_a = &kkey_a;
+ const TestKey* key_b = &kkey_b;
+ if (key_a->prefix != key_b->prefix) {
+ if (key_a->prefix < key_b->prefix) return -1;
+ if (key_a->prefix > key_b->prefix) return 1;
+ } else {
+ EXPECT_TRUE(key_a->prefix == key_b->prefix);
+ // note, both a and b could be prefix only
+ if (a.size() != b.size()) {
+ // one of them is prefix
+ EXPECT_TRUE(
+ (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
+ (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
+ if (a.size() < b.size()) return -1;
+ if (a.size() > b.size()) return 1;
+ } else {
+ // both a and b are prefix
+ if (a.size() == sizeof(uint64_t)) {
+ return 0;
+ }
+
+ // both a and b are whole key
+ EXPECT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
+ if (key_a->sorted < key_b->sorted) return -1;
+ if (key_a->sorted > key_b->sorted) return 1;
+ if (key_a->sorted == key_b->sorted) return 0;
+ }
+ }
+ return 0;
+ }
+
+ bool operator()(const TestKey& a, const TestKey& b) const {
+ std::string sa, sb;
+ return Compare(TestKeyToSlice(sa, a), TestKeyToSlice(sb, b)) < 0;
+ }
+
+ const char* Name() const override { return "TestKeyComparator"; }
+
+ void FindShortestSeparator(std::string* /*start*/,
+ const Slice& /*limit*/) const override {}
+
+ void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+namespace {
+void PutKey(DB* db, WriteOptions write_options, uint64_t prefix,
+ uint64_t suffix, const Slice& value) {
+ TestKey test_key(prefix, suffix);
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void PutKey(DB* db, WriteOptions write_options, const TestKey& test_key,
+ const Slice& value) {
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void MergeKey(DB* db, WriteOptions write_options, const TestKey& test_key,
+ const Slice& value) {
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ ASSERT_OK(db->Merge(write_options, key, value));
+}
+
+void DeleteKey(DB* db, WriteOptions write_options, const TestKey& test_key) {
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ ASSERT_OK(db->Delete(write_options, key));
+}
+
+void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) {
+ TestKey test_key(prefix, suffix);
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ iter->Seek(key);
+}
+
+const std::string kNotFoundResult = "NOT_FOUND";
+
+std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix,
+ uint64_t suffix) {
+ TestKey test_key(prefix, suffix);
+ std::string s2;
+ Slice key = TestKeyToSlice(s2, test_key);
+
+ std::string result;
+ Status s = db->Get(read_options, key, &result);
+ if (s.IsNotFound()) {
+ result = kNotFoundResult;
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+}
+
+class SamePrefixTransform : public SliceTransform {
+ private:
+ const Slice prefix_;
+ std::string name_;
+
+ public:
+ explicit SamePrefixTransform(const Slice& prefix)
+ : prefix_(prefix), name_("rocksdb.SamePrefix." + prefix.ToString()) {}
+
+ const char* Name() const override { return name_.c_str(); }
+
+ Slice Transform(const Slice& src) const override {
+ assert(InDomain(src));
+ return prefix_;
+ }
+
+ bool InDomain(const Slice& src) const override {
+ if (src.size() >= prefix_.size()) {
+ return Slice(src.data(), prefix_.size()) == prefix_;
+ }
+ return false;
+ }
+
+ bool InRange(const Slice& dst) const override { return dst == prefix_; }
+
+ bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
+};
+
+} // anonymous namespace
+
+class PrefixTest : public testing::Test {
+ public:
+ std::shared_ptr<DB> OpenDb() {
+ DB* db;
+
+ options.create_if_missing = true;
+ options.write_buffer_size = FLAGS_write_buffer_size;
+ options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+ options.min_write_buffer_number_to_merge =
+ FLAGS_min_write_buffer_number_to_merge;
+
+ options.memtable_prefix_bloom_size_ratio =
+ FLAGS_memtable_prefix_bloom_size_ratio;
+ options.memtable_huge_page_size = FLAGS_memtable_huge_page_size;
+
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ options.allow_concurrent_memtable_write = false;
+
+ Status s = DB::Open(options, kDbName, &db);
+ EXPECT_OK(s);
+ return std::shared_ptr<DB>(db);
+ }
+
+ void FirstOption() { option_config_ = kBegin; }
+
+ bool NextOptions(int bucket_count) {
+ // skip some options
+ option_config_++;
+ if (option_config_ < kEnd) {
+ options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+ switch (option_config_) {
+ case kHashSkipList:
+ options.memtable_factory.reset(
+ NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height));
+ return true;
+ case kHashLinkList:
+ options.memtable_factory.reset(
+ NewHashLinkListRepFactory(bucket_count));
+ return true;
+ case kHashLinkListHugePageTlb:
+ options.memtable_factory.reset(
+ NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
+ return true;
+ case kHashLinkListTriggerSkipList:
+ options.memtable_factory.reset(
+ NewHashLinkListRepFactory(bucket_count, 0, 3));
+ return true;
+ default:
+ return false;
+ }
+ }
+ return false;
+ }
+
+ PrefixTest() : option_config_(kBegin) {
+ options.comparator = new TestKeyComparator();
+ }
+ ~PrefixTest() override { delete options.comparator; }
+
+ protected:
+ enum OptionConfig {
+ kBegin,
+ kHashSkipList,
+ kHashLinkList,
+ kHashLinkListHugePageTlb,
+ kHashLinkListTriggerSkipList,
+ kEnd
+ };
+ int option_config_;
+ Options options;
+};
+
+TEST(SamePrefixTest, InDomainTest) {
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ options.prefix_extractor.reset(new SamePrefixTransform("HHKB"));
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ WriteOptions write_options;
+ ReadOptions read_options;
+ {
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ ASSERT_OK(DB::Open(options, kDbName, &db));
+ ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006"));
+ ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011"));
+ ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+ std::string result;
+ auto db_iter = db->NewIterator(ReadOptions());
+
+ db_iter->Seek("Realforce 87u");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ ASSERT_EQ(db_iter->key(), "Realforce 87u");
+ ASSERT_EQ(db_iter->value(), "idk");
+
+ delete db_iter;
+ delete db;
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ }
+
+ {
+ ASSERT_OK(DB::Open(options, kDbName, &db));
+ ASSERT_OK(db->Put(write_options, "pikachu", "1"));
+ ASSERT_OK(db->Put(write_options, "Meowth", "1"));
+ ASSERT_OK(db->Put(write_options, "Mewtwo", "idk"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+ std::string result;
+ auto db_iter = db->NewIterator(ReadOptions());
+
+ db_iter->Seek("Mewtwo");
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ delete db_iter;
+ delete db;
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ }
+}
+
+TEST_F(PrefixTest, TestResult) {
+ for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+ FirstOption();
+ while (NextOptions(num_buckets)) {
+ std::cout << "*** Mem table: " << options.memtable_factory->Name()
+ << " number of buckets: " << num_buckets << std::endl;
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ // 1. Insert one row.
+ Slice v16("v16");
+ PutKey(db.get(), write_options, 1, 6, v16);
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ SeekIterator(iter.get(), 1, 6);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+
+ SeekIterator(iter.get(), 2, 0);
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+
+ ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+ ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5));
+ ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7));
+ ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6));
+ ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6));
+
+ // 2. Insert an entry for the same prefix as the last entry in the bucket.
+ Slice v17("v17");
+ PutKey(db.get(), write_options, 1, 7, v17);
+ iter.reset(db->NewIterator(read_options));
+ SeekIterator(iter.get(), 1, 7);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ SeekIterator(iter.get(), 1, 6);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+
+ SeekIterator(iter.get(), 2, 0);
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+
+ // 3. Insert an entry for the same prefix as the head of the bucket.
+ Slice v15("v15");
+ PutKey(db.get(), write_options, 1, 5, v15);
+ iter.reset(db->NewIterator(read_options));
+
+ SeekIterator(iter.get(), 1, 7);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v15 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v15 == iter->value());
+
+ ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+ ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+ ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+
+ // 4. Insert an entry with a larger prefix
+ Slice v22("v22");
+ PutKey(db.get(), write_options, 2, 2, v22);
+ iter.reset(db->NewIterator(read_options));
+
+ SeekIterator(iter.get(), 2, 2);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v22 == iter->value());
+ SeekIterator(iter.get(), 2, 0);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v22 == iter->value());
+
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v15 == iter->value());
+
+ SeekIterator(iter.get(), 1, 7);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ // 5. Insert an entry with a smaller prefix
+ Slice v02("v02");
+ PutKey(db.get(), write_options, 0, 2, v02);
+ iter.reset(db->NewIterator(read_options));
+
+ SeekIterator(iter.get(), 0, 2);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v02 == iter->value());
+ SeekIterator(iter.get(), 0, 0);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v02 == iter->value());
+
+ SeekIterator(iter.get(), 2, 0);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v22 == iter->value());
+
+ SeekIterator(iter.get(), 1, 5);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v15 == iter->value());
+
+ SeekIterator(iter.get(), 1, 7);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ // 6. Insert to the beginning and the end of the first prefix
+ Slice v13("v13");
+ Slice v18("v18");
+ PutKey(db.get(), write_options, 1, 3, v13);
+ PutKey(db.get(), write_options, 1, 8, v18);
+ iter.reset(db->NewIterator(read_options));
+ SeekIterator(iter.get(), 1, 7);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ SeekIterator(iter.get(), 1, 3);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v13 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v15 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v18 == iter->value());
+
+ SeekIterator(iter.get(), 0, 0);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v02 == iter->value());
+
+ SeekIterator(iter.get(), 2, 0);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v22 == iter->value());
+
+ ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2));
+ ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2));
+ ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3));
+ ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+ ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+ ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+ ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8));
+ }
+ }
+}
+
+// Show results in prefix
+TEST_F(PrefixTest, PrefixValid) {
+ for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+ FirstOption();
+ while (NextOptions(num_buckets)) {
+ std::cout << "*** Mem table: " << options.memtable_factory->Name()
+ << " number of buckets: " << num_buckets << std::endl;
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ // Insert keys with common prefix and one key with different
+ Slice v16("v16");
+ Slice v17("v17");
+ Slice v18("v18");
+ Slice v19("v19");
+ PutKey(db.get(), write_options, 12345, 6, v16);
+ PutKey(db.get(), write_options, 12345, 7, v17);
+ PutKey(db.get(), write_options, 12345, 8, v18);
+ PutKey(db.get(), write_options, 12345, 9, v19);
+ PutKey(db.get(), write_options, 12346, 8, v16);
+ ASSERT_OK(db->Flush(FlushOptions()));
+ TestKey test_key(12346, 8);
+ std::string s;
+ ASSERT_OK(db->Delete(write_options, TestKeyToSlice(s, test_key)));
+ ASSERT_OK(db->Flush(FlushOptions()));
+ read_options.prefix_same_as_start = true;
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ SeekIterator(iter.get(), 12345, 6);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v16 == iter->value());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v17 == iter->value());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v18 == iter->value());
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_TRUE(v19 == iter->value());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 12346, 8));
+
+ // Verify seeking past the prefix won't return a result.
+ SeekIterator(iter.get(), 12345, 10);
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+ }
+ }
+}
+
+TEST_F(PrefixTest, DynamicPrefixIterator) {
+ while (NextOptions(FLAGS_bucket_count)) {
+ std::cout << "*** Mem table: " << options.memtable_factory->Name()
+ << std::endl;
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ std::vector<uint64_t> prefixes;
+ for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+ prefixes.push_back(i);
+ }
+
+ if (FLAGS_random_prefix) {
+ RandomShuffle(prefixes.begin(), prefixes.end());
+ }
+
+ HistogramImpl hist_put_time;
+ HistogramImpl hist_put_comparison;
+ // insert x random prefix, each with y continuous element.
+ for (auto prefix : prefixes) {
+ for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+ TestKey test_key(prefix, sorted);
+
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ std::string value(FLAGS_value_size, 0);
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(SystemClock::Default().get(), true);
+ ASSERT_OK(db->Put(write_options, key, value));
+ hist_put_time.Add(timer.ElapsedNanos());
+ hist_put_comparison.Add(get_perf_context()->user_key_comparison_count);
+ }
+ }
+
+ std::cout << "Put key comparison: \n"
+ << hist_put_comparison.ToString() << "Put time: \n"
+ << hist_put_time.ToString();
+
+ // test seek existing keys
+ HistogramImpl hist_seek_time;
+ HistogramImpl hist_seek_comparison;
+
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+ for (auto prefix : prefixes) {
+ TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+ std::string value = "v" + std::to_string(0);
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(SystemClock::Default().get(), true);
+ auto key_prefix = options.prefix_extractor->Transform(key);
+ uint64_t total_keys = 0;
+ for (iter->Seek(key);
+ iter->Valid() && iter->key().starts_with(key_prefix); iter->Next()) {
+ if (FLAGS_trigger_deadlock) {
+ std::cout << "Behold the deadlock!\n";
+ db->Delete(write_options, iter->key());
+ }
+ total_keys++;
+ }
+ hist_seek_time.Add(timer.ElapsedNanos());
+ hist_seek_comparison.Add(get_perf_context()->user_key_comparison_count);
+ ASSERT_EQ(total_keys,
+ FLAGS_items_per_prefix - FLAGS_items_per_prefix / 2);
+ }
+
+ std::cout << "Seek key comparison: \n"
+ << hist_seek_comparison.ToString() << "Seek time: \n"
+ << hist_seek_time.ToString();
+
+ // test non-existing keys
+ HistogramImpl hist_no_seek_time;
+ HistogramImpl hist_no_seek_comparison;
+
+ for (auto prefix = FLAGS_total_prefixes;
+ prefix < FLAGS_total_prefixes + 10000; prefix++) {
+ TestKey test_key(prefix, 0);
+ std::string s;
+ Slice key = TestKeyToSlice(s, test_key);
+
+ get_perf_context()->Reset();
+ StopWatchNano timer(SystemClock::Default().get(), true);
+ iter->Seek(key);
+ hist_no_seek_time.Add(timer.ElapsedNanos());
+ hist_no_seek_comparison.Add(
+ get_perf_context()->user_key_comparison_count);
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_OK(iter->status());
+ }
+
+ std::cout << "non-existing Seek key comparison: \n"
+ << hist_no_seek_comparison.ToString()
+ << "non-existing Seek time: \n"
+ << hist_no_seek_time.ToString();
+ }
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev) {
+ // Only for SkipListFactory
+ options.memtable_factory.reset(new SkipListFactory);
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ options.write_buffer_size = 1024 * 1024;
+ Random rnd(1);
+ for (size_t m = 1; m < 100; m++) {
+ std::cout << "[" + std::to_string(m) + "]" + "*** Mem table: "
+ << options.memtable_factory->Name() << std::endl;
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::map<TestKey, std::string, TestKeyComparator> entry_maps[3], whole_map;
+ for (uint64_t i = 0; i < 10; i++) {
+ int div = i % 3 + 1;
+ for (uint64_t j = 0; j < 10; j++) {
+ whole_map[TestKey(i, j)] = entry_maps[rnd.Uniform(div)][TestKey(i, j)] =
+ 'v' + std::to_string(i) + std::to_string(j);
+ }
+ }
+
+ std::map<TestKey, std::string, TestKeyComparator> type_map;
+ for (size_t i = 0; i < 3; i++) {
+ for (auto& kv : entry_maps[i]) {
+ if (rnd.OneIn(3)) {
+ PutKey(db.get(), write_options, kv.first, kv.second);
+ type_map[kv.first] = "value";
+ } else {
+ MergeKey(db.get(), write_options, kv.first, kv.second);
+ type_map[kv.first] = "merge";
+ }
+ }
+ if (i < 2) {
+ ASSERT_OK(db->Flush(FlushOptions()));
+ }
+ }
+
+ for (size_t i = 0; i < 2; i++) {
+ for (auto& kv : entry_maps[i]) {
+ if (rnd.OneIn(10)) {
+ whole_map.erase(kv.first);
+ DeleteKey(db.get(), write_options, kv.first);
+ entry_maps[2][kv.first] = "delete";
+ }
+ }
+ }
+
+ if (FLAGS_enable_print) {
+ for (size_t i = 0; i < 3; i++) {
+ for (auto& kv : entry_maps[i]) {
+ std::cout << "[" << i << "]" << kv.first.prefix << kv.first.sorted
+ << " " << kv.second + " " + type_map[kv.first] << std::endl;
+ }
+ }
+ }
+
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ for (uint64_t prefix = 0; prefix < 10; prefix++) {
+ uint64_t start_suffix = rnd.Uniform(9);
+ SeekIterator(iter.get(), prefix, start_suffix);
+ auto it = whole_map.find(TestKey(prefix, start_suffix));
+ if (it == whole_map.end()) {
+ continue;
+ }
+ ASSERT_NE(it, whole_map.end());
+ ASSERT_TRUE(iter->Valid());
+ if (FLAGS_enable_print) {
+ std::cout << "round " << prefix
+ << " iter: " << SliceToTestKey(iter->key()).prefix
+ << SliceToTestKey(iter->key()).sorted
+ << " | map: " << it->first.prefix << it->first.sorted << " | "
+ << iter->value().ToString() << " " << it->second << std::endl;
+ }
+ ASSERT_EQ(iter->value(), it->second);
+ uint64_t stored_prefix = prefix;
+ for (size_t k = 0; k < 9; k++) {
+ if (rnd.OneIn(2) || it == whole_map.begin()) {
+ iter->Next();
+ ++it;
+ if (FLAGS_enable_print) {
+ std::cout << "Next >> ";
+ }
+ } else {
+ iter->Prev();
+ it--;
+ if (FLAGS_enable_print) {
+ std::cout << "Prev >> ";
+ }
+ }
+ if (!iter->Valid() ||
+ SliceToTestKey(iter->key()).prefix != stored_prefix) {
+ break;
+ }
+ ASSERT_OK(iter->status());
+ stored_prefix = SliceToTestKey(iter->key()).prefix;
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_NE(it, whole_map.end());
+ ASSERT_EQ(iter->value(), it->second);
+ if (FLAGS_enable_print) {
+ std::cout << "iter: " << SliceToTestKey(iter->key()).prefix
+ << SliceToTestKey(iter->key()).sorted
+ << " | map: " << it->first.prefix << it->first.sorted
+ << " | " << iter->value().ToString() << " " << it->second
+ << std::endl;
+ }
+ }
+ }
+ }
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev2) {
+ // Only for SkipListFactory
+ // test the case
+ // iter1 iter2
+ // | prefix | suffix | | prefix | suffix |
+ // | 1 | 1 | | 1 | 2 |
+ // | 1 | 3 | | 1 | 4 |
+ // | 2 | 1 | | 3 | 3 |
+ // | 2 | 2 | | 3 | 4 |
+ // after seek(15), iter1 will be at 21 and iter2 will be 33.
+ // Then if call Prev() in prefix mode where SeekForPrev(21) gets called,
+ // iter2 should turn to invalid state because of bloom filter.
+ options.memtable_factory.reset(new SkipListFactory);
+ options.write_buffer_size = 1024 * 1024;
+ std::string v13("v13");
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+ PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+ PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+ PutKey(db.get(), write_options, TestKey(3, 3), "v33");
+ PutKey(db.get(), write_options, TestKey(3, 4), "v34");
+ ASSERT_OK(db->Flush(FlushOptions()));
+ ASSERT_OK(
+ static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+ PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+ PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+ PutKey(db.get(), write_options, TestKey(2, 1), "v21");
+ PutKey(db.get(), write_options, TestKey(2, 2), "v22");
+ ASSERT_OK(db->Flush(FlushOptions()));
+ ASSERT_OK(
+ static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ SeekIterator(iter.get(), 1, 5);
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->value(), v13);
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev3) {
+ // Only for SkipListFactory
+ // test SeekToLast() with iterate_upper_bound_ in prefix_seek_mode
+ options.memtable_factory.reset(new SkipListFactory);
+ options.write_buffer_size = 1024 * 1024;
+ std::string v14("v14");
+ TestKey upper_bound_key = TestKey(1, 5);
+ std::string s;
+ Slice upper_bound = TestKeyToSlice(s, upper_bound_key);
+
+ {
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+ read_options.iterate_upper_bound = &upper_bound;
+ PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+ PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+ ASSERT_OK(db->Flush(FlushOptions()));
+ ASSERT_OK(
+ static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+ PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+ PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+ PutKey(db.get(), write_options, TestKey(2, 1), "v21");
+ PutKey(db.get(), write_options, TestKey(2, 2), "v22");
+ ASSERT_OK(db->Flush(FlushOptions()));
+ ASSERT_OK(
+ static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ iter->SeekToLast();
+ ASSERT_EQ(iter->value(), v14);
+ }
+ {
+ ASSERT_OK(DestroyDB(kDbName, Options()));
+ auto db = OpenDb();
+ WriteOptions write_options;
+ ReadOptions read_options;
+ read_options.iterate_upper_bound = &upper_bound;
+ PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+ PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+ PutKey(db.get(), write_options, TestKey(3, 3), "v33");
+ PutKey(db.get(), write_options, TestKey(3, 4), "v34");
+ ASSERT_OK(db->Flush(FlushOptions()));
+ ASSERT_OK(
+ static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+ PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+ PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+ ASSERT_OK(db->Flush(FlushOptions()));
+ ASSERT_OK(
+ static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+ std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+ iter->SeekToLast();
+ ASSERT_EQ(iter->value(), v14);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ ParseCommandLineFlags(&argc, &argv, true);
+ return RUN_ALL_TESTS();
+}
+
+#endif // GFLAGS
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as HashSkipList and HashLinkList are not supported in "
+ "ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/range_del_aggregator.cc b/src/rocksdb/db/range_del_aggregator.cc
new file mode 100644
index 000000000..c03efa11f
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator.cc
@@ -0,0 +1,524 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_del_aggregator.h"
+
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TruncatedRangeDelIterator::TruncatedRangeDelIterator(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+ const InternalKeyComparator* icmp, const InternalKey* smallest,
+ const InternalKey* largest)
+ : iter_(std::move(iter)),
+ icmp_(icmp),
+ smallest_ikey_(smallest),
+ largest_ikey_(largest) {
+ if (smallest != nullptr) {
+ pinned_bounds_.emplace_back();
+ auto& parsed_smallest = pinned_bounds_.back();
+ Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
+ false /* log_err_key */); // TODO
+ pik_status.PermitUncheckedError();
+ assert(pik_status.ok());
+ smallest_ = &parsed_smallest;
+ }
+ if (largest != nullptr) {
+ pinned_bounds_.emplace_back();
+ auto& parsed_largest = pinned_bounds_.back();
+
+ Status pik_status = ParseInternalKey(largest->Encode(), &parsed_largest,
+ false /* log_err_key */); // TODO
+ pik_status.PermitUncheckedError();
+ assert(pik_status.ok());
+
+ if (parsed_largest.type == kTypeRangeDeletion &&
+ parsed_largest.sequence == kMaxSequenceNumber) {
+ // The file boundary has been artificially extended by a range tombstone.
+ // We do not need to adjust largest to properly truncate range
+ // tombstones that extend past the boundary.
+ } else if (parsed_largest.sequence == 0) {
+ // The largest key in the sstable has a sequence number of 0. Since we
+ // guarantee that no internal keys with the same user key and sequence
+ // number can exist in a DB, we know that the largest key in this sstable
+ // cannot exist as the smallest key in the next sstable. This further
+ // implies that no range tombstone in this sstable covers largest;
+ // otherwise, the file boundary would have been artificially extended.
+ //
+ // Therefore, we will never truncate a range tombstone at largest, so we
+ // can leave it unchanged.
+ } else {
+ // The same user key may straddle two sstable boundaries. To ensure that
+ // the truncated end key can cover the largest key in this sstable, reduce
+ // its sequence number by 1.
+ parsed_largest.sequence -= 1;
+ // This line is not needed for correctness, but it ensures that the
+ // truncated end key is not covering keys from the next SST file.
+ parsed_largest.type = kValueTypeForSeek;
+ }
+ largest_ = &parsed_largest;
+ }
+}
+
+bool TruncatedRangeDelIterator::Valid() const {
+ assert(iter_ != nullptr);
+ return iter_->Valid() &&
+ (smallest_ == nullptr ||
+ icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) &&
+ (largest_ == nullptr ||
+ icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0);
+}
+
+// NOTE: target is a user key, with timestamp if enabled.
+void TruncatedRangeDelIterator::Seek(const Slice& target) {
+ if (largest_ != nullptr &&
+ icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber,
+ kTypeRangeDeletion)) <= 0) {
+ iter_->Invalidate();
+ return;
+ }
+ if (smallest_ != nullptr &&
+ icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) {
+ iter_->Seek(smallest_->user_key);
+ return;
+ }
+ iter_->Seek(target);
+}
+
+// NOTE: target is a user key, with timestamp if enabled.
+void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
+ if (smallest_ != nullptr &&
+ icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion),
+ *smallest_) < 0) {
+ iter_->Invalidate();
+ return;
+ }
+ if (largest_ != nullptr &&
+ icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) {
+ iter_->SeekForPrev(largest_->user_key);
+ return;
+ }
+ iter_->SeekForPrev(target);
+}
+
+void TruncatedRangeDelIterator::SeekToFirst() {
+ if (smallest_ != nullptr) {
+ iter_->Seek(smallest_->user_key);
+ return;
+ }
+ iter_->SeekToTopFirst();
+}
+
+void TruncatedRangeDelIterator::SeekToLast() {
+ if (largest_ != nullptr) {
+ iter_->SeekForPrev(largest_->user_key);
+ return;
+ }
+ iter_->SeekToTopLast();
+}
+
+std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+TruncatedRangeDelIterator::SplitBySnapshot(
+ const std::vector<SequenceNumber>& snapshots) {
+ using FragmentedIterPair =
+ std::pair<const SequenceNumber,
+ std::unique_ptr<FragmentedRangeTombstoneIterator>>;
+
+ auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots);
+ std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+ split_truncated_iters;
+ std::for_each(
+ split_untruncated_iters.begin(), split_untruncated_iters.end(),
+ [&](FragmentedIterPair& iter_pair) {
+ auto truncated_iter = std::make_unique<TruncatedRangeDelIterator>(
+ std::move(iter_pair.second), icmp_, smallest_ikey_, largest_ikey_);
+ split_truncated_iters.emplace(iter_pair.first,
+ std::move(truncated_iter));
+ });
+ return split_truncated_iters;
+}
+
+ForwardRangeDelIterator::ForwardRangeDelIterator(
+ const InternalKeyComparator* icmp)
+ : icmp_(icmp),
+ unused_idx_(0),
+ active_seqnums_(SeqMaxComparator()),
+ active_iters_(EndKeyMinComparator(icmp)),
+ inactive_iters_(StartKeyMinComparator(icmp)) {}
+
+bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+ // Move active iterators that end before parsed.
+ while (!active_iters_.empty() &&
+ icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) {
+ TruncatedRangeDelIterator* iter = PopActiveIter();
+ do {
+ iter->Next();
+ } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0);
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ // Move inactive iterators that start before parsed.
+ while (!inactive_iters_.empty() &&
+ icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) {
+ TruncatedRangeDelIterator* iter = PopInactiveIter();
+ while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) {
+ iter->Next();
+ }
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ return active_seqnums_.empty()
+ ? false
+ : (*active_seqnums_.begin())->seq() > parsed.sequence;
+}
+
+void ForwardRangeDelIterator::Invalidate() {
+ unused_idx_ = 0;
+ active_iters_.clear();
+ active_seqnums_.clear();
+ inactive_iters_.clear();
+}
+
+ReverseRangeDelIterator::ReverseRangeDelIterator(
+ const InternalKeyComparator* icmp)
+ : icmp_(icmp),
+ unused_idx_(0),
+ active_seqnums_(SeqMaxComparator()),
+ active_iters_(StartKeyMaxComparator(icmp)),
+ inactive_iters_(EndKeyMaxComparator(icmp)) {}
+
+bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+ // Move active iterators that start after parsed.
+ while (!active_iters_.empty() &&
+ icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) {
+ TruncatedRangeDelIterator* iter = PopActiveIter();
+ do {
+ iter->Prev();
+ } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0);
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ // Move inactive iterators that end after parsed.
+ while (!inactive_iters_.empty() &&
+ icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) {
+ TruncatedRangeDelIterator* iter = PopInactiveIter();
+ while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) {
+ iter->Prev();
+ }
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ return active_seqnums_.empty()
+ ? false
+ : (*active_seqnums_.begin())->seq() > parsed.sequence;
+}
+
+void ReverseRangeDelIterator::Invalidate() {
+ unused_idx_ = 0;
+ active_iters_.clear();
+ active_seqnums_.clear();
+ inactive_iters_.clear();
+}
+
+bool RangeDelAggregator::StripeRep::ShouldDelete(
+ const ParsedInternalKey& parsed, RangeDelPositioningMode mode) {
+ if (!InStripe(parsed.sequence) || IsEmpty()) {
+ return false;
+ }
+ switch (mode) {
+ case RangeDelPositioningMode::kForwardTraversal:
+ InvalidateReverseIter();
+
+ // Pick up previously unseen iterators.
+ for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx());
+ it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) {
+ auto& iter = *it;
+ forward_iter_.AddNewIter(iter.get(), parsed);
+ }
+
+ return forward_iter_.ShouldDelete(parsed);
+ case RangeDelPositioningMode::kBackwardTraversal:
+ InvalidateForwardIter();
+
+ // Pick up previously unseen iterators.
+ for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx());
+ it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) {
+ auto& iter = *it;
+ reverse_iter_.AddNewIter(iter.get(), parsed);
+ }
+
+ return reverse_iter_.ShouldDelete(parsed);
+ default:
+ assert(false);
+ return false;
+ }
+}
+
+bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start,
+ const Slice& end) {
+ Invalidate();
+
+ // Set the internal start/end keys so that:
+ // - if start_ikey has the same user key and sequence number as the
+ // current end key, start_ikey will be considered greater; and
+ // - if end_ikey has the same user key and sequence number as the current
+ // start key, end_ikey will be considered greater.
+ ParsedInternalKey start_ikey(start, kMaxSequenceNumber,
+ static_cast<ValueType>(0));
+ ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
+ for (auto& iter : iters_) {
+ bool checked_candidate_tombstones = false;
+ for (iter->SeekForPrev(start);
+ iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0;
+ iter->Next()) {
+ checked_candidate_tombstones = true;
+ if (icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+ icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+ return true;
+ }
+ }
+
+ if (!checked_candidate_tombstones) {
+ // Do an additional check for when the end of the range is the begin
+ // key of a tombstone, which we missed earlier since SeekForPrev'ing
+ // to the start was invalid.
+ iter->SeekForPrev(end);
+ if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+ icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void ReadRangeDelAggregator::AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+ const InternalKey* smallest, const InternalKey* largest) {
+ if (input_iter == nullptr || input_iter->empty()) {
+ return;
+ }
+ rep_.AddTombstones(std::make_unique<TruncatedRangeDelIterator>(
+ std::move(input_iter), icmp_, smallest, largest));
+}
+
+bool ReadRangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode) {
+ return rep_.ShouldDelete(parsed, mode);
+}
+
+bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start,
+ const Slice& end) {
+ InvalidateRangeDelMapPositions();
+ return rep_.IsRangeOverlapped(start, end);
+}
+
+void CompactionRangeDelAggregator::AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+ const InternalKey* smallest, const InternalKey* largest) {
+ if (input_iter == nullptr || input_iter->empty()) {
+ return;
+ }
+ // This bounds output of CompactionRangeDelAggregator::NewIterator.
+ if (!trim_ts_.empty()) {
+ assert(icmp_->user_comparator()->timestamp_size() > 0);
+ input_iter->SetTimestampUpperBound(&trim_ts_);
+ }
+
+ assert(input_iter->lower_bound() == 0);
+ assert(input_iter->upper_bound() == kMaxSequenceNumber);
+ parent_iters_.emplace_back(new TruncatedRangeDelIterator(
+ std::move(input_iter), icmp_, smallest, largest));
+
+ Slice* ts_upper_bound = nullptr;
+ if (!ts_upper_bound_.empty()) {
+ assert(icmp_->user_comparator()->timestamp_size() > 0);
+ ts_upper_bound = &ts_upper_bound_;
+ }
+ auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_);
+ for (auto& split_iter : split_iters) {
+ auto it = reps_.find(split_iter.first);
+ if (it == reps_.end()) {
+ bool inserted;
+ SequenceNumber upper_bound = split_iter.second->upper_bound();
+ SequenceNumber lower_bound = split_iter.second->lower_bound();
+ std::tie(it, inserted) = reps_.emplace(
+ split_iter.first, StripeRep(icmp_, upper_bound, lower_bound));
+ assert(inserted);
+ }
+ assert(it != reps_.end());
+ // ts_upper_bound is used to bound ShouldDelete() to only consider
+ // range tombstones under full_history_ts_low_ and trim_ts_. Keys covered by
+ // range tombstones that are above full_history_ts_low_ should not be
+ // dropped prematurely: user may read with a timestamp between the range
+ // tombstone and the covered key. Note that we cannot set timestamp
+ // upperbound on the original `input_iter` since `input_iter`s are later
+ // used in CompactionRangeDelAggregator::NewIterator to output range
+ // tombstones for persistence. We do not want to only persist range
+ // tombstones with timestamp lower than ts_upper_bound.
+ split_iter.second->SetTimestampUpperBound(ts_upper_bound);
+ it->second.AddTombstones(std::move(split_iter.second));
+ }
+}
+
+bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode) {
+ auto it = reps_.lower_bound(parsed.sequence);
+ if (it == reps_.end()) {
+ return false;
+ }
+ return it->second.ShouldDelete(parsed, mode);
+}
+
+namespace {
+
+// Produce a sorted (by start internal key) stream of range tombstones from
+// `children`. lower_bound and upper_bound on user key can be
+// optionally specified. Range tombstones that ends before lower_bound or starts
+// after upper_bound are excluded.
+// If user-defined timestamp is enabled, lower_bound and upper_bound should
+// contain timestamp, but comparison is done ignoring timestamps.
+class TruncatedRangeDelMergingIter : public InternalIterator {
+ public:
+ TruncatedRangeDelMergingIter(
+ const InternalKeyComparator* icmp, const Slice* lower_bound,
+ const Slice* upper_bound, bool upper_bound_inclusive,
+ const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
+ : icmp_(icmp),
+ lower_bound_(lower_bound),
+ upper_bound_(upper_bound),
+ upper_bound_inclusive_(upper_bound_inclusive),
+ heap_(StartKeyMinComparator(icmp)),
+ ts_sz_(icmp_->user_comparator()->timestamp_size()) {
+ for (auto& child : children) {
+ if (child != nullptr) {
+ assert(child->lower_bound() == 0);
+ assert(child->upper_bound() == kMaxSequenceNumber);
+ children_.push_back(child.get());
+ }
+ }
+ }
+
+ bool Valid() const override {
+ return !heap_.empty() && BeforeEndKey(heap_.top());
+ }
+ Status status() const override { return Status::OK(); }
+
+ void SeekToFirst() override {
+ heap_.clear();
+ for (auto& child : children_) {
+ if (lower_bound_ != nullptr) {
+ child->Seek(*lower_bound_);
+ } else {
+ child->SeekToFirst();
+ }
+ if (child->Valid()) {
+ heap_.push(child);
+ }
+ }
+ }
+
+ void Next() override {
+ auto* top = heap_.top();
+ top->InternalNext();
+ if (top->Valid()) {
+ heap_.replace_top(top);
+ } else {
+ heap_.pop();
+ }
+ }
+
+ Slice key() const override {
+ auto* top = heap_.top();
+ if (ts_sz_) {
+ cur_start_key_.Set(top->start_key().user_key, top->seq(),
+ kTypeRangeDeletion, top->timestamp());
+ } else {
+ cur_start_key_.Set(top->start_key().user_key, top->seq(),
+ kTypeRangeDeletion);
+ }
+ assert(top->start_key().user_key.size() >= ts_sz_);
+ return cur_start_key_.Encode();
+ }
+
+ Slice value() const override {
+ auto* top = heap_.top();
+ if (!ts_sz_) {
+ return top->end_key().user_key;
+ }
+ assert(top->timestamp().size() == ts_sz_);
+ cur_end_key_.clear();
+ cur_end_key_.append(top->end_key().user_key.data(),
+ top->end_key().user_key.size() - ts_sz_);
+ cur_end_key_.append(top->timestamp().data(), ts_sz_);
+ return cur_end_key_;
+ }
+
+ // Unused InternalIterator methods
+ void Prev() override { assert(false); }
+ void Seek(const Slice& /* target */) override { assert(false); }
+ void SeekForPrev(const Slice& /* target */) override { assert(false); }
+ void SeekToLast() override { assert(false); }
+
+ private:
+ bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const {
+ if (upper_bound_ == nullptr) {
+ return true;
+ }
+ int cmp = icmp_->user_comparator()->CompareWithoutTimestamp(
+ iter->start_key().user_key, *upper_bound_);
+ return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
+ }
+
+ const InternalKeyComparator* icmp_;
+ const Slice* lower_bound_;
+ const Slice* upper_bound_;
+ bool upper_bound_inclusive_;
+ BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
+ std::vector<TruncatedRangeDelIterator*> children_;
+
+ mutable InternalKey cur_start_key_;
+ mutable std::string cur_end_key_;
+ size_t ts_sz_;
+};
+
+} // anonymous namespace
+
+std::unique_ptr<FragmentedRangeTombstoneIterator>
+CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
+ const Slice* upper_bound,
+ bool upper_bound_inclusive) {
+ InvalidateRangeDelMapPositions();
+ auto merging_iter = std::make_unique<TruncatedRangeDelMergingIter>(
+ icmp_, lower_bound, upper_bound, upper_bound_inclusive, parent_iters_);
+
+ auto fragmented_tombstone_list =
+ std::make_shared<FragmentedRangeTombstoneList>(
+ std::move(merging_iter), *icmp_, true /* for_compaction */,
+ *snapshots_);
+
+ return std::make_unique<FragmentedRangeTombstoneIterator>(
+ fragmented_tombstone_list, *icmp_, kMaxSequenceNumber /* upper_bound */);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_del_aggregator.h b/src/rocksdb/db/range_del_aggregator.h
new file mode 100644
index 000000000..9bd40967d
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator.h
@@ -0,0 +1,476 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TruncatedRangeDelIterator {
+ public:
+ TruncatedRangeDelIterator(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+ const InternalKeyComparator* icmp, const InternalKey* smallest,
+ const InternalKey* largest);
+
+ bool Valid() const;
+
+ void Next() { iter_->TopNext(); }
+ void Prev() { iter_->TopPrev(); }
+
+ void InternalNext() { iter_->Next(); }
+
+ // Seeks to the tombstone with the highest visible sequence number that covers
+ // target (a user key). If no such tombstone exists, the position will be at
+ // the earliest tombstone that ends after target.
+ // REQUIRES: target is a user key.
+ void Seek(const Slice& target);
+
+ // Seeks to the tombstone with the highest visible sequence number that covers
+ // target (a user key). If no such tombstone exists, the position will be at
+ // the latest tombstone that starts before target.
+ void SeekForPrev(const Slice& target);
+
+ void SeekToFirst();
+ void SeekToLast();
+
+ ParsedInternalKey start_key() const {
+ return (smallest_ == nullptr ||
+ icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0)
+ ? iter_->parsed_start_key()
+ : *smallest_;
+ }
+
+ ParsedInternalKey end_key() const {
+ return (largest_ == nullptr ||
+ icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0)
+ ? iter_->parsed_end_key()
+ : *largest_;
+ }
+
+ SequenceNumber seq() const { return iter_->seq(); }
+ Slice timestamp() const {
+ assert(icmp_->user_comparator()->timestamp_size());
+ return iter_->timestamp();
+ }
+ void SetTimestampUpperBound(const Slice* ts_upper_bound) {
+ iter_->SetTimestampUpperBound(ts_upper_bound);
+ }
+
+ std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+ SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+ SequenceNumber upper_bound() const { return iter_->upper_bound(); }
+
+ SequenceNumber lower_bound() const { return iter_->lower_bound(); }
+
+ private:
+ std::unique_ptr<FragmentedRangeTombstoneIterator> iter_;
+ const InternalKeyComparator* icmp_;
+ const ParsedInternalKey* smallest_ = nullptr;
+ const ParsedInternalKey* largest_ = nullptr;
+ std::list<ParsedInternalKey> pinned_bounds_;
+
+ const InternalKey* smallest_ikey_;
+ const InternalKey* largest_ikey_;
+};
+
+struct SeqMaxComparator {
+ bool operator()(const TruncatedRangeDelIterator* a,
+ const TruncatedRangeDelIterator* b) const {
+ return a->seq() > b->seq();
+ }
+};
+
+struct StartKeyMinComparator {
+ explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+ bool operator()(const TruncatedRangeDelIterator* a,
+ const TruncatedRangeDelIterator* b) const {
+ return icmp->Compare(a->start_key(), b->start_key()) > 0;
+ }
+
+ const InternalKeyComparator* icmp;
+};
+
+class ForwardRangeDelIterator {
+ public:
+ explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp);
+
+ bool ShouldDelete(const ParsedInternalKey& parsed);
+ void Invalidate();
+
+ void AddNewIter(TruncatedRangeDelIterator* iter,
+ const ParsedInternalKey& parsed) {
+ iter->Seek(parsed.user_key);
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ size_t UnusedIdx() const { return unused_idx_; }
+ void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+ using ActiveSeqSet =
+ std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+ struct EndKeyMinComparator {
+ explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+ bool operator()(const ActiveSeqSet::const_iterator& a,
+ const ActiveSeqSet::const_iterator& b) const {
+ return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0;
+ }
+
+ const InternalKeyComparator* icmp;
+ };
+
+ void PushIter(TruncatedRangeDelIterator* iter,
+ const ParsedInternalKey& parsed) {
+ if (!iter->Valid()) {
+ // The iterator has been fully consumed, so we don't need to add it to
+ // either of the heaps.
+ return;
+ }
+ int cmp = icmp_->Compare(parsed, iter->start_key());
+ if (cmp < 0) {
+ PushInactiveIter(iter);
+ } else {
+ PushActiveIter(iter);
+ }
+ }
+
+ void PushActiveIter(TruncatedRangeDelIterator* iter) {
+ auto seq_pos = active_seqnums_.insert(iter);
+ active_iters_.push(seq_pos);
+ }
+
+ TruncatedRangeDelIterator* PopActiveIter() {
+ auto active_top = active_iters_.top();
+ auto iter = *active_top;
+ active_iters_.pop();
+ active_seqnums_.erase(active_top);
+ return iter;
+ }
+
+ void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+ inactive_iters_.push(iter);
+ }
+
+ TruncatedRangeDelIterator* PopInactiveIter() {
+ auto* iter = inactive_iters_.top();
+ inactive_iters_.pop();
+ return iter;
+ }
+
+ const InternalKeyComparator* icmp_;
+ size_t unused_idx_;
+ ActiveSeqSet active_seqnums_;
+ BinaryHeap<ActiveSeqSet::const_iterator, EndKeyMinComparator> active_iters_;
+ BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> inactive_iters_;
+};
+
+class ReverseRangeDelIterator {
+ public:
+ explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp);
+
+ bool ShouldDelete(const ParsedInternalKey& parsed);
+ void Invalidate();
+
+ void AddNewIter(TruncatedRangeDelIterator* iter,
+ const ParsedInternalKey& parsed) {
+ iter->SeekForPrev(parsed.user_key);
+ PushIter(iter, parsed);
+ assert(active_iters_.size() == active_seqnums_.size());
+ }
+
+ size_t UnusedIdx() const { return unused_idx_; }
+ void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+ using ActiveSeqSet =
+ std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+ struct EndKeyMaxComparator {
+ explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+ bool operator()(const TruncatedRangeDelIterator* a,
+ const TruncatedRangeDelIterator* b) const {
+ return icmp->Compare(a->end_key(), b->end_key()) < 0;
+ }
+
+ const InternalKeyComparator* icmp;
+ };
+ struct StartKeyMaxComparator {
+ explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+ bool operator()(const ActiveSeqSet::const_iterator& a,
+ const ActiveSeqSet::const_iterator& b) const {
+ return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0;
+ }
+
+ const InternalKeyComparator* icmp;
+ };
+
+ void PushIter(TruncatedRangeDelIterator* iter,
+ const ParsedInternalKey& parsed) {
+ if (!iter->Valid()) {
+ // The iterator has been fully consumed, so we don't need to add it to
+ // either of the heaps.
+ } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) {
+ PushInactiveIter(iter);
+ } else {
+ PushActiveIter(iter);
+ }
+ }
+
+ void PushActiveIter(TruncatedRangeDelIterator* iter) {
+ auto seq_pos = active_seqnums_.insert(iter);
+ active_iters_.push(seq_pos);
+ }
+
+ TruncatedRangeDelIterator* PopActiveIter() {
+ auto active_top = active_iters_.top();
+ auto iter = *active_top;
+ active_iters_.pop();
+ active_seqnums_.erase(active_top);
+ return iter;
+ }
+
+ void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+ inactive_iters_.push(iter);
+ }
+
+ TruncatedRangeDelIterator* PopInactiveIter() {
+ auto* iter = inactive_iters_.top();
+ inactive_iters_.pop();
+ return iter;
+ }
+
+ const InternalKeyComparator* icmp_;
+ size_t unused_idx_;
+ ActiveSeqSet active_seqnums_;
+ BinaryHeap<ActiveSeqSet::const_iterator, StartKeyMaxComparator> active_iters_;
+ BinaryHeap<TruncatedRangeDelIterator*, EndKeyMaxComparator> inactive_iters_;
+};
+
+enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal };
+class RangeDelAggregator {
+ public:
+ explicit RangeDelAggregator(const InternalKeyComparator* icmp)
+ : icmp_(icmp) {}
+ virtual ~RangeDelAggregator() {}
+
+ virtual void AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+ const InternalKey* smallest = nullptr,
+ const InternalKey* largest = nullptr) = 0;
+
+ bool ShouldDelete(const Slice& ikey, RangeDelPositioningMode mode) {
+ ParsedInternalKey parsed;
+
+ Status pik_status =
+ ParseInternalKey(ikey, &parsed, false /* log_err_key */); // TODO
+ assert(pik_status.ok());
+ if (!pik_status.ok()) {
+ return false;
+ }
+
+ return ShouldDelete(parsed, mode);
+ }
+ virtual bool ShouldDelete(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode) = 0;
+
+ virtual void InvalidateRangeDelMapPositions() = 0;
+
+ virtual bool IsEmpty() const = 0;
+
+ bool AddFile(uint64_t file_number) {
+ return files_seen_.insert(file_number).second;
+ }
+
+ protected:
+ class StripeRep {
+ public:
+ StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound,
+ SequenceNumber lower_bound)
+ : icmp_(icmp),
+ forward_iter_(icmp),
+ reverse_iter_(icmp),
+ upper_bound_(upper_bound),
+ lower_bound_(lower_bound) {}
+
+ void AddTombstones(std::unique_ptr<TruncatedRangeDelIterator> input_iter) {
+ iters_.push_back(std::move(input_iter));
+ }
+
+ bool IsEmpty() const { return iters_.empty(); }
+
+ bool ShouldDelete(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode);
+
+ void Invalidate() {
+ if (!IsEmpty()) {
+ InvalidateForwardIter();
+ InvalidateReverseIter();
+ }
+ }
+
+ // If user-defined timestamp is enabled, `start` and `end` are user keys
+ // with timestamp.
+ bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+ private:
+ bool InStripe(SequenceNumber seq) const {
+ return lower_bound_ <= seq && seq <= upper_bound_;
+ }
+
+ void InvalidateForwardIter() { forward_iter_.Invalidate(); }
+
+ void InvalidateReverseIter() { reverse_iter_.Invalidate(); }
+
+ const InternalKeyComparator* icmp_;
+ std::vector<std::unique_ptr<TruncatedRangeDelIterator>> iters_;
+ ForwardRangeDelIterator forward_iter_;
+ ReverseRangeDelIterator reverse_iter_;
+ SequenceNumber upper_bound_;
+ SequenceNumber lower_bound_;
+ };
+
+ const InternalKeyComparator* icmp_;
+
+ private:
+ std::set<uint64_t> files_seen_;
+};
+
+class ReadRangeDelAggregator final : public RangeDelAggregator {
+ public:
+ ReadRangeDelAggregator(const InternalKeyComparator* icmp,
+ SequenceNumber upper_bound)
+ : RangeDelAggregator(icmp),
+ rep_(icmp, upper_bound, 0 /* lower_bound */) {}
+ ~ReadRangeDelAggregator() override {}
+
+ using RangeDelAggregator::ShouldDelete;
+ void AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+ const InternalKey* smallest = nullptr,
+ const InternalKey* largest = nullptr) override;
+
+ bool ShouldDelete(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode) final override {
+ if (rep_.IsEmpty()) {
+ return false;
+ }
+ return ShouldDeleteImpl(parsed, mode);
+ }
+
+ bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+ void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); }
+
+ bool IsEmpty() const override { return rep_.IsEmpty(); }
+
+ private:
+ StripeRep rep_;
+
+ bool ShouldDeleteImpl(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode);
+};
+
+class CompactionRangeDelAggregator : public RangeDelAggregator {
+ public:
+ CompactionRangeDelAggregator(const InternalKeyComparator* icmp,
+ const std::vector<SequenceNumber>& snapshots,
+ const std::string* full_history_ts_low = nullptr,
+ const std::string* trim_ts = nullptr)
+ : RangeDelAggregator(icmp), snapshots_(&snapshots) {
+ if (full_history_ts_low) {
+ ts_upper_bound_ = *full_history_ts_low;
+ }
+ if (trim_ts) {
+ trim_ts_ = *trim_ts;
+ // Range tombstone newer than `trim_ts` or `full_history_ts_low` should
+ // not be considered in ShouldDelete().
+ if (ts_upper_bound_.empty()) {
+ ts_upper_bound_ = trim_ts_;
+ } else if (!trim_ts_.empty() && icmp->user_comparator()->CompareTimestamp(
+ trim_ts_, ts_upper_bound_) < 0) {
+ ts_upper_bound_ = trim_ts_;
+ }
+ }
+ }
+ ~CompactionRangeDelAggregator() override {}
+
+ void AddTombstones(
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+ const InternalKey* smallest = nullptr,
+ const InternalKey* largest = nullptr) override;
+
+ using RangeDelAggregator::ShouldDelete;
+ bool ShouldDelete(const ParsedInternalKey& parsed,
+ RangeDelPositioningMode mode) override;
+
+ bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+ void InvalidateRangeDelMapPositions() override {
+ for (auto& rep : reps_) {
+ rep.second.Invalidate();
+ }
+ }
+
+ bool IsEmpty() const override {
+ for (const auto& rep : reps_) {
+ if (!rep.second.IsEmpty()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Creates an iterator over all the range tombstones in the aggregator, for
+ // use in compaction. Nullptr arguments indicate that the iterator range is
+ // unbounded.
+ // NOTE: the boundaries are used for optimization purposes to reduce the
+ // number of tombstones that are passed to the fragmenter; they do not
+ // guarantee that the resulting iterator only contains range tombstones that
+ // cover keys in the provided range. If required, these bounds must be
+ // enforced during iteration.
+ std::unique_ptr<FragmentedRangeTombstoneIterator> NewIterator(
+ const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr,
+ bool upper_bound_inclusive = false);
+
+ private:
+ std::vector<std::unique_ptr<TruncatedRangeDelIterator>> parent_iters_;
+ std::map<SequenceNumber, StripeRep> reps_;
+
+ const std::vector<SequenceNumber>* snapshots_;
+ // min over full_history_ts_low and trim_ts_
+ Slice ts_upper_bound_{};
+ Slice trim_ts_{};
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_del_aggregator_bench.cc b/src/rocksdb/db/range_del_aggregator_bench.cc
new file mode 100644
index 000000000..9dca707e5
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator_bench.cc
@@ -0,0 +1,280 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#else
+
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/system_clock.h"
+#include "util/coding.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/vector_iterator.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created");
+
+DEFINE_int32(num_runs, 1000, "number of test runs");
+
+DEFINE_int32(tombstone_start_upper_bound, 1000,
+ "exclusive upper bound on range tombstone start keys");
+
+DEFINE_int32(should_delete_upper_bound, 1000,
+ "exclusive upper bound on keys passed to ShouldDelete");
+
+DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width");
+
+DEFINE_double(tombstone_width_stddev, 0.0,
+ "standard deviation of range tombstone width");
+
+DEFINE_int32(seed, 0, "random number generator seed");
+
+DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run");
+
+DEFINE_int32(add_tombstones_per_run, 1,
+ "number of AddTombstones calls per run");
+
+DEFINE_bool(use_compaction_range_del_aggregator, false,
+ "Whether to use CompactionRangeDelAggregator. Default is to use "
+ "ReadRangeDelAggregator.");
+
+namespace {
+
+struct Stats {
+ uint64_t time_add_tombstones = 0;
+ uint64_t time_first_should_delete = 0;
+ uint64_t time_rest_should_delete = 0;
+ uint64_t time_fragment_tombstones = 0;
+};
+
+std::ostream& operator<<(std::ostream& os, const Stats& s) {
+ std::ios fmt_holder(nullptr);
+ fmt_holder.copyfmt(os);
+
+ os << std::left;
+ os << std::setw(25) << "Fragment Tombstones: "
+ << s.time_fragment_tombstones /
+ (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3)
+ << " us\n";
+ os << std::setw(25) << "AddTombstones: "
+ << s.time_add_tombstones /
+ (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3)
+ << " us\n";
+ os << std::setw(25) << "ShouldDelete (first): "
+ << s.time_first_should_delete / (FLAGS_num_runs * 1.0e3) << " us\n";
+ if (FLAGS_should_deletes_per_run > 1) {
+ os << std::setw(25) << "ShouldDelete (rest): "
+ << s.time_rest_should_delete /
+ ((FLAGS_should_deletes_per_run - 1) * FLAGS_num_runs * 1.0e3)
+ << " us\n";
+ }
+
+ os.copyfmt(fmt_holder);
+ return os;
+}
+
+auto icmp = ROCKSDB_NAMESPACE::InternalKeyComparator(
+ ROCKSDB_NAMESPACE::BytewiseComparator());
+
+} // anonymous namespace
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// A wrapper around RangeTombstones and the underlying data of its start and end
+// keys.
+struct PersistentRangeTombstone {
+ std::string start_key;
+ std::string end_key;
+ RangeTombstone tombstone;
+
+ PersistentRangeTombstone(std::string start, std::string end,
+ SequenceNumber seq)
+ : start_key(std::move(start)), end_key(std::move(end)) {
+ tombstone = RangeTombstone(start_key, end_key, seq);
+ }
+
+ PersistentRangeTombstone() = default;
+
+ PersistentRangeTombstone(const PersistentRangeTombstone& t) { *this = t; }
+
+ PersistentRangeTombstone& operator=(const PersistentRangeTombstone& t) {
+ start_key = t.start_key;
+ end_key = t.end_key;
+ tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+ return *this;
+ }
+
+ PersistentRangeTombstone(PersistentRangeTombstone&& t) noexcept { *this = t; }
+
+ PersistentRangeTombstone& operator=(PersistentRangeTombstone&& t) {
+ start_key = std::move(t.start_key);
+ end_key = std::move(t.end_key);
+ tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+ return *this;
+ }
+};
+
+struct TombstoneStartKeyComparator {
+ explicit TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {}
+
+ bool operator()(const RangeTombstone& a, const RangeTombstone& b) const {
+ return cmp->Compare(a.start_key_, b.start_key_) < 0;
+ }
+
+ const Comparator* cmp;
+};
+
+std::unique_ptr<InternalIterator> MakeRangeDelIterator(
+ const std::vector<PersistentRangeTombstone>& range_dels) {
+ std::vector<std::string> keys, values;
+ for (const auto& range_del : range_dels) {
+ auto key_and_value = range_del.tombstone.Serialize();
+ keys.push_back(key_and_value.first.Encode().ToString());
+ values.push_back(key_and_value.second.ToString());
+ }
+ return std::unique_ptr<VectorIterator>(
+ new VectorIterator(keys, values, &icmp));
+}
+
+// convert long to a big-endian slice key
+static std::string Key(int64_t val) {
+ std::string little_endian_key;
+ std::string big_endian_key;
+ PutFixed64(&little_endian_key, val);
+ assert(little_endian_key.size() == sizeof(val));
+ big_endian_key.resize(sizeof(val));
+ for (size_t i = 0; i < sizeof(val); ++i) {
+ big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+ }
+ return big_endian_key;
+}
+
+} // anonymous namespace
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ParseCommandLineFlags(&argc, &argv, true);
+
+ Stats stats;
+ ROCKSDB_NAMESPACE::SystemClock* clock =
+ ROCKSDB_NAMESPACE::SystemClock::Default().get();
+ ROCKSDB_NAMESPACE::Random64 rnd(FLAGS_seed);
+ std::default_random_engine random_gen(FLAGS_seed);
+ std::normal_distribution<double> normal_dist(FLAGS_tombstone_width_mean,
+ FLAGS_tombstone_width_stddev);
+ std::vector<std::vector<ROCKSDB_NAMESPACE::PersistentRangeTombstone> >
+ all_persistent_range_tombstones(FLAGS_add_tombstones_per_run);
+ for (int i = 0; i < FLAGS_add_tombstones_per_run; i++) {
+ all_persistent_range_tombstones[i] =
+ std::vector<ROCKSDB_NAMESPACE::PersistentRangeTombstone>(
+ FLAGS_num_range_tombstones);
+ }
+ auto mode = ROCKSDB_NAMESPACE::RangeDelPositioningMode::kForwardTraversal;
+ std::vector<ROCKSDB_NAMESPACE::SequenceNumber> snapshots{0};
+ for (int i = 0; i < FLAGS_num_runs; i++) {
+ std::unique_ptr<ROCKSDB_NAMESPACE::RangeDelAggregator> range_del_agg =
+ nullptr;
+ if (FLAGS_use_compaction_range_del_aggregator) {
+ range_del_agg.reset(new ROCKSDB_NAMESPACE::CompactionRangeDelAggregator(
+ &icmp, snapshots));
+ } else {
+ range_del_agg.reset(new ROCKSDB_NAMESPACE::ReadRangeDelAggregator(
+ &icmp, ROCKSDB_NAMESPACE::kMaxSequenceNumber /* upper_bound */));
+ }
+
+ std::vector<
+ std::unique_ptr<ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList> >
+ fragmented_range_tombstone_lists(FLAGS_add_tombstones_per_run);
+
+ for (auto& persistent_range_tombstones : all_persistent_range_tombstones) {
+ // TODO(abhimadan): consider whether creating the range tombstones right
+ // before AddTombstones is artificially warming the cache compared to
+ // real workloads.
+ for (int j = 0; j < FLAGS_num_range_tombstones; j++) {
+ uint64_t start = rnd.Uniform(FLAGS_tombstone_start_upper_bound);
+ uint64_t end = static_cast<uint64_t>(
+ std::round(start + std::max(1.0, normal_dist(random_gen))));
+ persistent_range_tombstones[j] =
+ ROCKSDB_NAMESPACE::PersistentRangeTombstone(
+ ROCKSDB_NAMESPACE::Key(start), ROCKSDB_NAMESPACE::Key(end), j);
+ }
+ auto iter =
+ ROCKSDB_NAMESPACE::MakeRangeDelIterator(persistent_range_tombstones);
+ ROCKSDB_NAMESPACE::StopWatchNano stop_watch_fragment_tombstones(
+ clock, true /* auto_start */);
+ fragmented_range_tombstone_lists.emplace_back(
+ new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList(
+ std::move(iter), icmp, FLAGS_use_compaction_range_del_aggregator,
+ snapshots));
+ stats.time_fragment_tombstones +=
+ stop_watch_fragment_tombstones.ElapsedNanos();
+ std::unique_ptr<ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator>
+ fragmented_range_del_iter(
+ new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator(
+ fragmented_range_tombstone_lists.back().get(), icmp,
+ ROCKSDB_NAMESPACE::kMaxSequenceNumber));
+
+ ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones(
+ clock, true /* auto_start */);
+ range_del_agg->AddTombstones(std::move(fragmented_range_del_iter));
+ stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
+ }
+
+ ROCKSDB_NAMESPACE::ParsedInternalKey parsed_key;
+ parsed_key.sequence = FLAGS_num_range_tombstones / 2;
+ parsed_key.type = ROCKSDB_NAMESPACE::kTypeValue;
+
+ uint64_t first_key = rnd.Uniform(FLAGS_should_delete_upper_bound -
+ FLAGS_should_deletes_per_run + 1);
+
+ for (int j = 0; j < FLAGS_should_deletes_per_run; j++) {
+ std::string key_string = ROCKSDB_NAMESPACE::Key(first_key + j);
+ parsed_key.user_key = key_string;
+
+ ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete(
+ clock, true /* auto_start */);
+ range_del_agg->ShouldDelete(parsed_key, mode);
+ uint64_t call_time = stop_watch_should_delete.ElapsedNanos();
+
+ if (j == 0) {
+ stats.time_first_should_delete += call_time;
+ } else {
+ stats.time_rest_should_delete += call_time;
+ }
+ }
+ }
+
+ std::cout << "=========================\n"
+ << "Results:\n"
+ << "=========================\n"
+ << stats;
+
+ return 0;
+}
+
+#endif // GFLAGS
diff --git a/src/rocksdb/db/range_del_aggregator_test.cc b/src/rocksdb/db/range_del_aggregator_test.cc
new file mode 100644
index 000000000..7fe35276a
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator_test.cc
@@ -0,0 +1,715 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_del_aggregator.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeDelAggregatorTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+ const std::vector<RangeTombstone>& range_dels) {
+ std::vector<std::string> keys, values;
+ for (const auto& range_del : range_dels) {
+ auto key_and_value = range_del.Serialize();
+ keys.push_back(key_and_value.first.Encode().ToString());
+ values.push_back(key_and_value.second.ToString());
+ }
+ return std::unique_ptr<VectorIterator>(
+ new VectorIterator(keys, values, &bytewise_icmp));
+}
+
+std::vector<std::unique_ptr<FragmentedRangeTombstoneList>>
+MakeFragmentedTombstoneLists(
+ const std::vector<std::vector<RangeTombstone>>& range_dels_list) {
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneList>> fragment_lists;
+ for (const auto& range_dels : range_dels_list) {
+ auto range_del_iter = MakeRangeDelIter(range_dels);
+ fragment_lists.emplace_back(new FragmentedRangeTombstoneList(
+ std::move(range_del_iter), bytewise_icmp));
+ }
+ return fragment_lists;
+}
+
+struct TruncatedIterScanTestCase {
+ ParsedInternalKey start;
+ ParsedInternalKey end;
+ SequenceNumber seq;
+};
+
+struct TruncatedIterSeekTestCase {
+ Slice target;
+ ParsedInternalKey start;
+ ParsedInternalKey end;
+ SequenceNumber seq;
+ bool invalid;
+};
+
+struct ShouldDeleteTestCase {
+ ParsedInternalKey lookup_key;
+ bool result;
+};
+
+struct IsRangeOverlappedTestCase {
+ Slice start;
+ Slice end;
+ bool result;
+};
+
+ParsedInternalKey UncutEndpoint(const Slice& s) {
+ return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion);
+}
+
+ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq,
+ ValueType type = kTypeValue) {
+ return ParsedInternalKey(key, seq, type);
+}
+
+void VerifyIterator(
+ TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+ const std::vector<TruncatedIterScanTestCase>& expected_range_dels) {
+ // Test forward iteration.
+ iter->SeekToFirst();
+ for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) {
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start));
+ EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end));
+ EXPECT_EQ(expected_range_dels[i].seq, iter->seq());
+ }
+ EXPECT_FALSE(iter->Valid());
+
+ // Test reverse iteration.
+ iter->SeekToLast();
+ std::vector<TruncatedIterScanTestCase> reverse_expected_range_dels(
+ expected_range_dels.rbegin(), expected_range_dels.rend());
+ for (size_t i = 0; i < reverse_expected_range_dels.size();
+ i++, iter->Prev()) {
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(0, icmp.Compare(iter->start_key(),
+ reverse_expected_range_dels[i].start));
+ EXPECT_EQ(
+ 0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end));
+ EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq());
+ }
+ EXPECT_FALSE(iter->Valid());
+}
+
+void VerifySeek(TruncatedRangeDelIterator* iter,
+ const InternalKeyComparator& icmp,
+ const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+ for (const auto& test_case : test_cases) {
+ iter->Seek(test_case.target);
+ if (test_case.invalid) {
+ ASSERT_FALSE(iter->Valid());
+ } else {
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+ EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+ EXPECT_EQ(test_case.seq, iter->seq());
+ }
+ }
+}
+
+void VerifySeekForPrev(
+ TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+ const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+ for (const auto& test_case : test_cases) {
+ iter->SeekForPrev(test_case.target);
+ if (test_case.invalid) {
+ ASSERT_FALSE(iter->Valid());
+ } else {
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+ EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+ EXPECT_EQ(test_case.seq, iter->seq());
+ }
+ }
+}
+
+void VerifyShouldDelete(RangeDelAggregator* range_del_agg,
+ const std::vector<ShouldDeleteTestCase>& test_cases) {
+ for (const auto& test_case : test_cases) {
+ EXPECT_EQ(
+ test_case.result,
+ range_del_agg->ShouldDelete(
+ test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal));
+ }
+ for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) {
+ const auto& test_case = *it;
+ EXPECT_EQ(
+ test_case.result,
+ range_del_agg->ShouldDelete(
+ test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal));
+ }
+}
+
+void VerifyIsRangeOverlapped(
+ ReadRangeDelAggregator* range_del_agg,
+ const std::vector<IsRangeOverlappedTestCase>& test_cases) {
+ for (const auto& test_case : test_cases) {
+ EXPECT_EQ(test_case.result,
+ range_del_agg->IsRangeOverlapped(test_case.start, test_case.end));
+ }
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+ const FragmentedRangeTombstoneIterator* iter) {
+ // Test InternalIterator interface.
+ EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+ EXPECT_EQ(tombstone.end_key_, iter->value());
+ EXPECT_EQ(tombstone.seq_, iter->seq());
+
+ // Test FragmentedRangeTombstoneIterator interface.
+ EXPECT_EQ(tombstone.start_key_, iter->start_key());
+ EXPECT_EQ(tombstone.end_key_, iter->end_key());
+ EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+ FragmentedRangeTombstoneIterator* iter,
+ const std::vector<RangeTombstone>& expected_tombstones) {
+ iter->SeekToFirst();
+ for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+ ASSERT_TRUE(iter->Valid());
+ CheckIterPosition(expected_tombstones[i], iter);
+ }
+ EXPECT_FALSE(iter->Valid());
+}
+
+} // anonymous namespace
+
+TEST_F(RangeDelAggregatorTest, EmptyTruncatedIter) {
+ auto range_del_iter = MakeRangeDelIter({});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber));
+
+ TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+ nullptr);
+
+ iter.SeekToFirst();
+ ASSERT_FALSE(iter.Valid());
+
+ iter.SeekToLast();
+ ASSERT_FALSE(iter.Valid());
+}
+
+TEST_F(RangeDelAggregatorTest, UntruncatedIter) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber));
+
+ TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+ nullptr);
+
+ VerifyIterator(&iter, bytewise_icmp,
+ {{UncutEndpoint("a"), UncutEndpoint("e"), 10},
+ {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+ VerifySeek(
+ &iter, bytewise_icmp,
+ {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+ {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+ {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}});
+
+ VerifySeekForPrev(
+ &iter, bytewise_icmp,
+ {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+ {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ 9 /* snapshot */));
+
+ TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+ nullptr);
+
+ VerifyIterator(&iter, bytewise_icmp,
+ {{UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+ VerifySeek(
+ &iter, bytewise_icmp,
+ {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+ {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+ {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}});
+
+ VerifySeekForPrev(
+ &iter, bytewise_icmp,
+ {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+ {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber));
+
+ InternalKey smallest("d", 7, kTypeValue);
+ InternalKey largest("m", 9, kTypeValue);
+ TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+ &smallest, &largest);
+
+ VerifyIterator(
+ &iter, bytewise_icmp,
+ {{InternalValue("d", 7), UncutEndpoint("e"), 10},
+ {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4}});
+
+ VerifySeek(
+ &iter, bytewise_icmp,
+ {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4,
+ false /* invalid */},
+ {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+ {"", InternalValue("d", 7), UncutEndpoint("e"), 10}});
+
+ VerifySeekForPrev(
+ &iter, bytewise_icmp,
+ {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+ {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+ {"n", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4,
+ false /* invalid */},
+ {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber));
+
+ InternalKey smallest("f", 7, kTypeValue);
+ InternalKey largest("i", 9, kTypeValue);
+ TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+ &smallest, &largest);
+
+ VerifyIterator(&iter, bytewise_icmp,
+ {{InternalValue("f", 7), UncutEndpoint("g"), 8}});
+
+ VerifySeek(
+ &iter, bytewise_icmp,
+ {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8},
+ {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+ {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+
+ VerifySeekForPrev(
+ &iter, bytewise_icmp,
+ {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+ {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+ {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}});
+}
+
+TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}});
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber));
+
+ ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+ range_del_agg.AddTombstones(std::move(input_iter));
+
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+ {InternalValue("b", 9), true},
+ {InternalValue("d", 9), true},
+ {InternalValue("e", 7), true},
+ {InternalValue("g", 7), false}});
+
+ VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+ {"_", "a", true},
+ {"a", "c", true},
+ {"d", "f", true},
+ {"g", "l", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregator) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+ {InternalValue("b", 19), false},
+ {InternalValue("b", 9), true},
+ {InternalValue("d", 9), true},
+ {InternalValue("e", 7), true},
+ {InternalValue("g", 7), false},
+ {InternalValue("h", 24), true},
+ {InternalValue("i", 24), false},
+ {InternalValue("ii", 14), true},
+ {InternalValue("j", 14), false}});
+
+ VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+ {"_", "a", true},
+ {"a", "c", true},
+ {"d", "f", true},
+ {"g", "l", true},
+ {"x", "y", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregatorWithUpperBound) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ 19 /* snapshot */));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+ {InternalValue("a", 9), true},
+ {InternalValue("b", 9), true},
+ {InternalValue("d", 9), true},
+ {InternalValue("e", 7), true},
+ {InternalValue("g", 7), false},
+ {InternalValue("h", 24), false},
+ {InternalValue("i", 24), false},
+ {InternalValue("ii", 14), true},
+ {InternalValue("j", 14), false}});
+
+ VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+ {"_", "a", true},
+ {"a", "c", true},
+ {"d", "f", true},
+ {"g", "l", true},
+ {"x", "y", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregator) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+ std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+ {InternalKey("a", 4, kTypeValue),
+ InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+ {InternalKey("m", 20, kTypeValue),
+ InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+ {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+ ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+ for (size_t i = 0; i < fragment_lists.size(); i++) {
+ const auto& fragment_list = fragment_lists[i];
+ const auto& bounds = iter_bounds[i];
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ 19 /* snapshot */));
+ range_del_agg.AddTombstones(std::move(input_iter), &bounds.first,
+ &bounds.second);
+ }
+
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+ {InternalValue("a", 9), false},
+ {InternalValue("a", 4), true},
+ {InternalValue("m", 10), false},
+ {InternalValue("m", 9), true},
+ {InternalValue("x", 10), false},
+ {InternalValue("x", 9), false},
+ {InternalValue("x", 5), true},
+ {InternalValue("z", 9), false}});
+
+ VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+ {"_", "a", true},
+ {"a", "n", true},
+ {"l", "x", true},
+ {"w", "z", true},
+ {"zzz", "zz", false},
+ {"zz", "zzz", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregatorSameLevel) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+ std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+ {InternalKey("a", 4, kTypeValue),
+ InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+ {InternalKey("m", 20, kTypeValue),
+ InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+ {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+ ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+
+ auto add_iter_to_agg = [&](size_t i) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_lists[i].get(),
+ bytewise_icmp, 19 /* snapshot */));
+ range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first,
+ &iter_bounds[i].second);
+ };
+
+ add_iter_to_agg(0);
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+ {InternalValue("a", 9), false},
+ {InternalValue("a", 4), true}});
+
+ add_iter_to_agg(1);
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false},
+ {InternalValue("m", 9), true}});
+
+ add_iter_to_agg(2);
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false},
+ {InternalValue("x", 9), false},
+ {InternalValue("x", 5), true},
+ {InternalValue("z", 9), false}});
+
+ VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+ {"_", "a", true},
+ {"a", "n", true},
+ {"l", "x", true},
+ {"w", "z", true},
+ {"zzz", "zz", false},
+ {"zz", "zzz", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorNoSnapshots) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots;
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+ {InternalValue("b", 19), false},
+ {InternalValue("b", 9), true},
+ {InternalValue("d", 9), true},
+ {InternalValue("e", 7), true},
+ {InternalValue("g", 7), false},
+ {InternalValue("h", 24), true},
+ {InternalValue("i", 24), false},
+ {InternalValue("ii", 14), true},
+ {InternalValue("j", 14), false}});
+
+ auto range_del_compaction_iter = range_del_agg.NewIterator();
+ VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+ {"b", "c", 10},
+ {"c", "e", 10},
+ {"e", "g", 8},
+ {"h", "i", 25},
+ {"ii", "j", 15}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorWithSnapshots) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots{9, 19};
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ VerifyShouldDelete(
+ &range_del_agg,
+ {
+ {InternalValue("a", 19), false}, // [10, 19]
+ {InternalValue("a", 9), false}, // [0, 9]
+ {InternalValue("b", 9), false}, // [0, 9]
+ {InternalValue("d", 9), false}, // [0, 9]
+ {InternalValue("d", 7), true}, // [0, 9]
+ {InternalValue("e", 7), true}, // [0, 9]
+ {InternalValue("g", 7), false}, // [0, 9]
+ {InternalValue("h", 24), true}, // [20, kMaxSequenceNumber]
+ {InternalValue("i", 24), false}, // [20, kMaxSequenceNumber]
+ {InternalValue("ii", 14), true}, // [10, 19]
+ {InternalValue("j", 14), false} // [10, 19]
+ });
+
+ auto range_del_compaction_iter = range_del_agg.NewIterator();
+ VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+ {"a", "b", 10},
+ {"b", "c", 10},
+ {"c", "e", 10},
+ {"c", "e", 8},
+ {"e", "g", 8},
+ {"h", "i", 25},
+ {"ii", "j", 15}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorLeft) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots{9, 19};
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ Slice start("_");
+ Slice end("__");
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots{9, 19};
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ Slice start("p");
+ Slice end("q");
+ auto range_del_compaction_iter1 =
+ range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {});
+
+ auto range_del_compaction_iter2 =
+ range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "e", 10}, {"c", "g", 8}},
+ {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots{9, 19};
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ Slice start("bb");
+ Slice end("e");
+ auto range_del_compaction_iter1 =
+ range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(range_del_compaction_iter1.get(),
+ {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}});
+
+ auto range_del_compaction_iter2 =
+ range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(
+ range_del_compaction_iter2.get(),
+ {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}});
+}
+
+TEST_F(RangeDelAggregatorTest,
+ CompactionAggregatorBoundedIteratorExtraFragments) {
+ auto fragment_lists = MakeFragmentedTombstoneLists(
+ {{{"a", "d", 10}, {"c", "g", 8}},
+ {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+ std::vector<SequenceNumber> snapshots{9, 19};
+ CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+ for (const auto& fragment_list : fragment_lists) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+ new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+ kMaxSequenceNumber));
+ range_del_agg.AddTombstones(std::move(input_iter));
+ }
+
+ Slice start("bb");
+ Slice end("e");
+ auto range_del_compaction_iter1 =
+ range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10},
+ {"b", "c", 20},
+ {"b", "c", 10},
+ {"c", "d", 10},
+ {"c", "d", 8},
+ {"d", "f", 30},
+ {"d", "f", 8},
+ {"f", "g", 8}});
+
+ auto range_del_compaction_iter2 =
+ range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+ VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10},
+ {"b", "c", 20},
+ {"b", "c", 10},
+ {"c", "d", 10},
+ {"c", "d", 8},
+ {"d", "f", 30},
+ {"d", "f", 8},
+ {"f", "g", 8}});
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/range_tombstone_fragmenter.cc b/src/rocksdb/db/range_tombstone_fragmenter.cc
new file mode 100644
index 000000000..7e7cedeca
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter.cc
@@ -0,0 +1,502 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <functional>
+#include <set>
+
+#include "util/autovector.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
+ std::unique_ptr<InternalIterator> unfragmented_tombstones,
+ const InternalKeyComparator& icmp, bool for_compaction,
+ const std::vector<SequenceNumber>& snapshots) {
+ if (unfragmented_tombstones == nullptr) {
+ return;
+ }
+ bool is_sorted = true;
+ InternalKey pinned_last_start_key;
+ Slice last_start_key;
+ num_unfragmented_tombstones_ = 0;
+ total_tombstone_payload_bytes_ = 0;
+ for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+ unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) {
+ total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
+ unfragmented_tombstones->value().size();
+ if (num_unfragmented_tombstones_ > 0 &&
+ icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
+ is_sorted = false;
+ break;
+ }
+ if (unfragmented_tombstones->IsKeyPinned()) {
+ last_start_key = unfragmented_tombstones->key();
+ } else {
+ pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key());
+ last_start_key = pinned_last_start_key.Encode();
+ }
+ }
+ if (is_sorted) {
+ FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction,
+ snapshots);
+ return;
+ }
+
+ // Sort the tombstones before fragmenting them.
+ std::vector<std::string> keys, values;
+ keys.reserve(num_unfragmented_tombstones_);
+ values.reserve(num_unfragmented_tombstones_);
+ // Reset the counter to zero for the next iteration over keys.
+ total_tombstone_payload_bytes_ = 0;
+ for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+ unfragmented_tombstones->Next()) {
+ total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
+ unfragmented_tombstones->value().size();
+ keys.emplace_back(unfragmented_tombstones->key().data(),
+ unfragmented_tombstones->key().size());
+ values.emplace_back(unfragmented_tombstones->value().data(),
+ unfragmented_tombstones->value().size());
+ }
+ // VectorIterator implicitly sorts by key during construction.
+ auto iter = std::make_unique<VectorIterator>(std::move(keys),
+ std::move(values), &icmp);
+ FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots);
+}
+
+void FragmentedRangeTombstoneList::FragmentTombstones(
+ std::unique_ptr<InternalIterator> unfragmented_tombstones,
+ const InternalKeyComparator& icmp, bool for_compaction,
+ const std::vector<SequenceNumber>& snapshots) {
+ Slice cur_start_key(nullptr, 0);
+ auto cmp = ParsedInternalKeyComparator(&icmp);
+
+ // Stores the end keys and sequence numbers of range tombstones with a start
+ // key less than or equal to cur_start_key. Provides an ordering by end key
+ // for use in flush_current_tombstones.
+ std::set<ParsedInternalKey, ParsedInternalKeyComparator> cur_end_keys(cmp);
+
+ size_t ts_sz = icmp.user_comparator()->timestamp_size();
+ // Given the next start key in unfragmented_tombstones,
+ // flush_current_tombstones writes every tombstone fragment that starts
+ // and ends with a key before next_start_key, and starts with a key greater
+ // than or equal to cur_start_key.
+ auto flush_current_tombstones = [&](const Slice& next_start_key) {
+ auto it = cur_end_keys.begin();
+ bool reached_next_start_key = false;
+ for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
+ Slice cur_end_key = it->user_key;
+ if (icmp.user_comparator()->CompareWithoutTimestamp(cur_start_key,
+ cur_end_key) == 0) {
+ // Empty tombstone.
+ continue;
+ }
+ if (icmp.user_comparator()->CompareWithoutTimestamp(next_start_key,
+ cur_end_key) <= 0) {
+ // All the end keys in [it, cur_end_keys.end()) are after
+ // next_start_key, so the tombstones they represent can be used in
+ // fragments that start with keys greater than or equal to
+ // next_start_key. However, the end keys we already passed will not be
+ // used in any more tombstone fragments.
+ //
+ // Remove the fully fragmented tombstones and stop iteration after a
+ // final round of flushing to preserve the tombstones we can create more
+ // fragments from.
+ reached_next_start_key = true;
+ cur_end_keys.erase(cur_end_keys.begin(), it);
+ cur_end_key = next_start_key;
+ }
+
+ // Flush a range tombstone fragment [cur_start_key, cur_end_key), which
+ // should not overlap with the last-flushed tombstone fragment.
+ assert(tombstones_.empty() ||
+ icmp.user_comparator()->CompareWithoutTimestamp(
+ tombstones_.back().end_key, cur_start_key) <= 0);
+
+ // Sort the sequence numbers of the tombstones being fragmented in
+ // descending order, and then flush them in that order.
+ autovector<SequenceNumber> seqnums_to_flush;
+ autovector<Slice> timestamps_to_flush;
+ for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
+ seqnums_to_flush.push_back(flush_it->sequence);
+ if (ts_sz) {
+ timestamps_to_flush.push_back(
+ ExtractTimestampFromUserKey(flush_it->user_key, ts_sz));
+ }
+ }
+ // TODO: bind the two sorting together to be more efficient
+ std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
+ std::greater<SequenceNumber>());
+ if (ts_sz) {
+ std::sort(timestamps_to_flush.begin(), timestamps_to_flush.end(),
+ [icmp](const Slice& ts1, const Slice& ts2) {
+ return icmp.user_comparator()->CompareTimestamp(ts1, ts2) >
+ 0;
+ });
+ }
+
+ size_t start_idx = tombstone_seqs_.size();
+ size_t end_idx = start_idx + seqnums_to_flush.size();
+
+ // If user-defined timestamp is enabled, we should not drop tombstones
+ // from any snapshot stripe. Garbage collection of range tombstones
+ // happens in CompactionOutputs::AddRangeDels().
+ if (for_compaction && ts_sz == 0) {
+ // Drop all tombstone seqnums that are not preserved by a snapshot.
+ SequenceNumber next_snapshot = kMaxSequenceNumber;
+ for (auto seq : seqnums_to_flush) {
+ if (seq <= next_snapshot) {
+ // This seqnum is visible by a lower snapshot.
+ tombstone_seqs_.push_back(seq);
+ auto upper_bound_it =
+ std::lower_bound(snapshots.begin(), snapshots.end(), seq);
+ if (upper_bound_it == snapshots.begin()) {
+ // This seqnum is the topmost one visible by the earliest
+ // snapshot. None of the seqnums below it will be visible, so we
+ // can skip them.
+ break;
+ }
+ next_snapshot = *std::prev(upper_bound_it);
+ }
+ }
+ end_idx = tombstone_seqs_.size();
+ } else {
+ // The fragmentation is being done for reads, so preserve all seqnums.
+ tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(),
+ seqnums_to_flush.end());
+ if (ts_sz) {
+ tombstone_timestamps_.insert(tombstone_timestamps_.end(),
+ timestamps_to_flush.begin(),
+ timestamps_to_flush.end());
+ }
+ }
+
+ assert(start_idx < end_idx);
+ if (ts_sz) {
+ std::string start_key_with_max_ts;
+ AppendUserKeyWithMaxTimestamp(&start_key_with_max_ts, cur_start_key,
+ ts_sz);
+ pinned_slices_.emplace_back(std::move(start_key_with_max_ts));
+ Slice start_key = pinned_slices_.back();
+
+ std::string end_key_with_max_ts;
+ AppendUserKeyWithMaxTimestamp(&end_key_with_max_ts, cur_end_key, ts_sz);
+ pinned_slices_.emplace_back(std::move(end_key_with_max_ts));
+ Slice end_key = pinned_slices_.back();
+
+ // RangeTombstoneStack expects start_key and end_key to have max
+ // timestamp.
+ tombstones_.emplace_back(start_key, end_key, start_idx, end_idx);
+ } else {
+ tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx,
+ end_idx);
+ }
+
+ cur_start_key = cur_end_key;
+ }
+ if (!reached_next_start_key) {
+ // There is a gap between the last flushed tombstone fragment and
+ // the next tombstone's start key. Remove all the end keys in
+ // the working set, since we have fully fragmented their corresponding
+ // tombstones.
+ cur_end_keys.clear();
+ }
+ cur_start_key = next_start_key;
+ };
+
+ pinned_iters_mgr_.StartPinning();
+
+ bool no_tombstones = true;
+ for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+ unfragmented_tombstones->Next()) {
+ const Slice& ikey = unfragmented_tombstones->key();
+ Slice tombstone_start_key = ExtractUserKey(ikey);
+ SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
+ if (!unfragmented_tombstones->IsKeyPinned()) {
+ pinned_slices_.emplace_back(tombstone_start_key.data(),
+ tombstone_start_key.size());
+ tombstone_start_key = pinned_slices_.back();
+ }
+ no_tombstones = false;
+
+ Slice tombstone_end_key = unfragmented_tombstones->value();
+ if (!unfragmented_tombstones->IsValuePinned()) {
+ pinned_slices_.emplace_back(tombstone_end_key.data(),
+ tombstone_end_key.size());
+ tombstone_end_key = pinned_slices_.back();
+ }
+ if (!cur_end_keys.empty() &&
+ icmp.user_comparator()->CompareWithoutTimestamp(
+ cur_start_key, tombstone_start_key) != 0) {
+ // The start key has changed. Flush all tombstones that start before
+ // this new start key.
+ flush_current_tombstones(tombstone_start_key);
+ }
+ cur_start_key = tombstone_start_key;
+
+ cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion);
+ }
+ if (!cur_end_keys.empty()) {
+ ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end());
+ flush_current_tombstones(last_end_key.user_key);
+ }
+
+ if (!no_tombstones) {
+ pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
+ false /* arena */);
+ }
+}
+
+bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower,
+ SequenceNumber upper) {
+ std::call_once(seq_set_init_once_flag_, [this]() {
+ for (auto s : tombstone_seqs_) {
+ seq_set_.insert(s);
+ }
+ });
+ auto seq_it = seq_set_.lower_bound(lower);
+ return seq_it != seq_set_.end() && *seq_it <= upper;
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+ FragmentedRangeTombstoneList* tombstones, const InternalKeyComparator& icmp,
+ SequenceNumber _upper_bound, const Slice* ts_upper_bound,
+ SequenceNumber _lower_bound)
+ : tombstone_start_cmp_(icmp.user_comparator()),
+ tombstone_end_cmp_(icmp.user_comparator()),
+ icmp_(&icmp),
+ ucmp_(icmp.user_comparator()),
+ tombstones_(tombstones),
+ upper_bound_(_upper_bound),
+ lower_bound_(_lower_bound),
+ ts_upper_bound_(ts_upper_bound) {
+ assert(tombstones_ != nullptr);
+ Invalidate();
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+ const std::shared_ptr<FragmentedRangeTombstoneList>& tombstones,
+ const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+ const Slice* ts_upper_bound, SequenceNumber _lower_bound)
+ : tombstone_start_cmp_(icmp.user_comparator()),
+ tombstone_end_cmp_(icmp.user_comparator()),
+ icmp_(&icmp),
+ ucmp_(icmp.user_comparator()),
+ tombstones_ref_(tombstones),
+ tombstones_(tombstones_ref_.get()),
+ upper_bound_(_upper_bound),
+ lower_bound_(_lower_bound),
+ ts_upper_bound_(ts_upper_bound) {
+ assert(tombstones_ != nullptr);
+ Invalidate();
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+ const std::shared_ptr<FragmentedRangeTombstoneListCache>& tombstones_cache,
+ const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+ const Slice* ts_upper_bound, SequenceNumber _lower_bound)
+ : tombstone_start_cmp_(icmp.user_comparator()),
+ tombstone_end_cmp_(icmp.user_comparator()),
+ icmp_(&icmp),
+ ucmp_(icmp.user_comparator()),
+ tombstones_cache_ref_(tombstones_cache),
+ tombstones_(tombstones_cache_ref_->tombstones.get()),
+ upper_bound_(_upper_bound),
+ lower_bound_(_lower_bound) {
+ assert(tombstones_ != nullptr);
+ if (!ts_upper_bound || ts_upper_bound->empty()) {
+ ts_upper_bound_ = nullptr;
+ } else {
+ ts_upper_bound_ = ts_upper_bound;
+ }
+ Invalidate();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToFirst() {
+ pos_ = tombstones_->begin();
+ seq_pos_ = tombstones_->seq_begin();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopFirst() {
+ if (tombstones_->empty()) {
+ Invalidate();
+ return;
+ }
+ pos_ = tombstones_->begin();
+ SetMaxVisibleSeqAndTimestamp();
+ ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToLast() {
+ pos_ = std::prev(tombstones_->end());
+ seq_pos_ = std::prev(tombstones_->seq_end());
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopLast() {
+ if (tombstones_->empty()) {
+ Invalidate();
+ return;
+ }
+ pos_ = std::prev(tombstones_->end());
+ SetMaxVisibleSeqAndTimestamp();
+ ScanBackwardToVisibleTombstone();
+}
+
+// @param `target` is a user key, with timestamp if user-defined timestamp is
+// enabled.
+void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
+ if (tombstones_->empty()) {
+ Invalidate();
+ return;
+ }
+ SeekToCoveringTombstone(target);
+ ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
+ if (tombstones_->empty()) {
+ Invalidate();
+ return;
+ }
+ SeekForPrevToCoveringTombstone(target);
+ ScanBackwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone(
+ const Slice& target) {
+ pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+ tombstone_end_cmp_);
+ if (pos_ == tombstones_->end()) {
+ // All tombstones end before target.
+ seq_pos_ = tombstones_->seq_end();
+ return;
+ }
+ SetMaxVisibleSeqAndTimestamp();
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone(
+ const Slice& target) {
+ if (tombstones_->empty()) {
+ Invalidate();
+ return;
+ }
+ pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+ tombstone_start_cmp_);
+ if (pos_ == tombstones_->begin()) {
+ // All tombstones start after target.
+ Invalidate();
+ return;
+ }
+ --pos_;
+ SetMaxVisibleSeqAndTimestamp();
+}
+
+void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() {
+ while (pos_ != tombstones_->end() &&
+ (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+ *seq_pos_ < lower_bound_)) {
+ ++pos_;
+ if (pos_ == tombstones_->end()) {
+ Invalidate();
+ return;
+ }
+ SetMaxVisibleSeqAndTimestamp();
+ }
+}
+
+void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() {
+ while (pos_ != tombstones_->end() &&
+ (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+ *seq_pos_ < lower_bound_)) {
+ if (pos_ == tombstones_->begin()) {
+ Invalidate();
+ return;
+ }
+ --pos_;
+ SetMaxVisibleSeqAndTimestamp();
+ }
+}
+
+void FragmentedRangeTombstoneIterator::Next() {
+ ++seq_pos_;
+ if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) {
+ ++pos_;
+ }
+}
+
+void FragmentedRangeTombstoneIterator::TopNext() {
+ ++pos_;
+ if (pos_ == tombstones_->end()) {
+ return;
+ }
+ SetMaxVisibleSeqAndTimestamp();
+ ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::Prev() {
+ if (seq_pos_ == tombstones_->seq_begin()) {
+ Invalidate();
+ return;
+ }
+ --seq_pos_;
+ if (pos_ == tombstones_->end() ||
+ seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) {
+ --pos_;
+ }
+}
+
+void FragmentedRangeTombstoneIterator::TopPrev() {
+ if (pos_ == tombstones_->begin()) {
+ Invalidate();
+ return;
+ }
+ --pos_;
+ SetMaxVisibleSeqAndTimestamp();
+ ScanBackwardToVisibleTombstone();
+}
+
+bool FragmentedRangeTombstoneIterator::Valid() const {
+ return tombstones_ != nullptr && pos_ != tombstones_->end();
+}
+
+SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum(
+ const Slice& target_user_key) {
+ SeekToCoveringTombstone(target_user_key);
+ return ValidPos() && ucmp_->CompareWithoutTimestamp(start_key(),
+ target_user_key) <= 0
+ ? seq()
+ : 0;
+}
+
+std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+FragmentedRangeTombstoneIterator::SplitBySnapshot(
+ const std::vector<SequenceNumber>& snapshots) {
+ std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ splits;
+ SequenceNumber lower = 0;
+ SequenceNumber upper;
+ for (size_t i = 0; i <= snapshots.size(); i++) {
+ if (i >= snapshots.size()) {
+ upper = kMaxSequenceNumber;
+ } else {
+ upper = snapshots[i];
+ }
+ if (tombstones_->ContainsRange(lower, upper)) {
+ splits.emplace(upper,
+ std::make_unique<FragmentedRangeTombstoneIterator>(
+ tombstones_, *icmp_, upper, ts_upper_bound_, lower));
+ }
+ lower = upper + 1;
+ }
+ return splits;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_tombstone_fragmenter.h b/src/rocksdb/db/range_tombstone_fragmenter.h
new file mode 100644
index 000000000..df07fa894
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter.h
@@ -0,0 +1,357 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct FragmentedRangeTombstoneList;
+
+struct FragmentedRangeTombstoneListCache {
+ // ensure only the first reader needs to initialize l
+ std::mutex reader_mutex;
+ std::unique_ptr<FragmentedRangeTombstoneList> tombstones = nullptr;
+ // readers will first check this bool to avoid
+ std::atomic<bool> initialized = false;
+};
+
+struct FragmentedRangeTombstoneList {
+ public:
+ // A compact representation of a "stack" of range tombstone fragments, which
+ // start and end at the same user keys but have different sequence numbers.
+ // The members seq_start_idx and seq_end_idx are intended to be parameters to
+ // seq_iter().
+ // If user-defined timestamp is enabled, `start` and `end` should be user keys
+ // with timestamp, and the timestamps are set to max timestamp to be returned
+ // by parsed_start_key()/parsed_end_key(). seq_start_idx and seq_end_idx will
+ // also be used as parameters to ts_iter().
+ struct RangeTombstoneStack {
+ RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx,
+ size_t end_idx)
+ : start_key(start),
+ end_key(end),
+ seq_start_idx(start_idx),
+ seq_end_idx(end_idx) {}
+ Slice start_key;
+ Slice end_key;
+ size_t seq_start_idx;
+ size_t seq_end_idx;
+ };
+ // Assumes unfragmented_tombstones->key() and unfragmented_tombstones->value()
+ // both contain timestamp if enabled.
+ FragmentedRangeTombstoneList(
+ std::unique_ptr<InternalIterator> unfragmented_tombstones,
+ const InternalKeyComparator& icmp, bool for_compaction = false,
+ const std::vector<SequenceNumber>& snapshots = {});
+
+ std::vector<RangeTombstoneStack>::const_iterator begin() const {
+ return tombstones_.begin();
+ }
+
+ std::vector<RangeTombstoneStack>::const_iterator end() const {
+ return tombstones_.end();
+ }
+
+ std::vector<SequenceNumber>::const_iterator seq_iter(size_t idx) const {
+ return std::next(tombstone_seqs_.begin(), idx);
+ }
+
+ std::vector<Slice>::const_iterator ts_iter(size_t idx) const {
+ return std::next(tombstone_timestamps_.begin(), idx);
+ }
+
+ std::vector<SequenceNumber>::const_iterator seq_begin() const {
+ return tombstone_seqs_.begin();
+ }
+
+ std::vector<SequenceNumber>::const_iterator seq_end() const {
+ return tombstone_seqs_.end();
+ }
+
+ bool empty() const { return tombstones_.empty(); }
+
+ // Returns true if the stored tombstones contain with one with a sequence
+ // number in [lower, upper].
+ // This method is not const as it internally lazy initialize a set of
+ // sequence numbers (`seq_set_`).
+ bool ContainsRange(SequenceNumber lower, SequenceNumber upper);
+
+ uint64_t num_unfragmented_tombstones() const {
+ return num_unfragmented_tombstones_;
+ }
+
+ uint64_t total_tombstone_payload_bytes() const {
+ return total_tombstone_payload_bytes_;
+ }
+
+ private:
+ // Given an ordered range tombstone iterator unfragmented_tombstones,
+ // "fragment" the tombstones into non-overlapping pieces. Each
+ // "non-overlapping piece" is a RangeTombstoneStack in tombstones_, which
+ // contains start_key, end_key, and indices that points to sequence numbers
+ // (in tombstone_seqs_) and timestamps (in tombstone_timestamps_). If
+ // for_compaction is true, then `snapshots` should be provided. Range
+ // tombstone fragments are dropped if they are not visible in any snapshot and
+ // user-defined timestamp is not enabled. That is, for each snapshot stripe
+ // [lower, upper], the range tombstone fragment with largest seqno in [lower,
+ // upper] is preserved, and all the other range tombstones are dropped.
+ void FragmentTombstones(
+ std::unique_ptr<InternalIterator> unfragmented_tombstones,
+ const InternalKeyComparator& icmp, bool for_compaction,
+ const std::vector<SequenceNumber>& snapshots);
+
+ std::vector<RangeTombstoneStack> tombstones_;
+ std::vector<SequenceNumber> tombstone_seqs_;
+ std::vector<Slice> tombstone_timestamps_;
+ std::once_flag seq_set_init_once_flag_;
+ std::set<SequenceNumber> seq_set_;
+ std::list<std::string> pinned_slices_;
+ PinnedIteratorsManager pinned_iters_mgr_;
+ uint64_t num_unfragmented_tombstones_;
+ uint64_t total_tombstone_payload_bytes_;
+};
+
+// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
+// meta block into an iterator over non-overlapping tombstone fragments. The
+// tombstone fragmentation process should be more efficient than the range
+// tombstone collapsing algorithm in RangeDelAggregator because this leverages
+// the internal key ordering already provided by the input iterator, if
+// applicable (when the iterator is unsorted, a new sorted iterator is created
+// before proceeding). If there are few overlaps, creating a
+// FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator
+// tombstone collapsing is always O(n log n).
+class FragmentedRangeTombstoneIterator : public InternalIterator {
+ public:
+ FragmentedRangeTombstoneIterator(FragmentedRangeTombstoneList* tombstones,
+ const InternalKeyComparator& icmp,
+ SequenceNumber upper_bound,
+ const Slice* ts_upper_bound = nullptr,
+ SequenceNumber lower_bound = 0);
+ FragmentedRangeTombstoneIterator(
+ const std::shared_ptr<FragmentedRangeTombstoneList>& tombstones,
+ const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+ const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0);
+ FragmentedRangeTombstoneIterator(
+ const std::shared_ptr<FragmentedRangeTombstoneListCache>& tombstones,
+ const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+ const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0);
+
+ void SeekToFirst() override;
+ void SeekToLast() override;
+
+ void SeekToTopFirst();
+ void SeekToTopLast();
+
+ // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator
+ // seeking should behave. This is OK because they are not currently used, but
+ // eventually FragmentedRangeTombstoneIterator should no longer implement
+ // InternalIterator.
+ //
+ // Seeks to the range tombstone that covers target at a seqnum in the
+ // snapshot. If no such tombstone exists, seek to the earliest tombstone in
+ // the snapshot that ends after target.
+ void Seek(const Slice& target) override;
+ // Seeks to the range tombstone that covers target at a seqnum in the
+ // snapshot. If no such tombstone exists, seek to the latest tombstone in the
+ // snapshot that starts before target.
+ void SeekForPrev(const Slice& target) override;
+
+ void Next() override;
+ void Prev() override;
+
+ void TopNext();
+ void TopPrev();
+
+ bool Valid() const override;
+ // Note that key() and value() do not return correct timestamp.
+ // Caller should call timestamp() to get the current timestamp.
+ Slice key() const override {
+ MaybePinKey();
+ return current_start_key_.Encode();
+ }
+ Slice value() const override { return pos_->end_key; }
+ bool IsKeyPinned() const override { return false; }
+ bool IsValuePinned() const override { return true; }
+ Status status() const override { return Status::OK(); }
+
+ bool empty() const { return tombstones_->empty(); }
+ void Invalidate() {
+ pos_ = tombstones_->end();
+ seq_pos_ = tombstones_->seq_end();
+ pinned_pos_ = tombstones_->end();
+ pinned_seq_pos_ = tombstones_->seq_end();
+ }
+
+ RangeTombstone Tombstone() const {
+ assert(Valid());
+ if (icmp_->user_comparator()->timestamp_size()) {
+ return RangeTombstone(start_key(), end_key(), seq(), timestamp());
+ }
+ return RangeTombstone(start_key(), end_key(), seq());
+ }
+ // Note that start_key() and end_key() are not guaranteed to have the
+ // correct timestamp. User can call timestamp() to get the correct
+ // timestamp().
+ Slice start_key() const { return pos_->start_key; }
+ Slice end_key() const { return pos_->end_key; }
+ SequenceNumber seq() const { return *seq_pos_; }
+ Slice timestamp() const {
+ // seqno and timestamp are stored in the same order.
+ return *tombstones_->ts_iter(seq_pos_ - tombstones_->seq_begin());
+ }
+ // Current use case is by CompactionRangeDelAggregator to set
+ // full_history_ts_low_.
+ void SetTimestampUpperBound(const Slice* ts_upper_bound) {
+ ts_upper_bound_ = ts_upper_bound;
+ }
+
+ ParsedInternalKey parsed_start_key() const {
+ return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber,
+ kTypeRangeDeletion);
+ }
+ ParsedInternalKey parsed_end_key() const {
+ return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber,
+ kTypeRangeDeletion);
+ }
+
+ // Return the max sequence number of a range tombstone that covers
+ // the given user key.
+ // If there is no covering tombstone, then 0 is returned.
+ SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key);
+
+ // Splits the iterator into n+1 iterators (where n is the number of
+ // snapshots), each providing a view over a "stripe" of sequence numbers. The
+ // iterators are keyed by the upper bound of their ranges (the provided
+ // snapshots + kMaxSequenceNumber).
+ //
+ // NOTE: the iterators in the returned map are no longer valid if their
+ // parent iterator is deleted, since they do not modify the refcount of the
+ // underlying tombstone list. Therefore, this map should be deleted before
+ // the parent iterator.
+ std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+ SequenceNumber upper_bound() const { return upper_bound_; }
+ SequenceNumber lower_bound() const { return lower_bound_; }
+
+ uint64_t num_unfragmented_tombstones() const {
+ return tombstones_->num_unfragmented_tombstones();
+ }
+ uint64_t total_tombstone_payload_bytes() const {
+ return tombstones_->total_tombstone_payload_bytes();
+ }
+
+ private:
+ using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;
+
+ struct RangeTombstoneStackStartComparator {
+ explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {}
+
+ bool operator()(const RangeTombstoneStack& a,
+ const RangeTombstoneStack& b) const {
+ return cmp->CompareWithoutTimestamp(a.start_key, b.start_key) < 0;
+ }
+
+ bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+ return cmp->CompareWithoutTimestamp(a.start_key, b) < 0;
+ }
+
+ bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+ return cmp->CompareWithoutTimestamp(a, b.start_key) < 0;
+ }
+
+ const Comparator* cmp;
+ };
+
+ struct RangeTombstoneStackEndComparator {
+ explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {}
+
+ bool operator()(const RangeTombstoneStack& a,
+ const RangeTombstoneStack& b) const {
+ return cmp->CompareWithoutTimestamp(a.end_key, b.end_key) < 0;
+ }
+
+ bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+ return cmp->CompareWithoutTimestamp(a.end_key, b) < 0;
+ }
+
+ bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+ return cmp->CompareWithoutTimestamp(a, b.end_key) < 0;
+ }
+
+ const Comparator* cmp;
+ };
+
+ void MaybePinKey() const {
+ if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() &&
+ (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) {
+ current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion);
+ pinned_pos_ = pos_;
+ pinned_seq_pos_ = seq_pos_;
+ }
+ }
+
+ void SeekToCoveringTombstone(const Slice& key);
+ void SeekForPrevToCoveringTombstone(const Slice& key);
+ void ScanForwardToVisibleTombstone();
+ void ScanBackwardToVisibleTombstone();
+ bool ValidPos() const {
+ return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx);
+ }
+
+ const RangeTombstoneStackStartComparator tombstone_start_cmp_;
+ const RangeTombstoneStackEndComparator tombstone_end_cmp_;
+ const InternalKeyComparator* icmp_;
+ const Comparator* ucmp_;
+ std::shared_ptr<FragmentedRangeTombstoneList> tombstones_ref_;
+ std::shared_ptr<FragmentedRangeTombstoneListCache> tombstones_cache_ref_;
+ FragmentedRangeTombstoneList* tombstones_;
+ SequenceNumber upper_bound_;
+ SequenceNumber lower_bound_;
+ // Only consider timestamps <= ts_upper_bound_.
+ const Slice* ts_upper_bound_;
+ std::vector<RangeTombstoneStack>::const_iterator pos_;
+ std::vector<SequenceNumber>::const_iterator seq_pos_;
+ mutable std::vector<RangeTombstoneStack>::const_iterator pinned_pos_;
+ mutable std::vector<SequenceNumber>::const_iterator pinned_seq_pos_;
+ mutable InternalKey current_start_key_;
+
+ // Check the current RangeTombstoneStack `pos_` against timestamp
+ // upper bound `ts_upper_bound_` and sequence number upper bound
+ // `upper_bound_`. Update the sequence number (and timestamp) pointer
+ // `seq_pos_` to the first valid position satisfying both bounds.
+ void SetMaxVisibleSeqAndTimestamp() {
+ seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+ tombstones_->seq_iter(pos_->seq_end_idx),
+ upper_bound_, std::greater<SequenceNumber>());
+ if (ts_upper_bound_ && !ts_upper_bound_->empty()) {
+ auto ts_pos = std::lower_bound(
+ tombstones_->ts_iter(pos_->seq_start_idx),
+ tombstones_->ts_iter(pos_->seq_end_idx), *ts_upper_bound_,
+ [this](const Slice& s1, const Slice& s2) {
+ return ucmp_->CompareTimestamp(s1, s2) > 0;
+ });
+ auto ts_idx = ts_pos - tombstones_->ts_iter(pos_->seq_start_idx);
+ auto seq_idx = seq_pos_ - tombstones_->seq_iter(pos_->seq_start_idx);
+ if (seq_idx < ts_idx) {
+ // seq and ts are ordered in non-increasing order. Only updates seq_pos_
+ // to a larger index for smaller sequence number and timestamp.
+ seq_pos_ = tombstones_->seq_iter(pos_->seq_start_idx + ts_idx);
+ }
+ }
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_tombstone_fragmenter_test.cc b/src/rocksdb/db/range_tombstone_fragmenter_test.cc
new file mode 100644
index 000000000..46b3c99b5
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter_test.cc
@@ -0,0 +1,555 @@
+// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeTombstoneFragmenterTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+ const std::vector<RangeTombstone>& range_dels) {
+ std::vector<std::string> keys, values;
+ for (const auto& range_del : range_dels) {
+ auto key_and_value = range_del.Serialize();
+ keys.push_back(key_and_value.first.Encode().ToString());
+ values.push_back(key_and_value.second.ToString());
+ }
+ return std::unique_ptr<VectorIterator>(
+ new VectorIterator(keys, values, &bytewise_icmp));
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+ const FragmentedRangeTombstoneIterator* iter) {
+ // Test InternalIterator interface.
+ EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+ EXPECT_EQ(tombstone.end_key_, iter->value());
+ EXPECT_EQ(tombstone.seq_, iter->seq());
+
+ // Test FragmentedRangeTombstoneIterator interface.
+ EXPECT_EQ(tombstone.start_key_, iter->start_key());
+ EXPECT_EQ(tombstone.end_key_, iter->end_key());
+ EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+ FragmentedRangeTombstoneIterator* iter,
+ const std::vector<RangeTombstone>& expected_tombstones) {
+ iter->SeekToFirst();
+ for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+ ASSERT_TRUE(iter->Valid());
+ CheckIterPosition(expected_tombstones[i], iter);
+ }
+ EXPECT_FALSE(iter->Valid());
+}
+
+void VerifyVisibleTombstones(
+ FragmentedRangeTombstoneIterator* iter,
+ const std::vector<RangeTombstone>& expected_tombstones) {
+ iter->SeekToTopFirst();
+ for (size_t i = 0; i < expected_tombstones.size(); i++, iter->TopNext()) {
+ ASSERT_TRUE(iter->Valid());
+ CheckIterPosition(expected_tombstones[i], iter);
+ }
+ EXPECT_FALSE(iter->Valid());
+}
+
+struct SeekTestCase {
+ Slice seek_target;
+ RangeTombstone expected_position;
+ bool out_of_range;
+};
+
+void VerifySeek(FragmentedRangeTombstoneIterator* iter,
+ const std::vector<SeekTestCase>& cases) {
+ for (const auto& testcase : cases) {
+ iter->Seek(testcase.seek_target);
+ if (testcase.out_of_range) {
+ ASSERT_FALSE(iter->Valid());
+ } else {
+ ASSERT_TRUE(iter->Valid());
+ CheckIterPosition(testcase.expected_position, iter);
+ }
+ }
+}
+
+void VerifySeekForPrev(FragmentedRangeTombstoneIterator* iter,
+ const std::vector<SeekTestCase>& cases) {
+ for (const auto& testcase : cases) {
+ iter->SeekForPrev(testcase.seek_target);
+ if (testcase.out_of_range) {
+ ASSERT_FALSE(iter->Valid());
+ } else {
+ ASSERT_TRUE(iter->Valid());
+ CheckIterPosition(testcase.expected_position, iter);
+ }
+ }
+}
+
+struct MaxCoveringTombstoneSeqnumTestCase {
+ Slice user_key;
+ SequenceNumber result;
+};
+
+void VerifyMaxCoveringTombstoneSeqnum(
+ FragmentedRangeTombstoneIterator* iter,
+ const std::vector<MaxCoveringTombstoneSeqnumTestCase>& cases) {
+ for (const auto& testcase : cases) {
+ EXPECT_EQ(testcase.result,
+ iter->MaxCoveringTombstoneSeqnum(testcase.user_key));
+ }
+}
+
+} // anonymous namespace
+
+TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter,
+ {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(
+ &iter, {{"a", "c", 10}, {"c", "e", 15}, {"c", "e", 10}, {"e", "g", 15}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter,
+ {{"a", 10}, {"c", 15}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) {
+ auto range_del_iter = MakeRangeDelIter(
+ {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(
+ &iter, {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter,
+ {{"a", 10}, {"c", 20}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(&iter,
+ {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter, {{"a", 10}, {"b", 10}, {"c", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) {
+ auto range_del_iter =
+ MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+ {"a", "c", 7},
+ {"a", "c", 3},
+ {"c", "e", 10},
+ {"c", "e", 7},
+ {"e", "g", 7}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter,
+ {{"a", 10}, {"c", 10}, {"e", 7}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "c", 30},
+ {"a", "g", 20},
+ {"a", "e", 10},
+ {"a", "g", 7},
+ {"a", "c", 3}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+ VerifyFragmentedRangeDels(&iter, {{"a", "c", 30},
+ {"a", "c", 20},
+ {"a", "c", 10},
+ {"a", "c", 7},
+ {"a", "c", 3},
+ {"c", "e", 20},
+ {"c", "e", 10},
+ {"c", "e", 7},
+ {"e", "g", 20},
+ {"e", "g", 7}});
+ VerifyMaxCoveringTombstoneSeqnum(&iter,
+ {{"a", 30}, {"c", 20}, {"e", 20}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"c", "g", 8},
+ {"c", "i", 6},
+ {"j", "n", 4},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+ 9 /* upper_bound */);
+ FragmentedRangeTombstoneIterator iter3(&fragment_list, bytewise_icmp,
+ 7 /* upper_bound */);
+ FragmentedRangeTombstoneIterator iter4(&fragment_list, bytewise_icmp,
+ 5 /* upper_bound */);
+ FragmentedRangeTombstoneIterator iter5(&fragment_list, bytewise_icmp,
+ 3 /* upper_bound */);
+ for (auto* iter : {&iter1, &iter2, &iter3, &iter4, &iter5}) {
+ VerifyFragmentedRangeDels(iter, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"c", "e", 8},
+ {"c", "e", 6},
+ {"e", "g", 8},
+ {"e", "g", 6},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"j", "l", 2},
+ {"l", "n", 4}});
+ }
+
+ ASSERT_EQ(0, iter1.lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, iter1.upper_bound());
+ VerifyVisibleTombstones(&iter1, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"e", "g", 8},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter1, {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+ ASSERT_EQ(0, iter2.lower_bound());
+ ASSERT_EQ(9, iter2.upper_bound());
+ VerifyVisibleTombstones(&iter2, {{"c", "e", 8},
+ {"e", "g", 8},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter2, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+ ASSERT_EQ(0, iter3.lower_bound());
+ ASSERT_EQ(7, iter3.upper_bound());
+ VerifyVisibleTombstones(&iter3, {{"c", "e", 6},
+ {"e", "g", 6},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter3, {{"a", 0}, {"c", 6}, {"e", 6}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+ ASSERT_EQ(0, iter4.lower_bound());
+ ASSERT_EQ(5, iter4.upper_bound());
+ VerifyVisibleTombstones(&iter4, {{"j", "l", 4}, {"l", "n", 4}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter4, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+ ASSERT_EQ(0, iter5.lower_bound());
+ ASSERT_EQ(3, iter5.upper_bound());
+ VerifyVisibleTombstones(&iter5, {{"j", "l", 2}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter5, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 2}, {"m", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"j", "n", 4},
+ {"c", "i", 6},
+ {"c", "g", 8},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ 9 /* upper_bound */);
+ ASSERT_EQ(0, iter.lower_bound());
+ ASSERT_EQ(9, iter.upper_bound());
+ VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"c", "e", 8},
+ {"c", "e", 6},
+ {"e", "g", 8},
+ {"e", "g", 6},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"j", "l", 2},
+ {"l", "n", 4}});
+ VerifyMaxCoveringTombstoneSeqnum(
+ &iter, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyForCompaction) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"j", "n", 4},
+ {"c", "i", 6},
+ {"c", "g", 8},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(
+ std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+ {} /* snapshots */);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber /* upper_bound */);
+ VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"e", "g", 8},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest,
+ OverlapAndRepeatedStartKeyForCompactionWithSnapshot) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"j", "n", 4},
+ {"c", "i", 6},
+ {"c", "g", 8},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(
+ std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+ {20, 9} /* upper_bounds */);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber /* upper_bound */);
+ VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"c", "e", 8},
+ {"e", "g", 8},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitNoSnapshots) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"j", "n", 4},
+ {"c", "i", 6},
+ {"c", "g", 8},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber /* upper_bound */);
+
+ auto split_iters = iter.SplitBySnapshot({} /* snapshots */);
+ ASSERT_EQ(1, split_iters.size());
+
+ auto* split_iter = split_iters[kMaxSequenceNumber].get();
+ ASSERT_EQ(0, split_iter->lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, split_iter->upper_bound());
+ VerifyVisibleTombstones(split_iter, {{"a", "c", 10},
+ {"c", "e", 10},
+ {"e", "g", 8},
+ {"g", "i", 6},
+ {"j", "l", 4},
+ {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitWithSnapshots) {
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"j", "n", 4},
+ {"c", "i", 6},
+ {"c", "g", 8},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber /* upper_bound */);
+
+ auto split_iters = iter.SplitBySnapshot({3, 5, 7, 9} /* snapshots */);
+ ASSERT_EQ(5, split_iters.size());
+
+ auto* split_iter1 = split_iters[3].get();
+ ASSERT_EQ(0, split_iter1->lower_bound());
+ ASSERT_EQ(3, split_iter1->upper_bound());
+ VerifyVisibleTombstones(split_iter1, {{"j", "l", 2}});
+
+ auto* split_iter2 = split_iters[5].get();
+ ASSERT_EQ(4, split_iter2->lower_bound());
+ ASSERT_EQ(5, split_iter2->upper_bound());
+ VerifyVisibleTombstones(split_iter2, {{"j", "l", 4}, {"l", "n", 4}});
+
+ auto* split_iter3 = split_iters[7].get();
+ ASSERT_EQ(6, split_iter3->lower_bound());
+ ASSERT_EQ(7, split_iter3->upper_bound());
+ VerifyVisibleTombstones(split_iter3,
+ {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}});
+
+ auto* split_iter4 = split_iters[9].get();
+ ASSERT_EQ(8, split_iter4->lower_bound());
+ ASSERT_EQ(9, split_iter4->upper_bound());
+ VerifyVisibleTombstones(split_iter4, {{"c", "e", 8}, {"e", "g", 8}});
+
+ auto* split_iter5 = split_iters[kMaxSequenceNumber].get();
+ ASSERT_EQ(10, split_iter5->lower_bound());
+ ASSERT_EQ(kMaxSequenceNumber, split_iter5->upper_bound());
+ VerifyVisibleTombstones(split_iter5, {{"a", "c", 10}, {"c", "e", 10}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) {
+ // Same tombstones as OverlapAndRepeatedStartKey.
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"c", "g", 8},
+ {"c", "i", 6},
+ {"j", "n", 4},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+
+ FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ VerifySeek(
+ &iter1,
+ {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+ VerifySeekForPrev(
+ &iter1,
+ {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+
+ FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+ 3 /* upper_bound */);
+ VerifySeek(&iter2, {{"a", {"j", "l", 2}},
+ {"e", {"j", "l", 2}},
+ {"l", {}, true /* out of range */}});
+ VerifySeekForPrev(&iter2, {{"a", {}, true /* out of range */},
+ {"e", {}, true /* out of range */},
+ {"l", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekCovered) {
+ // Same tombstones as OverlapAndRepeatedStartKey.
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"c", "g", 8},
+ {"c", "i", 6},
+ {"j", "n", 4},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+
+ FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ VerifySeek(
+ &iter1,
+ {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+ VerifySeekForPrev(
+ &iter1,
+ {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+
+ FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+ 3 /* upper_bound */);
+ VerifySeek(&iter2, {{"b", {"j", "l", 2}},
+ {"f", {"j", "l", 2}},
+ {"m", {}, true /* out of range */}});
+ VerifySeekForPrev(&iter2, {{"b", {}, true /* out of range */},
+ {"f", {}, true /* out of range */},
+ {"m", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) {
+ // Same tombstones as OverlapAndRepeatedStartKey.
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"c", "g", 8},
+ {"c", "i", 6},
+ {"j", "n", 4},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+
+ FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ VerifySeek(&iter1, {{"c", {"c", "e", 10}},
+ {"g", {"g", "i", 6}},
+ {"i", {"j", "l", 4}},
+ {"n", {}, true /* out of range */}});
+ VerifySeekForPrev(&iter1, {{"c", {"c", "e", 10}},
+ {"g", {"g", "i", 6}},
+ {"i", {"g", "i", 6}},
+ {"n", {"l", "n", 4}}});
+
+ FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+ 3 /* upper_bound */);
+ VerifySeek(&iter2, {{"c", {"j", "l", 2}},
+ {"g", {"j", "l", 2}},
+ {"i", {"j", "l", 2}},
+ {"n", {}, true /* out of range */}});
+ VerifySeekForPrev(&iter2, {{"c", {}, true /* out of range */},
+ {"g", {}, true /* out of range */},
+ {"i", {}, true /* out of range */},
+ {"n", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) {
+ // Same tombstones as OverlapAndRepeatedStartKey.
+ auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+ {"c", "g", 8},
+ {"c", "i", 6},
+ {"j", "n", 4},
+ {"j", "l", 2}});
+
+ FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+ bytewise_icmp);
+
+ FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+ kMaxSequenceNumber);
+ VerifySeek(&iter, {{"", {"a", "c", 10}}, {"z", {}, true /* out of range */}});
+ VerifySeekForPrev(&iter,
+ {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}});
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/read_callback.h b/src/rocksdb/db/read_callback.h
new file mode 100644
index 000000000..c042352db
--- /dev/null
+++ b/src/rocksdb/db/read_callback.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "db/dbformat.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ReadCallback {
+ public:
+ explicit ReadCallback(SequenceNumber last_visible_seq)
+ : max_visible_seq_(last_visible_seq) {}
+ ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted)
+ : max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {}
+
+ virtual ~ReadCallback() {}
+
+ // Will be called to see if the seq number visible; if not it moves on to
+ // the next seq number.
+ virtual bool IsVisibleFullCheck(SequenceNumber seq) = 0;
+
+ inline bool IsVisible(SequenceNumber seq) {
+ assert(min_uncommitted_ > 0);
+ assert(min_uncommitted_ >= kMinUnCommittedSeq);
+ if (seq < min_uncommitted_) { // handles seq == 0 as well
+ assert(seq <= max_visible_seq_);
+ return true;
+ } else if (max_visible_seq_ < seq) {
+ assert(seq != 0);
+ return false;
+ } else {
+ assert(seq != 0); // already handled in the first if-then clause
+ return IsVisibleFullCheck(seq);
+ }
+ }
+
+ inline SequenceNumber max_visible_seq() { return max_visible_seq_; }
+
+ // Refresh to a more recent visible seq
+ virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; }
+
+ protected:
+ // The max visible seq, it is usually the snapshot but could be larger if
+ // transaction has its own writes written to db.
+ SequenceNumber max_visible_seq_ = kMaxSequenceNumber;
+ // Any seq less than min_uncommitted_ is committed.
+ const SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/repair.cc b/src/rocksdb/db/repair.cc
new file mode 100644
index 000000000..1829a79f2
--- /dev/null
+++ b/src/rocksdb/db/repair.cc
@@ -0,0 +1,771 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Repairer does best effort recovery to recover as much data as possible after
+// a disaster without compromising consistency. It does not guarantee bringing
+// the database to a time consistent state.
+//
+// Repair process is broken into 4 phases:
+// (a) Find files
+// (b) Convert logs to tables
+// (c) Extract metadata
+// (d) Write Descriptor
+//
+// (a) Find files
+//
+// The repairer goes through all the files in the directory, and classifies them
+// based on their file name. Any file that cannot be identified by name will be
+// ignored.
+//
+// (b) Convert logs to table
+//
+// Every log file that is active is replayed. All sections of the file where the
+// checksum does not match is skipped over. We intentionally give preference to
+// data consistency.
+//
+// (c) Extract metadata
+//
+// We scan every table to compute
+// (1) smallest/largest for the table
+// (2) largest sequence number in the table
+// (3) oldest blob file referred to by the table (if applicable)
+//
+// If we are unable to scan the file, then we ignore the table.
+//
+// (d) Write Descriptor
+//
+// We generate descriptor contents:
+// - log number is set to zero
+// - next-file-number is set to 1 + largest file number we found
+// - last-sequence-number is set to largest sequence# found across
+// all tables (see 2c)
+// - compaction pointers are cleared
+// - every table file is added at level 0
+//
+// Possible optimization 1:
+// (a) Compute total size and use to pick appropriate max-level M
+// (b) Sort tables by largest sequence# in the table
+// (c) For each table: if it overlaps earlier table, place in level-0,
+// else place in level-M.
+// (d) We can provide options for time consistent recovery and unsafe recovery
+// (ignore checksum failure when applicable)
+// Possible optimization 2:
+// Store per-table metadata (smallest, largest, largest-seq#, ...)
+// in the table's meta section to speed up ScanTable.
+
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "options/cf_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/unique_id_impl.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class Repairer {
+ public:
+ Repairer(const std::string& dbname, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ const ColumnFamilyOptions& default_cf_opts,
+ const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs)
+ : dbname_(dbname),
+ db_session_id_(DBImpl::GenerateDbSessionId(db_options.env)),
+ env_(db_options.env),
+ file_options_(),
+ db_options_(SanitizeOptions(dbname_, db_options)),
+ immutable_db_options_(ImmutableDBOptions(db_options_)),
+ icmp_(default_cf_opts.comparator),
+ default_cf_opts_(
+ SanitizeOptions(immutable_db_options_, default_cf_opts)),
+ default_iopts_(
+ ImmutableOptions(immutable_db_options_, default_cf_opts_)),
+ unknown_cf_opts_(
+ SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
+ create_unknown_cfs_(create_unknown_cfs),
+ raw_table_cache_(
+ // TableCache can be small since we expect each table to be opened
+ // once.
+ NewLRUCache(10, db_options_.table_cache_numshardbits)),
+ table_cache_(new TableCache(default_iopts_, &file_options_,
+ raw_table_cache_.get(),
+ /*block_cache_tracer=*/nullptr,
+ /*io_tracer=*/nullptr, db_session_id_)),
+ wb_(db_options_.db_write_buffer_size),
+ wc_(db_options_.delayed_write_rate),
+ vset_(dbname_, &immutable_db_options_, file_options_,
+ raw_table_cache_.get(), &wb_, &wc_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id=*/"", db_session_id_),
+ next_file_number_(1),
+ db_lock_(nullptr),
+ closed_(false) {
+ for (const auto& cfd : column_families) {
+ cf_name_to_opts_[cfd.name] = cfd.options;
+ }
+ }
+
+ const ColumnFamilyOptions* GetColumnFamilyOptions(
+ const std::string& cf_name) {
+ if (cf_name_to_opts_.find(cf_name) == cf_name_to_opts_.end()) {
+ if (create_unknown_cfs_) {
+ return &unknown_cf_opts_;
+ }
+ return nullptr;
+ }
+ return &cf_name_to_opts_[cf_name];
+ }
+
+ // Adds a column family to the VersionSet with cf_options_ and updates
+ // manifest.
+ Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) {
+ const auto* cf_opts = GetColumnFamilyOptions(cf_name);
+ if (cf_opts == nullptr) {
+ return Status::Corruption("Encountered unknown column family with name=" +
+ cf_name + ", id=" + std::to_string(cf_id));
+ }
+ Options opts(db_options_, *cf_opts);
+ MutableCFOptions mut_cf_opts(opts);
+
+ VersionEdit edit;
+ edit.SetComparatorName(opts.comparator->Name());
+ edit.SetLogNumber(0);
+ edit.SetColumnFamily(cf_id);
+ ColumnFamilyData* cfd;
+ cfd = nullptr;
+ edit.AddColumnFamily(cf_name);
+
+ mutex_.Lock();
+ std::unique_ptr<FSDirectory> db_dir;
+ Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(),
+ &db_dir, nullptr);
+ if (status.ok()) {
+ status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_, db_dir.get(),
+ false /* new_descriptor_log */, cf_opts);
+ }
+ mutex_.Unlock();
+ return status;
+ }
+
+ Status Close() {
+ Status s = Status::OK();
+ if (!closed_) {
+ if (db_lock_ != nullptr) {
+ s = env_->UnlockFile(db_lock_);
+ db_lock_ = nullptr;
+ }
+ closed_ = true;
+ }
+ return s;
+ }
+
+ ~Repairer() { Close().PermitUncheckedError(); }
+
+ Status Run() {
+ Status status = env_->LockFile(LockFileName(dbname_), &db_lock_);
+ if (!status.ok()) {
+ return status;
+ }
+ status = FindFiles();
+ DBImpl* db_impl = nullptr;
+ if (status.ok()) {
+ // Discard older manifests and start a fresh one
+ for (size_t i = 0; i < manifests_.size(); i++) {
+ ArchiveFile(dbname_ + "/" + manifests_[i]);
+ }
+ // Just create a DBImpl temporarily so we can reuse NewDB()
+ db_impl = new DBImpl(db_options_, dbname_);
+ status = db_impl->NewDB(/*new_filenames=*/nullptr);
+ }
+ delete db_impl;
+
+ if (status.ok()) {
+ // Recover using the fresh manifest created by NewDB()
+ status =
+ vset_.Recover({{kDefaultColumnFamilyName, default_cf_opts_}}, false);
+ }
+ if (status.ok()) {
+ // Need to scan existing SST files first so the column families are
+ // created before we process WAL files
+ ExtractMetaData();
+
+ // ExtractMetaData() uses table_fds_ to know which SST files' metadata to
+ // extract -- we need to clear it here since metadata for existing SST
+ // files has been extracted already
+ table_fds_.clear();
+ ConvertLogFilesToTables();
+ ExtractMetaData();
+ status = AddTables();
+ }
+ if (status.ok()) {
+ uint64_t bytes = 0;
+ for (size_t i = 0; i < tables_.size(); i++) {
+ bytes += tables_[i].meta.fd.GetFileSize();
+ }
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "**** Repaired rocksdb %s; "
+ "recovered %" ROCKSDB_PRIszt " files; %" PRIu64
+ " bytes. "
+ "Some data may have been lost. "
+ "****",
+ dbname_.c_str(), tables_.size(), bytes);
+ }
+ return status;
+ }
+
+ private:
+ struct TableInfo {
+ FileMetaData meta;
+ uint32_t column_family_id;
+ std::string column_family_name;
+ };
+
+ std::string const dbname_;
+ std::string db_session_id_;
+ Env* const env_;
+ const FileOptions file_options_;
+ const DBOptions db_options_;
+ const ImmutableDBOptions immutable_db_options_;
+ const InternalKeyComparator icmp_;
+ const ColumnFamilyOptions default_cf_opts_;
+ const ImmutableOptions default_iopts_; // table_cache_ holds reference
+ const ColumnFamilyOptions unknown_cf_opts_;
+ const bool create_unknown_cfs_;
+ std::shared_ptr<Cache> raw_table_cache_;
+ std::unique_ptr<TableCache> table_cache_;
+ WriteBufferManager wb_;
+ WriteController wc_;
+ VersionSet vset_;
+ std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_opts_;
+ InstrumentedMutex mutex_;
+
+ std::vector<std::string> manifests_;
+ std::vector<FileDescriptor> table_fds_;
+ std::vector<uint64_t> logs_;
+ std::vector<TableInfo> tables_;
+ uint64_t next_file_number_;
+ // Lock over the persistent DB state. Non-nullptr iff successfully
+ // acquired.
+ FileLock* db_lock_;
+ bool closed_;
+
+ Status FindFiles() {
+ std::vector<std::string> filenames;
+ bool found_file = false;
+ std::vector<std::string> to_search_paths;
+
+ for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) {
+ to_search_paths.push_back(db_options_.db_paths[path_id].path);
+ }
+
+ // search wal_dir if user uses a customize wal_dir
+ bool same = immutable_db_options_.IsWalDirSameAsDBPath(dbname_);
+ if (!same) {
+ to_search_paths.push_back(immutable_db_options_.wal_dir);
+ }
+
+ for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) {
+ ROCKS_LOG_INFO(db_options_.info_log, "Searching path %s\n",
+ to_search_paths[path_id].c_str());
+ Status status = env_->GetChildren(to_search_paths[path_id], &filenames);
+ if (!status.ok()) {
+ return status;
+ }
+ if (!filenames.empty()) {
+ found_file = true;
+ }
+
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type)) {
+ if (type == kDescriptorFile) {
+ manifests_.push_back(filenames[i]);
+ } else {
+ if (number + 1 > next_file_number_) {
+ next_file_number_ = number + 1;
+ }
+ if (type == kWalFile) {
+ logs_.push_back(number);
+ } else if (type == kTableFile) {
+ table_fds_.emplace_back(number, static_cast<uint32_t>(path_id),
+ 0);
+ } else {
+ // Ignore other files
+ }
+ }
+ }
+ }
+ }
+ if (!found_file) {
+ return Status::Corruption(dbname_, "repair found no files");
+ }
+ return Status::OK();
+ }
+
+ void ConvertLogFilesToTables() {
+ const auto& wal_dir = immutable_db_options_.GetWalDir();
+ for (size_t i = 0; i < logs_.size(); i++) {
+ // we should use LogFileName(wal_dir, logs_[i]) here. user might uses
+ // wal_dir option.
+ std::string logname = LogFileName(wal_dir, logs_[i]);
+ Status status = ConvertLogToTable(wal_dir, logs_[i]);
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Log #%" PRIu64 ": ignoring conversion error: %s",
+ logs_[i], status.ToString().c_str());
+ }
+ ArchiveFile(logname);
+ }
+ }
+
+ Status ConvertLogToTable(const std::string& wal_dir, uint64_t log) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ std::shared_ptr<Logger> info_log;
+ uint64_t lognum;
+ void Corruption(size_t bytes, const Status& s) override {
+ // We print error messages for corruption, but continue repairing.
+ ROCKS_LOG_ERROR(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s",
+ lognum, static_cast<int>(bytes), s.ToString().c_str());
+ }
+ };
+
+ // Open the log file
+ std::string logname = LogFileName(wal_dir, log);
+ const auto& fs = env_->GetFileSystem();
+ std::unique_ptr<SequentialFileReader> lfile_reader;
+ Status status = SequentialFileReader::Create(
+ fs, logname, fs->OptimizeForLogRead(file_options_), &lfile_reader,
+ nullptr /* dbg */, nullptr /* rate limiter */);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Create the log reader.
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = db_options_.info_log;
+ reporter.lognum = log;
+ // We intentionally make log::Reader do checksumming so that
+ // corruptions cause entire commits to be skipped instead of
+ // propagating bad information (like overly large sequence
+ // numbers).
+ log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter,
+ true /*enable checksum*/, log);
+
+ // Initialize per-column family memtables
+ for (auto* cfd : *vset_.GetColumnFamilySet()) {
+ cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+ }
+ auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet());
+
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+ int counter = 0;
+ while (reader.ReadRecord(&record, &scratch)) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter.Corruption(record.size(),
+ Status::Corruption("log record too small"));
+ continue;
+ }
+ Status record_status = WriteBatchInternal::SetContents(&batch, record);
+ if (record_status.ok()) {
+ record_status =
+ WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr);
+ }
+ if (record_status.ok()) {
+ counter += WriteBatchInternal::Count(&batch);
+ } else {
+ ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s",
+ log, record_status.ToString().c_str());
+ }
+ }
+
+ // Dump a table for each column family with entries in this log file.
+ for (auto* cfd : *vset_.GetColumnFamilySet()) {
+ // Do not record a version edit for this conversion to a Table
+ // since ExtractMetaData() will also generate edits.
+ MemTable* mem = cfd->mem();
+ if (mem->IsEmpty()) {
+ continue;
+ }
+
+ FileMetaData meta;
+ meta.fd = FileDescriptor(next_file_number_++, 0, 0);
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ Arena arena;
+ ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+ int64_t _current_time = 0;
+ immutable_db_options_.clock->GetCurrentTime(&_current_time)
+ .PermitUncheckedError(); // ignore error
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+ meta.file_creation_time = current_time;
+ SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
+
+ auto write_hint = cfd->CalculateSSTWriteHint(0);
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters;
+ auto range_del_iter = mem->NewRangeTombstoneIterator(
+ ro, kMaxSequenceNumber, false /* immutable_memtable */);
+ if (range_del_iter != nullptr) {
+ range_del_iters.emplace_back(range_del_iter);
+ }
+
+ IOStatus io_s;
+ CompressionOptions default_compression;
+ TableBuilderOptions tboptions(
+ *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
+ cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+ kNoCompression, default_compression, cfd->GetID(), cfd->GetName(),
+ -1 /* level */, false /* is_bottommost */,
+ TableFileCreationReason::kRecovery, 0 /* oldest_key_time */,
+ 0 /* file_creation_time */, "DB Repairer" /* db_id */, db_session_id_,
+ 0 /*target_file_size*/, meta.fd.GetNumber());
+
+ SeqnoToTimeMapping empty_seqno_time_mapping;
+ status = BuildTable(
+ dbname_, /* versions */ nullptr, immutable_db_options_, tboptions,
+ file_options_, table_cache_.get(), iter.get(),
+ std::move(range_del_iters), &meta, nullptr /* blob_file_additions */,
+ {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker,
+ false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s,
+ nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery,
+ empty_seqno_time_mapping, nullptr /* event_logger */, 0 /* job_id */,
+ Env::IO_HIGH, nullptr /* table_properties */, write_hint);
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
+ log, counter, meta.fd.GetNumber(),
+ status.ToString().c_str());
+ if (status.ok()) {
+ if (meta.fd.GetFileSize() > 0) {
+ table_fds_.push_back(meta.fd);
+ }
+ } else {
+ break;
+ }
+ }
+ delete cf_mems;
+ return status;
+ }
+
+ void ExtractMetaData() {
+ for (size_t i = 0; i < table_fds_.size(); i++) {
+ TableInfo t;
+ t.meta.fd = table_fds_[i];
+ Status status = ScanTable(&t);
+ if (!status.ok()) {
+ std::string fname = TableFileName(
+ db_options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId());
+ char file_num_buf[kFormatFileNumberBufSize];
+ FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
+ file_num_buf, sizeof(file_num_buf));
+ ROCKS_LOG_WARN(db_options_.info_log, "Table #%s: ignoring %s",
+ file_num_buf, status.ToString().c_str());
+ ArchiveFile(fname);
+ } else {
+ tables_.push_back(t);
+ }
+ }
+ }
+
+ Status ScanTable(TableInfo* t) {
+ std::string fname = TableFileName(
+ db_options_.db_paths, t->meta.fd.GetNumber(), t->meta.fd.GetPathId());
+ int counter = 0;
+ uint64_t file_size;
+ Status status = env_->GetFileSize(fname, &file_size);
+ t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(),
+ file_size);
+ std::shared_ptr<const TableProperties> props;
+ if (status.ok()) {
+ status = table_cache_->GetTableProperties(file_options_, icmp_, t->meta,
+ &props);
+ }
+ if (status.ok()) {
+ auto s =
+ GetSstInternalUniqueId(props->db_id, props->db_session_id,
+ props->orig_file_number, &t->meta.unique_id);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Table #%" PRIu64
+ ": unable to get unique id, default to Unknown.",
+ t->meta.fd.GetNumber());
+ }
+ t->column_family_id = static_cast<uint32_t>(props->column_family_id);
+ if (t->column_family_id ==
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "Table #%" PRIu64
+ ": column family unknown (probably due to legacy format); "
+ "adding to default column family id 0.",
+ t->meta.fd.GetNumber());
+ t->column_family_id = 0;
+ }
+
+ if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) ==
+ nullptr) {
+ status =
+ AddColumnFamily(props->column_family_name, t->column_family_id);
+ }
+ t->meta.oldest_ancester_time = props->creation_time;
+ }
+ ColumnFamilyData* cfd = nullptr;
+ if (status.ok()) {
+ cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id);
+ if (cfd->GetName() != props->column_family_name) {
+ ROCKS_LOG_ERROR(
+ db_options_.info_log,
+ "Table #%" PRIu64
+ ": inconsistent column family name '%s'; expected '%s' for column "
+ "family id %" PRIu32 ".",
+ t->meta.fd.GetNumber(), props->column_family_name.c_str(),
+ cfd->GetName().c_str(), t->column_family_id);
+ status = Status::Corruption(dbname_, "inconsistent column family name");
+ }
+ }
+ if (status.ok()) {
+ ReadOptions ropts;
+ ropts.total_order_seek = true;
+ InternalIterator* iter = table_cache_->NewIterator(
+ ropts, file_options_, cfd->internal_comparator(), t->meta,
+ nullptr /* range_del_agg */,
+ cfd->GetLatestMutableCFOptions()->prefix_extractor,
+ /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+ TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
+ /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0,
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr,
+ /*allow_unprepared_value=*/false);
+ ParsedInternalKey parsed;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ Slice key = iter->key();
+ Status pik_status =
+ ParseInternalKey(key, &parsed, db_options_.allow_data_in_errors);
+ if (!pik_status.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Table #%" PRIu64 ": unparsable key - %s",
+ t->meta.fd.GetNumber(), pik_status.getState());
+ continue;
+ }
+
+ counter++;
+
+ status = t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence,
+ parsed.type);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (status.ok() && !iter->status().ok()) {
+ status = iter->status();
+ }
+ delete iter;
+
+ ROCKS_LOG_INFO(db_options_.info_log, "Table #%" PRIu64 ": %d entries %s",
+ t->meta.fd.GetNumber(), counter,
+ status.ToString().c_str());
+ }
+ if (status.ok()) {
+ // XXX/FIXME: This is just basic, naive handling of range tombstones,
+ // like call to UpdateBoundariesForRange in builder.cc where we assume
+ // an SST file is a full sorted run. This probably needs the extra logic
+ // from compaction_job.cc around call to UpdateBoundariesForRange (to
+ // handle range tombstones extendingg beyond range of other entries).
+ ReadOptions ropts;
+ std::unique_ptr<FragmentedRangeTombstoneIterator> r_iter;
+ status = table_cache_->GetRangeTombstoneIterator(
+ ropts, cfd->internal_comparator(), t->meta, &r_iter);
+
+ if (r_iter) {
+ r_iter->SeekToFirst();
+
+ while (r_iter->Valid()) {
+ auto tombstone = r_iter->Tombstone();
+ auto kv = tombstone.Serialize();
+ t->meta.UpdateBoundariesForRange(
+ kv.first, tombstone.SerializeEndKey(), tombstone.seq_,
+ cfd->internal_comparator());
+ r_iter->Next();
+ }
+ }
+ }
+ return status;
+ }
+
+ Status AddTables() {
+ std::unordered_map<uint32_t, std::vector<const TableInfo*>> cf_id_to_tables;
+ SequenceNumber max_sequence = 0;
+ for (size_t i = 0; i < tables_.size(); i++) {
+ cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]);
+ if (max_sequence < tables_[i].meta.fd.largest_seqno) {
+ max_sequence = tables_[i].meta.fd.largest_seqno;
+ }
+ }
+ vset_.SetLastAllocatedSequence(max_sequence);
+ vset_.SetLastPublishedSequence(max_sequence);
+ vset_.SetLastSequence(max_sequence);
+
+ for (const auto& cf_id_and_tables : cf_id_to_tables) {
+ auto* cfd =
+ vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first);
+ VersionEdit edit;
+ edit.SetComparatorName(cfd->user_comparator()->Name());
+ edit.SetLogNumber(0);
+ edit.SetNextFile(next_file_number_);
+ edit.SetColumnFamily(cfd->GetID());
+
+ // TODO(opt): separate out into multiple levels
+ for (const auto* table : cf_id_and_tables.second) {
+ edit.AddFile(
+ 0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
+ table->meta.fd.GetFileSize(), table->meta.smallest,
+ table->meta.largest, table->meta.fd.smallest_seqno,
+ table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
+ table->meta.temperature, table->meta.oldest_blob_file_number,
+ table->meta.oldest_ancester_time, table->meta.file_creation_time,
+ table->meta.file_checksum, table->meta.file_checksum_func_name,
+ table->meta.unique_id);
+ }
+ assert(next_file_number_ > 0);
+ vset_.MarkFileNumberUsed(next_file_number_ - 1);
+ mutex_.Lock();
+ std::unique_ptr<FSDirectory> db_dir;
+ Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(),
+ &db_dir, nullptr);
+ if (status.ok()) {
+ status = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ &edit, &mutex_, db_dir.get(),
+ false /* new_descriptor_log */);
+ }
+ mutex_.Unlock();
+ if (!status.ok()) {
+ return status;
+ }
+ }
+ return Status::OK();
+ }
+
+ void ArchiveFile(const std::string& fname) {
+ // Move into another directory. E.g., for
+ // dir/foo
+ // rename to
+ // dir/lost/foo
+ const char* slash = strrchr(fname.c_str(), '/');
+ std::string new_dir;
+ if (slash != nullptr) {
+ new_dir.assign(fname.data(), slash - fname.data());
+ }
+ new_dir.append("/lost");
+ env_->CreateDir(new_dir).PermitUncheckedError(); // Ignore error
+ std::string new_file = new_dir;
+ new_file.append("/");
+ new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
+ Status s = env_->RenameFile(fname, new_file);
+ ROCKS_LOG_INFO(db_options_.info_log, "Archiving %s: %s\n", fname.c_str(),
+ s.ToString().c_str());
+ }
+};
+
+Status GetDefaultCFOptions(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ ColumnFamilyOptions* res) {
+ assert(res != nullptr);
+ auto iter = std::find_if(column_families.begin(), column_families.end(),
+ [](const ColumnFamilyDescriptor& cfd) {
+ return cfd.name == kDefaultColumnFamilyName;
+ });
+ if (iter == column_families.end()) {
+ return Status::InvalidArgument(
+ "column_families", "Must contain entry for default column family");
+ }
+ *res = iter->options;
+ return Status::OK();
+}
+} // anonymous namespace
+
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families) {
+ ColumnFamilyOptions default_cf_opts;
+ Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+ if (!status.ok()) {
+ return status;
+ }
+
+ Repairer repairer(dbname, db_options, column_families, default_cf_opts,
+ ColumnFamilyOptions() /* unknown_cf_opts */,
+ false /* create_unknown_cfs */);
+ status = repairer.Run();
+ if (status.ok()) {
+ status = repairer.Close();
+ }
+ return status;
+}
+
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ const ColumnFamilyOptions& unknown_cf_opts) {
+ ColumnFamilyOptions default_cf_opts;
+ Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+ if (!status.ok()) {
+ return status;
+ }
+
+ Repairer repairer(dbname, db_options, column_families, default_cf_opts,
+ unknown_cf_opts, true /* create_unknown_cfs */);
+ status = repairer.Run();
+ if (status.ok()) {
+ status = repairer.Close();
+ }
+ return status;
+}
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+ Options opts(options);
+ DBOptions db_options(opts);
+ ColumnFamilyOptions cf_options(opts);
+
+ Repairer repairer(dbname, db_options, {}, cf_options /* default_cf_opts */,
+ cf_options /* unknown_cf_opts */,
+ true /* create_unknown_cfs */);
+ Status status = repairer.Run();
+ if (status.ok()) {
+ status = repairer.Close();
+ }
+ return status;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/repair_test.cc b/src/rocksdb/db/repair_test.cc
new file mode 100644
index 000000000..644a9270d
--- /dev/null
+++ b/src/rocksdb/db/repair_test.cc
@@ -0,0 +1,442 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/options.h"
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "file/file_util.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/transaction_log.h"
+#include "table/unique_id_impl.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class RepairTest : public DBTestBase {
+ public:
+ RepairTest() : DBTestBase("repair_test", /*env_do_fsync=*/true) {}
+
+ Status GetFirstSstPath(std::string* first_sst_path) {
+ assert(first_sst_path != nullptr);
+ first_sst_path->clear();
+ uint64_t manifest_size;
+ std::vector<std::string> files;
+ Status s = db_->GetLiveFiles(files, &manifest_size);
+ if (s.ok()) {
+ auto sst_iter =
+ std::find_if(files.begin(), files.end(), [](const std::string& file) {
+ uint64_t number;
+ FileType type;
+ bool ok = ParseFileName(file, &number, &type);
+ return ok && type == kTableFile;
+ });
+ *first_sst_path = sst_iter == files.end() ? "" : dbname_ + *sst_iter;
+ }
+ return s;
+ }
+
+ void ReopenWithSstIdVerify() {
+ std::atomic_int verify_passed{0};
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::PassedVerifyUniqueId", [&](void* arg) {
+ // override job status
+ auto id = static_cast<UniqueId64x2*>(arg);
+ assert(*id != kNullUniqueId64x2);
+ verify_passed++;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ auto options = CurrentOptions();
+ options.verify_sst_unique_id_in_manifest = true;
+ Reopen(options);
+
+ ASSERT_GT(verify_passed, 0);
+ SyncPoint::GetInstance()->DisableProcessing();
+ }
+};
+
+TEST_F(RepairTest, LostManifest) {
+ // Add a couple SST files, delete the manifest, and verify RepairDB() saves
+ // the day.
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("key2", "val2"));
+ ASSERT_OK(Flush());
+ // Need to get path before Close() deletes db_, but delete it after Close() to
+ // ensure Close() didn't change the manifest.
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+ ASSERT_OK(env_->DeleteFile(manifest_path));
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+ ReopenWithSstIdVerify();
+
+ ASSERT_EQ(Get("key"), "val");
+ ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, LostManifestMoreDbFeatures) {
+ // Add a couple SST files, delete the manifest, and verify RepairDB() saves
+ // the day.
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(Put("key2", "val2"));
+ ASSERT_OK(Put("key3", "val3"));
+ ASSERT_OK(Put("key4", "val4"));
+ ASSERT_OK(Flush());
+ // Test an SST file containing only a range tombstone
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key2",
+ "key3z"));
+ ASSERT_OK(Flush());
+ // Need to get path before Close() deletes db_, but delete it after Close() to
+ // ensure Close() didn't change the manifest.
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+ ASSERT_OK(env_->DeleteFile(manifest_path));
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+ // repair from sst should work with unique_id verification
+ ReopenWithSstIdVerify();
+
+ ASSERT_EQ(Get("key"), "val");
+ ASSERT_EQ(Get("key2"), "NOT_FOUND");
+ ASSERT_EQ(Get("key3"), "NOT_FOUND");
+ ASSERT_EQ(Get("key4"), "val4");
+}
+
+TEST_F(RepairTest, CorruptManifest) {
+ // Manifest is in an invalid format. Expect a full recovery.
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("key2", "val2"));
+ ASSERT_OK(Flush());
+ // Need to get path before Close() deletes db_, but overwrite it after Close()
+ // to ensure Close() didn't change the manifest.
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+
+ ASSERT_OK(CreateFile(env_->GetFileSystem(), manifest_path, "blah",
+ false /* use_fsync */));
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+ ReopenWithSstIdVerify();
+
+ ASSERT_EQ(Get("key"), "val");
+ ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, IncompleteManifest) {
+ // In this case, the manifest is valid but does not reference all of the SST
+ // files. Expect a full recovery.
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(Flush());
+ std::string orig_manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+ CopyFile(orig_manifest_path, orig_manifest_path + ".tmp");
+ ASSERT_OK(Put("key2", "val2"));
+ ASSERT_OK(Flush());
+ // Need to get path before Close() deletes db_, but overwrite it after Close()
+ // to ensure Close() didn't change the manifest.
+ std::string new_manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+ Close();
+ ASSERT_OK(env_->FileExists(new_manifest_path));
+ // Replace the manifest with one that is only aware of the first SST file.
+ CopyFile(orig_manifest_path + ".tmp", new_manifest_path);
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+ ReopenWithSstIdVerify();
+
+ ASSERT_EQ(Get("key"), "val");
+ ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, PostRepairSstFileNumbering) {
+ // Verify after a DB is repaired, new files will be assigned higher numbers
+ // than old files.
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("key2", "val2"));
+ ASSERT_OK(Flush());
+ uint64_t pre_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+ Close();
+
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+ ReopenWithSstIdVerify();
+
+ uint64_t post_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+ ASSERT_GE(post_repair_file_num, pre_repair_file_num);
+}
+
+TEST_F(RepairTest, LostSst) {
+ // Delete one of the SST files but preserve the manifest that refers to it,
+ // then verify the DB is still usable for the intact SST.
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("key2", "val2"));
+ ASSERT_OK(Flush());
+ std::string sst_path;
+ ASSERT_OK(GetFirstSstPath(&sst_path));
+ ASSERT_FALSE(sst_path.empty());
+ ASSERT_OK(env_->DeleteFile(sst_path));
+
+ Close();
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+ ReopenWithSstIdVerify();
+
+ // Exactly one of the key-value pairs should be in the DB now.
+ ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
+}
+
+TEST_F(RepairTest, CorruptSst) {
+ // Corrupt one of the SST files but preserve the manifest that refers to it,
+ // then verify the DB is still usable for the intact SST.
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("key2", "val2"));
+ ASSERT_OK(Flush());
+ std::string sst_path;
+ ASSERT_OK(GetFirstSstPath(&sst_path));
+ ASSERT_FALSE(sst_path.empty());
+
+ ASSERT_OK(CreateFile(env_->GetFileSystem(), sst_path, "blah",
+ false /* use_fsync */));
+
+ Close();
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+ ReopenWithSstIdVerify();
+
+ // Exactly one of the key-value pairs should be in the DB now.
+ ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
+}
+
+TEST_F(RepairTest, UnflushedSst) {
+ // This test case invokes repair while some data is unflushed, then verifies
+ // that data is in the db.
+ ASSERT_OK(Put("key", "val"));
+ VectorLogPtr wal_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+ ASSERT_EQ(wal_files.size(), 1);
+ {
+ uint64_t total_ssts_size;
+ std::unordered_map<std::string, uint64_t> sst_files;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+ ASSERT_EQ(total_ssts_size, 0);
+ }
+ // Need to get path before Close() deletes db_, but delete it after Close() to
+ // ensure Close() didn't change the manifest.
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+ ASSERT_OK(env_->DeleteFile(manifest_path));
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+ ReopenWithSstIdVerify();
+
+ ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+ ASSERT_EQ(wal_files.size(), 0);
+ {
+ uint64_t total_ssts_size;
+ std::unordered_map<std::string, uint64_t> sst_files;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+ ASSERT_GT(total_ssts_size, 0);
+ }
+ ASSERT_EQ(Get("key"), "val");
+}
+
+TEST_F(RepairTest, SeparateWalDir) {
+ do {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(Put("foo", "bar"));
+ VectorLogPtr wal_files;
+ ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+ ASSERT_EQ(wal_files.size(), 1);
+ {
+ uint64_t total_ssts_size;
+ std::unordered_map<std::string, uint64_t> sst_files;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+ ASSERT_EQ(total_ssts_size, 0);
+ }
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+ ASSERT_OK(env_->DeleteFile(manifest_path));
+ ASSERT_OK(RepairDB(dbname_, options));
+
+ // make sure that all WALs are converted to SSTables.
+ options.wal_dir = "";
+
+ ReopenWithSstIdVerify();
+ ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+ ASSERT_EQ(wal_files.size(), 0);
+ {
+ uint64_t total_ssts_size;
+ std::unordered_map<std::string, uint64_t> sst_files;
+ ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+ ASSERT_GT(total_ssts_size, 0);
+ }
+ ASSERT_EQ(Get("key"), "val");
+ ASSERT_EQ(Get("foo"), "bar");
+
+ } while (ChangeWalOptions());
+}
+
+TEST_F(RepairTest, RepairMultipleColumnFamilies) {
+ // Verify repair logic associates SST files with their original column
+ // families.
+ const int kNumCfs = 3;
+ const int kEntriesPerCf = 2;
+ DestroyAndReopen(CurrentOptions());
+ CreateAndReopenWithCF({"pikachu1", "pikachu2"}, CurrentOptions());
+ for (int i = 0; i < kNumCfs; ++i) {
+ for (int j = 0; j < kEntriesPerCf; ++j) {
+ ASSERT_OK(Put(i, "key" + std::to_string(j), "val" + std::to_string(j)));
+ if (j == kEntriesPerCf - 1 && i == kNumCfs - 1) {
+ // Leave one unflushed so we can verify WAL entries are properly
+ // associated with column families.
+ continue;
+ }
+ ASSERT_OK(Flush(i));
+ }
+ }
+
+ // Need to get path before Close() deletes db_, but delete it after Close() to
+ // ensure Close() doesn't re-create the manifest.
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+ ASSERT_OK(env_->DeleteFile(manifest_path));
+
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+ ReopenWithColumnFamilies({"default", "pikachu1", "pikachu2"},
+ CurrentOptions());
+ for (int i = 0; i < kNumCfs; ++i) {
+ for (int j = 0; j < kEntriesPerCf; ++j) {
+ ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j));
+ }
+ }
+}
+
+TEST_F(RepairTest, RepairColumnFamilyOptions) {
+ // Verify repair logic uses correct ColumnFamilyOptions when repairing a
+ // database with different options for column families.
+ const int kNumCfs = 2;
+ const int kEntriesPerCf = 2;
+
+ Options opts(CurrentOptions()), rev_opts(CurrentOptions());
+ opts.comparator = BytewiseComparator();
+ rev_opts.comparator = ReverseBytewiseComparator();
+
+ DestroyAndReopen(opts);
+ CreateColumnFamilies({"reverse"}, rev_opts);
+ ReopenWithColumnFamilies({"default", "reverse"},
+ std::vector<Options>{opts, rev_opts});
+ for (int i = 0; i < kNumCfs; ++i) {
+ for (int j = 0; j < kEntriesPerCf; ++j) {
+ ASSERT_OK(Put(i, "key" + std::to_string(j), "val" + std::to_string(j)));
+ if (i == kNumCfs - 1 && j == kEntriesPerCf - 1) {
+ // Leave one unflushed so we can verify RepairDB's flush logic
+ continue;
+ }
+ ASSERT_OK(Flush(i));
+ }
+ }
+ Close();
+
+ // RepairDB() records the comparator in the manifest, and DB::Open would fail
+ // if a different comparator were used.
+ ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}, {"reverse", rev_opts}},
+ opts /* unknown_cf_opts */));
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"},
+ std::vector<Options>{opts, rev_opts}));
+ for (int i = 0; i < kNumCfs; ++i) {
+ for (int j = 0; j < kEntriesPerCf; ++j) {
+ ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j));
+ }
+ }
+
+ // Examine table properties to verify RepairDB() used the right options when
+ // converting WAL->SST
+ TablePropertiesCollection fname_to_props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props));
+ ASSERT_EQ(fname_to_props.size(), 2U);
+ for (const auto& fname_and_props : fname_to_props) {
+ std::string comparator_name(rev_opts.comparator->Name());
+ ASSERT_EQ(comparator_name, fname_and_props.second->comparator_name);
+ }
+ Close();
+
+ // Also check comparator when it's provided via "unknown" CF options
+ ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}},
+ rev_opts /* unknown_cf_opts */));
+ ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"},
+ std::vector<Options>{opts, rev_opts}));
+ for (int i = 0; i < kNumCfs; ++i) {
+ for (int j = 0; j < kEntriesPerCf; ++j) {
+ ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j));
+ }
+ }
+}
+
+TEST_F(RepairTest, DbNameContainsTrailingSlash) {
+ {
+ bool tmp;
+ if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) {
+ fprintf(stderr,
+ "skipping RepairTest.DbNameContainsTrailingSlash due to "
+ "unsupported Env::AreFilesSame\n");
+ return;
+ }
+ }
+
+ ASSERT_OK(Put("key", "val"));
+ ASSERT_OK(Flush());
+ Close();
+
+ ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions()));
+ ReopenWithSstIdVerify();
+ ASSERT_EQ(Get("key"), "val");
+}
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as RepairDB is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/seqno_time_test.cc b/src/rocksdb/db/seqno_time_test.cc
new file mode 100644
index 000000000..12394a368
--- /dev/null
+++ b/src/rocksdb/db/seqno_time_test.cc
@@ -0,0 +1,996 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "db/periodic_task_scheduler.h"
+#include "db/seqno_to_time_mapping.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "test_util/mock_time_env.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class SeqnoTimeTest : public DBTestBase {
+ public:
+ SeqnoTimeTest() : DBTestBase("seqno_time_test", /*env_do_fsync=*/false) {
+ mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+ mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
+ }
+
+ protected:
+ std::unique_ptr<Env> mock_env_;
+ std::shared_ptr<MockSystemClock> mock_clock_;
+
+ void SetUp() override {
+ mock_clock_->InstallTimedWaitFixCallback();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+ auto periodic_task_scheduler_ptr =
+ reinterpret_cast<PeriodicTaskScheduler*>(arg);
+ periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
+ });
+ }
+
+ // make sure the file is not in cache, otherwise it won't have IO info
+ void AssertKeyTemperature(int key_id, Temperature expected_temperature) {
+ get_iostats_context()->Reset();
+ IOStatsContext* iostats = get_iostats_context();
+ std::string result = Get(Key(key_id));
+ ASSERT_FALSE(result.empty());
+ ASSERT_GT(iostats->bytes_read, 0);
+ switch (expected_temperature) {
+ case Temperature::kUnknown:
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count,
+ 0);
+ ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
+ 0);
+ break;
+ case Temperature::kCold:
+ ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_read_count,
+ 0);
+ ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
+ 0);
+ break;
+ default:
+ // the test only support kCold now for the bottommost temperature
+ FAIL();
+ }
+ }
+};
+
+TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preclude_last_level_data_seconds = 10000;
+ options.env = mock_env_.get();
+ options.bottommost_temperature = Temperature::kCold;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+ int sst_num = 0;
+ // Write files that are overlap and enough to trigger compaction
+ for (; sst_num < kNumTrigger; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // All data is hot, only output to penultimate level
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // read a random key, which should be hot (kUnknown)
+ AssertKeyTemperature(20, Temperature::kUnknown);
+
+ // Write more data, but still all hot until the 10th SST, as:
+ // write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds
+ // The preclude_last_level_data_seconds is 10k
+ for (; sst_num < kNumTrigger * 2; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+ }
+
+ // Now we have both hot data and cold data
+ for (; sst_num < kNumTrigger * 3; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+ }
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
+ uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
+ ASSERT_GT(hot_data_size, 0);
+ ASSERT_GT(cold_data_size, 0);
+ // the first a few key should be cold
+ AssertKeyTemperature(20, Temperature::kCold);
+
+ for (int i = 0; i < 30; i++) {
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(20 * kKeyPerSec));
+ });
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // the hot/cold data cut off range should be between i * 20 + 200 -> 250
+ AssertKeyTemperature(i * 20 + 250, Temperature::kUnknown);
+ AssertKeyTemperature(i * 20 + 200, Temperature::kCold);
+ }
+
+ ASSERT_LT(GetSstSizeHelper(Temperature::kUnknown), hot_data_size);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), cold_data_size);
+
+ // Wait again, the most of the data should be cold after that
+ // but it may not be all cold, because if there's no new data write to SST,
+ // the compaction will not get the new seqno->time sampling to decide the last
+ // a few data's time.
+ for (int i = 0; i < 5; i++) {
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ }
+
+ // any random data close to the end should be cold
+ AssertKeyTemperature(1000, Temperature::kCold);
+
+ // close explicitly, because the env is local variable which will be released
+ // first.
+ Close();
+}
+
+TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+
+ Options options = CurrentOptions();
+ options.preclude_last_level_data_seconds = 10000;
+ options.env = mock_env_.get();
+ options.bottommost_temperature = Temperature::kCold;
+ options.num_levels = kNumLevels;
+ options.level_compaction_dynamic_level_bytes = true;
+ // TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if
+ // the penultimate level score > 1, but the hot is not cold enough to compact
+ // to last level, which will keep triggering compaction.
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+
+ int sst_num = 0;
+ // Write files that are overlap
+ for (; sst_num < 4; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+ }
+ ASSERT_OK(Flush());
+ }
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // All data is hot, only output to penultimate level
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // read a random key, which should be hot (kUnknown)
+ AssertKeyTemperature(20, Temperature::kUnknown);
+
+ // Adding more data to have mixed hot and cold data
+ for (; sst_num < 14; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // Compact the files to the last level which should split the hot/cold data
+ MoveFilesToLevel(6);
+ uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
+ uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
+ ASSERT_GT(hot_data_size, 0);
+ ASSERT_GT(cold_data_size, 0);
+ // the first a few key should be cold
+ AssertKeyTemperature(20, Temperature::kCold);
+
+ // Wait some time, with each wait, the cold data is increasing and hot data is
+ // decreasing
+ for (int i = 0; i < 30; i++) {
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ uint64_t pre_hot = hot_data_size;
+ uint64_t pre_cold = cold_data_size;
+ hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
+ cold_data_size = GetSstSizeHelper(Temperature::kCold);
+ ASSERT_LT(hot_data_size, pre_hot);
+ ASSERT_GT(cold_data_size, pre_cold);
+
+ // the hot/cold cut_off key should be around i * 20 + 400 -> 450
+ AssertKeyTemperature(i * 20 + 450, Temperature::kUnknown);
+ AssertKeyTemperature(i * 20 + 400, Temperature::kCold);
+ }
+
+ // Wait again, the most of the data should be cold after that
+ // hot data might not be empty, because if we don't write new data, there's
+ // no seqno->time sampling available to the compaction
+ for (int i = 0; i < 5; i++) {
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ }
+
+ // any random data close to the end should be cold
+ AssertKeyTemperature(1000, Temperature::kCold);
+
+ Close();
+}
+
+enum class SeqnoTimeTestType : char {
+ kTrackInternalTimeSeconds = 0,
+ kPrecludeLastLevel = 1,
+ kBothSetTrackSmaller = 2,
+};
+
+class SeqnoTimeTablePropTest
+ : public SeqnoTimeTest,
+ public ::testing::WithParamInterface<SeqnoTimeTestType> {
+ public:
+ SeqnoTimeTablePropTest() : SeqnoTimeTest() {}
+
+ void SetTrackTimeDurationOptions(uint64_t track_time_duration,
+ Options& options) const {
+ // either option set will enable the time tracking feature
+ switch (GetParam()) {
+ case SeqnoTimeTestType::kTrackInternalTimeSeconds:
+ options.preclude_last_level_data_seconds = 0;
+ options.preserve_internal_time_seconds = track_time_duration;
+ break;
+ case SeqnoTimeTestType::kPrecludeLastLevel:
+ options.preclude_last_level_data_seconds = track_time_duration;
+ options.preserve_internal_time_seconds = 0;
+ break;
+ case SeqnoTimeTestType::kBothSetTrackSmaller:
+ options.preclude_last_level_data_seconds = track_time_duration;
+ options.preserve_internal_time_seconds = track_time_duration / 10;
+ break;
+ }
+ }
+};
+
+INSTANTIATE_TEST_CASE_P(
+ SeqnoTimeTablePropTest, SeqnoTimeTablePropTest,
+ ::testing::Values(SeqnoTimeTestType::kTrackInternalTimeSeconds,
+ SeqnoTimeTestType::kPrecludeLastLevel,
+ SeqnoTimeTestType::kBothSetTrackSmaller));
+
+TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) {
+ Options options = CurrentOptions();
+ SetTrackTimeDurationOptions(10000, options);
+
+ options.env = mock_env_.get();
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ std::set<uint64_t> checked_file_nums;
+ SequenceNumber start_seq = dbfull()->GetLatestSequenceNumber();
+ // Write a key every 10 seconds
+ for (int i = 0; i < 200; i++) {
+ ASSERT_OK(Put(Key(i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+ }
+ ASSERT_OK(Flush());
+ TablePropertiesCollection tables_props;
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+ ASSERT_EQ(tables_props.size(), 1);
+ auto it = tables_props.begin();
+ SeqnoToTimeMapping tp_mapping;
+ ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+ ASSERT_OK(tp_mapping.Sort());
+ ASSERT_FALSE(tp_mapping.Empty());
+ auto seqs = tp_mapping.TEST_GetInternalMapping();
+ // about ~20 seqs->time entries, because the sample rate is 10000/100, and it
+ // passes 2k time.
+ ASSERT_GE(seqs.size(), 19);
+ ASSERT_LE(seqs.size(), 21);
+ SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber();
+ for (auto i = start_seq; i < start_seq + 10; i++) {
+ ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 1) * 10);
+ }
+ start_seq += 10;
+ for (auto i = start_seq; i < seq_end; i++) {
+ // The result is within the range
+ ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - 10) * 10);
+ ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 10) * 10);
+ }
+ checked_file_nums.insert(it->second->orig_file_number);
+ start_seq = seq_end;
+
+ // Write a key every 1 seconds
+ for (int i = 0; i < 200; i++) {
+ ASSERT_OK(Put(Key(i + 190), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1)); });
+ }
+ seq_end = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(Flush());
+ tables_props.clear();
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+ ASSERT_EQ(tables_props.size(), 2);
+ it = tables_props.begin();
+ while (it != tables_props.end()) {
+ if (!checked_file_nums.count(it->second->orig_file_number)) {
+ break;
+ }
+ it++;
+ }
+ ASSERT_TRUE(it != tables_props.end());
+
+ tp_mapping.Clear();
+ ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+ ASSERT_OK(tp_mapping.Sort());
+ seqs = tp_mapping.TEST_GetInternalMapping();
+ // There only a few time sample
+ ASSERT_GE(seqs.size(), 1);
+ ASSERT_LE(seqs.size(), 3);
+ for (auto i = start_seq; i < seq_end; i++) {
+ // The result is not very accurate, as there is more data write within small
+ // range of time
+ ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 1000);
+ ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000);
+ }
+ checked_file_nums.insert(it->second->orig_file_number);
+ start_seq = seq_end;
+
+ // Write a key every 200 seconds
+ for (int i = 0; i < 200; i++) {
+ ASSERT_OK(Put(Key(i + 380), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
+ }
+ seq_end = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(Flush());
+ tables_props.clear();
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+ ASSERT_EQ(tables_props.size(), 3);
+ it = tables_props.begin();
+ while (it != tables_props.end()) {
+ if (!checked_file_nums.count(it->second->orig_file_number)) {
+ break;
+ }
+ it++;
+ }
+ ASSERT_TRUE(it != tables_props.end());
+
+ tp_mapping.Clear();
+ ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+ ASSERT_OK(tp_mapping.Sort());
+ seqs = tp_mapping.TEST_GetInternalMapping();
+ // The sequence number -> time entries should be maxed
+ ASSERT_GE(seqs.size(), 99);
+ ASSERT_LE(seqs.size(), 101);
+ for (auto i = start_seq; i < seq_end - 99; i++) {
+ // likely the first 100 entries reports 0
+ ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000);
+ }
+ start_seq += 101;
+
+ for (auto i = start_seq; i < seq_end; i++) {
+ ASSERT_GE(tp_mapping.GetOldestApproximateTime(i),
+ (i - start_seq) * 200 + 22200);
+ ASSERT_LE(tp_mapping.GetOldestApproximateTime(i),
+ (i - start_seq) * 200 + 22600);
+ }
+ checked_file_nums.insert(it->second->orig_file_number);
+ start_seq = seq_end;
+
+ // Write a key every 100 seconds
+ for (int i = 0; i < 200; i++) {
+ ASSERT_OK(Put(Key(i + 570), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+ }
+ seq_end = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(Flush());
+ tables_props.clear();
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+ ASSERT_EQ(tables_props.size(), 4);
+ it = tables_props.begin();
+ while (it != tables_props.end()) {
+ if (!checked_file_nums.count(it->second->orig_file_number)) {
+ break;
+ }
+ it++;
+ }
+ ASSERT_TRUE(it != tables_props.end());
+ tp_mapping.Clear();
+ ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+ ASSERT_OK(tp_mapping.Sort());
+ seqs = tp_mapping.TEST_GetInternalMapping();
+ ASSERT_GE(seqs.size(), 99);
+ ASSERT_LE(seqs.size(), 101);
+
+ checked_file_nums.insert(it->second->orig_file_number);
+
+ // re-enable compaction
+ ASSERT_OK(dbfull()->SetOptions({
+ {"disable_auto_compactions", "false"},
+ }));
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ tables_props.clear();
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+ ASSERT_GE(tables_props.size(), 1);
+ it = tables_props.begin();
+ while (it != tables_props.end()) {
+ if (!checked_file_nums.count(it->second->orig_file_number)) {
+ break;
+ }
+ it++;
+ }
+ ASSERT_TRUE(it != tables_props.end());
+ tp_mapping.Clear();
+ ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+ ASSERT_OK(tp_mapping.Sort());
+ seqs = tp_mapping.TEST_GetInternalMapping();
+ ASSERT_GE(seqs.size(), 99);
+ ASSERT_LE(seqs.size(), 101);
+ for (auto i = start_seq; i < seq_end - 99; i++) {
+ // likely the first 100 entries reports 0
+ ASSERT_LE(tp_mapping.GetOldestApproximateTime(i),
+ (i - start_seq) * 100 + 50000);
+ }
+ start_seq += 101;
+
+ for (auto i = start_seq; i < seq_end; i++) {
+ ASSERT_GE(tp_mapping.GetOldestApproximateTime(i),
+ (i - start_seq) * 100 + 52200);
+ ASSERT_LE(tp_mapping.GetOldestApproximateTime(i),
+ (i - start_seq) * 100 + 52400);
+ }
+ ASSERT_OK(db_->Close());
+}
+
+TEST_P(SeqnoTimeTablePropTest, MultiCFs) {
+ Options options = CurrentOptions();
+ options.preclude_last_level_data_seconds = 0;
+ options.preserve_internal_time_seconds = 0;
+ options.env = mock_env_.get();
+ options.stats_dump_period_sec = 0;
+ options.stats_persist_period_sec = 0;
+ ReopenWithColumnFamilies({"default"}, options);
+
+ const PeriodicTaskScheduler& scheduler =
+ dbfull()->TEST_GetPeriodicTaskScheduler();
+ ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
+
+ // Write some data and increase the current time
+ for (int i = 0; i < 200; i++) {
+ ASSERT_OK(Put(Key(i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+ }
+ ASSERT_OK(Flush());
+ TablePropertiesCollection tables_props;
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+ ASSERT_EQ(tables_props.size(), 1);
+ auto it = tables_props.begin();
+ ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
+
+ ASSERT_TRUE(dbfull()->TEST_GetSeqnoToTimeMapping().Empty());
+
+ Options options_1 = options;
+ SetTrackTimeDurationOptions(10000, options_1);
+ CreateColumnFamilies({"one"}, options_1);
+ ASSERT_TRUE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
+
+ // Write some data to the default CF (without preclude_last_level feature)
+ for (int i = 0; i < 200; i++) {
+ ASSERT_OK(Put(Key(i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+ }
+ ASSERT_OK(Flush());
+
+ // Write some data to the CF one
+ for (int i = 0; i < 20; i++) {
+ ASSERT_OK(Put(1, Key(i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+ }
+ ASSERT_OK(Flush(1));
+ tables_props.clear();
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[1], &tables_props));
+ ASSERT_EQ(tables_props.size(), 1);
+ it = tables_props.begin();
+ SeqnoToTimeMapping tp_mapping;
+ ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+ ASSERT_OK(tp_mapping.Sort());
+ ASSERT_FALSE(tp_mapping.Empty());
+ auto seqs = tp_mapping.TEST_GetInternalMapping();
+ ASSERT_GE(seqs.size(), 1);
+ ASSERT_LE(seqs.size(), 4);
+
+ // Create one more CF with larger preclude_last_level time
+ Options options_2 = options;
+ SetTrackTimeDurationOptions(1000000, options_2); // 1m
+ CreateColumnFamilies({"two"}, options_2);
+
+ // Add more data to CF "two" to fill the in memory mapping
+ for (int i = 0; i < 2000; i++) {
+ ASSERT_OK(Put(2, Key(i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+ }
+ seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
+ ASSERT_GE(seqs.size(), 1000 - 1);
+ ASSERT_LE(seqs.size(), 1000 + 1);
+
+ ASSERT_OK(Flush(2));
+ tables_props.clear();
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
+ ASSERT_EQ(tables_props.size(), 1);
+ it = tables_props.begin();
+ tp_mapping.Clear();
+ ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+ ASSERT_OK(tp_mapping.Sort());
+ seqs = tp_mapping.TEST_GetInternalMapping();
+ // the max encoded entries is 100
+ ASSERT_GE(seqs.size(), 100 - 1);
+ ASSERT_LE(seqs.size(), 100 + 1);
+
+ // Write some data to default CF, as all memtable with preclude_last_level
+ // enabled have flushed, the in-memory seqno->time mapping should be cleared
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(0, Key(i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+ }
+ seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
+ ASSERT_OK(Flush(0));
+
+ // trigger compaction for CF "two" and make sure the compaction output has
+ // seqno_to_time_mapping
+ for (int j = 0; j < 3; j++) {
+ for (int i = 0; i < 200; i++) {
+ ASSERT_OK(Put(2, Key(i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+ }
+ ASSERT_OK(Flush(2));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ tables_props.clear();
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
+ ASSERT_EQ(tables_props.size(), 1);
+ it = tables_props.begin();
+ tp_mapping.Clear();
+ ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+ ASSERT_OK(tp_mapping.Sort());
+ seqs = tp_mapping.TEST_GetInternalMapping();
+ ASSERT_GE(seqs.size(), 99);
+ ASSERT_LE(seqs.size(), 101);
+
+ for (int j = 0; j < 2; j++) {
+ for (int i = 0; i < 200; i++) {
+ ASSERT_OK(Put(0, Key(i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+ }
+ ASSERT_OK(Flush(0));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ tables_props.clear();
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[0], &tables_props));
+ ASSERT_EQ(tables_props.size(), 1);
+ it = tables_props.begin();
+ ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
+
+ // Write some data to CF "two", but don't flush to accumulate
+ for (int i = 0; i < 1000; i++) {
+ ASSERT_OK(Put(2, Key(i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+ }
+ ASSERT_GE(
+ dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
+ 500);
+ // After dropping CF "one", the in-memory mapping will be change to only
+ // follow CF "two" options.
+ ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+ ASSERT_LE(
+ dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
+ 100 + 5);
+
+ // After dropping CF "two", the in-memory mapping is also clear.
+ ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+ ASSERT_EQ(
+ dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
+ 0);
+
+ // And the timer worker is stopped
+ ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
+ Close();
+}
+
+TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) {
+ const int kInstanceNum = 2;
+
+ Options options = CurrentOptions();
+ SetTrackTimeDurationOptions(10000, options);
+ options.env = mock_env_.get();
+ options.stats_dump_period_sec = 0;
+ options.stats_persist_period_sec = 0;
+
+ auto dbs = std::vector<DB*>(kInstanceNum);
+ for (int i = 0; i < kInstanceNum; i++) {
+ ASSERT_OK(
+ DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
+ }
+
+ // Make sure the second instance has the worker enabled
+ auto dbi = static_cast_with_check<DBImpl>(dbs[1]);
+ WriteOptions wo;
+ for (int i = 0; i < 200; i++) {
+ ASSERT_OK(dbi->Put(wo, Key(i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+ }
+ SeqnoToTimeMapping seqno_to_time_mapping = dbi->TEST_GetSeqnoToTimeMapping();
+ ASSERT_GT(seqno_to_time_mapping.Size(), 10);
+
+ for (int i = 0; i < kInstanceNum; i++) {
+ ASSERT_OK(dbs[i]->Close());
+ delete dbs[i];
+ }
+}
+
+TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+
+ Options options = CurrentOptions();
+ SetTrackTimeDurationOptions(10000, options);
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = kNumLevels;
+ options.env = mock_env_.get();
+
+ DestroyAndReopen(options);
+
+ std::atomic_uint64_t num_seqno_zeroing{0};
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput:ZeroingSeq",
+ [&](void* /*arg*/) { num_seqno_zeroing++; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ int sst_num = 0;
+ for (; sst_num < kNumTrigger - 1; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+ }
+ ASSERT_OK(Flush());
+ }
+ TablePropertiesCollection tables_props;
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+ ASSERT_EQ(tables_props.size(), 3);
+ for (const auto& props : tables_props) {
+ ASSERT_FALSE(props.second->seqno_to_time_mapping.empty());
+ SeqnoToTimeMapping tp_mapping;
+ ASSERT_OK(tp_mapping.Add(props.second->seqno_to_time_mapping));
+ ASSERT_OK(tp_mapping.Sort());
+ ASSERT_FALSE(tp_mapping.Empty());
+ auto seqs = tp_mapping.TEST_GetInternalMapping();
+ ASSERT_GE(seqs.size(), 10 - 1);
+ ASSERT_LE(seqs.size(), 10 + 1);
+ }
+
+ // Trigger a compaction
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+ }
+ sst_num++;
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ tables_props.clear();
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+ ASSERT_EQ(tables_props.size(), 1);
+
+ auto it = tables_props.begin();
+ SeqnoToTimeMapping tp_mapping;
+ ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
+ ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+
+ // compact to the last level
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ // make sure the data is all compacted to penultimate level if the feature is
+ // on, otherwise, compacted to the last level.
+ if (options.preclude_last_level_data_seconds > 0) {
+ ASSERT_GT(NumTableFilesAtLevel(5), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(6), 0);
+ } else {
+ ASSERT_EQ(NumTableFilesAtLevel(5), 0);
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+ }
+
+ // regardless the file is on the last level or not, it should keep the time
+ // information and sequence number are not set
+ tables_props.clear();
+ tp_mapping.Clear();
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+
+ ASSERT_EQ(tables_props.size(), 1);
+ ASSERT_EQ(num_seqno_zeroing, 0);
+
+ it = tables_props.begin();
+ ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
+ ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+
+ // make half of the data expired
+ mock_clock_->MockSleepForSeconds(static_cast<int>(8000));
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ tables_props.clear();
+ tp_mapping.Clear();
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+
+ if (options.preclude_last_level_data_seconds > 0) {
+ ASSERT_EQ(tables_props.size(), 2);
+ } else {
+ ASSERT_EQ(tables_props.size(), 1);
+ }
+ ASSERT_GT(num_seqno_zeroing, 0);
+ std::vector<KeyVersion> key_versions;
+ ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+ std::numeric_limits<size_t>::max(),
+ &key_versions));
+ // make sure there're more than 300 keys and first 100 keys are having seqno
+ // zeroed out, the last 100 key seqno not zeroed out
+ ASSERT_GT(key_versions.size(), 300);
+ for (int i = 0; i < 100; i++) {
+ ASSERT_EQ(key_versions[i].sequence, 0);
+ }
+ auto rit = key_versions.rbegin();
+ for (int i = 0; i < 100; i++) {
+ ASSERT_GT(rit->sequence, 0);
+ rit++;
+ }
+
+ // make all data expired and compact again to push it to the last level
+ // regardless if the tiering feature is enabled or not
+ mock_clock_->MockSleepForSeconds(static_cast<int>(20000));
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_GT(num_seqno_zeroing, 0);
+ ASSERT_GT(NumTableFilesAtLevel(6), 0);
+
+ Close();
+}
+
+TEST_F(SeqnoTimeTest, MappingAppend) {
+ SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
+
+ // ignore seqno == 0, as it may mean the seqno is zeroed out
+ ASSERT_FALSE(test.Append(0, 9));
+
+ ASSERT_TRUE(test.Append(3, 10));
+ auto size = test.Size();
+ // normal add
+ ASSERT_TRUE(test.Append(10, 11));
+ size++;
+ ASSERT_EQ(size, test.Size());
+
+ // Append unsorted
+ ASSERT_FALSE(test.Append(8, 12));
+ ASSERT_EQ(size, test.Size());
+
+ // Append with the same seqno, newer time will be accepted
+ ASSERT_TRUE(test.Append(10, 12));
+ ASSERT_EQ(size, test.Size());
+ // older time will be ignored
+ ASSERT_FALSE(test.Append(10, 9));
+ ASSERT_EQ(size, test.Size());
+
+ // new seqno with old time will be ignored
+ ASSERT_FALSE(test.Append(12, 8));
+ ASSERT_EQ(size, test.Size());
+}
+
+TEST_F(SeqnoTimeTest, GetOldestApproximateTime) {
+ SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
+
+ ASSERT_EQ(test.GetOldestApproximateTime(10), kUnknownSeqnoTime);
+
+ test.Append(3, 10);
+
+ ASSERT_EQ(test.GetOldestApproximateTime(2), kUnknownSeqnoTime);
+ ASSERT_EQ(test.GetOldestApproximateTime(3), 10);
+ ASSERT_EQ(test.GetOldestApproximateTime(10), 10);
+
+ test.Append(10, 100);
+
+ test.Append(100, 1000);
+ ASSERT_EQ(test.GetOldestApproximateTime(10), 100);
+ ASSERT_EQ(test.GetOldestApproximateTime(40), 100);
+ ASSERT_EQ(test.GetOldestApproximateTime(111), 1000);
+}
+
+TEST_F(SeqnoTimeTest, Sort) {
+ SeqnoToTimeMapping test;
+
+ // single entry
+ test.Add(10, 11);
+ ASSERT_OK(test.Sort());
+ ASSERT_EQ(test.Size(), 1);
+
+ // duplicate, should be removed by sort
+ test.Add(10, 11);
+ // same seqno, but older time, should be removed
+ test.Add(10, 9);
+
+ // unuseful ones, should be removed by sort
+ test.Add(11, 9);
+ test.Add(9, 8);
+
+ // Good ones
+ test.Add(1, 10);
+ test.Add(100, 100);
+
+ ASSERT_OK(test.Sort());
+
+ auto seqs = test.TEST_GetInternalMapping();
+
+ std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
+ expected.emplace_back(1, 10);
+ expected.emplace_back(10, 11);
+ expected.emplace_back(100, 100);
+
+ ASSERT_EQ(expected, seqs);
+}
+
+TEST_F(SeqnoTimeTest, EncodeDecodeBasic) {
+ SeqnoToTimeMapping test(0, 1000);
+
+ std::string output;
+ test.Encode(output, 0, 1000, 100);
+ ASSERT_TRUE(output.empty());
+
+ for (int i = 1; i <= 1000; i++) {
+ ASSERT_TRUE(test.Append(i, i * 10));
+ }
+ test.Encode(output, 0, 1000, 100);
+
+ ASSERT_FALSE(output.empty());
+
+ SeqnoToTimeMapping decoded;
+ ASSERT_OK(decoded.Add(output));
+ ASSERT_OK(decoded.Sort());
+ ASSERT_EQ(decoded.Size(), SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST);
+ ASSERT_EQ(test.Size(), 1000);
+
+ for (SequenceNumber seq = 0; seq <= 1000; seq++) {
+ // test has the more accurate time mapping, encode only pick
+ // kMaxSeqnoTimePairsPerSST number of entries, which is less accurate
+ uint64_t target_time = test.GetOldestApproximateTime(seq);
+ ASSERT_GE(decoded.GetOldestApproximateTime(seq),
+ target_time < 200 ? 0 : target_time - 200);
+ ASSERT_LE(decoded.GetOldestApproximateTime(seq), target_time);
+ }
+}
+
+TEST_F(SeqnoTimeTest, EncodeDecodePerferNewTime) {
+ SeqnoToTimeMapping test(0, 10);
+
+ test.Append(1, 10);
+ test.Append(5, 17);
+ test.Append(6, 25);
+ test.Append(8, 30);
+
+ std::string output;
+ test.Encode(output, 1, 10, 0, 3);
+
+ SeqnoToTimeMapping decoded;
+ ASSERT_OK(decoded.Add(output));
+ ASSERT_OK(decoded.Sort());
+
+ ASSERT_EQ(decoded.Size(), 3);
+
+ auto seqs = decoded.TEST_GetInternalMapping();
+ std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
+ expected.emplace_back(1, 10);
+ expected.emplace_back(6, 25);
+ expected.emplace_back(8, 30);
+ ASSERT_EQ(expected, seqs);
+
+ // Add a few large time number
+ test.Append(10, 100);
+ test.Append(13, 200);
+ test.Append(16, 300);
+
+ output.clear();
+ test.Encode(output, 1, 20, 0, 4);
+ decoded.Clear();
+ ASSERT_OK(decoded.Add(output));
+ ASSERT_OK(decoded.Sort());
+ ASSERT_EQ(decoded.Size(), 4);
+
+ expected.clear();
+ expected.emplace_back(1, 10);
+ // entry #6, #8 are skipped as they are too close to #1.
+ // entry #100 is also within skip range, but if it's skipped, there not enough
+ // number to fill 4 entries, so select it.
+ expected.emplace_back(10, 100);
+ expected.emplace_back(13, 200);
+ expected.emplace_back(16, 300);
+ seqs = decoded.TEST_GetInternalMapping();
+ ASSERT_EQ(expected, seqs);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/seqno_to_time_mapping.cc b/src/rocksdb/db/seqno_to_time_mapping.cc
new file mode 100644
index 000000000..c69209929
--- /dev/null
+++ b/src/rocksdb/db/seqno_to_time_mapping.cc
@@ -0,0 +1,341 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/seqno_to_time_mapping.h"
+
+#include "db/version_edit.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint64_t SeqnoToTimeMapping::GetOldestApproximateTime(
+ const SequenceNumber seqno) const {
+ assert(is_sorted_);
+ auto it = std::upper_bound(seqno_time_mapping_.begin(),
+ seqno_time_mapping_.end(), seqno);
+ if (it == seqno_time_mapping_.begin()) {
+ return 0;
+ }
+ it--;
+ return it->time;
+}
+
+void SeqnoToTimeMapping::Add(SequenceNumber seqno, uint64_t time) {
+ if (seqno == 0) {
+ return;
+ }
+ is_sorted_ = false;
+ seqno_time_mapping_.emplace_back(seqno, time);
+}
+
+void SeqnoToTimeMapping::TruncateOldEntries(const uint64_t now) {
+ assert(is_sorted_);
+
+ if (max_time_duration_ == 0) {
+ return;
+ }
+
+ const uint64_t cut_off_time =
+ now > max_time_duration_ ? now - max_time_duration_ : 0;
+ assert(cut_off_time <= now); // no overflow
+
+ auto it = std::upper_bound(
+ seqno_time_mapping_.begin(), seqno_time_mapping_.end(), cut_off_time,
+ [](uint64_t target, const SeqnoTimePair& other) -> bool {
+ return target < other.time;
+ });
+ if (it == seqno_time_mapping_.begin()) {
+ return;
+ }
+ it--;
+ seqno_time_mapping_.erase(seqno_time_mapping_.begin(), it);
+}
+
+SequenceNumber SeqnoToTimeMapping::GetOldestSequenceNum(uint64_t time) {
+ assert(is_sorted_);
+
+ auto it = std::upper_bound(
+ seqno_time_mapping_.begin(), seqno_time_mapping_.end(), time,
+ [](uint64_t target, const SeqnoTimePair& other) -> bool {
+ return target < other.time;
+ });
+ if (it == seqno_time_mapping_.begin()) {
+ return 0;
+ }
+ it--;
+ return it->seqno;
+}
+
+// The encoded format is:
+// [num_of_entries][[seqno][time],[seqno][time],...]
+// ^ ^
+// var_int delta_encoded (var_int)
+void SeqnoToTimeMapping::Encode(std::string& dest, const SequenceNumber start,
+ const SequenceNumber end, const uint64_t now,
+ const uint64_t output_size) const {
+ assert(is_sorted_);
+ if (start > end) {
+ // It could happen when the SST file is empty, the initial value of min
+ // sequence number is kMaxSequenceNumber and max is 0.
+ // The empty output file will be removed in the final step of compaction.
+ return;
+ }
+
+ auto start_it = std::upper_bound(seqno_time_mapping_.begin(),
+ seqno_time_mapping_.end(), start);
+ if (start_it != seqno_time_mapping_.begin()) {
+ start_it--;
+ }
+
+ auto end_it = std::upper_bound(seqno_time_mapping_.begin(),
+ seqno_time_mapping_.end(), end);
+ if (end_it == seqno_time_mapping_.begin()) {
+ return;
+ }
+ if (start_it >= end_it) {
+ return;
+ }
+
+ // truncate old entries that are not needed
+ if (max_time_duration_ > 0) {
+ const uint64_t cut_off_time =
+ now > max_time_duration_ ? now - max_time_duration_ : 0;
+ while (start_it < end_it && start_it->time < cut_off_time) {
+ start_it++;
+ }
+ }
+ // to include the first element
+ if (start_it != seqno_time_mapping_.begin()) {
+ start_it--;
+ }
+
+ // If there are more data than needed, pick the entries for encoding.
+ // It's not the most optimized algorithm for selecting the best representative
+ // entries over the time.
+ // It starts from the beginning and makes sure the distance is larger than
+ // `(end - start) / size` before selecting the number. For example, for the
+ // following list, pick 3 entries (it will pick seqno #1, #6, #8):
+ // 1 -> 10
+ // 5 -> 17
+ // 6 -> 25
+ // 8 -> 30
+ // first, it always picks the first one, then there are 2 num_entries_to_fill
+ // and the time difference between current one vs. the last one is
+ // (30 - 10) = 20. 20/2 = 10. So it will skip until 10+10 = 20. => it skips
+ // #5 and pick #6.
+ // But the most optimized solution is picking #1 #5 #8, as it will be more
+ // evenly distributed for time. Anyway the following algorithm is simple and
+ // may over-select new data, which is good. We do want more accurate time
+ // information for recent data.
+ std::deque<SeqnoTimePair> output_copy;
+ if (std::distance(start_it, end_it) > static_cast<int64_t>(output_size)) {
+ int64_t num_entries_to_fill = static_cast<int64_t>(output_size);
+ auto last_it = end_it;
+ last_it--;
+ uint64_t end_time = last_it->time;
+ uint64_t skip_until_time = 0;
+ for (auto it = start_it; it < end_it; it++) {
+ // skip if it's not reach the skip_until_time yet
+ if (std::distance(it, end_it) > num_entries_to_fill &&
+ it->time < skip_until_time) {
+ continue;
+ }
+ output_copy.push_back(*it);
+ num_entries_to_fill--;
+ if (std::distance(it, end_it) > num_entries_to_fill &&
+ num_entries_to_fill > 0) {
+ // If there are more entries than we need, re-calculate the
+ // skip_until_time, which means skip until that time
+ skip_until_time =
+ it->time + ((end_time - it->time) / num_entries_to_fill);
+ }
+ }
+
+ // Make sure all entries are filled
+ assert(num_entries_to_fill == 0);
+ start_it = output_copy.begin();
+ end_it = output_copy.end();
+ }
+
+ // Delta encode the data
+ uint64_t size = std::distance(start_it, end_it);
+ PutVarint64(&dest, size);
+ SeqnoTimePair base;
+ for (auto it = start_it; it < end_it; it++) {
+ assert(base < *it);
+ SeqnoTimePair val = *it - base;
+ base = *it;
+ val.Encode(dest);
+ }
+}
+
+Status SeqnoToTimeMapping::Add(const std::string& seqno_time_mapping_str) {
+ Slice input(seqno_time_mapping_str);
+ if (input.empty()) {
+ return Status::OK();
+ }
+ uint64_t size;
+ if (!GetVarint64(&input, &size)) {
+ return Status::Corruption("Invalid sequence number time size");
+ }
+ is_sorted_ = false;
+ SeqnoTimePair base;
+ for (uint64_t i = 0; i < size; i++) {
+ SeqnoTimePair val;
+ Status s = val.Decode(input);
+ if (!s.ok()) {
+ return s;
+ }
+ val.Add(base);
+ seqno_time_mapping_.emplace_back(val);
+ base = val;
+ }
+ return Status::OK();
+}
+
+void SeqnoToTimeMapping::SeqnoTimePair::Encode(std::string& dest) const {
+ PutVarint64Varint64(&dest, seqno, time);
+}
+
+Status SeqnoToTimeMapping::SeqnoTimePair::Decode(Slice& input) {
+ if (!GetVarint64(&input, &seqno)) {
+ return Status::Corruption("Invalid sequence number");
+ }
+ if (!GetVarint64(&input, &time)) {
+ return Status::Corruption("Invalid time");
+ }
+ return Status::OK();
+}
+
+bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) {
+ assert(is_sorted_);
+
+ // skip seq number 0, which may have special meaning, like zeroed out data
+ if (seqno == 0) {
+ return false;
+ }
+ if (!Empty()) {
+ if (seqno < Last().seqno || time < Last().time) {
+ return false;
+ }
+ if (seqno == Last().seqno) {
+ Last().time = time;
+ return true;
+ }
+ if (time == Last().time) {
+ // new sequence has the same time as old one, no need to add new mapping
+ return false;
+ }
+ }
+
+ seqno_time_mapping_.emplace_back(seqno, time);
+
+ if (seqno_time_mapping_.size() > max_capacity_) {
+ seqno_time_mapping_.pop_front();
+ }
+ return true;
+}
+
+bool SeqnoToTimeMapping::Resize(uint64_t min_time_duration,
+ uint64_t max_time_duration) {
+ uint64_t new_max_capacity =
+ CalculateMaxCapacity(min_time_duration, max_time_duration);
+ if (new_max_capacity == max_capacity_) {
+ return false;
+ } else if (new_max_capacity < seqno_time_mapping_.size()) {
+ uint64_t delta = seqno_time_mapping_.size() - new_max_capacity;
+ seqno_time_mapping_.erase(seqno_time_mapping_.begin(),
+ seqno_time_mapping_.begin() + delta);
+ }
+ max_capacity_ = new_max_capacity;
+ return true;
+}
+
+Status SeqnoToTimeMapping::Sort() {
+ if (is_sorted_) {
+ return Status::OK();
+ }
+ if (seqno_time_mapping_.empty()) {
+ is_sorted_ = true;
+ return Status::OK();
+ }
+
+ std::deque<SeqnoTimePair> copy = std::move(seqno_time_mapping_);
+
+ std::sort(copy.begin(), copy.end());
+
+ seqno_time_mapping_.clear();
+
+ // remove seqno = 0, which may have special meaning, like zeroed out data
+ while (copy.front().seqno == 0) {
+ copy.pop_front();
+ }
+
+ SeqnoTimePair prev = copy.front();
+ for (const auto& it : copy) {
+ // If sequence number is the same, pick the one with larger time, which is
+ // more accurate than the older time.
+ if (it.seqno == prev.seqno) {
+ assert(it.time >= prev.time);
+ prev.time = it.time;
+ } else {
+ assert(it.seqno > prev.seqno);
+ // If a larger sequence number has an older time which is not useful, skip
+ if (it.time > prev.time) {
+ seqno_time_mapping_.push_back(prev);
+ prev = it;
+ }
+ }
+ }
+ seqno_time_mapping_.emplace_back(prev);
+
+ is_sorted_ = true;
+ return Status::OK();
+}
+
+std::string SeqnoToTimeMapping::ToHumanString() const {
+ std::string ret;
+ for (const auto& seq_time : seqno_time_mapping_) {
+ AppendNumberTo(&ret, seq_time.seqno);
+ ret.append("->");
+ AppendNumberTo(&ret, seq_time.time);
+ ret.append(",");
+ }
+ return ret;
+}
+
+SeqnoToTimeMapping SeqnoToTimeMapping::Copy(
+ SequenceNumber smallest_seqno) const {
+ SeqnoToTimeMapping ret;
+ auto it = std::upper_bound(seqno_time_mapping_.begin(),
+ seqno_time_mapping_.end(), smallest_seqno);
+ if (it != seqno_time_mapping_.begin()) {
+ it--;
+ }
+ std::copy(it, seqno_time_mapping_.end(),
+ std::back_inserter(ret.seqno_time_mapping_));
+ return ret;
+}
+
+uint64_t SeqnoToTimeMapping::CalculateMaxCapacity(uint64_t min_time_duration,
+ uint64_t max_time_duration) {
+ if (min_time_duration == 0) {
+ return 0;
+ }
+ return std::min(
+ kMaxSeqnoToTimeEntries,
+ max_time_duration * kMaxSeqnoTimePairsPerCF / min_time_duration);
+}
+
+SeqnoToTimeMapping::SeqnoTimePair SeqnoToTimeMapping::SeqnoTimePair::operator-(
+ const SeqnoTimePair& other) const {
+ SeqnoTimePair res;
+ res.seqno = seqno - other.seqno;
+ res.time = time - other.time;
+ return res;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/seqno_to_time_mapping.h b/src/rocksdb/db/seqno_to_time_mapping.h
new file mode 100644
index 000000000..4ffc9c199
--- /dev/null
+++ b/src/rocksdb/db/seqno_to_time_mapping.h
@@ -0,0 +1,189 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <cinttypes>
+#include <deque>
+#include <functional>
+#include <iterator>
+#include <string>
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint64_t kUnknownSeqnoTime = 0;
+
+// SeqnoToTimeMapping stores the sequence number to time mapping, so given a
+// sequence number it can estimate the oldest possible time for that sequence
+// number. For example:
+// 10 -> 100
+// 50 -> 300
+// then if a key has seqno 19, the OldestApproximateTime would be 100, for 51 it
+// would be 300.
+// As it's a sorted list, the new entry is inserted from the back. The old data
+// will be popped from the front if they're no longer used.
+//
+// Note: the data struct is not thread safe, both read and write need to be
+// synchronized by caller.
+class SeqnoToTimeMapping {
+ public:
+ // Maximum number of entries can be encoded into SST. The data is delta encode
+ // so the maximum data usage for each SST is < 0.3K
+ static constexpr uint64_t kMaxSeqnoTimePairsPerSST = 100;
+
+ // Maximum number of entries per CF. If there's only CF with this feature on,
+ // the max duration divided by this number, so for example, if
+ // preclude_last_level_data_seconds = 100000 (~1day), then it will sample the
+ // seqno -> time every 1000 seconds (~17minutes). Then the maximum entry it
+ // needs is 100.
+ // When there are multiple CFs having this feature on, the sampling cadence is
+ // determined by the smallest setting, the capacity is determined the largest
+ // setting, also it's caped by kMaxSeqnoTimePairsPerCF * 10.
+ static constexpr uint64_t kMaxSeqnoTimePairsPerCF = 100;
+
+ // A simple struct for sequence number to time pair
+ struct SeqnoTimePair {
+ SequenceNumber seqno = 0;
+ uint64_t time = 0;
+
+ SeqnoTimePair() = default;
+ SeqnoTimePair(SequenceNumber _seqno, uint64_t _time)
+ : seqno(_seqno), time(_time) {}
+
+ // Encode to dest string
+ void Encode(std::string& dest) const;
+
+ // Decode the value from input Slice and remove it from the input
+ Status Decode(Slice& input);
+
+ // subtraction of 2 SeqnoTimePair
+ SeqnoTimePair operator-(const SeqnoTimePair& other) const;
+
+ // Add 2 values together
+ void Add(const SeqnoTimePair& obj) {
+ seqno += obj.seqno;
+ time += obj.time;
+ }
+
+ // Compare SeqnoTimePair with a sequence number, used for binary search a
+ // sequence number in a list of SeqnoTimePair
+ bool operator<(const SequenceNumber& other) const { return seqno < other; }
+
+ // Compare 2 SeqnoTimePair
+ bool operator<(const SeqnoTimePair& other) const {
+ return std::tie(seqno, time) < std::tie(other.seqno, other.time);
+ }
+
+ // Check if 2 SeqnoTimePair is the same
+ bool operator==(const SeqnoTimePair& other) const {
+ return std::tie(seqno, time) == std::tie(other.seqno, other.time);
+ }
+ };
+
+ // constractor of SeqnoToTimeMapping
+ // max_time_duration is the maximum time it should track. For example, if
+ // preclude_last_level_data_seconds is 1 day, then if an entry is older than 1
+ // day, then it can be removed.
+ // max_capacity is the maximum number of entry it can hold. For single CF,
+ // it's caped at 100 (kMaxSeqnoTimePairsPerCF), otherwise
+ // kMaxSeqnoTimePairsPerCF * 10.
+ // If it's set to 0, means it won't truncate any old data.
+ explicit SeqnoToTimeMapping(uint64_t max_time_duration = 0,
+ uint64_t max_capacity = 0)
+ : max_time_duration_(max_time_duration), max_capacity_(max_capacity) {}
+
+ // Append a new entry to the list. The new entry should be newer than the
+ // existing ones. It maintains the internal sorted status.
+ bool Append(SequenceNumber seqno, uint64_t time);
+
+ // Given a sequence number, estimate it's oldest time
+ uint64_t GetOldestApproximateTime(SequenceNumber seqno) const;
+
+ // Truncate the old entries based on the current time and max_time_duration_
+ void TruncateOldEntries(uint64_t now);
+
+ // Given a time, return it's oldest possible sequence number
+ SequenceNumber GetOldestSequenceNum(uint64_t time);
+
+ // Encode to a binary string
+ void Encode(std::string& des, SequenceNumber start, SequenceNumber end,
+ uint64_t now,
+ uint64_t output_size = kMaxSeqnoTimePairsPerSST) const;
+
+ // Add a new random entry, unlike Append(), it can be any data, but also makes
+ // the list un-sorted.
+ void Add(SequenceNumber seqno, uint64_t time);
+
+ // Decode and add the entries to the current obj. The list will be unsorted
+ Status Add(const std::string& seqno_time_mapping_str);
+
+ // Return the number of entries
+ size_t Size() const { return seqno_time_mapping_.size(); }
+
+ // Reduce the size of internal list
+ bool Resize(uint64_t min_time_duration, uint64_t max_time_duration);
+
+ // Override the max_time_duration_
+ void SetMaxTimeDuration(uint64_t max_time_duration) {
+ max_time_duration_ = max_time_duration;
+ }
+
+ uint64_t GetCapacity() const { return max_capacity_; }
+
+ // Sort the list, which also remove the redundant entries, useless entries,
+ // which makes sure the seqno is sorted, but also the time
+ Status Sort();
+
+ // copy the current obj from the given smallest_seqno.
+ SeqnoToTimeMapping Copy(SequenceNumber smallest_seqno) const;
+
+ // If the internal list is empty
+ bool Empty() const { return seqno_time_mapping_.empty(); }
+
+ // clear all entries
+ void Clear() { seqno_time_mapping_.clear(); }
+
+ // return the string for user message
+ // Note: Not efficient, okay for print
+ std::string ToHumanString() const;
+
+#ifndef NDEBUG
+ const std::deque<SeqnoTimePair>& TEST_GetInternalMapping() const {
+ return seqno_time_mapping_;
+ }
+#endif
+
+ private:
+ static constexpr uint64_t kMaxSeqnoToTimeEntries =
+ kMaxSeqnoTimePairsPerCF * 10;
+
+ uint64_t max_time_duration_;
+ uint64_t max_capacity_;
+
+ std::deque<SeqnoTimePair> seqno_time_mapping_;
+
+ bool is_sorted_ = true;
+
+ static uint64_t CalculateMaxCapacity(uint64_t min_time_duration,
+ uint64_t max_time_duration);
+
+ SeqnoTimePair& Last() {
+ assert(!Empty());
+ return seqno_time_mapping_.back();
+ }
+};
+
+// for searching the sequence number from SeqnoToTimeMapping
+inline bool operator<(const SequenceNumber& seqno,
+ const SeqnoToTimeMapping::SeqnoTimePair& other) {
+ return seqno < other.seqno;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_checker.h b/src/rocksdb/db/snapshot_checker.h
new file mode 100644
index 000000000..0bfb1aa07
--- /dev/null
+++ b/src/rocksdb/db/snapshot_checker.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class SnapshotCheckerResult : int {
+ kInSnapshot = 0,
+ kNotInSnapshot = 1,
+ // In case snapshot is released and the checker has no clue whether
+ // the given sequence is visible to the snapshot.
+ kSnapshotReleased = 2,
+};
+
+// Callback class that control GC of duplicate keys in flush/compaction.
+class SnapshotChecker {
+ public:
+ virtual ~SnapshotChecker() {}
+ virtual SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber sequence, SequenceNumber snapshot_sequence) const = 0;
+};
+
+class DisableGCSnapshotChecker : public SnapshotChecker {
+ public:
+ virtual ~DisableGCSnapshotChecker() {}
+ virtual SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber /*sequence*/,
+ SequenceNumber /*snapshot_sequence*/) const override {
+ // By returning kNotInSnapshot, we prevent all the values from being GCed
+ return SnapshotCheckerResult::kNotInSnapshot;
+ }
+ static DisableGCSnapshotChecker* Instance();
+
+ protected:
+ explicit DisableGCSnapshotChecker() {}
+};
+
+class WritePreparedTxnDB;
+
+// Callback class created by WritePreparedTxnDB to check if a key
+// is visible by a snapshot.
+class WritePreparedSnapshotChecker : public SnapshotChecker {
+ public:
+ explicit WritePreparedSnapshotChecker(WritePreparedTxnDB* txn_db);
+ virtual ~WritePreparedSnapshotChecker() {}
+
+ virtual SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber sequence, SequenceNumber snapshot_sequence) const override;
+
+ private:
+#ifndef ROCKSDB_LITE
+ const WritePreparedTxnDB* const txn_db_;
+#endif // !ROCKSDB_LITE
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_impl.cc b/src/rocksdb/db/snapshot_impl.cc
new file mode 100644
index 000000000..98b475463
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/db.h"
+#include "rocksdb/snapshot.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ManagedSnapshot::ManagedSnapshot(DB* db)
+ : db_(db), snapshot_(db->GetSnapshot()) {}
+
+ManagedSnapshot::ManagedSnapshot(DB* db, const Snapshot* _snapshot)
+ : db_(db), snapshot_(_snapshot) {}
+
+ManagedSnapshot::~ManagedSnapshot() {
+ if (snapshot_) {
+ db_->ReleaseSnapshot(snapshot_);
+ }
+}
+
+const Snapshot* ManagedSnapshot::snapshot() { return snapshot_; }
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_impl.h b/src/rocksdb/db/snapshot_impl.h
new file mode 100644
index 000000000..23e5e98cd
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.h
@@ -0,0 +1,239 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/db.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+ SequenceNumber number_; // const after creation
+ // It indicates the smallest uncommitted data at the time the snapshot was
+ // taken. This is currently used by WritePrepared transactions to limit the
+ // scope of queries to IsInSnapshot.
+ SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
+
+ SequenceNumber GetSequenceNumber() const override { return number_; }
+
+ int64_t GetUnixTime() const override { return unix_time_; }
+
+ uint64_t GetTimestamp() const override { return timestamp_; }
+
+ private:
+ friend class SnapshotList;
+
+ // SnapshotImpl is kept in a doubly-linked circular list
+ SnapshotImpl* prev_;
+ SnapshotImpl* next_;
+
+ SnapshotList* list_; // just for sanity checks
+
+ int64_t unix_time_;
+
+ uint64_t timestamp_;
+
+ // Will this snapshot be used by a Transaction to do write-conflict checking?
+ bool is_write_conflict_boundary_;
+};
+
+class SnapshotList {
+ public:
+ SnapshotList() {
+ list_.prev_ = &list_;
+ list_.next_ = &list_;
+ list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging
+ // Set all the variables to make UBSAN happy.
+ list_.list_ = nullptr;
+ list_.unix_time_ = 0;
+ list_.timestamp_ = 0;
+ list_.is_write_conflict_boundary_ = false;
+ count_ = 0;
+ }
+
+ // No copy-construct.
+ SnapshotList(const SnapshotList&) = delete;
+
+ bool empty() const {
+ assert(list_.next_ != &list_ || 0 == count_);
+ return list_.next_ == &list_;
+ }
+ SnapshotImpl* oldest() const {
+ assert(!empty());
+ return list_.next_;
+ }
+ SnapshotImpl* newest() const {
+ assert(!empty());
+ return list_.prev_;
+ }
+
+ SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, uint64_t unix_time,
+ bool is_write_conflict_boundary,
+ uint64_t ts = std::numeric_limits<uint64_t>::max()) {
+ s->number_ = seq;
+ s->unix_time_ = unix_time;
+ s->timestamp_ = ts;
+ s->is_write_conflict_boundary_ = is_write_conflict_boundary;
+ s->list_ = this;
+ s->next_ = &list_;
+ s->prev_ = list_.prev_;
+ s->prev_->next_ = s;
+ s->next_->prev_ = s;
+ count_++;
+ return s;
+ }
+
+ // Do not responsible to free the object.
+ void Delete(const SnapshotImpl* s) {
+ assert(s->list_ == this);
+ s->prev_->next_ = s->next_;
+ s->next_->prev_ = s->prev_;
+ count_--;
+ }
+
+ // retrieve all snapshot numbers up until max_seq. They are sorted in
+ // ascending order (with no duplicates).
+ std::vector<SequenceNumber> GetAll(
+ SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+ const SequenceNumber& max_seq = kMaxSequenceNumber) const {
+ std::vector<SequenceNumber> ret;
+ GetAll(&ret, oldest_write_conflict_snapshot, max_seq);
+ return ret;
+ }
+
+ void GetAll(std::vector<SequenceNumber>* snap_vector,
+ SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+ const SequenceNumber& max_seq = kMaxSequenceNumber) const {
+ std::vector<SequenceNumber>& ret = *snap_vector;
+ // So far we have no use case that would pass a non-empty vector
+ assert(ret.size() == 0);
+
+ if (oldest_write_conflict_snapshot != nullptr) {
+ *oldest_write_conflict_snapshot = kMaxSequenceNumber;
+ }
+
+ if (empty()) {
+ return;
+ }
+ const SnapshotImpl* s = &list_;
+ while (s->next_ != &list_) {
+ if (s->next_->number_ > max_seq) {
+ break;
+ }
+ // Avoid duplicates
+ if (ret.empty() || ret.back() != s->next_->number_) {
+ ret.push_back(s->next_->number_);
+ }
+
+ if (oldest_write_conflict_snapshot != nullptr &&
+ *oldest_write_conflict_snapshot == kMaxSequenceNumber &&
+ s->next_->is_write_conflict_boundary_) {
+ // If this is the first write-conflict boundary snapshot in the list,
+ // it is the oldest
+ *oldest_write_conflict_snapshot = s->next_->number_;
+ }
+
+ s = s->next_;
+ }
+ return;
+ }
+
+ // get the sequence number of the most recent snapshot
+ SequenceNumber GetNewest() {
+ if (empty()) {
+ return 0;
+ }
+ return newest()->number_;
+ }
+
+ int64_t GetOldestSnapshotTime() const {
+ if (empty()) {
+ return 0;
+ } else {
+ return oldest()->unix_time_;
+ }
+ }
+
+ int64_t GetOldestSnapshotSequence() const {
+ if (empty()) {
+ return 0;
+ } else {
+ return oldest()->GetSequenceNumber();
+ }
+ }
+
+ uint64_t count() const { return count_; }
+
+ private:
+ // Dummy head of doubly-linked list of snapshots
+ SnapshotImpl list_;
+ uint64_t count_;
+};
+
+// All operations on TimestampedSnapshotList must be protected by db mutex.
+class TimestampedSnapshotList {
+ public:
+ explicit TimestampedSnapshotList() = default;
+
+ std::shared_ptr<const SnapshotImpl> GetSnapshot(uint64_t ts) const {
+ if (ts == std::numeric_limits<uint64_t>::max() && !snapshots_.empty()) {
+ auto it = snapshots_.rbegin();
+ assert(it != snapshots_.rend());
+ return it->second;
+ }
+ auto it = snapshots_.find(ts);
+ if (it == snapshots_.end()) {
+ return std::shared_ptr<const SnapshotImpl>();
+ }
+ return it->second;
+ }
+
+ void GetSnapshots(
+ uint64_t ts_lb, uint64_t ts_ub,
+ std::vector<std::shared_ptr<const Snapshot>>& snapshots) const {
+ assert(ts_lb < ts_ub);
+ auto it_low = snapshots_.lower_bound(ts_lb);
+ auto it_high = snapshots_.lower_bound(ts_ub);
+ for (auto it = it_low; it != it_high; ++it) {
+ snapshots.emplace_back(it->second);
+ }
+ }
+
+ void AddSnapshot(const std::shared_ptr<const SnapshotImpl>& snapshot) {
+ assert(snapshot);
+ snapshots_.try_emplace(snapshot->GetTimestamp(), snapshot);
+ }
+
+ // snapshots_to_release: the container to where the timestamped snapshots will
+ // be moved so that it retains the last reference to the snapshots and the
+ // snapshots won't be actually released which requires db mutex. The
+ // snapshots will be released by caller of ReleaseSnapshotsOlderThan().
+ void ReleaseSnapshotsOlderThan(
+ uint64_t ts,
+ autovector<std::shared_ptr<const SnapshotImpl>>& snapshots_to_release) {
+ auto ub = snapshots_.lower_bound(ts);
+ for (auto it = snapshots_.begin(); it != ub; ++it) {
+ snapshots_to_release.emplace_back(it->second);
+ }
+ snapshots_.erase(snapshots_.begin(), ub);
+ }
+
+ private:
+ std::map<uint64_t, std::shared_ptr<const SnapshotImpl>> snapshots_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache.cc b/src/rocksdb/db/table_cache.cc
new file mode 100644
index 000000000..c44c4bb84
--- /dev/null
+++ b/src/rocksdb/db/table_cache.cc
@@ -0,0 +1,753 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/snapshot_impl.h"
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/statistics.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "table/multiget_context.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+template <class T>
+static void DeleteEntry(const Slice& /*key*/, void* value) {
+ T* typed_value = reinterpret_cast<T*>(value);
+ delete typed_value;
+}
+} // anonymous namespace
+} // namespace ROCKSDB_NAMESPACE
+
+// Generate the regular and coroutine versions of some methods by
+// including table_cache_sync_and_async.h twice
+// Macros in the header will expand differently based on whether
+// WITH_COROUTINES or WITHOUT_COROUTINES is defined
+// clang-format off
+#define WITHOUT_COROUTINES
+#include "db/table_cache_sync_and_async.h"
+#undef WITHOUT_COROUTINES
+#define WITH_COROUTINES
+#include "db/table_cache_sync_and_async.h"
+#undef WITH_COROUTINES
+// clang-format on
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+static void UnrefEntry(void* arg1, void* arg2) {
+ Cache* cache = reinterpret_cast<Cache*>(arg1);
+ Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+ cache->Release(h);
+}
+
+static Slice GetSliceForFileNumber(const uint64_t* file_number) {
+ return Slice(reinterpret_cast<const char*>(file_number),
+ sizeof(*file_number));
+}
+
+#ifndef ROCKSDB_LITE
+
+void AppendVarint64(IterKey* key, uint64_t v) {
+ char buf[10];
+ auto ptr = EncodeVarint64(buf, v);
+ key->TrimAppend(key->Size(), buf, ptr - buf);
+}
+
+#endif // ROCKSDB_LITE
+
+} // anonymous namespace
+
+const int kLoadConcurency = 128;
+
+TableCache::TableCache(const ImmutableOptions& ioptions,
+ const FileOptions* file_options, Cache* const cache,
+ BlockCacheTracer* const block_cache_tracer,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::string& db_session_id)
+ : ioptions_(ioptions),
+ file_options_(*file_options),
+ cache_(cache),
+ immortal_tables_(false),
+ block_cache_tracer_(block_cache_tracer),
+ loader_mutex_(kLoadConcurency, kGetSliceNPHash64UnseededFnPtr),
+ io_tracer_(io_tracer),
+ db_session_id_(db_session_id) {
+ if (ioptions_.row_cache) {
+ // If the same cache is shared by multiple instances, we need to
+ // disambiguate its entries.
+ PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId());
+ }
+}
+
+TableCache::~TableCache() {}
+
+TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
+ return reinterpret_cast<TableReader*>(cache_->Value(handle));
+}
+
+void TableCache::ReleaseHandle(Cache::Handle* handle) {
+ cache_->Release(handle);
+}
+
+Status TableCache::GetTableReader(
+ const ReadOptions& ro, const FileOptions& file_options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, bool sequential_mode, bool record_read_stats,
+ HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
+ size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
+ std::string fname = TableFileName(
+ ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId());
+ std::unique_ptr<FSRandomAccessFile> file;
+ FileOptions fopts = file_options;
+ fopts.temperature = file_temperature;
+ Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
+ TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile",
+ const_cast<Status*>(&s));
+ if (s.ok()) {
+ s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
+ }
+ if (s.ok()) {
+ RecordTick(ioptions_.stats, NO_FILE_OPENS);
+ } else if (s.IsPathNotFound()) {
+ fname = Rocks2LevelTableFileName(fname);
+ s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
+ if (s.ok()) {
+ s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
+ nullptr);
+ }
+ if (s.ok()) {
+ RecordTick(ioptions_.stats, NO_FILE_OPENS);
+ }
+ }
+
+ if (s.ok()) {
+ if (!sequential_mode && ioptions_.advise_random_on_open) {
+ file->Hint(FSRandomAccessFile::kRandom);
+ }
+ StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS);
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(
+ std::move(file), fname, ioptions_.clock, io_tracer_,
+ record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS,
+ file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners,
+ file_temperature, level == ioptions_.num_levels - 1));
+ UniqueId64x2 expected_unique_id;
+ if (ioptions_.verify_sst_unique_id_in_manifest) {
+ expected_unique_id = file_meta.unique_id;
+ } else {
+ expected_unique_id = kNullUniqueId64x2; // null ID == no verification
+ }
+ s = ioptions_.table_factory->NewTableReader(
+ ro,
+ TableReaderOptions(ioptions_, prefix_extractor, file_options,
+ internal_comparator, skip_filters, immortal_tables_,
+ false /* force_direct_prefetch */, level,
+ block_cache_tracer_, max_file_size_for_l0_meta_pin,
+ db_session_id_, file_meta.fd.GetNumber(),
+ expected_unique_id, file_meta.fd.largest_seqno),
+ std::move(file_reader), file_meta.fd.GetFileSize(), table_reader,
+ prefetch_index_and_filter_in_cache);
+ TEST_SYNC_POINT("TableCache::GetTableReader:0");
+ }
+ return s;
+}
+
+void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) {
+ ReleaseHandle(handle);
+ uint64_t number = fd.GetNumber();
+ Slice key = GetSliceForFileNumber(&number);
+ cache_->Erase(key);
+}
+
+Status TableCache::FindTable(
+ const ReadOptions& ro, const FileOptions& file_options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, Cache::Handle** handle,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist,
+ bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
+ size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
+ PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock);
+ uint64_t number = file_meta.fd.GetNumber();
+ Slice key = GetSliceForFileNumber(&number);
+ *handle = cache_->Lookup(key);
+ TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0",
+ const_cast<bool*>(&no_io));
+
+ if (*handle == nullptr) {
+ if (no_io) {
+ return Status::Incomplete("Table not found in table_cache, no_io is set");
+ }
+ MutexLock load_lock(loader_mutex_.get(key));
+ // We check the cache again under loading mutex
+ *handle = cache_->Lookup(key);
+ if (*handle != nullptr) {
+ return Status::OK();
+ }
+
+ std::unique_ptr<TableReader> table_reader;
+ Status s =
+ GetTableReader(ro, file_options, internal_comparator, file_meta,
+ false /* sequential mode */, record_read_stats,
+ file_read_hist, &table_reader, prefix_extractor,
+ skip_filters, level, prefetch_index_and_filter_in_cache,
+ max_file_size_for_l0_meta_pin, file_temperature);
+ if (!s.ok()) {
+ assert(table_reader == nullptr);
+ RecordTick(ioptions_.stats, NO_FILE_ERRORS);
+ // We do not cache error results so that if the error is transient,
+ // or somebody repairs the file, we recover automatically.
+ } else {
+ s = cache_->Insert(key, table_reader.get(), 1, &DeleteEntry<TableReader>,
+ handle);
+ if (s.ok()) {
+ // Release ownership of table reader.
+ table_reader.release();
+ }
+ }
+ return s;
+ }
+ return Status::OK();
+}
+
+InternalIterator* TableCache::NewIterator(
+ const ReadOptions& options, const FileOptions& file_options,
+ const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
+ RangeDelAggregator* range_del_agg,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
+ TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
+ size_t max_file_size_for_l0_meta_pin,
+ const InternalKey* smallest_compaction_key,
+ const InternalKey* largest_compaction_key, bool allow_unprepared_value,
+ TruncatedRangeDelIterator** range_del_iter) {
+ PERF_TIMER_GUARD(new_table_iterator_nanos);
+
+ Status s;
+ TableReader* table_reader = nullptr;
+ Cache::Handle* handle = nullptr;
+ if (table_reader_ptr != nullptr) {
+ *table_reader_ptr = nullptr;
+ }
+ bool for_compaction = caller == TableReaderCaller::kCompaction;
+ auto& fd = file_meta.fd;
+ table_reader = fd.table_reader;
+ if (table_reader == nullptr) {
+ s = FindTable(
+ options, file_options, icomparator, file_meta, &handle,
+ prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
+ !for_compaction /* record_read_stats */, file_read_hist, skip_filters,
+ level, true /* prefetch_index_and_filter_in_cache */,
+ max_file_size_for_l0_meta_pin, file_meta.temperature);
+ if (s.ok()) {
+ table_reader = GetTableReaderFromHandle(handle);
+ }
+ }
+ InternalIterator* result = nullptr;
+ if (s.ok()) {
+ if (options.table_filter &&
+ !options.table_filter(*table_reader->GetTableProperties())) {
+ result = NewEmptyInternalIterator<Slice>(arena);
+ } else {
+ result = table_reader->NewIterator(
+ options, prefix_extractor.get(), arena, skip_filters, caller,
+ file_options.compaction_readahead_size, allow_unprepared_value);
+ }
+ if (handle != nullptr) {
+ result->RegisterCleanup(&UnrefEntry, cache_, handle);
+ handle = nullptr; // prevent from releasing below
+ }
+
+ if (for_compaction) {
+ table_reader->SetupForCompaction();
+ }
+ if (table_reader_ptr != nullptr) {
+ *table_reader_ptr = table_reader;
+ }
+ }
+ if (s.ok() && !options.ignore_range_deletions) {
+ if (range_del_iter != nullptr) {
+ auto new_range_del_iter =
+ table_reader->NewRangeTombstoneIterator(options);
+ if (new_range_del_iter == nullptr || new_range_del_iter->empty()) {
+ delete new_range_del_iter;
+ *range_del_iter = nullptr;
+ } else {
+ *range_del_iter = new TruncatedRangeDelIterator(
+ std::unique_ptr<FragmentedRangeTombstoneIterator>(
+ new_range_del_iter),
+ &icomparator, &file_meta.smallest, &file_meta.largest);
+ }
+ }
+ if (range_del_agg != nullptr) {
+ if (range_del_agg->AddFile(fd.GetNumber())) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> new_range_del_iter(
+ static_cast<FragmentedRangeTombstoneIterator*>(
+ table_reader->NewRangeTombstoneIterator(options)));
+ if (new_range_del_iter != nullptr) {
+ s = new_range_del_iter->status();
+ }
+ if (s.ok()) {
+ const InternalKey* smallest = &file_meta.smallest;
+ const InternalKey* largest = &file_meta.largest;
+ if (smallest_compaction_key != nullptr) {
+ smallest = smallest_compaction_key;
+ }
+ if (largest_compaction_key != nullptr) {
+ largest = largest_compaction_key;
+ }
+ range_del_agg->AddTombstones(std::move(new_range_del_iter), smallest,
+ largest);
+ }
+ }
+ }
+ }
+
+ if (handle != nullptr) {
+ ReleaseHandle(handle);
+ }
+ if (!s.ok()) {
+ assert(result == nullptr);
+ result = NewErrorInternalIterator<Slice>(s, arena);
+ }
+ return result;
+}
+
+Status TableCache::GetRangeTombstoneIterator(
+ const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
+ assert(out_iter);
+ const FileDescriptor& fd = file_meta.fd;
+ Status s;
+ TableReader* t = fd.table_reader;
+ Cache::Handle* handle = nullptr;
+ if (t == nullptr) {
+ s = FindTable(options, file_options_, internal_comparator, file_meta,
+ &handle);
+ if (s.ok()) {
+ t = GetTableReaderFromHandle(handle);
+ }
+ }
+ if (s.ok()) {
+ // Note: NewRangeTombstoneIterator could return nullptr
+ out_iter->reset(t->NewRangeTombstoneIterator(options));
+ }
+ if (handle) {
+ if (*out_iter) {
+ (*out_iter)->RegisterCleanup(&UnrefEntry, cache_, handle);
+ } else {
+ ReleaseHandle(handle);
+ }
+ }
+ return s;
+}
+
+#ifndef ROCKSDB_LITE
+void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options,
+ const FileDescriptor& fd,
+ const Slice& internal_key,
+ GetContext* get_context,
+ IterKey& row_cache_key) {
+ uint64_t fd_number = fd.GetNumber();
+ // We use the user key as cache key instead of the internal key,
+ // otherwise the whole cache would be invalidated every time the
+ // sequence key increases. However, to support caching snapshot
+ // reads, we append the sequence number (incremented by 1 to
+ // distinguish from 0) only in this case.
+ // If the snapshot is larger than the largest seqno in the file,
+ // all data should be exposed to the snapshot, so we treat it
+ // the same as there is no snapshot. The exception is that if
+ // a seq-checking callback is registered, some internal keys
+ // may still be filtered out.
+ uint64_t seq_no = 0;
+ // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
+ if (options.snapshot != nullptr &&
+ (get_context->has_callback() ||
+ static_cast_with_check<const SnapshotImpl>(options.snapshot)
+ ->GetSequenceNumber() <= fd.largest_seqno)) {
+ // We should consider to use options.snapshot->GetSequenceNumber()
+ // instead of GetInternalKeySeqno(k), which will make the code
+ // easier to understand.
+ seq_no = 1 + GetInternalKeySeqno(internal_key);
+ }
+
+ // Compute row cache key.
+ row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
+ row_cache_id_.size());
+ AppendVarint64(&row_cache_key, fd_number);
+ AppendVarint64(&row_cache_key, seq_no);
+}
+
+bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+ size_t prefix_size, GetContext* get_context) {
+ bool found = false;
+
+ row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size());
+ if (auto row_handle =
+ ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) {
+ // Cleanable routine to release the cache entry
+ Cleanable value_pinner;
+ auto release_cache_entry_func = [](void* cache_to_clean,
+ void* cache_handle) {
+ ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle);
+ };
+ auto found_row_cache_entry =
+ static_cast<const std::string*>(ioptions_.row_cache->Value(row_handle));
+ // If it comes here value is located on the cache.
+ // found_row_cache_entry points to the value on cache,
+ // and value_pinner has cleanup procedure for the cached entry.
+ // After replayGetContextLog() returns, get_context.pinnable_slice_
+ // will point to cache entry buffer (or a copy based on that) and
+ // cleanup routine under value_pinner will be delegated to
+ // get_context.pinnable_slice_. Cache entry is released when
+ // get_context.pinnable_slice_ is reset.
+ value_pinner.RegisterCleanup(release_cache_entry_func,
+ ioptions_.row_cache.get(), row_handle);
+ replayGetContextLog(*found_row_cache_entry, user_key, get_context,
+ &value_pinner);
+ RecordTick(ioptions_.stats, ROW_CACHE_HIT);
+ found = true;
+ } else {
+ RecordTick(ioptions_.stats, ROW_CACHE_MISS);
+ }
+ return found;
+}
+#endif // ROCKSDB_LITE
+
+Status TableCache::Get(
+ const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ HistogramImpl* file_read_hist, bool skip_filters, int level,
+ size_t max_file_size_for_l0_meta_pin) {
+ auto& fd = file_meta.fd;
+ std::string* row_cache_entry = nullptr;
+ bool done = false;
+#ifndef ROCKSDB_LITE
+ IterKey row_cache_key;
+ std::string row_cache_entry_buffer;
+
+ // Check row cache if enabled. Since row cache does not currently store
+ // sequence numbers, we cannot use it if we need to fetch the sequence.
+ if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
+ auto user_key = ExtractUserKey(k);
+ CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key);
+ done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(),
+ get_context);
+ if (!done) {
+ row_cache_entry = &row_cache_entry_buffer;
+ }
+ }
+#endif // ROCKSDB_LITE
+ Status s;
+ TableReader* t = fd.table_reader;
+ Cache::Handle* handle = nullptr;
+ if (!done) {
+ assert(s.ok());
+ if (t == nullptr) {
+ s = FindTable(options, file_options_, internal_comparator, file_meta,
+ &handle, prefix_extractor,
+ options.read_tier == kBlockCacheTier /* no_io */,
+ true /* record_read_stats */, file_read_hist, skip_filters,
+ level, true /* prefetch_index_and_filter_in_cache */,
+ max_file_size_for_l0_meta_pin, file_meta.temperature);
+ if (s.ok()) {
+ t = GetTableReaderFromHandle(handle);
+ }
+ }
+ SequenceNumber* max_covering_tombstone_seq =
+ get_context->max_covering_tombstone_seq();
+ if (s.ok() && max_covering_tombstone_seq != nullptr &&
+ !options.ignore_range_deletions) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ t->NewRangeTombstoneIterator(options));
+ if (range_del_iter != nullptr) {
+ SequenceNumber seq =
+ range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k));
+ if (seq > *max_covering_tombstone_seq) {
+ *max_covering_tombstone_seq = seq;
+ if (get_context->NeedTimestamp()) {
+ get_context->SetTimestampFromRangeTombstone(
+ range_del_iter->timestamp());
+ }
+ }
+ }
+ }
+ if (s.ok()) {
+ get_context->SetReplayLog(row_cache_entry); // nullptr if no cache.
+ s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters);
+ get_context->SetReplayLog(nullptr);
+ } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
+ // Couldn't find Table in cache but treat as kFound if no_io set
+ get_context->MarkKeyMayExist();
+ s = Status::OK();
+ done = true;
+ }
+ }
+
+#ifndef ROCKSDB_LITE
+ // Put the replay log in row cache only if something was found.
+ if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) {
+ size_t charge = row_cache_entry->capacity() + sizeof(std::string);
+ void* row_ptr = new std::string(std::move(*row_cache_entry));
+ // If row cache is full, it's OK to continue.
+ ioptions_.row_cache
+ ->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+ &DeleteEntry<std::string>)
+ .PermitUncheckedError();
+ }
+#endif // ROCKSDB_LITE
+
+ if (handle != nullptr) {
+ ReleaseHandle(handle);
+ }
+ return s;
+}
+
+void TableCache::UpdateRangeTombstoneSeqnums(
+ const ReadOptions& options, TableReader* t,
+ MultiGetContext::Range& table_range) {
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ t->NewRangeTombstoneIterator(options));
+ if (range_del_iter != nullptr) {
+ for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
+ SequenceNumber* max_covering_tombstone_seq =
+ iter->get_context->max_covering_tombstone_seq();
+ SequenceNumber seq =
+ range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts);
+ if (seq > *max_covering_tombstone_seq) {
+ *max_covering_tombstone_seq = seq;
+ if (iter->get_context->NeedTimestamp()) {
+ iter->get_context->SetTimestampFromRangeTombstone(
+ range_del_iter->timestamp());
+ }
+ }
+ }
+ }
+}
+
+Status TableCache::MultiGetFilter(
+ const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ HistogramImpl* file_read_hist, int level,
+ MultiGetContext::Range* mget_range, Cache::Handle** table_handle) {
+ auto& fd = file_meta.fd;
+#ifndef ROCKSDB_LITE
+ IterKey row_cache_key;
+ std::string row_cache_entry_buffer;
+
+ // Check if we need to use the row cache. If yes, then we cannot do the
+ // filtering here, since the filtering needs to happen after the row cache
+ // lookup.
+ KeyContext& first_key = *mget_range->begin();
+ if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) {
+ return Status::NotSupported();
+ }
+#endif // ROCKSDB_LITE
+ Status s;
+ TableReader* t = fd.table_reader;
+ Cache::Handle* handle = nullptr;
+ MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(),
+ mget_range->end());
+ if (t == nullptr) {
+ s = FindTable(
+ options, file_options_, internal_comparator, file_meta, &handle,
+ prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
+ true /* record_read_stats */, file_read_hist, /*skip_filters=*/false,
+ level, true /* prefetch_index_and_filter_in_cache */,
+ /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature);
+ if (s.ok()) {
+ t = GetTableReaderFromHandle(handle);
+ }
+ *table_handle = handle;
+ }
+ if (s.ok()) {
+ s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range);
+ }
+ if (s.ok() && !options.ignore_range_deletions) {
+ // Update the range tombstone sequence numbers for the keys here
+ // as TableCache::MultiGet may or may not be called, and even if it
+ // is, it may be called with fewer keys in the rangedue to filtering.
+ UpdateRangeTombstoneSeqnums(options, t, tombstone_range);
+ }
+ if (mget_range->empty() && handle) {
+ ReleaseHandle(handle);
+ *table_handle = nullptr;
+ }
+
+ return s;
+}
+
+Status TableCache::GetTableProperties(
+ const FileOptions& file_options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ std::shared_ptr<const TableProperties>* properties,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor, bool no_io) {
+ auto table_reader = file_meta.fd.table_reader;
+ // table already been pre-loaded?
+ if (table_reader) {
+ *properties = table_reader->GetTableProperties();
+
+ return Status::OK();
+ }
+
+ Cache::Handle* table_handle = nullptr;
+ Status s = FindTable(ReadOptions(), file_options, internal_comparator,
+ file_meta, &table_handle, prefix_extractor, no_io);
+ if (!s.ok()) {
+ return s;
+ }
+ assert(table_handle);
+ auto table = GetTableReaderFromHandle(table_handle);
+ *properties = table->GetTableProperties();
+ ReleaseHandle(table_handle);
+ return s;
+}
+
+Status TableCache::ApproximateKeyAnchors(
+ const ReadOptions& ro, const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, std::vector<TableReader::Anchor>& anchors) {
+ Status s;
+ TableReader* t = file_meta.fd.table_reader;
+ Cache::Handle* handle = nullptr;
+ if (t == nullptr) {
+ s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle);
+ if (s.ok()) {
+ t = GetTableReaderFromHandle(handle);
+ }
+ }
+ if (s.ok() && t != nullptr) {
+ s = t->ApproximateKeyAnchors(ro, anchors);
+ }
+ if (handle != nullptr) {
+ ReleaseHandle(handle);
+ }
+ return s;
+}
+
+size_t TableCache::GetMemoryUsageByTableReader(
+ const FileOptions& file_options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor) {
+ auto table_reader = file_meta.fd.table_reader;
+ // table already been pre-loaded?
+ if (table_reader) {
+ return table_reader->ApproximateMemoryUsage();
+ }
+
+ Cache::Handle* table_handle = nullptr;
+ Status s = FindTable(ReadOptions(), file_options, internal_comparator,
+ file_meta, &table_handle, prefix_extractor, true);
+ if (!s.ok()) {
+ return 0;
+ }
+ assert(table_handle);
+ auto table = GetTableReaderFromHandle(table_handle);
+ auto ret = table->ApproximateMemoryUsage();
+ ReleaseHandle(table_handle);
+ return ret;
+}
+
+bool TableCache::HasEntry(Cache* cache, uint64_t file_number) {
+ Cache::Handle* handle = cache->Lookup(GetSliceForFileNumber(&file_number));
+ if (handle) {
+ cache->Release(handle);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void TableCache::Evict(Cache* cache, uint64_t file_number) {
+ cache->Erase(GetSliceForFileNumber(&file_number));
+}
+
+uint64_t TableCache::ApproximateOffsetOf(
+ const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller,
+ const InternalKeyComparator& internal_comparator,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor) {
+ uint64_t result = 0;
+ TableReader* table_reader = file_meta.fd.table_reader;
+ Cache::Handle* table_handle = nullptr;
+ if (table_reader == nullptr) {
+ const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+ Status s =
+ FindTable(ReadOptions(), file_options_, internal_comparator, file_meta,
+ &table_handle, prefix_extractor, false /* no_io */,
+ !for_compaction /* record_read_stats */);
+ if (s.ok()) {
+ table_reader = GetTableReaderFromHandle(table_handle);
+ }
+ }
+
+ if (table_reader != nullptr) {
+ result = table_reader->ApproximateOffsetOf(key, caller);
+ }
+ if (table_handle != nullptr) {
+ ReleaseHandle(table_handle);
+ }
+
+ return result;
+}
+
+uint64_t TableCache::ApproximateSize(
+ const Slice& start, const Slice& end, const FileMetaData& file_meta,
+ TableReaderCaller caller, const InternalKeyComparator& internal_comparator,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor) {
+ uint64_t result = 0;
+ TableReader* table_reader = file_meta.fd.table_reader;
+ Cache::Handle* table_handle = nullptr;
+ if (table_reader == nullptr) {
+ const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+ Status s =
+ FindTable(ReadOptions(), file_options_, internal_comparator, file_meta,
+ &table_handle, prefix_extractor, false /* no_io */,
+ !for_compaction /* record_read_stats */);
+ if (s.ok()) {
+ table_reader = GetTableReaderFromHandle(table_handle);
+ }
+ }
+
+ if (table_reader != nullptr) {
+ result = table_reader->ApproximateSize(start, end, caller);
+ }
+ if (table_handle != nullptr) {
+ ReleaseHandle(table_handle);
+ }
+
+ return result;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache.h b/src/rocksdb/db/table_cache.h
new file mode 100644
index 000000000..2e50f2c77
--- /dev/null
+++ b/src/rocksdb/db/table_cache.h
@@ -0,0 +1,275 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#pragma once
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "options/cf_options.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/table_reader.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/coro_utils.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Arena;
+struct FileDescriptor;
+class GetContext;
+class HistogramImpl;
+
+// Manages caching for TableReader objects for a column family. The actual
+// cache is allocated separately and passed to the constructor. TableCache
+// wraps around the underlying SST file readers by providing Get(),
+// MultiGet() and NewIterator() methods that hide the instantiation,
+// caching and access to the TableReader. The main purpose of this is
+// performance - by caching the TableReader, it avoids unnecessary file opens
+// and object allocation and instantiation. One exception is compaction, where
+// a new TableReader may be instantiated - see NewIterator() comments
+//
+// Another service provided by TableCache is managing the row cache - if the
+// DB is configured with a row cache, and the lookup key is present in the row
+// cache, lookup is very fast. The row cache is obtained from
+// ioptions.row_cache
+class TableCache {
+ public:
+ TableCache(const ImmutableOptions& ioptions,
+ const FileOptions* storage_options, Cache* cache,
+ BlockCacheTracer* const block_cache_tracer,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::string& db_session_id);
+ ~TableCache();
+
+ // Return an iterator for the specified file number (the corresponding
+ // file length must be exactly "file_size" bytes). If "table_reader_ptr"
+ // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
+ // underlying the returned iterator, or nullptr if no Table object underlies
+ // the returned iterator. The returned "*table_reader_ptr" object is owned
+ // by the cache and should not be deleted, and is valid for as long as the
+ // returned iterator is live.
+ // If !options.ignore_range_deletions, and range_del_iter is non-nullptr,
+ // then range_del_iter is set to a TruncatedRangeDelIterator for range
+ // tombstones in the SST file corresponding to the specified file number. The
+ // upper/lower bounds for the TruncatedRangeDelIterator are set to the SST
+ // file's boundary.
+ // @param options Must outlive the returned iterator.
+ // @param range_del_agg If non-nullptr, adds range deletions to the
+ // aggregator. If an error occurs, returns it in a NewErrorInternalIterator
+ // @param for_compaction If true, a new TableReader may be allocated (but
+ // not cached), depending on the CF options
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level The level this table is at, -1 for "not set / don't know"
+ InternalIterator* NewIterator(
+ const ReadOptions& options, const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
+ TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
+ size_t max_file_size_for_l0_meta_pin,
+ const InternalKey* smallest_compaction_key,
+ const InternalKey* largest_compaction_key, bool allow_unprepared_value,
+ TruncatedRangeDelIterator** range_del_iter = nullptr);
+
+ // If a seek to internal key "k" in specified file finds an entry,
+ // call get_context->SaveValue() repeatedly until
+ // it returns false. As a side effect, it will insert the TableReader
+ // into the cache and potentially evict another entry
+ // @param get_context Context for get operation. The result of the lookup
+ // can be retrieved by calling get_context->State()
+ // @param file_read_hist If non-nullptr, the file reader statistics are
+ // recorded
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level The level this table is at, -1 for "not set / don't know"
+ Status Get(
+ const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+ HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+ int level = -1, size_t max_file_size_for_l0_meta_pin = 0);
+
+ // Return the range delete tombstone iterator of the file specified by
+ // `file_meta`.
+ Status GetRangeTombstoneIterator(
+ const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
+
+ // Call table reader's MultiGetFilter to use the bloom filter to filter out
+ // keys. Returns Status::NotSupported() if row cache needs to be checked.
+ // If the table cache is looked up to get the table reader, the cache handle
+ // is returned in table_handle. This handle should be passed back to
+ // MultiGet() so it can be released.
+ Status MultiGetFilter(
+ const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ HistogramImpl* file_read_hist, int level,
+ MultiGetContext::Range* mget_range, Cache::Handle** table_handle);
+
+ // If a seek to internal key "k" in specified file finds an entry,
+ // call get_context->SaveValue() repeatedly until
+ // it returns false. As a side effect, it will insert the TableReader
+ // into the cache and potentially evict another entry
+ // @param mget_range Pointer to the structure describing a batch of keys to
+ // be looked up in this table file. The result is stored
+ // in the embedded GetContext
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level The level this table is at, -1 for "not set / don't know"
+ DECLARE_SYNC_AND_ASYNC(
+ Status, MultiGet, const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+ HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+ bool skip_range_deletions = false, int level = -1,
+ Cache::Handle* table_handle = nullptr);
+
+ // Evict any entry for the specified file number
+ static void Evict(Cache* cache, uint64_t file_number);
+
+ // Query whether specified file number is currently in cache
+ static bool HasEntry(Cache* cache, uint64_t file_number);
+
+ // Clean table handle and erase it from the table cache
+ // Used in DB close, or the file is not live anymore.
+ void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle);
+
+ // Find table reader
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level == -1 means not specified
+ Status FindTable(
+ const ReadOptions& ro, const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, Cache::Handle**,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+ const bool no_io = false, bool record_read_stats = true,
+ HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+ int level = -1, bool prefetch_index_and_filter_in_cache = true,
+ size_t max_file_size_for_l0_meta_pin = 0,
+ Temperature file_temperature = Temperature::kUnknown);
+
+ // Get TableReader from a cache handle.
+ TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
+
+ // Get the table properties of a given table.
+ // @no_io: indicates if we should load table to the cache if it is not present
+ // in table cache yet.
+ // @returns: `properties` will be reset on success. Please note that we will
+ // return Status::Incomplete() if table is not present in cache and
+ // we set `no_io` to be true.
+ Status GetTableProperties(
+ const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ std::shared_ptr<const TableProperties>* properties,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+ bool no_io = false);
+
+ Status ApproximateKeyAnchors(const ReadOptions& ro,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ std::vector<TableReader::Anchor>& anchors);
+
+ // Return total memory usage of the table reader of the file.
+ // 0 if table reader of the file is not loaded.
+ size_t GetMemoryUsageByTableReader(
+ const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+ // Returns approximated offset of a key in a file represented by fd.
+ uint64_t ApproximateOffsetOf(
+ const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller,
+ const InternalKeyComparator& internal_comparator,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+ // Returns approximated data size between start and end keys in a file
+ // represented by fd (the start key must not be greater than the end key).
+ uint64_t ApproximateSize(
+ const Slice& start, const Slice& end, const FileMetaData& file_meta,
+ TableReaderCaller caller,
+ const InternalKeyComparator& internal_comparator,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+ // Release the handle from a cache
+ void ReleaseHandle(Cache::Handle* handle);
+
+ Cache* get_cache() const { return cache_; }
+
+ // Capacity of the backing Cache that indicates infinite TableCache capacity.
+ // For example when max_open_files is -1 we set the backing Cache to this.
+ static const int kInfiniteCapacity = 0x400000;
+
+ // The tables opened with this TableCache will be immortal, i.e., their
+ // lifetime is as long as that of the DB.
+ void SetTablesAreImmortal() {
+ if (cache_->GetCapacity() >= kInfiniteCapacity) {
+ immortal_tables_ = true;
+ }
+ }
+
+ private:
+ // Build a table reader
+ Status GetTableReader(
+ const ReadOptions& ro, const FileOptions& file_options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, bool sequential_mode,
+ bool record_read_stats, HistogramImpl* file_read_hist,
+ std::unique_ptr<TableReader>* table_reader,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+ bool skip_filters = false, int level = -1,
+ bool prefetch_index_and_filter_in_cache = true,
+ size_t max_file_size_for_l0_meta_pin = 0,
+ Temperature file_temperature = Temperature::kUnknown);
+
+ // Update the max_covering_tombstone_seq in the GetContext for each key based
+ // on the range deletions in the table
+ void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t,
+ MultiGetContext::Range& table_range);
+
+ // Create a key prefix for looking up the row cache. The prefix is of the
+ // format row_cache_id + fd_number + seq_no. Later, the user key can be
+ // appended to form the full key
+ void CreateRowCacheKeyPrefix(const ReadOptions& options,
+ const FileDescriptor& fd,
+ const Slice& internal_key,
+ GetContext* get_context, IterKey& row_cache_key);
+
+ // Helper function to lookup the row cache for a key. It appends the
+ // user key to row_cache_key at offset prefix_size
+ bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+ size_t prefix_size, GetContext* get_context);
+
+ const ImmutableOptions& ioptions_;
+ const FileOptions& file_options_;
+ Cache* const cache_;
+ std::string row_cache_id_;
+ bool immortal_tables_;
+ BlockCacheTracer* const block_cache_tracer_;
+ Striped<port::Mutex, Slice> loader_mutex_;
+ std::shared_ptr<IOTracer> io_tracer_;
+ std::string db_session_id_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache_sync_and_async.h b/src/rocksdb/db/table_cache_sync_and_async.h
new file mode 100644
index 000000000..e72abdd45
--- /dev/null
+++ b/src/rocksdb/db/table_cache_sync_and_async.h
@@ -0,0 +1,135 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "util/coro_utils.h"
+
+#if defined(WITHOUT_COROUTINES) || \
+ (defined(USE_COROUTINES) && defined(WITH_COROUTINES))
+namespace ROCKSDB_NAMESPACE {
+
+#if defined(WITHOUT_COROUTINES)
+#endif
+
+// Batched version of TableCache::MultiGet.
+DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
+(const ReadOptions& options, const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions,
+ int level, Cache::Handle* table_handle) {
+ auto& fd = file_meta.fd;
+ Status s;
+ TableReader* t = fd.table_reader;
+ Cache::Handle* handle = table_handle;
+ MultiGetRange table_range(*mget_range, mget_range->begin(),
+ mget_range->end());
+ if (handle != nullptr && t == nullptr) {
+ t = GetTableReaderFromHandle(handle);
+ }
+#ifndef ROCKSDB_LITE
+ autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
+ IterKey row_cache_key;
+ size_t row_cache_key_prefix_size = 0;
+ KeyContext& first_key = *table_range.begin();
+ bool lookup_row_cache =
+ ioptions_.row_cache && !first_key.get_context->NeedToReadSequence();
+
+ // Check row cache if enabled. Since row cache does not currently store
+ // sequence numbers, we cannot use it if we need to fetch the sequence.
+ if (lookup_row_cache) {
+ GetContext* first_context = first_key.get_context;
+ CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context,
+ row_cache_key);
+ row_cache_key_prefix_size = row_cache_key.Size();
+
+ for (auto miter = table_range.begin(); miter != table_range.end();
+ ++miter) {
+ const Slice& user_key = miter->ukey_with_ts;
+
+ GetContext* get_context = miter->get_context;
+
+ if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
+ get_context)) {
+ table_range.SkipKey(miter);
+ } else {
+ row_cache_entries.emplace_back();
+ get_context->SetReplayLog(&(row_cache_entries.back()));
+ }
+ }
+ }
+#endif // ROCKSDB_LITE
+
+ // Check that table_range is not empty. Its possible all keys may have been
+ // found in the row cache and thus the range may now be empty
+ if (s.ok() && !table_range.empty()) {
+ if (t == nullptr) {
+ assert(handle == nullptr);
+ s = FindTable(options, file_options_, internal_comparator, file_meta,
+ &handle, prefix_extractor,
+ options.read_tier == kBlockCacheTier /* no_io */,
+ true /* record_read_stats */, file_read_hist, skip_filters,
+ level, true /* prefetch_index_and_filter_in_cache */,
+ 0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature);
+ TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s);
+ if (s.ok()) {
+ t = GetTableReaderFromHandle(handle);
+ assert(t);
+ }
+ }
+ if (s.ok() && !options.ignore_range_deletions && !skip_range_deletions) {
+ UpdateRangeTombstoneSeqnums(options, t, table_range);
+ }
+ if (s.ok()) {
+ CO_AWAIT(t->MultiGet)
+ (options, &table_range, prefix_extractor.get(), skip_filters);
+ } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
+ for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
+ Status* status = iter->s;
+ if (status->IsIncomplete()) {
+ // Couldn't find Table in cache but treat as kFound if no_io set
+ iter->get_context->MarkKeyMayExist();
+ s = Status::OK();
+ }
+ }
+ }
+ }
+
+#ifndef ROCKSDB_LITE
+ if (lookup_row_cache) {
+ size_t row_idx = 0;
+
+ for (auto miter = table_range.begin(); miter != table_range.end();
+ ++miter) {
+ std::string& row_cache_entry = row_cache_entries[row_idx++];
+ const Slice& user_key = miter->ukey_with_ts;
+ ;
+ GetContext* get_context = miter->get_context;
+
+ get_context->SetReplayLog(nullptr);
+ // Compute row cache key.
+ row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(),
+ user_key.size());
+ // Put the replay log in row cache only if something was found.
+ if (s.ok() && !row_cache_entry.empty()) {
+ size_t charge = row_cache_entry.capacity() + sizeof(std::string);
+ void* row_ptr = new std::string(std::move(row_cache_entry));
+ // If row cache is full, it's OK.
+ ioptions_.row_cache
+ ->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+ &DeleteEntry<std::string>)
+ .PermitUncheckedError();
+ }
+ }
+ }
+#endif // ROCKSDB_LITE
+
+ if (handle != nullptr) {
+ ReleaseHandle(handle);
+ }
+ CO_RETURN s;
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/db/table_properties_collector.cc b/src/rocksdb/db/table_properties_collector.cc
new file mode 100644
index 000000000..edb9a1b63
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/table_properties_collector.h"
+
+#include "db/dbformat.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint64_t GetUint64Property(const UserCollectedProperties& props,
+ const std::string& property_name,
+ bool* property_present) {
+ auto pos = props.find(property_name);
+ if (pos == props.end()) {
+ *property_present = false;
+ return 0;
+ }
+ Slice raw = pos->second;
+ uint64_t val = 0;
+ *property_present = true;
+ return GetVarint64(&raw, &val) ? val : 0;
+}
+
+} // anonymous namespace
+
+Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key,
+ const Slice& value,
+ uint64_t file_size) {
+ ParsedInternalKey ikey;
+ Status s = ParseInternalKey(key, &ikey, false /* log_err_key */); // TODO
+ if (!s.ok()) {
+ return s;
+ }
+
+ return collector_->AddUserKey(ikey.user_key, value, GetEntryType(ikey.type),
+ ikey.sequence, file_size);
+}
+
+void UserKeyTablePropertiesCollector::BlockAdd(
+ uint64_t block_uncomp_bytes, uint64_t block_compressed_bytes_fast,
+ uint64_t block_compressed_bytes_slow) {
+ return collector_->BlockAdd(block_uncomp_bytes, block_compressed_bytes_fast,
+ block_compressed_bytes_slow);
+}
+
+Status UserKeyTablePropertiesCollector::Finish(
+ UserCollectedProperties* properties) {
+ return collector_->Finish(properties);
+}
+
+UserCollectedProperties UserKeyTablePropertiesCollector::GetReadableProperties()
+ const {
+ return collector_->GetReadableProperties();
+}
+
+uint64_t GetDeletedKeys(const UserCollectedProperties& props) {
+ bool property_present_ignored;
+ return GetUint64Property(props, TablePropertiesNames::kDeletedKeys,
+ &property_present_ignored);
+}
+
+uint64_t GetMergeOperands(const UserCollectedProperties& props,
+ bool* property_present) {
+ return GetUint64Property(props, TablePropertiesNames::kMergeOperands,
+ property_present);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector.h b/src/rocksdb/db/table_properties_collector.h
new file mode 100644
index 000000000..9035ba793
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector.h
@@ -0,0 +1,175 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines a collection of statistics collectors.
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Base class for internal table properties collector.
+class IntTblPropCollector {
+ public:
+ virtual ~IntTblPropCollector() {}
+ virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+ virtual const char* Name() const = 0;
+
+ // @params key the user key that is inserted into the table.
+ // @params value the value that is inserted into the table.
+ virtual Status InternalAdd(const Slice& key, const Slice& value,
+ uint64_t file_size) = 0;
+
+ virtual void BlockAdd(uint64_t block_uncomp_bytes,
+ uint64_t block_compressed_bytes_fast,
+ uint64_t block_compressed_bytes_slow) = 0;
+
+ virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+ virtual bool NeedCompact() const { return false; }
+};
+
+// Factory for internal table properties collector.
+class IntTblPropCollectorFactory {
+ public:
+ virtual ~IntTblPropCollectorFactory() {}
+ // has to be thread-safe
+ virtual IntTblPropCollector* CreateIntTblPropCollector(
+ uint32_t column_family_id, int level_at_creation) = 0;
+
+ // The name of the properties collector can be used for debugging purpose.
+ virtual const char* Name() const = 0;
+};
+
+using IntTblPropCollectorFactories =
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>;
+
+// When rocksdb creates a new table, it will encode all "user keys" into
+// "internal keys", which contains meta information of a given entry.
+//
+// This class extracts user key from the encoded internal key when Add() is
+// invoked.
+class UserKeyTablePropertiesCollector : public IntTblPropCollector {
+ public:
+ // transfer of ownership
+ explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector)
+ : collector_(collector) {}
+
+ virtual ~UserKeyTablePropertiesCollector() {}
+
+ virtual Status InternalAdd(const Slice& key, const Slice& value,
+ uint64_t file_size) override;
+
+ virtual void BlockAdd(uint64_t block_uncomp_bytes,
+ uint64_t block_compressed_bytes_fast,
+ uint64_t block_compressed_bytes_slow) override;
+
+ virtual Status Finish(UserCollectedProperties* properties) override;
+
+ virtual const char* Name() const override { return collector_->Name(); }
+
+ UserCollectedProperties GetReadableProperties() const override;
+
+ virtual bool NeedCompact() const override {
+ return collector_->NeedCompact();
+ }
+
+ protected:
+ std::unique_ptr<TablePropertiesCollector> collector_;
+};
+
+class UserKeyTablePropertiesCollectorFactory
+ : public IntTblPropCollectorFactory {
+ public:
+ explicit UserKeyTablePropertiesCollectorFactory(
+ std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
+ : user_collector_factory_(user_collector_factory) {}
+ virtual IntTblPropCollector* CreateIntTblPropCollector(
+ uint32_t column_family_id, int level_at_creation) override {
+ TablePropertiesCollectorFactory::Context context;
+ context.column_family_id = column_family_id;
+ context.level_at_creation = level_at_creation;
+ return new UserKeyTablePropertiesCollector(
+ user_collector_factory_->CreateTablePropertiesCollector(context));
+ }
+
+ virtual const char* Name() const override {
+ return user_collector_factory_->Name();
+ }
+
+ private:
+ std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory_;
+};
+
+// When rocksdb creates a newtable, it will encode all "user keys" into
+// "internal keys". This class collects min/max timestamp from the encoded
+// internal key when Add() is invoked.
+//
+// @param cmp the user comparator to compare the timestamps in internal key.
+class TimestampTablePropertiesCollector : public IntTblPropCollector {
+ public:
+ explicit TimestampTablePropertiesCollector(const Comparator* cmp)
+ : cmp_(cmp),
+ timestamp_min_(kDisableUserTimestamp),
+ timestamp_max_(kDisableUserTimestamp) {}
+
+ Status InternalAdd(const Slice& key, const Slice& /* value */,
+ uint64_t /* file_size */) override {
+ auto user_key = ExtractUserKey(key);
+ assert(cmp_ && cmp_->timestamp_size() > 0);
+ if (user_key.size() < cmp_->timestamp_size()) {
+ return Status::Corruption(
+ "User key size mismatch when comparing to timestamp size.");
+ }
+ auto timestamp_in_key =
+ ExtractTimestampFromUserKey(user_key, cmp_->timestamp_size());
+ if (timestamp_max_ == kDisableUserTimestamp ||
+ cmp_->CompareTimestamp(timestamp_in_key, timestamp_max_) > 0) {
+ timestamp_max_.assign(timestamp_in_key.data(), timestamp_in_key.size());
+ }
+ if (timestamp_min_ == kDisableUserTimestamp ||
+ cmp_->CompareTimestamp(timestamp_min_, timestamp_in_key) > 0) {
+ timestamp_min_.assign(timestamp_in_key.data(), timestamp_in_key.size());
+ }
+ return Status::OK();
+ }
+
+ void BlockAdd(uint64_t /* block_uncomp_bytes */,
+ uint64_t /* block_compressed_bytes_fast */,
+ uint64_t /* block_compressed_bytes_slow */) override {
+ return;
+ }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ assert(timestamp_min_.size() == timestamp_max_.size() &&
+ timestamp_max_.size() == cmp_->timestamp_size());
+ properties->insert({"rocksdb.timestamp_min", timestamp_min_});
+ properties->insert({"rocksdb.timestamp_max", timestamp_max_});
+ return Status::OK();
+ }
+
+ const char* Name() const override {
+ return "TimestampTablePropertiesCollector";
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return {{"rocksdb.timestamp_min", Slice(timestamp_min_).ToString(true)},
+ {"rocksdb.timestamp_max", Slice(timestamp_max_).ToString(true)}};
+ }
+
+ protected:
+ const Comparator* const cmp_;
+ std::string timestamp_min_;
+ std::string timestamp_max_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector_test.cc b/src/rocksdb/db/table_properties_collector_test.cc
new file mode 100644
index 000000000..5f0f205da
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector_test.cc
@@ -0,0 +1,513 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/table_properties_collector.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TablePropertiesTest : public testing::Test,
+ public testing::WithParamInterface<bool> {
+ public:
+ void SetUp() override { backward_mode_ = GetParam(); }
+
+ bool backward_mode_;
+};
+
+// Utilities test functions
+namespace {
+static const uint32_t kTestColumnFamilyId = 66;
+static const std::string kTestColumnFamilyName = "test_column_fam";
+static const int kTestLevel = 1;
+
+void MakeBuilder(
+ const Options& options, const ImmutableOptions& ioptions,
+ const MutableCFOptions& moptions,
+ const InternalKeyComparator& internal_comparator,
+ const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+ std::unique_ptr<WritableFileWriter>* writable,
+ std::unique_ptr<TableBuilder>* builder) {
+ std::unique_ptr<FSWritableFile> wf(new test::StringSink);
+ writable->reset(
+ new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
+ TableBuilderOptions tboptions(
+ ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories,
+ options.compression, options.compression_opts, kTestColumnFamilyId,
+ kTestColumnFamilyName, kTestLevel);
+ builder->reset(NewTableBuilder(tboptions, writable->get()));
+}
+} // namespace
+
+// Collects keys that starts with "A" in a table.
+class RegularKeysStartWithA : public TablePropertiesCollector {
+ public:
+ const char* Name() const override { return "RegularKeysStartWithA"; }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ std::string encoded;
+ std::string encoded_num_puts;
+ std::string encoded_num_deletes;
+ std::string encoded_num_single_deletes;
+ std::string encoded_num_size_changes;
+ PutVarint32(&encoded, count_);
+ PutVarint32(&encoded_num_puts, num_puts_);
+ PutVarint32(&encoded_num_deletes, num_deletes_);
+ PutVarint32(&encoded_num_single_deletes, num_single_deletes_);
+ PutVarint32(&encoded_num_size_changes, num_size_changes_);
+ *properties = UserCollectedProperties{
+ {"TablePropertiesTest", message_},
+ {"Count", encoded},
+ {"NumPuts", encoded_num_puts},
+ {"NumDeletes", encoded_num_deletes},
+ {"NumSingleDeletes", encoded_num_single_deletes},
+ {"NumSizeChanges", encoded_num_size_changes},
+ };
+ return Status::OK();
+ }
+
+ Status AddUserKey(const Slice& user_key, const Slice& /*value*/,
+ EntryType type, SequenceNumber /*seq*/,
+ uint64_t file_size) override {
+ // simply asssume all user keys are not empty.
+ if (user_key.data()[0] == 'A') {
+ ++count_;
+ }
+ if (type == kEntryPut) {
+ num_puts_++;
+ } else if (type == kEntryDelete) {
+ num_deletes_++;
+ } else if (type == kEntrySingleDelete) {
+ num_single_deletes_++;
+ }
+ if (file_size < file_size_) {
+ message_ = "File size should not decrease.";
+ } else if (file_size != file_size_) {
+ num_size_changes_++;
+ }
+
+ return Status::OK();
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ private:
+ std::string message_ = "Rocksdb";
+ uint32_t count_ = 0;
+ uint32_t num_puts_ = 0;
+ uint32_t num_deletes_ = 0;
+ uint32_t num_single_deletes_ = 0;
+ uint32_t num_size_changes_ = 0;
+ uint64_t file_size_ = 0;
+};
+
+// Collects keys that starts with "A" in a table. Backward compatible mode
+// It is also used to test internal key table property collector
+class RegularKeysStartWithABackwardCompatible
+ : public TablePropertiesCollector {
+ public:
+ const char* Name() const override { return "RegularKeysStartWithA"; }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ std::string encoded;
+ PutVarint32(&encoded, count_);
+ *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+ {"Count", encoded}};
+ return Status::OK();
+ }
+
+ Status Add(const Slice& user_key, const Slice& /*value*/) override {
+ // simply asssume all user keys are not empty.
+ if (user_key.data()[0] == 'A') {
+ ++count_;
+ }
+ return Status::OK();
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ private:
+ uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAInternal : public IntTblPropCollector {
+ public:
+ const char* Name() const override { return "RegularKeysStartWithA"; }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ std::string encoded;
+ PutVarint32(&encoded, count_);
+ *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+ {"Count", encoded}};
+ return Status::OK();
+ }
+
+ Status InternalAdd(const Slice& user_key, const Slice& /*value*/,
+ uint64_t /*file_size*/) override {
+ // simply asssume all user keys are not empty.
+ if (user_key.data()[0] == 'A') {
+ ++count_;
+ }
+ return Status::OK();
+ }
+
+ void BlockAdd(uint64_t /* block_uncomp_bytes */,
+ uint64_t /* block_compressed_bytes_fast */,
+ uint64_t /* block_compressed_bytes_slow */) override {
+ // Nothing to do.
+ return;
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+
+ private:
+ uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory,
+ public TablePropertiesCollectorFactory {
+ public:
+ explicit RegularKeysStartWithAFactory(bool backward_mode)
+ : backward_mode_(backward_mode) {}
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context context) override {
+ EXPECT_EQ(kTestColumnFamilyId, context.column_family_id);
+ EXPECT_EQ(kTestLevel, context.level_at_creation);
+ if (!backward_mode_) {
+ return new RegularKeysStartWithA();
+ } else {
+ return new RegularKeysStartWithABackwardCompatible();
+ }
+ }
+ IntTblPropCollector* CreateIntTblPropCollector(
+ uint32_t /*column_family_id*/, int /* level_at_creation */) override {
+ return new RegularKeysStartWithAInternal();
+ }
+ const char* Name() const override { return "RegularKeysStartWithA"; }
+
+ bool backward_mode_;
+};
+
+class FlushBlockEveryThreePolicy : public FlushBlockPolicy {
+ public:
+ bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+ return (++count_ % 3U == 0);
+ }
+
+ private:
+ uint64_t count_ = 0;
+};
+
+class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+ explicit FlushBlockEveryThreePolicyFactory() {}
+
+ const char* Name() const override {
+ return "FlushBlockEveryThreePolicyFactory";
+ }
+
+ FlushBlockPolicy* NewFlushBlockPolicy(
+ const BlockBasedTableOptions& /*table_options*/,
+ const BlockBuilder& /*data_block_builder*/) const override {
+ return new FlushBlockEveryThreePolicy;
+ }
+};
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+namespace {
+void TestCustomizedTablePropertiesCollector(
+ bool backward_mode, uint64_t magic_number, bool test_int_tbl_prop_collector,
+ const Options& options, const InternalKeyComparator& internal_comparator) {
+ // make sure the entries will be inserted with order.
+ std::map<std::pair<std::string, ValueType>, std::string> kvs = {
+ {{"About ", kTypeValue}, "val5"}, // starts with 'A'
+ {{"Abstract", kTypeValue}, "val2"}, // starts with 'A'
+ {{"Around ", kTypeValue}, "val7"}, // starts with 'A'
+ {{"Beyond ", kTypeValue}, "val3"},
+ {{"Builder ", kTypeValue}, "val1"},
+ {{"Love ", kTypeDeletion}, ""},
+ {{"Cancel ", kTypeValue}, "val4"},
+ {{"Find ", kTypeValue}, "val6"},
+ {{"Rocks ", kTypeDeletion}, ""},
+ {{"Foo ", kTypeSingleDeletion}, ""},
+ };
+
+ // -- Step 1: build table
+ std::unique_ptr<TableBuilder> builder;
+ std::unique_ptr<WritableFileWriter> writer;
+ const ImmutableOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+ if (test_int_tbl_prop_collector) {
+ int_tbl_prop_collector_factories.emplace_back(
+ new RegularKeysStartWithAFactory(backward_mode));
+ } else {
+ GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
+ }
+ MakeBuilder(options, ioptions, moptions, internal_comparator,
+ &int_tbl_prop_collector_factories, &writer, &builder);
+
+ SequenceNumber seqNum = 0U;
+ for (const auto& kv : kvs) {
+ InternalKey ikey(kv.first.first, seqNum++, kv.first.second);
+ builder->Add(ikey.Encode(), kv.second);
+ }
+ ASSERT_OK(builder->Finish());
+ ASSERT_OK(writer->Flush());
+
+ // -- Step 2: Read properties
+ test::StringSink* fwf =
+ static_cast<test::StringSink*>(writer->writable_file());
+ std::unique_ptr<FSRandomAccessFile> source(
+ new test::StringSource(fwf->contents()));
+ std::unique_ptr<RandomAccessFileReader> fake_file_reader(
+ new RandomAccessFileReader(std::move(source), "test"));
+
+ std::unique_ptr<TableProperties> props;
+ Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(),
+ magic_number, ioptions, &props);
+ ASSERT_OK(s);
+
+ auto user_collected = props->user_collected_properties;
+
+ ASSERT_NE(user_collected.find("TablePropertiesTest"), user_collected.end());
+ ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
+
+ uint32_t starts_with_A = 0;
+ ASSERT_NE(user_collected.find("Count"), user_collected.end());
+ Slice key(user_collected.at("Count"));
+ ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+ ASSERT_EQ(3u, starts_with_A);
+
+ if (!backward_mode && !test_int_tbl_prop_collector) {
+ uint32_t num_puts;
+ ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+ Slice key_puts(user_collected.at("NumPuts"));
+ ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+ ASSERT_EQ(7u, num_puts);
+
+ uint32_t num_deletes;
+ ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
+ Slice key_deletes(user_collected.at("NumDeletes"));
+ ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+ ASSERT_EQ(2u, num_deletes);
+
+ uint32_t num_single_deletes;
+ ASSERT_NE(user_collected.find("NumSingleDeletes"), user_collected.end());
+ Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+ ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+ ASSERT_EQ(1u, num_single_deletes);
+
+ uint32_t num_size_changes;
+ ASSERT_NE(user_collected.find("NumSizeChanges"), user_collected.end());
+ Slice key_size_changes(user_collected.at("NumSizeChanges"));
+ ASSERT_TRUE(GetVarint32(&key_size_changes, &num_size_changes));
+ ASSERT_GE(num_size_changes, 2u);
+ }
+}
+} // namespace
+
+TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) {
+ // Test properties collectors with internal keys or regular keys
+ // for block based table
+ for (bool encode_as_internal : {true, false}) {
+ Options options;
+ BlockBasedTableOptions table_options;
+ table_options.flush_block_policy_factory =
+ std::make_shared<FlushBlockEveryThreePolicyFactory>();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ test::PlainInternalKeyComparator ikc(options.comparator);
+ std::shared_ptr<TablePropertiesCollectorFactory> collector_factory(
+ new RegularKeysStartWithAFactory(backward_mode_));
+ options.table_properties_collector_factories.resize(1);
+ options.table_properties_collector_factories[0] = collector_factory;
+
+ TestCustomizedTablePropertiesCollector(backward_mode_,
+ kBlockBasedTableMagicNumber,
+ encode_as_internal, options, ikc);
+
+#ifndef ROCKSDB_LITE // PlainTable is not supported in Lite
+ // test plain table
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 8;
+ plain_table_options.bloom_bits_per_key = 8;
+ plain_table_options.hash_table_ratio = 0;
+
+ options.table_factory =
+ std::make_shared<PlainTableFactory>(plain_table_options);
+ TestCustomizedTablePropertiesCollector(backward_mode_,
+ kPlainTableMagicNumber,
+ encode_as_internal, options, ikc);
+#endif // !ROCKSDB_LITE
+ }
+}
+
+namespace {
+void TestInternalKeyPropertiesCollector(
+ bool backward_mode, uint64_t magic_number, bool sanitized,
+ std::shared_ptr<TableFactory> table_factory) {
+ InternalKey keys[] = {
+ InternalKey("A ", 0, ValueType::kTypeValue),
+ InternalKey("B ", 1, ValueType::kTypeValue),
+ InternalKey("C ", 2, ValueType::kTypeValue),
+ InternalKey("W ", 3, ValueType::kTypeDeletion),
+ InternalKey("X ", 4, ValueType::kTypeDeletion),
+ InternalKey("Y ", 5, ValueType::kTypeDeletion),
+ InternalKey("Z ", 6, ValueType::kTypeDeletion),
+ InternalKey("a ", 7, ValueType::kTypeSingleDeletion),
+ InternalKey("b ", 8, ValueType::kTypeMerge),
+ InternalKey("c ", 9, ValueType::kTypeMerge),
+ };
+
+ std::unique_ptr<TableBuilder> builder;
+ std::unique_ptr<WritableFileWriter> writable;
+ Options options;
+ test::PlainInternalKeyComparator pikc(options.comparator);
+
+ IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+ options.table_factory = table_factory;
+ if (sanitized) {
+ options.table_properties_collector_factories.emplace_back(
+ new RegularKeysStartWithAFactory(backward_mode));
+ // with sanitization, even regular properties collector will be able to
+ // handle internal keys.
+ auto comparator = options.comparator;
+ // HACK: Set options.info_log to avoid writing log in
+ // SanitizeOptions().
+ options.info_log = std::make_shared<test::NullLogger>();
+ options = SanitizeOptions("db", // just a place holder
+ options);
+ ImmutableOptions ioptions(options);
+ GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
+ options.comparator = comparator;
+ }
+ const ImmutableOptions ioptions(options);
+ MutableCFOptions moptions(options);
+
+ for (int iter = 0; iter < 2; ++iter) {
+ MakeBuilder(options, ioptions, moptions, pikc,
+ &int_tbl_prop_collector_factories, &writable, &builder);
+ for (const auto& k : keys) {
+ builder->Add(k.Encode(), "val");
+ }
+
+ ASSERT_OK(builder->Finish());
+ ASSERT_OK(writable->Flush());
+
+ test::StringSink* fwf =
+ static_cast<test::StringSink*>(writable->writable_file());
+ std::unique_ptr<FSRandomAccessFile> source(
+ new test::StringSource(fwf->contents()));
+ std::unique_ptr<RandomAccessFileReader> reader(
+ new RandomAccessFileReader(std::move(source), "test"));
+
+ std::unique_ptr<TableProperties> props;
+ Status s = ReadTableProperties(reader.get(), fwf->contents().size(),
+ magic_number, ioptions, &props);
+ ASSERT_OK(s);
+
+ auto user_collected = props->user_collected_properties;
+ uint64_t deleted = GetDeletedKeys(user_collected);
+ ASSERT_EQ(5u, deleted); // deletes + single-deletes
+
+ bool property_present;
+ uint64_t merges = GetMergeOperands(user_collected, &property_present);
+ ASSERT_TRUE(property_present);
+ ASSERT_EQ(2u, merges);
+
+ if (sanitized) {
+ uint32_t starts_with_A = 0;
+ ASSERT_NE(user_collected.find("Count"), user_collected.end());
+ Slice key(user_collected.at("Count"));
+ ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+ ASSERT_EQ(1u, starts_with_A);
+
+ if (!backward_mode) {
+ uint32_t num_puts;
+ ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+ Slice key_puts(user_collected.at("NumPuts"));
+ ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+ ASSERT_EQ(3u, num_puts);
+
+ uint32_t num_deletes;
+ ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
+ Slice key_deletes(user_collected.at("NumDeletes"));
+ ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+ ASSERT_EQ(4u, num_deletes);
+
+ uint32_t num_single_deletes;
+ ASSERT_NE(user_collected.find("NumSingleDeletes"),
+ user_collected.end());
+ Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+ ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+ ASSERT_EQ(1u, num_single_deletes);
+ }
+ }
+ }
+}
+} // namespace
+
+TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) {
+ TestInternalKeyPropertiesCollector(
+ backward_mode_, kBlockBasedTableMagicNumber, true /* sanitize */,
+ std::make_shared<BlockBasedTableFactory>());
+ if (backward_mode_) {
+ TestInternalKeyPropertiesCollector(
+ backward_mode_, kBlockBasedTableMagicNumber, false /* not sanitize */,
+ std::make_shared<BlockBasedTableFactory>());
+ }
+
+#ifndef ROCKSDB_LITE // PlainTable is not supported in Lite
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 8;
+ plain_table_options.bloom_bits_per_key = 8;
+ plain_table_options.hash_table_ratio = 0;
+
+ TestInternalKeyPropertiesCollector(
+ backward_mode_, kPlainTableMagicNumber, false /* not sanitize */,
+ std::make_shared<PlainTableFactory>(plain_table_options));
+#endif // !ROCKSDB_LITE
+}
+
+INSTANTIATE_TEST_CASE_P(InternalKeyPropertiesCollector, TablePropertiesTest,
+ ::testing::Bool());
+
+INSTANTIATE_TEST_CASE_P(CustomizedTablePropertiesCollector, TablePropertiesTest,
+ ::testing::Bool());
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/transaction_log_impl.cc b/src/rocksdb/db/transaction_log_impl.cc
new file mode 100644
index 000000000..3878b428a
--- /dev/null
+++ b/src/rocksdb/db/transaction_log_impl.cc
@@ -0,0 +1,298 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/transaction_log_impl.h"
+
+#include <cinttypes>
+
+#include "db/write_batch_internal.h"
+#include "file/sequence_file_reader.h"
+#include "util/defer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TransactionLogIteratorImpl::TransactionLogIteratorImpl(
+ const std::string& dir, const ImmutableDBOptions* options,
+ const TransactionLogIterator::ReadOptions& read_options,
+ const EnvOptions& soptions, const SequenceNumber seq,
+ std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+ const bool seq_per_batch, const std::shared_ptr<IOTracer>& io_tracer)
+ : dir_(dir),
+ options_(options),
+ read_options_(read_options),
+ soptions_(soptions),
+ starting_sequence_number_(seq),
+ files_(std::move(files)),
+ versions_(versions),
+ seq_per_batch_(seq_per_batch),
+ io_tracer_(io_tracer),
+ started_(false),
+ is_valid_(false),
+ current_file_index_(0),
+ current_batch_seq_(0),
+ current_last_seq_(0) {
+ assert(files_ != nullptr);
+ assert(versions_ != nullptr);
+ assert(!seq_per_batch_);
+ current_status_.PermitUncheckedError(); // Clear on start
+ reporter_.env = options_->env;
+ reporter_.info_log = options_->info_log.get();
+ SeekToStartSequence(); // Seek till starting sequence
+}
+
+Status TransactionLogIteratorImpl::OpenLogFile(
+ const LogFile* log_file,
+ std::unique_ptr<SequentialFileReader>* file_reader) {
+ FileSystemPtr fs(options_->fs, io_tracer_);
+ std::unique_ptr<FSSequentialFile> file;
+ std::string fname;
+ Status s;
+ EnvOptions optimized_env_options = fs->OptimizeForLogRead(soptions_);
+ if (log_file->Type() == kArchivedLogFile) {
+ fname = ArchivedLogFileName(dir_, log_file->LogNumber());
+ s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+ } else {
+ fname = LogFileName(dir_, log_file->LogNumber());
+ s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+ if (!s.ok()) {
+ // If cannot open file in DB directory.
+ // Try the archive dir, as it could have moved in the meanwhile.
+ fname = ArchivedLogFileName(dir_, log_file->LogNumber());
+ s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+ }
+ }
+ if (s.ok()) {
+ file_reader->reset(new SequentialFileReader(std::move(file), fname,
+ io_tracer_, options_->listeners,
+ options_->rate_limiter.get()));
+ }
+ return s;
+}
+
+BatchResult TransactionLogIteratorImpl::GetBatch() {
+ assert(is_valid_); // cannot call in a non valid state.
+ BatchResult result;
+ result.sequence = current_batch_seq_;
+ result.writeBatchPtr = std::move(current_batch_);
+ return result;
+}
+
+Status TransactionLogIteratorImpl::status() { return current_status_; }
+
+bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; }
+
+bool TransactionLogIteratorImpl::RestrictedRead(Slice* record) {
+ // Don't read if no more complete entries to read from logs
+ if (current_last_seq_ >= versions_->LastSequence()) {
+ return false;
+ }
+ return current_log_reader_->ReadRecord(record, &scratch_);
+}
+
+void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index,
+ bool strict) {
+ Slice record;
+ started_ = false;
+ is_valid_ = false;
+ // Check invariant of TransactionLogIterator when SeekToStartSequence()
+ // succeeds.
+ const Defer defer([this]() {
+ if (is_valid_) {
+ assert(current_status_.ok());
+ if (starting_sequence_number_ > current_batch_seq_) {
+ assert(current_batch_seq_ < current_last_seq_);
+ assert(current_last_seq_ >= starting_sequence_number_);
+ }
+ }
+ });
+ if (files_->size() <= start_file_index) {
+ return;
+ } else if (!current_status_.ok()) {
+ return;
+ }
+ Status s =
+ OpenLogReader(files_->at(static_cast<size_t>(start_file_index)).get());
+ if (!s.ok()) {
+ current_status_ = s;
+ reporter_.Info(current_status_.ToString().c_str());
+ return;
+ }
+ while (RestrictedRead(&record)) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter_.Corruption(record.size(),
+ Status::Corruption("very small log record"));
+ continue;
+ }
+ UpdateCurrentWriteBatch(record);
+ if (current_last_seq_ >= starting_sequence_number_) {
+ if (strict && current_batch_seq_ != starting_sequence_number_) {
+ current_status_ = Status::Corruption(
+ "Gap in sequence number. Could not "
+ "seek to required sequence number");
+ reporter_.Info(current_status_.ToString().c_str());
+ return;
+ } else if (strict) {
+ reporter_.Info(
+ "Could seek required sequence number. Iterator will "
+ "continue.");
+ }
+ is_valid_ = true;
+ started_ = true; // set started_ as we could seek till starting sequence
+ return;
+ } else {
+ is_valid_ = false;
+ }
+ }
+
+ // Could not find start sequence in first file. Normally this must be the
+ // only file. Otherwise log the error and let the iterator return next entry
+ // If strict is set, we want to seek exactly till the start sequence and it
+ // should have been present in the file we scanned above
+ if (strict) {
+ current_status_ = Status::Corruption(
+ "Gap in sequence number. Could not "
+ "seek to required sequence number");
+ reporter_.Info(current_status_.ToString().c_str());
+ } else if (files_->size() != 1) {
+ current_status_ = Status::Corruption(
+ "Start sequence was not found, "
+ "skipping to the next available");
+ reporter_.Info(current_status_.ToString().c_str());
+ // Let NextImpl find the next available entry. started_ remains false
+ // because we don't want to check for gaps while moving to start sequence
+ NextImpl(true);
+ }
+}
+
+void TransactionLogIteratorImpl::Next() {
+ if (!current_status_.ok()) {
+ return;
+ }
+ return NextImpl(false);
+}
+
+void TransactionLogIteratorImpl::NextImpl(bool internal) {
+ Slice record;
+ is_valid_ = false;
+ if (!internal && !started_) {
+ // Runs every time until we can seek to the start sequence
+ SeekToStartSequence();
+ }
+ while (true) {
+ assert(current_log_reader_);
+ if (current_log_reader_->IsEOF()) {
+ current_log_reader_->UnmarkEOF();
+ }
+ while (RestrictedRead(&record)) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter_.Corruption(record.size(),
+ Status::Corruption("very small log record"));
+ continue;
+ } else {
+ // started_ should be true if called by application
+ assert(internal || started_);
+ // started_ should be false if called internally
+ assert(!internal || !started_);
+ UpdateCurrentWriteBatch(record);
+ if (internal && !started_) {
+ started_ = true;
+ }
+ return;
+ }
+ }
+
+ // Open the next file
+ if (current_file_index_ < files_->size() - 1) {
+ ++current_file_index_;
+ Status s = OpenLogReader(files_->at(current_file_index_).get());
+ if (!s.ok()) {
+ is_valid_ = false;
+ current_status_ = s;
+ return;
+ }
+ } else {
+ is_valid_ = false;
+ if (current_last_seq_ == versions_->LastSequence()) {
+ current_status_ = Status::OK();
+ } else {
+ const char* msg = "Create a new iterator to fetch the new tail.";
+ current_status_ = Status::TryAgain(msg);
+ }
+ return;
+ }
+ }
+}
+
+bool TransactionLogIteratorImpl::IsBatchExpected(
+ const WriteBatch* batch, const SequenceNumber expected_seq) {
+ assert(batch);
+ SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
+ if (batchSeq != expected_seq) {
+ char buf[200];
+ snprintf(buf, sizeof(buf),
+ "Discontinuity in log records. Got seq=%" PRIu64
+ ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64
+ ".Log iterator will reseek the correct batch.",
+ batchSeq, expected_seq, versions_->LastSequence());
+ reporter_.Info(buf);
+ return false;
+ }
+ return true;
+}
+
+void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
+ std::unique_ptr<WriteBatch> batch(new WriteBatch());
+ Status s = WriteBatchInternal::SetContents(batch.get(), record);
+ s.PermitUncheckedError(); // TODO: What should we do with this error?
+
+ SequenceNumber expected_seq = current_last_seq_ + 1;
+ // If the iterator has started, then confirm that we get continuous batches
+ if (started_ && !IsBatchExpected(batch.get(), expected_seq)) {
+ // Seek to the batch having expected sequence number
+ if (expected_seq < files_->at(current_file_index_)->StartSequence()) {
+ // Expected batch must lie in the previous log file
+ // Avoid underflow.
+ if (current_file_index_ != 0) {
+ current_file_index_--;
+ }
+ }
+ starting_sequence_number_ = expected_seq;
+ // currentStatus_ will be set to Ok if reseek succeeds
+ // Note: this is still ok in seq_pre_batch_ && two_write_queuesp_ mode
+ // that allows gaps in the WAL since it will still skip over the gap.
+ current_status_ = Status::NotFound("Gap in sequence numbers");
+ // In seq_per_batch_ mode, gaps in the seq are possible so the strict mode
+ // should be disabled
+ return SeekToStartSequence(current_file_index_, !seq_per_batch_);
+ }
+
+ current_batch_seq_ = WriteBatchInternal::Sequence(batch.get());
+ assert(!seq_per_batch_);
+ current_last_seq_ =
+ current_batch_seq_ + WriteBatchInternal::Count(batch.get()) - 1;
+ // currentBatchSeq_ can only change here
+ assert(current_last_seq_ <= versions_->LastSequence());
+
+ current_batch_ = std::move(batch);
+ is_valid_ = true;
+ current_status_ = Status::OK();
+}
+
+Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* log_file) {
+ std::unique_ptr<SequentialFileReader> file;
+ Status s = OpenLogFile(log_file, &file);
+ if (!s.ok()) {
+ return s;
+ }
+ assert(file);
+ current_log_reader_.reset(
+ new log::Reader(options_->info_log, std::move(file), &reporter_,
+ read_options_.verify_checksums_, log_file->LogNumber()));
+ return Status::OK();
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/transaction_log_impl.h b/src/rocksdb/db/transaction_log_impl.h
new file mode 100644
index 000000000..e8c6efc02
--- /dev/null
+++ b/src/rocksdb/db/transaction_log_impl.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <vector>
+
+#include "db/log_reader.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LogFileImpl : public LogFile {
+ public:
+ LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
+ uint64_t sizeBytes)
+ : logNumber_(logNum),
+ type_(logType),
+ startSequence_(startSeq),
+ sizeFileBytes_(sizeBytes) {}
+
+ std::string PathName() const override {
+ if (type_ == kArchivedLogFile) {
+ return ArchivedLogFileName("", logNumber_);
+ }
+ return LogFileName("", logNumber_);
+ }
+
+ uint64_t LogNumber() const override { return logNumber_; }
+
+ WalFileType Type() const override { return type_; }
+
+ SequenceNumber StartSequence() const override { return startSequence_; }
+
+ uint64_t SizeFileBytes() const override { return sizeFileBytes_; }
+
+ bool operator<(const LogFile& that) const {
+ return LogNumber() < that.LogNumber();
+ }
+
+ private:
+ uint64_t logNumber_;
+ WalFileType type_;
+ SequenceNumber startSequence_;
+ uint64_t sizeFileBytes_;
+};
+
+class TransactionLogIteratorImpl : public TransactionLogIterator {
+ public:
+ TransactionLogIteratorImpl(
+ const std::string& dir, const ImmutableDBOptions* options,
+ const TransactionLogIterator::ReadOptions& read_options,
+ const EnvOptions& soptions, const SequenceNumber seqNum,
+ std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+ const bool seq_per_batch, const std::shared_ptr<IOTracer>& io_tracer);
+
+ virtual bool Valid() override;
+
+ virtual void Next() override;
+
+ virtual Status status() override;
+
+ virtual BatchResult GetBatch() override;
+
+ private:
+ const std::string& dir_;
+ const ImmutableDBOptions* options_;
+ const TransactionLogIterator::ReadOptions read_options_;
+ const EnvOptions& soptions_;
+ SequenceNumber starting_sequence_number_;
+ std::unique_ptr<VectorLogPtr> files_;
+ // Used only to get latest seq. num
+ // TODO(icanadi) can this be just a callback?
+ VersionSet const* const versions_;
+ const bool seq_per_batch_;
+ std::shared_ptr<IOTracer> io_tracer_;
+
+ // State variables
+ bool started_;
+ bool is_valid_; // not valid when it starts of.
+ Status current_status_;
+ size_t current_file_index_;
+ std::unique_ptr<WriteBatch> current_batch_;
+ std::unique_ptr<log::Reader> current_log_reader_;
+ std::string scratch_;
+ Status OpenLogFile(const LogFile* log_file,
+ std::unique_ptr<SequentialFileReader>* file);
+
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ virtual void Corruption(size_t bytes, const Status& s) override {
+ ROCKS_LOG_ERROR(info_log, "dropping %" ROCKSDB_PRIszt " bytes; %s", bytes,
+ s.ToString().c_str());
+ }
+ virtual void Info(const char* s) { ROCKS_LOG_INFO(info_log, "%s", s); }
+ } reporter_;
+
+ SequenceNumber
+ current_batch_seq_; // sequence number at start of current batch
+ SequenceNumber current_last_seq_; // last sequence in the current batch
+ // Reads from transaction log only if the writebatch record has been written
+ bool RestrictedRead(Slice* record);
+ // Seeks to starting_sequence_number_ reading from start_file_index in files_.
+ // If strict is set, then must get a batch starting with
+ // starting_sequence_number_.
+ void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false);
+ // Implementation of Next. SeekToStartSequence calls it internally with
+ // internal=true to let it find next entry even if it has to jump gaps because
+ // the iterator may start off from the first available entry but promises to
+ // be continuous after that
+ void NextImpl(bool internal = false);
+ // Check if batch is expected, else return false
+ bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expected_seq);
+ // Update current batch if a continuous batch is found.
+ void UpdateCurrentWriteBatch(const Slice& record);
+ Status OpenLogReader(const LogFile* file);
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/trim_history_scheduler.cc b/src/rocksdb/db/trim_history_scheduler.cc
new file mode 100644
index 000000000..d7ca0899f
--- /dev/null
+++ b/src/rocksdb/db/trim_history_scheduler.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/trim_history_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void TrimHistoryScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+ std::lock_guard<std::mutex> lock(checking_mutex_);
+ cfd->Ref();
+ cfds_.push_back(cfd);
+ is_empty_.store(false, std::memory_order_relaxed);
+}
+
+ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() {
+ std::lock_guard<std::mutex> lock(checking_mutex_);
+ while (true) {
+ if (cfds_.empty()) {
+ return nullptr;
+ }
+ ColumnFamilyData* cfd = cfds_.back();
+ cfds_.pop_back();
+ if (cfds_.empty()) {
+ is_empty_.store(true, std::memory_order_relaxed);
+ }
+
+ if (!cfd->IsDropped()) {
+ // success
+ return cfd;
+ }
+ cfd->UnrefAndTryDelete();
+ }
+}
+
+bool TrimHistoryScheduler::Empty() {
+ bool is_empty = is_empty_.load(std::memory_order_relaxed);
+ return is_empty;
+}
+
+void TrimHistoryScheduler::Clear() {
+ ColumnFamilyData* cfd;
+ while ((cfd = TakeNextColumnFamily()) != nullptr) {
+ cfd->UnrefAndTryDelete();
+ }
+ assert(Empty());
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/trim_history_scheduler.h b/src/rocksdb/db/trim_history_scheduler.h
new file mode 100644
index 000000000..252802a7a
--- /dev/null
+++ b/src/rocksdb/db/trim_history_scheduler.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <mutex>
+
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+
+// Similar to FlushScheduler, TrimHistoryScheduler is a FIFO queue that keeps
+// track of column families whose flushed immutable memtables may need to be
+// removed (aka trimmed). The actual trimming may be slightly delayed. Due to
+// the use of the mutex and atomic variable, ScheduleWork,
+// TakeNextColumnFamily, and, Empty can be called concurrently.
+class TrimHistoryScheduler {
+ public:
+ TrimHistoryScheduler() : is_empty_(true) {}
+
+ // When a column family needs history trimming, add cfd to the FIFO queue
+ void ScheduleWork(ColumnFamilyData* cfd);
+
+ // Remove the column family from the queue, the caller is responsible for
+ // calling `MemtableList::TrimHistory`
+ ColumnFamilyData* TakeNextColumnFamily();
+
+ bool Empty();
+
+ void Clear();
+
+ // Not on critical path, use mutex to ensure thread safety
+ private:
+ std::atomic<bool> is_empty_;
+ autovector<ColumnFamilyData*> cfds_;
+ std::mutex checking_mutex_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder.cc b/src/rocksdb/db/version_builder.cc
new file mode 100644
index 000000000..2c65dcf71
--- /dev/null
+++ b/src/rocksdb/db/version_builder.cc
@@ -0,0 +1,1372 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_builder.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cinttypes>
+#include <functional>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cache/cache_reservation_manager.h"
+#include "db/blob/blob_file_meta.h"
+#include "db/dbformat.h"
+#include "db/internal_stats.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "port/port.h"
+#include "table/table_reader.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionBuilder::Rep {
+ class NewestFirstBySeqNo {
+ public:
+ bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
+ assert(lhs);
+ assert(rhs);
+
+ if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
+ return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
+ }
+
+ if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
+ return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
+ }
+
+ // Break ties by file number
+ return lhs->fd.GetNumber() > rhs->fd.GetNumber();
+ }
+ };
+
+ class BySmallestKey {
+ public:
+ explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {}
+
+ bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
+ assert(lhs);
+ assert(rhs);
+ assert(cmp_);
+
+ const int r = cmp_->Compare(lhs->smallest, rhs->smallest);
+ if (r != 0) {
+ return (r < 0);
+ }
+
+ // Break ties by file number
+ return (lhs->fd.GetNumber() < rhs->fd.GetNumber());
+ }
+
+ private:
+ const InternalKeyComparator* cmp_;
+ };
+
+ struct LevelState {
+ std::unordered_set<uint64_t> deleted_files;
+ // Map from file number to file meta data.
+ std::unordered_map<uint64_t, FileMetaData*> added_files;
+ };
+
+ // A class that represents the accumulated changes (like additional garbage or
+ // newly linked/unlinked SST files) for a given blob file after applying a
+ // series of VersionEdits.
+ class BlobFileMetaDataDelta {
+ public:
+ bool IsEmpty() const {
+ return !additional_garbage_count_ && !additional_garbage_bytes_ &&
+ newly_linked_ssts_.empty() && newly_unlinked_ssts_.empty();
+ }
+
+ uint64_t GetAdditionalGarbageCount() const {
+ return additional_garbage_count_;
+ }
+
+ uint64_t GetAdditionalGarbageBytes() const {
+ return additional_garbage_bytes_;
+ }
+
+ const std::unordered_set<uint64_t>& GetNewlyLinkedSsts() const {
+ return newly_linked_ssts_;
+ }
+
+ const std::unordered_set<uint64_t>& GetNewlyUnlinkedSsts() const {
+ return newly_unlinked_ssts_;
+ }
+
+ void AddGarbage(uint64_t count, uint64_t bytes) {
+ additional_garbage_count_ += count;
+ additional_garbage_bytes_ += bytes;
+ }
+
+ void LinkSst(uint64_t sst_file_number) {
+ assert(newly_linked_ssts_.find(sst_file_number) ==
+ newly_linked_ssts_.end());
+
+ // Reconcile with newly unlinked SSTs on the fly. (Note: an SST can be
+ // linked to and unlinked from the same blob file in the case of a trivial
+ // move.)
+ auto it = newly_unlinked_ssts_.find(sst_file_number);
+
+ if (it != newly_unlinked_ssts_.end()) {
+ newly_unlinked_ssts_.erase(it);
+ } else {
+ newly_linked_ssts_.emplace(sst_file_number);
+ }
+ }
+
+ void UnlinkSst(uint64_t sst_file_number) {
+ assert(newly_unlinked_ssts_.find(sst_file_number) ==
+ newly_unlinked_ssts_.end());
+
+ // Reconcile with newly linked SSTs on the fly. (Note: an SST can be
+ // linked to and unlinked from the same blob file in the case of a trivial
+ // move.)
+ auto it = newly_linked_ssts_.find(sst_file_number);
+
+ if (it != newly_linked_ssts_.end()) {
+ newly_linked_ssts_.erase(it);
+ } else {
+ newly_unlinked_ssts_.emplace(sst_file_number);
+ }
+ }
+
+ private:
+ uint64_t additional_garbage_count_ = 0;
+ uint64_t additional_garbage_bytes_ = 0;
+ std::unordered_set<uint64_t> newly_linked_ssts_;
+ std::unordered_set<uint64_t> newly_unlinked_ssts_;
+ };
+
+ // A class that represents the state of a blob file after applying a series of
+ // VersionEdits. In addition to the resulting state, it also contains the
+ // delta (see BlobFileMetaDataDelta above). The resulting state can be used to
+ // identify obsolete blob files, while the delta makes it possible to
+ // efficiently detect trivial moves.
+ class MutableBlobFileMetaData {
+ public:
+ // To be used for brand new blob files
+ explicit MutableBlobFileMetaData(
+ std::shared_ptr<SharedBlobFileMetaData>&& shared_meta)
+ : shared_meta_(std::move(shared_meta)) {}
+
+ // To be used for pre-existing blob files
+ explicit MutableBlobFileMetaData(
+ const std::shared_ptr<BlobFileMetaData>& meta)
+ : shared_meta_(meta->GetSharedMeta()),
+ linked_ssts_(meta->GetLinkedSsts()),
+ garbage_blob_count_(meta->GetGarbageBlobCount()),
+ garbage_blob_bytes_(meta->GetGarbageBlobBytes()) {}
+
+ const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const {
+ return shared_meta_;
+ }
+
+ uint64_t GetBlobFileNumber() const {
+ assert(shared_meta_);
+ return shared_meta_->GetBlobFileNumber();
+ }
+
+ bool HasDelta() const { return !delta_.IsEmpty(); }
+
+ const std::unordered_set<uint64_t>& GetLinkedSsts() const {
+ return linked_ssts_;
+ }
+
+ uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+
+ uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+ bool AddGarbage(uint64_t count, uint64_t bytes) {
+ assert(shared_meta_);
+
+ if (garbage_blob_count_ + count > shared_meta_->GetTotalBlobCount() ||
+ garbage_blob_bytes_ + bytes > shared_meta_->GetTotalBlobBytes()) {
+ return false;
+ }
+
+ delta_.AddGarbage(count, bytes);
+
+ garbage_blob_count_ += count;
+ garbage_blob_bytes_ += bytes;
+
+ return true;
+ }
+
+ void LinkSst(uint64_t sst_file_number) {
+ delta_.LinkSst(sst_file_number);
+
+ assert(linked_ssts_.find(sst_file_number) == linked_ssts_.end());
+ linked_ssts_.emplace(sst_file_number);
+ }
+
+ void UnlinkSst(uint64_t sst_file_number) {
+ delta_.UnlinkSst(sst_file_number);
+
+ assert(linked_ssts_.find(sst_file_number) != linked_ssts_.end());
+ linked_ssts_.erase(sst_file_number);
+ }
+
+ private:
+ std::shared_ptr<SharedBlobFileMetaData> shared_meta_;
+ // Accumulated changes
+ BlobFileMetaDataDelta delta_;
+ // Resulting state after applying the changes
+ BlobFileMetaData::LinkedSsts linked_ssts_;
+ uint64_t garbage_blob_count_ = 0;
+ uint64_t garbage_blob_bytes_ = 0;
+ };
+
+ const FileOptions& file_options_;
+ const ImmutableCFOptions* const ioptions_;
+ TableCache* table_cache_;
+ VersionStorageInfo* base_vstorage_;
+ VersionSet* version_set_;
+ int num_levels_;
+ LevelState* levels_;
+ // Store sizes of levels larger than num_levels_. We do this instead of
+ // storing them in levels_ to avoid regression in case there are no files
+ // on invalid levels. The version is not consistent if in the end the files
+ // on invalid levels don't cancel out.
+ std::unordered_map<int, size_t> invalid_level_sizes_;
+ // Whether there are invalid new files or invalid deletion on levels larger
+ // than num_levels_.
+ bool has_invalid_levels_;
+ // Current levels of table files affected by additions/deletions.
+ std::unordered_map<uint64_t, int> table_file_levels_;
+ // Current compact cursors that should be changed after the last compaction
+ std::unordered_map<int, InternalKey> updated_compact_cursors_;
+ NewestFirstBySeqNo level_zero_cmp_;
+ BySmallestKey level_nonzero_cmp_;
+
+ // Mutable metadata objects for all blob files affected by the series of
+ // version edits.
+ std::map<uint64_t, MutableBlobFileMetaData> mutable_blob_file_metas_;
+
+ std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
+
+ public:
+ Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions,
+ TableCache* table_cache, VersionStorageInfo* base_vstorage,
+ VersionSet* version_set,
+ std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
+ : file_options_(file_options),
+ ioptions_(ioptions),
+ table_cache_(table_cache),
+ base_vstorage_(base_vstorage),
+ version_set_(version_set),
+ num_levels_(base_vstorage->num_levels()),
+ has_invalid_levels_(false),
+ level_nonzero_cmp_(base_vstorage_->InternalComparator()),
+ file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) {
+ assert(ioptions_);
+
+ levels_ = new LevelState[num_levels_];
+ }
+
+ ~Rep() {
+ for (int level = 0; level < num_levels_; level++) {
+ const auto& added = levels_[level].added_files;
+ for (auto& pair : added) {
+ UnrefFile(pair.second);
+ }
+ }
+
+ delete[] levels_;
+ }
+
+ void UnrefFile(FileMetaData* f) {
+ f->refs--;
+ if (f->refs <= 0) {
+ if (f->table_reader_handle) {
+ assert(table_cache_ != nullptr);
+ table_cache_->ReleaseHandle(f->table_reader_handle);
+ f->table_reader_handle = nullptr;
+ }
+
+ if (file_metadata_cache_res_mgr_) {
+ Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation(
+ f->ApproximateMemoryUsage(), false /* increase */);
+ s.PermitUncheckedError();
+ }
+ delete f;
+ }
+ }
+
+ // Mapping used for checking the consistency of links between SST files and
+ // blob files. It is built using the forward links (table file -> blob file),
+ // and is subsequently compared with the inverse mapping stored in the
+ // BlobFileMetaData objects.
+ using ExpectedLinkedSsts =
+ std::unordered_map<uint64_t, BlobFileMetaData::LinkedSsts>;
+
+ static void UpdateExpectedLinkedSsts(
+ uint64_t table_file_number, uint64_t blob_file_number,
+ ExpectedLinkedSsts* expected_linked_ssts) {
+ assert(expected_linked_ssts);
+
+ if (blob_file_number == kInvalidBlobFileNumber) {
+ return;
+ }
+
+ (*expected_linked_ssts)[blob_file_number].emplace(table_file_number);
+ }
+
+ template <typename Checker>
+ Status CheckConsistencyDetailsForLevel(
+ const VersionStorageInfo* vstorage, int level, Checker checker,
+ const std::string& sync_point,
+ ExpectedLinkedSsts* expected_linked_ssts) const {
+#ifdef NDEBUG
+ (void)sync_point;
+#endif
+
+ assert(vstorage);
+ assert(level >= 0 && level < num_levels_);
+ assert(expected_linked_ssts);
+
+ const auto& level_files = vstorage->LevelFiles(level);
+
+ if (level_files.empty()) {
+ return Status::OK();
+ }
+
+ assert(level_files[0]);
+ UpdateExpectedLinkedSsts(level_files[0]->fd.GetNumber(),
+ level_files[0]->oldest_blob_file_number,
+ expected_linked_ssts);
+
+ for (size_t i = 1; i < level_files.size(); ++i) {
+ assert(level_files[i]);
+ UpdateExpectedLinkedSsts(level_files[i]->fd.GetNumber(),
+ level_files[i]->oldest_blob_file_number,
+ expected_linked_ssts);
+
+ auto lhs = level_files[i - 1];
+ auto rhs = level_files[i];
+
+#ifndef NDEBUG
+ auto pair = std::make_pair(&lhs, &rhs);
+ TEST_SYNC_POINT_CALLBACK(sync_point, &pair);
+#endif
+
+ const Status s = checker(lhs, rhs);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ return Status::OK();
+ }
+
+ // Make sure table files are sorted correctly and that the links between
+ // table files and blob files are consistent.
+ Status CheckConsistencyDetails(const VersionStorageInfo* vstorage) const {
+ assert(vstorage);
+
+ ExpectedLinkedSsts expected_linked_ssts;
+
+ if (num_levels_ > 0) {
+ // Check L0
+ {
+ auto l0_checker = [this](const FileMetaData* lhs,
+ const FileMetaData* rhs) {
+ assert(lhs);
+ assert(rhs);
+
+ if (!level_zero_cmp_(lhs, rhs)) {
+ std::ostringstream oss;
+ oss << "L0 files are not sorted properly: files #"
+ << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
+
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+
+ if (rhs->fd.smallest_seqno == rhs->fd.largest_seqno) {
+ // This is an external file that we ingested
+ const SequenceNumber external_file_seqno = rhs->fd.smallest_seqno;
+
+ if (!(external_file_seqno < lhs->fd.largest_seqno ||
+ external_file_seqno == 0)) {
+ std::ostringstream oss;
+ oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
+ << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
+ << " vs. file #" << rhs->fd.GetNumber()
+ << " with global_seqno " << external_file_seqno;
+
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+ } else if (lhs->fd.smallest_seqno <= rhs->fd.smallest_seqno) {
+ std::ostringstream oss;
+ oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
+ << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
+ << " vs. file #" << rhs->fd.GetNumber() << " with seqno "
+ << rhs->fd.smallest_seqno << ' ' << rhs->fd.largest_seqno;
+
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+
+ return Status::OK();
+ };
+
+ const Status s = CheckConsistencyDetailsForLevel(
+ vstorage, /* level */ 0, l0_checker,
+ "VersionBuilder::CheckConsistency0", &expected_linked_ssts);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Check L1 and up
+ const InternalKeyComparator* const icmp = vstorage->InternalComparator();
+ assert(icmp);
+
+ for (int level = 1; level < num_levels_; ++level) {
+ auto checker = [this, level, icmp](const FileMetaData* lhs,
+ const FileMetaData* rhs) {
+ assert(lhs);
+ assert(rhs);
+
+ if (!level_nonzero_cmp_(lhs, rhs)) {
+ std::ostringstream oss;
+ oss << 'L' << level << " files are not sorted properly: files #"
+ << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
+
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+
+ // Make sure there is no overlap in level
+ if (icmp->Compare(lhs->largest, rhs->smallest) >= 0) {
+ std::ostringstream oss;
+ oss << 'L' << level << " has overlapping ranges: file #"
+ << lhs->fd.GetNumber()
+ << " largest key: " << lhs->largest.DebugString(true)
+ << " vs. file #" << rhs->fd.GetNumber()
+ << " smallest key: " << rhs->smallest.DebugString(true);
+
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+
+ return Status::OK();
+ };
+
+ const Status s = CheckConsistencyDetailsForLevel(
+ vstorage, level, checker, "VersionBuilder::CheckConsistency1",
+ &expected_linked_ssts);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+
+ // Make sure that all blob files in the version have non-garbage data and
+ // the links between them and the table files are consistent.
+ const auto& blob_files = vstorage->GetBlobFiles();
+ for (const auto& blob_file_meta : blob_files) {
+ assert(blob_file_meta);
+
+ const uint64_t blob_file_number = blob_file_meta->GetBlobFileNumber();
+
+ if (blob_file_meta->GetGarbageBlobCount() >=
+ blob_file_meta->GetTotalBlobCount()) {
+ std::ostringstream oss;
+ oss << "Blob file #" << blob_file_number
+ << " consists entirely of garbage";
+
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+
+ if (blob_file_meta->GetLinkedSsts() !=
+ expected_linked_ssts[blob_file_number]) {
+ std::ostringstream oss;
+ oss << "Links are inconsistent between table files and blob file #"
+ << blob_file_number;
+
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+ }
+
+ Status ret_s;
+ TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistencyBeforeReturn",
+ &ret_s);
+ return ret_s;
+ }
+
+ Status CheckConsistency(const VersionStorageInfo* vstorage) const {
+ assert(vstorage);
+
+ // Always run consistency checks in debug build
+#ifdef NDEBUG
+ if (!vstorage->force_consistency_checks()) {
+ return Status::OK();
+ }
+#endif
+ Status s = CheckConsistencyDetails(vstorage);
+ if (s.IsCorruption() && s.getState()) {
+ // Make it clear the error is due to force_consistency_checks = 1 or
+ // debug build
+#ifdef NDEBUG
+ auto prefix = "force_consistency_checks";
+#else
+ auto prefix = "force_consistency_checks(DEBUG)";
+#endif
+ s = Status::Corruption(prefix, s.getState());
+ } else {
+ // was only expecting corruption with message, or OK
+ assert(s.ok());
+ }
+ return s;
+ }
+
+ bool CheckConsistencyForNumLevels() const {
+ // Make sure there are no files on or beyond num_levels().
+ if (has_invalid_levels_) {
+ return false;
+ }
+
+ for (const auto& pair : invalid_level_sizes_) {
+ const size_t level_size = pair.second;
+ if (level_size != 0) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ bool IsBlobFileInVersion(uint64_t blob_file_number) const {
+ auto mutable_it = mutable_blob_file_metas_.find(blob_file_number);
+ if (mutable_it != mutable_blob_file_metas_.end()) {
+ return true;
+ }
+
+ assert(base_vstorage_);
+ const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number);
+
+ return !!meta;
+ }
+
+ MutableBlobFileMetaData* GetOrCreateMutableBlobFileMetaData(
+ uint64_t blob_file_number) {
+ auto mutable_it = mutable_blob_file_metas_.find(blob_file_number);
+ if (mutable_it != mutable_blob_file_metas_.end()) {
+ return &mutable_it->second;
+ }
+
+ assert(base_vstorage_);
+ const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number);
+
+ if (meta) {
+ mutable_it = mutable_blob_file_metas_
+ .emplace(blob_file_number, MutableBlobFileMetaData(meta))
+ .first;
+ return &mutable_it->second;
+ }
+
+ return nullptr;
+ }
+
+ Status ApplyBlobFileAddition(const BlobFileAddition& blob_file_addition) {
+ const uint64_t blob_file_number = blob_file_addition.GetBlobFileNumber();
+
+ if (IsBlobFileInVersion(blob_file_number)) {
+ std::ostringstream oss;
+ oss << "Blob file #" << blob_file_number << " already added";
+
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+
+ // Note: we use C++11 for now but in C++14, this could be done in a more
+ // elegant way using generalized lambda capture.
+ VersionSet* const vs = version_set_;
+ const ImmutableCFOptions* const ioptions = ioptions_;
+
+ auto deleter = [vs, ioptions](SharedBlobFileMetaData* shared_meta) {
+ if (vs) {
+ assert(ioptions);
+ assert(!ioptions->cf_paths.empty());
+ assert(shared_meta);
+
+ vs->AddObsoleteBlobFile(shared_meta->GetBlobFileNumber(),
+ ioptions->cf_paths.front().path);
+ }
+
+ delete shared_meta;
+ };
+
+ auto shared_meta = SharedBlobFileMetaData::Create(
+ blob_file_number, blob_file_addition.GetTotalBlobCount(),
+ blob_file_addition.GetTotalBlobBytes(),
+ blob_file_addition.GetChecksumMethod(),
+ blob_file_addition.GetChecksumValue(), deleter);
+
+ mutable_blob_file_metas_.emplace(
+ blob_file_number, MutableBlobFileMetaData(std::move(shared_meta)));
+
+ return Status::OK();
+ }
+
+ Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) {
+ const uint64_t blob_file_number = blob_file_garbage.GetBlobFileNumber();
+
+ MutableBlobFileMetaData* const mutable_meta =
+ GetOrCreateMutableBlobFileMetaData(blob_file_number);
+
+ if (!mutable_meta) {
+ std::ostringstream oss;
+ oss << "Blob file #" << blob_file_number << " not found";
+
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+
+ if (!mutable_meta->AddGarbage(blob_file_garbage.GetGarbageBlobCount(),
+ blob_file_garbage.GetGarbageBlobBytes())) {
+ std::ostringstream oss;
+ oss << "Garbage overflow for blob file #" << blob_file_number;
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+
+ return Status::OK();
+ }
+
+ int GetCurrentLevelForTableFile(uint64_t file_number) const {
+ auto it = table_file_levels_.find(file_number);
+ if (it != table_file_levels_.end()) {
+ return it->second;
+ }
+
+ assert(base_vstorage_);
+ return base_vstorage_->GetFileLocation(file_number).GetLevel();
+ }
+
+ uint64_t GetOldestBlobFileNumberForTableFile(int level,
+ uint64_t file_number) const {
+ assert(level < num_levels_);
+
+ const auto& added_files = levels_[level].added_files;
+
+ auto it = added_files.find(file_number);
+ if (it != added_files.end()) {
+ const FileMetaData* const meta = it->second;
+ assert(meta);
+
+ return meta->oldest_blob_file_number;
+ }
+
+ assert(base_vstorage_);
+ const FileMetaData* const meta =
+ base_vstorage_->GetFileMetaDataByNumber(file_number);
+ assert(meta);
+
+ return meta->oldest_blob_file_number;
+ }
+
+ Status ApplyFileDeletion(int level, uint64_t file_number) {
+ assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
+
+ const int current_level = GetCurrentLevelForTableFile(file_number);
+
+ if (level != current_level) {
+ if (level >= num_levels_) {
+ has_invalid_levels_ = true;
+ }
+
+ std::ostringstream oss;
+ oss << "Cannot delete table file #" << file_number << " from level "
+ << level << " since it is ";
+ if (current_level ==
+ VersionStorageInfo::FileLocation::Invalid().GetLevel()) {
+ oss << "not in the LSM tree";
+ } else {
+ oss << "on level " << current_level;
+ }
+
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+
+ if (level >= num_levels_) {
+ assert(invalid_level_sizes_[level] > 0);
+ --invalid_level_sizes_[level];
+
+ table_file_levels_[file_number] =
+ VersionStorageInfo::FileLocation::Invalid().GetLevel();
+
+ return Status::OK();
+ }
+
+ const uint64_t blob_file_number =
+ GetOldestBlobFileNumberForTableFile(level, file_number);
+
+ if (blob_file_number != kInvalidBlobFileNumber) {
+ MutableBlobFileMetaData* const mutable_meta =
+ GetOrCreateMutableBlobFileMetaData(blob_file_number);
+ if (mutable_meta) {
+ mutable_meta->UnlinkSst(file_number);
+ }
+ }
+
+ auto& level_state = levels_[level];
+
+ auto& add_files = level_state.added_files;
+ auto add_it = add_files.find(file_number);
+ if (add_it != add_files.end()) {
+ UnrefFile(add_it->second);
+ add_files.erase(add_it);
+ }
+
+ auto& del_files = level_state.deleted_files;
+ assert(del_files.find(file_number) == del_files.end());
+ del_files.emplace(file_number);
+
+ table_file_levels_[file_number] =
+ VersionStorageInfo::FileLocation::Invalid().GetLevel();
+
+ return Status::OK();
+ }
+
+ Status ApplyFileAddition(int level, const FileMetaData& meta) {
+ assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
+
+ const uint64_t file_number = meta.fd.GetNumber();
+
+ const int current_level = GetCurrentLevelForTableFile(file_number);
+
+ if (current_level !=
+ VersionStorageInfo::FileLocation::Invalid().GetLevel()) {
+ if (level >= num_levels_) {
+ has_invalid_levels_ = true;
+ }
+
+ std::ostringstream oss;
+ oss << "Cannot add table file #" << file_number << " to level " << level
+ << " since it is already in the LSM tree on level " << current_level;
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+
+ if (level >= num_levels_) {
+ ++invalid_level_sizes_[level];
+ table_file_levels_[file_number] = level;
+
+ return Status::OK();
+ }
+
+ auto& level_state = levels_[level];
+
+ auto& del_files = level_state.deleted_files;
+ auto del_it = del_files.find(file_number);
+ if (del_it != del_files.end()) {
+ del_files.erase(del_it);
+ }
+
+ FileMetaData* const f = new FileMetaData(meta);
+ f->refs = 1;
+
+ if (file_metadata_cache_res_mgr_) {
+ Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation(
+ f->ApproximateMemoryUsage(), true /* increase */);
+ if (!s.ok()) {
+ delete f;
+ s = Status::MemoryLimit(
+ "Can't allocate " +
+ kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+ CacheEntryRole::kFileMetadata)] +
+ " due to exceeding the memory limit "
+ "based on "
+ "cache capacity");
+ return s;
+ }
+ }
+
+ auto& add_files = level_state.added_files;
+ assert(add_files.find(file_number) == add_files.end());
+ add_files.emplace(file_number, f);
+
+ const uint64_t blob_file_number = f->oldest_blob_file_number;
+
+ if (blob_file_number != kInvalidBlobFileNumber) {
+ MutableBlobFileMetaData* const mutable_meta =
+ GetOrCreateMutableBlobFileMetaData(blob_file_number);
+ if (mutable_meta) {
+ mutable_meta->LinkSst(file_number);
+ }
+ }
+
+ table_file_levels_[file_number] = level;
+
+ return Status::OK();
+ }
+
+ Status ApplyCompactCursors(int level,
+ const InternalKey& smallest_uncompacted_key) {
+ if (level < 0) {
+ std::ostringstream oss;
+ oss << "Cannot add compact cursor (" << level << ","
+ << smallest_uncompacted_key.Encode().ToString()
+ << " due to invalid level (level = " << level << ")";
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+ if (level < num_levels_) {
+ // Omit levels (>= num_levels_) when re-open with shrinking num_levels_
+ updated_compact_cursors_[level] = smallest_uncompacted_key;
+ }
+ return Status::OK();
+ }
+
+ // Apply all of the edits in *edit to the current state.
+ Status Apply(const VersionEdit* edit) {
+ {
+ const Status s = CheckConsistency(base_vstorage_);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Note: we process the blob file related changes first because the
+ // table file addition/deletion logic depends on the blob files
+ // already being there.
+
+ // Add new blob files
+ for (const auto& blob_file_addition : edit->GetBlobFileAdditions()) {
+ const Status s = ApplyBlobFileAddition(blob_file_addition);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Increase the amount of garbage for blob files affected by GC
+ for (const auto& blob_file_garbage : edit->GetBlobFileGarbages()) {
+ const Status s = ApplyBlobFileGarbage(blob_file_garbage);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Delete table files
+ for (const auto& deleted_file : edit->GetDeletedFiles()) {
+ const int level = deleted_file.first;
+ const uint64_t file_number = deleted_file.second;
+
+ const Status s = ApplyFileDeletion(level, file_number);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Add new table files
+ for (const auto& new_file : edit->GetNewFiles()) {
+ const int level = new_file.first;
+ const FileMetaData& meta = new_file.second;
+
+ const Status s = ApplyFileAddition(level, meta);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Populate compact cursors for round-robin compaction, leave
+ // the cursor to be empty to indicate it is invalid
+ for (const auto& cursor : edit->GetCompactCursors()) {
+ const int level = cursor.first;
+ const InternalKey smallest_uncompacted_key = cursor.second;
+ const Status s = ApplyCompactCursors(level, smallest_uncompacted_key);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ return Status::OK();
+ }
+
+ // Helper function template for merging the blob file metadata from the base
+ // version with the mutable metadata representing the state after applying the
+ // edits. The function objects process_base and process_mutable are
+ // respectively called to handle a base version object when there is no
+ // matching mutable object, and a mutable object when there is no matching
+ // base version object. process_both is called to perform the merge when a
+ // given blob file appears both in the base version and the mutable list. The
+ // helper stops processing objects if a function object returns false. Blob
+ // files with a file number below first_blob_file are not processed.
+ template <typename ProcessBase, typename ProcessMutable, typename ProcessBoth>
+ void MergeBlobFileMetas(uint64_t first_blob_file, ProcessBase process_base,
+ ProcessMutable process_mutable,
+ ProcessBoth process_both) const {
+ assert(base_vstorage_);
+
+ auto base_it = base_vstorage_->GetBlobFileMetaDataLB(first_blob_file);
+ const auto base_it_end = base_vstorage_->GetBlobFiles().end();
+
+ auto mutable_it = mutable_blob_file_metas_.lower_bound(first_blob_file);
+ const auto mutable_it_end = mutable_blob_file_metas_.end();
+
+ while (base_it != base_it_end && mutable_it != mutable_it_end) {
+ const auto& base_meta = *base_it;
+ assert(base_meta);
+
+ const uint64_t base_blob_file_number = base_meta->GetBlobFileNumber();
+ const uint64_t mutable_blob_file_number = mutable_it->first;
+
+ if (base_blob_file_number < mutable_blob_file_number) {
+ if (!process_base(base_meta)) {
+ return;
+ }
+
+ ++base_it;
+ } else if (mutable_blob_file_number < base_blob_file_number) {
+ const auto& mutable_meta = mutable_it->second;
+
+ if (!process_mutable(mutable_meta)) {
+ return;
+ }
+
+ ++mutable_it;
+ } else {
+ assert(base_blob_file_number == mutable_blob_file_number);
+
+ const auto& mutable_meta = mutable_it->second;
+
+ if (!process_both(base_meta, mutable_meta)) {
+ return;
+ }
+
+ ++base_it;
+ ++mutable_it;
+ }
+ }
+
+ while (base_it != base_it_end) {
+ const auto& base_meta = *base_it;
+
+ if (!process_base(base_meta)) {
+ return;
+ }
+
+ ++base_it;
+ }
+
+ while (mutable_it != mutable_it_end) {
+ const auto& mutable_meta = mutable_it->second;
+
+ if (!process_mutable(mutable_meta)) {
+ return;
+ }
+
+ ++mutable_it;
+ }
+ }
+
+ // Helper function template for finding the first blob file that has linked
+ // SSTs.
+ template <typename Meta>
+ static bool CheckLinkedSsts(const Meta& meta,
+ uint64_t* min_oldest_blob_file_num) {
+ assert(min_oldest_blob_file_num);
+
+ if (!meta.GetLinkedSsts().empty()) {
+ assert(*min_oldest_blob_file_num == kInvalidBlobFileNumber);
+
+ *min_oldest_blob_file_num = meta.GetBlobFileNumber();
+
+ return false;
+ }
+
+ return true;
+ }
+
+ // Find the oldest blob file that has linked SSTs.
+ uint64_t GetMinOldestBlobFileNumber() const {
+ uint64_t min_oldest_blob_file_num = kInvalidBlobFileNumber;
+
+ auto process_base =
+ [&min_oldest_blob_file_num](
+ const std::shared_ptr<BlobFileMetaData>& base_meta) {
+ assert(base_meta);
+
+ return CheckLinkedSsts(*base_meta, &min_oldest_blob_file_num);
+ };
+
+ auto process_mutable = [&min_oldest_blob_file_num](
+ const MutableBlobFileMetaData& mutable_meta) {
+ return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num);
+ };
+
+ auto process_both = [&min_oldest_blob_file_num](
+ const std::shared_ptr<BlobFileMetaData>& base_meta,
+ const MutableBlobFileMetaData& mutable_meta) {
+#ifndef NDEBUG
+ assert(base_meta);
+ assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta());
+#else
+ (void)base_meta;
+#endif
+
+ // Look at mutable_meta since it supersedes *base_meta
+ return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num);
+ };
+
+ MergeBlobFileMetas(kInvalidBlobFileNumber, process_base, process_mutable,
+ process_both);
+
+ return min_oldest_blob_file_num;
+ }
+
+ static std::shared_ptr<BlobFileMetaData> CreateBlobFileMetaData(
+ const MutableBlobFileMetaData& mutable_meta) {
+ return BlobFileMetaData::Create(
+ mutable_meta.GetSharedMeta(), mutable_meta.GetLinkedSsts(),
+ mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes());
+ }
+
+ // Add the blob file specified by meta to *vstorage if it is determined to
+ // contain valid data (blobs).
+ template <typename Meta>
+ static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) {
+ assert(vstorage);
+ assert(meta);
+
+ if (meta->GetLinkedSsts().empty() &&
+ meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) {
+ return;
+ }
+
+ vstorage->AddBlobFile(std::forward<Meta>(meta));
+ }
+
+ // Merge the blob file metadata from the base version with the changes (edits)
+ // applied, and save the result into *vstorage.
+ void SaveBlobFilesTo(VersionStorageInfo* vstorage) const {
+ assert(vstorage);
+
+ assert(base_vstorage_);
+ vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() +
+ mutable_blob_file_metas_.size());
+
+ const uint64_t oldest_blob_file_with_linked_ssts =
+ GetMinOldestBlobFileNumber();
+
+ auto process_base =
+ [vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) {
+ assert(base_meta);
+
+ AddBlobFileIfNeeded(vstorage, base_meta);
+
+ return true;
+ };
+
+ auto process_mutable =
+ [vstorage](const MutableBlobFileMetaData& mutable_meta) {
+ AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
+
+ return true;
+ };
+
+ auto process_both = [vstorage](
+ const std::shared_ptr<BlobFileMetaData>& base_meta,
+ const MutableBlobFileMetaData& mutable_meta) {
+ assert(base_meta);
+ assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta());
+
+ if (!mutable_meta.HasDelta()) {
+ assert(base_meta->GetGarbageBlobCount() ==
+ mutable_meta.GetGarbageBlobCount());
+ assert(base_meta->GetGarbageBlobBytes() ==
+ mutable_meta.GetGarbageBlobBytes());
+ assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts());
+
+ AddBlobFileIfNeeded(vstorage, base_meta);
+
+ return true;
+ }
+
+ AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
+
+ return true;
+ };
+
+ MergeBlobFileMetas(oldest_blob_file_with_linked_ssts, process_base,
+ process_mutable, process_both);
+ }
+
+ void MaybeAddFile(VersionStorageInfo* vstorage, int level,
+ FileMetaData* f) const {
+ const uint64_t file_number = f->fd.GetNumber();
+
+ const auto& level_state = levels_[level];
+
+ const auto& del_files = level_state.deleted_files;
+ const auto del_it = del_files.find(file_number);
+
+ if (del_it != del_files.end()) {
+ // f is to-be-deleted table file
+ vstorage->RemoveCurrentStats(f);
+ } else {
+ const auto& add_files = level_state.added_files;
+ const auto add_it = add_files.find(file_number);
+
+ // Note: if the file appears both in the base version and in the added
+ // list, the added FileMetaData supersedes the one in the base version.
+ if (add_it != add_files.end() && add_it->second != f) {
+ vstorage->RemoveCurrentStats(f);
+ } else {
+ vstorage->AddFile(level, f);
+ }
+ }
+ }
+
+ template <typename Cmp>
+ void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const {
+ // Merge the set of added files with the set of pre-existing files.
+ // Drop any deleted files. Store the result in *vstorage.
+ const auto& base_files = base_vstorage_->LevelFiles(level);
+ const auto& unordered_added_files = levels_[level].added_files;
+ vstorage->Reserve(level, base_files.size() + unordered_added_files.size());
+
+ // Sort added files for the level.
+ std::vector<FileMetaData*> added_files;
+ added_files.reserve(unordered_added_files.size());
+ for (const auto& pair : unordered_added_files) {
+ added_files.push_back(pair.second);
+ }
+ std::sort(added_files.begin(), added_files.end(), cmp);
+
+ auto base_iter = base_files.begin();
+ auto base_end = base_files.end();
+ auto added_iter = added_files.begin();
+ auto added_end = added_files.end();
+ while (added_iter != added_end || base_iter != base_end) {
+ if (base_iter == base_end ||
+ (added_iter != added_end && cmp(*added_iter, *base_iter))) {
+ MaybeAddFile(vstorage, level, *added_iter++);
+ } else {
+ MaybeAddFile(vstorage, level, *base_iter++);
+ }
+ }
+ }
+
+ void SaveSSTFilesTo(VersionStorageInfo* vstorage) const {
+ assert(vstorage);
+
+ if (!num_levels_) {
+ return;
+ }
+
+ SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_);
+
+ for (int level = 1; level < num_levels_; ++level) {
+ SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_);
+ }
+ }
+
+ void SaveCompactCursorsTo(VersionStorageInfo* vstorage) const {
+ for (auto iter = updated_compact_cursors_.begin();
+ iter != updated_compact_cursors_.end(); iter++) {
+ vstorage->AddCursorForOneLevel(iter->first, iter->second);
+ }
+ }
+
+ // Save the current state in *vstorage.
+ Status SaveTo(VersionStorageInfo* vstorage) const {
+ Status s;
+
+#ifndef NDEBUG
+ // The same check is done within Apply() so we skip it in release mode.
+ s = CheckConsistency(base_vstorage_);
+ if (!s.ok()) {
+ return s;
+ }
+#endif // NDEBUG
+
+ s = CheckConsistency(vstorage);
+ if (!s.ok()) {
+ return s;
+ }
+
+ SaveSSTFilesTo(vstorage);
+
+ SaveBlobFilesTo(vstorage);
+
+ SaveCompactCursorsTo(vstorage);
+
+ s = CheckConsistency(vstorage);
+ return s;
+ }
+
+ Status LoadTableHandlers(
+ InternalStats* internal_stats, int max_threads,
+ bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ size_t max_file_size_for_l0_meta_pin) {
+ assert(table_cache_ != nullptr);
+
+ size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity();
+ bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity);
+ size_t max_load = std::numeric_limits<size_t>::max();
+
+ if (!always_load) {
+ // If it is initial loading and not set to always loading all the
+ // files, we only load up to kInitialLoadLimit files, to limit the
+ // time reopening the DB.
+ const size_t kInitialLoadLimit = 16;
+ size_t load_limit;
+ // If the table cache is not 1/4 full, we pin the table handle to
+ // file metadata to avoid the cache read costs when reading the file.
+ // The downside of pinning those files is that LRU won't be followed
+ // for those files. This doesn't matter much because if number of files
+ // of the DB excceeds table cache capacity, eventually no table reader
+ // will be pinned and LRU will be followed.
+ if (is_initial_load) {
+ load_limit = std::min(kInitialLoadLimit, table_cache_capacity / 4);
+ } else {
+ load_limit = table_cache_capacity / 4;
+ }
+
+ size_t table_cache_usage = table_cache_->get_cache()->GetUsage();
+ if (table_cache_usage >= load_limit) {
+ // TODO (yanqin) find a suitable status code.
+ return Status::OK();
+ } else {
+ max_load = load_limit - table_cache_usage;
+ }
+ }
+
+ // <file metadata, level>
+ std::vector<std::pair<FileMetaData*, int>> files_meta;
+ std::vector<Status> statuses;
+ for (int level = 0; level < num_levels_; level++) {
+ for (auto& file_meta_pair : levels_[level].added_files) {
+ auto* file_meta = file_meta_pair.second;
+ // If the file has been opened before, just skip it.
+ if (!file_meta->table_reader_handle) {
+ files_meta.emplace_back(file_meta, level);
+ statuses.emplace_back(Status::OK());
+ }
+ if (files_meta.size() >= max_load) {
+ break;
+ }
+ }
+ if (files_meta.size() >= max_load) {
+ break;
+ }
+ }
+
+ std::atomic<size_t> next_file_meta_idx(0);
+ std::function<void()> load_handlers_func([&]() {
+ while (true) {
+ size_t file_idx = next_file_meta_idx.fetch_add(1);
+ if (file_idx >= files_meta.size()) {
+ break;
+ }
+
+ auto* file_meta = files_meta[file_idx].first;
+ int level = files_meta[file_idx].second;
+ statuses[file_idx] = table_cache_->FindTable(
+ ReadOptions(), file_options_,
+ *(base_vstorage_->InternalComparator()), *file_meta,
+ &file_meta->table_reader_handle, prefix_extractor, false /*no_io */,
+ true /* record_read_stats */,
+ internal_stats->GetFileReadHist(level), false, level,
+ prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin,
+ file_meta->temperature);
+ if (file_meta->table_reader_handle != nullptr) {
+ // Load table_reader
+ file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
+ file_meta->table_reader_handle);
+ }
+ }
+ });
+
+ std::vector<port::Thread> threads;
+ for (int i = 1; i < max_threads; i++) {
+ threads.emplace_back(load_handlers_func);
+ }
+ load_handlers_func();
+ for (auto& t : threads) {
+ t.join();
+ }
+ Status ret;
+ for (const auto& s : statuses) {
+ if (!s.ok()) {
+ if (ret.ok()) {
+ ret = s;
+ }
+ }
+ }
+ return ret;
+ }
+};
+
+VersionBuilder::VersionBuilder(
+ const FileOptions& file_options, const ImmutableCFOptions* ioptions,
+ TableCache* table_cache, VersionStorageInfo* base_vstorage,
+ VersionSet* version_set,
+ std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
+ : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage,
+ version_set, file_metadata_cache_res_mgr)) {}
+
+VersionBuilder::~VersionBuilder() = default;
+
+bool VersionBuilder::CheckConsistencyForNumLevels() {
+ return rep_->CheckConsistencyForNumLevels();
+}
+
+Status VersionBuilder::Apply(const VersionEdit* edit) {
+ return rep_->Apply(edit);
+}
+
+Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) const {
+ return rep_->SaveTo(vstorage);
+}
+
+Status VersionBuilder::LoadTableHandlers(
+ InternalStats* internal_stats, int max_threads,
+ bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ size_t max_file_size_for_l0_meta_pin) {
+ return rep_->LoadTableHandlers(
+ internal_stats, max_threads, prefetch_index_and_filter_in_cache,
+ is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin);
+}
+
+uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const {
+ return rep_->GetMinOldestBlobFileNumber();
+}
+
+BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
+ ColumnFamilyData* cfd)
+ : version_builder_(new VersionBuilder(
+ cfd->current()->version_set()->file_options(), cfd->ioptions(),
+ cfd->table_cache(), cfd->current()->storage_info(),
+ cfd->current()->version_set(),
+ cfd->GetFileMetadataCacheReservationManager())),
+ version_(cfd->current()) {
+ version_->Ref();
+}
+
+BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
+ ColumnFamilyData* cfd, Version* v)
+ : version_builder_(new VersionBuilder(
+ cfd->current()->version_set()->file_options(), cfd->ioptions(),
+ cfd->table_cache(), v->storage_info(), v->version_set(),
+ cfd->GetFileMetadataCacheReservationManager())),
+ version_(v) {
+ assert(version_ != cfd->current());
+}
+
+BaseReferencedVersionBuilder::~BaseReferencedVersionBuilder() {
+ version_->Unref();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder.h b/src/rocksdb/db/version_builder.h
new file mode 100644
index 000000000..1c022832a
--- /dev/null
+++ b/src/rocksdb/db/version_builder.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ImmutableCFOptions;
+class TableCache;
+class VersionStorageInfo;
+class VersionEdit;
+struct FileMetaData;
+class InternalStats;
+class Version;
+class VersionSet;
+class ColumnFamilyData;
+class CacheReservationManager;
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionBuilder {
+ public:
+ VersionBuilder(const FileOptions& file_options,
+ const ImmutableCFOptions* ioptions, TableCache* table_cache,
+ VersionStorageInfo* base_vstorage, VersionSet* version_set,
+ std::shared_ptr<CacheReservationManager>
+ file_metadata_cache_res_mgr = nullptr);
+ ~VersionBuilder();
+
+ bool CheckConsistencyForNumLevels();
+ Status Apply(const VersionEdit* edit);
+ Status SaveTo(VersionStorageInfo* vstorage) const;
+ Status LoadTableHandlers(
+ InternalStats* internal_stats, int max_threads,
+ bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ size_t max_file_size_for_l0_meta_pin);
+ uint64_t GetMinOldestBlobFileNumber() const;
+
+ private:
+ class Rep;
+ std::unique_ptr<Rep> rep_;
+};
+
+// A wrapper of version builder which references the current version in
+// constructor and unref it in the destructor.
+// Both of the constructor and destructor need to be called inside DB Mutex.
+class BaseReferencedVersionBuilder {
+ public:
+ explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd);
+ BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v);
+ ~BaseReferencedVersionBuilder();
+ VersionBuilder* version_builder() const { return version_builder_.get(); }
+
+ private:
+ std::unique_ptr<VersionBuilder> version_builder_;
+ Version* version_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder_test.cc b/src/rocksdb/db/version_builder_test.cc
new file mode 100644
index 000000000..ee5c3f2e3
--- /dev/null
+++ b/src/rocksdb/db/version_builder_test.cc
@@ -0,0 +1,1695 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <cstring>
+#include <iomanip>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionBuilderTest : public testing::Test {
+ public:
+ const Comparator* ucmp_;
+ InternalKeyComparator icmp_;
+ Options options_;
+ ImmutableOptions ioptions_;
+ MutableCFOptions mutable_cf_options_;
+ VersionStorageInfo vstorage_;
+ uint32_t file_num_;
+ CompactionOptionsFIFO fifo_options_;
+ std::vector<uint64_t> size_being_compacted_;
+
+ VersionBuilderTest()
+ : ucmp_(BytewiseComparator()),
+ icmp_(ucmp_),
+ ioptions_(options_),
+ mutable_cf_options_(options_),
+ vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel,
+ nullptr, false),
+ file_num_(1) {
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ size_being_compacted_.resize(options_.num_levels);
+ }
+
+ ~VersionBuilderTest() override {
+ for (int i = 0; i < vstorage_.num_levels(); i++) {
+ for (auto* f : vstorage_.LevelFiles(i)) {
+ if (--f->refs == 0) {
+ delete f;
+ }
+ }
+ }
+ }
+
+ InternalKey GetInternalKey(const char* ukey,
+ SequenceNumber smallest_seq = 100) {
+ return InternalKey(ukey, smallest_seq, kTypeValue);
+ }
+
+ void Add(int level, uint64_t file_number, const char* smallest,
+ const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
+ SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+ uint64_t num_entries = 0, uint64_t num_deletions = 0,
+ bool sampled = false, SequenceNumber smallest_seqno = 0,
+ SequenceNumber largest_seqno = 0,
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+ assert(level < vstorage_.num_levels());
+ FileMetaData* f = new FileMetaData(
+ file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
+ GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
+ /* marked_for_compact */ false, Temperature::kUnknown,
+ oldest_blob_file_number, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ f->compensated_file_size = file_size;
+ f->num_entries = num_entries;
+ f->num_deletions = num_deletions;
+ vstorage_.AddFile(level, f);
+ if (sampled) {
+ f->init_stats_from_file = true;
+ vstorage_.UpdateAccumulatedStats(f);
+ }
+ }
+
+ void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count,
+ uint64_t total_blob_bytes, std::string checksum_method,
+ std::string checksum_value,
+ BlobFileMetaData::LinkedSsts linked_ssts,
+ uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) {
+ auto shared_meta = SharedBlobFileMetaData::Create(
+ blob_file_number, total_blob_count, total_blob_bytes,
+ std::move(checksum_method), std::move(checksum_value));
+ auto meta =
+ BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
+ garbage_blob_count, garbage_blob_bytes);
+
+ vstorage_.AddBlobFile(std::move(meta));
+ }
+
+ void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number) {
+ constexpr int level = 0;
+ constexpr char smallest[] = "bar";
+ constexpr char largest[] = "foo";
+ constexpr uint64_t file_size = 100;
+ constexpr uint32_t path_id = 0;
+ constexpr SequenceNumber smallest_seq = 0;
+ constexpr SequenceNumber largest_seq = 0;
+ constexpr uint64_t num_entries = 0;
+ constexpr uint64_t num_deletions = 0;
+ constexpr bool sampled = false;
+
+ Add(level, table_file_number, smallest, largest, file_size, path_id,
+ smallest_seq, largest_seq, num_entries, num_deletions, sampled,
+ smallest_seq, largest_seq, blob_file_number);
+ }
+
+ void AddDummyFileToEdit(VersionEdit* edit, uint64_t table_file_number,
+ uint64_t blob_file_number) {
+ assert(edit);
+
+ constexpr int level = 0;
+ constexpr uint32_t path_id = 0;
+ constexpr uint64_t file_size = 100;
+ constexpr char smallest[] = "bar";
+ constexpr char largest[] = "foo";
+ constexpr SequenceNumber smallest_seqno = 100;
+ constexpr SequenceNumber largest_seqno = 300;
+ constexpr bool marked_for_compaction = false;
+
+ edit->AddFile(
+ level, table_file_number, path_id, file_size, GetInternalKey(smallest),
+ GetInternalKey(largest), smallest_seqno, largest_seqno,
+ marked_for_compaction, Temperature::kUnknown, blob_file_number,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ }
+
+ void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) {
+ assert(vstorage);
+
+ vstorage->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+ vstorage->SetFinalized();
+ }
+
+ void UpdateVersionStorageInfo() { UpdateVersionStorageInfo(&vstorage_); }
+};
+
+void UnrefFilesInVersion(VersionStorageInfo* new_vstorage) {
+ for (int i = 0; i < new_vstorage->num_levels(); i++) {
+ for (auto* f : new_vstorage->LevelFiles(i)) {
+ if (--f->refs == 0) {
+ delete f;
+ }
+ }
+ }
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
+ Add(0, 1U, "150", "200", 100U);
+
+ Add(1, 66U, "150", "200", 100U);
+ Add(1, 88U, "201", "300", 100U);
+
+ Add(2, 6U, "150", "179", 100U);
+ Add(2, 7U, "180", "220", 100U);
+ Add(2, 8U, "221", "300", 100U);
+
+ Add(3, 26U, "150", "170", 100U);
+ Add(3, 27U, "171", "179", 100U);
+ Add(3, 28U, "191", "220", 100U);
+ Add(3, 29U, "221", "300", 100U);
+
+ UpdateVersionStorageInfo();
+
+ VersionEdit version_edit;
+ version_edit.AddFile(
+ 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit.DeleteFile(3, 27U);
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+ &vstorage_, version_set);
+
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr, false);
+ ASSERT_OK(version_builder.Apply(&version_edit));
+ ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2));
+ ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+
+ Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+ Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+ Add(4, 6U, "150", "179", 100U);
+ Add(4, 7U, "180", "220", 100U);
+ Add(4, 8U, "221", "300", 100U);
+
+ Add(5, 26U, "150", "170", 100U);
+ Add(5, 27U, "171", "179", 100U);
+
+ UpdateVersionStorageInfo();
+
+ VersionEdit version_edit;
+ version_edit.AddFile(
+ 3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit.DeleteFile(0, 1U);
+ version_edit.DeleteFile(0, 88U);
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+ &vstorage_, version_set);
+
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr, false);
+ ASSERT_OK(version_builder.Apply(&version_edit));
+ ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+ ASSERT_EQ(100U, new_vstorage.NumLevelBytes(3));
+ ASSERT_EQ(300U, new_vstorage.NumLevelBytes(4));
+ ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+
+ Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+ Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+ Add(4, 6U, "150", "179", 100U);
+ Add(4, 7U, "180", "220", 100U);
+ Add(4, 8U, "221", "300", 100U);
+
+ Add(5, 26U, "150", "170", 100U);
+ Add(5, 27U, "171", "179", 100U);
+
+ UpdateVersionStorageInfo();
+
+ VersionEdit version_edit;
+ version_edit.AddFile(
+ 4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit.DeleteFile(0, 1U);
+ version_edit.DeleteFile(0, 88U);
+ version_edit.DeleteFile(4, 6U);
+ version_edit.DeleteFile(4, 7U);
+ version_edit.DeleteFile(4, 8U);
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+ &vstorage_, version_set);
+
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr, false);
+ ASSERT_OK(version_builder.Apply(&version_edit));
+ ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+ ASSERT_EQ(100U, new_vstorage.NumLevelBytes(4));
+ ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
+ UpdateVersionStorageInfo();
+
+ VersionEdit version_edit;
+ version_edit.AddFile(
+ 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit.AddFile(
+ 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit.AddFile(
+ 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit.AddFile(
+ 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit.AddFile(
+ 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+ &vstorage_, version_set);
+
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr, false);
+ ASSERT_OK(version_builder.Apply(&version_edit));
+ ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+ &vstorage_, version_set);
+
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr, false);
+
+ VersionEdit version_edit;
+ version_edit.AddFile(
+ 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit.AddFile(
+ 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit.AddFile(
+ 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit.AddFile(
+ 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit.AddFile(
+ 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ ASSERT_OK(version_builder.Apply(&version_edit));
+
+ VersionEdit version_edit2;
+ version_edit.AddFile(
+ 2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ version_edit2.DeleteFile(2, 616);
+ version_edit2.DeleteFile(2, 636);
+ version_edit.AddFile(
+ 2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200,
+ false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ ASSERT_OK(version_builder.Apply(&version_edit2));
+ ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyFileDeletionIncorrectLevel) {
+ constexpr int level = 1;
+ constexpr uint64_t file_number = 2345;
+ constexpr char smallest[] = "bar";
+ constexpr char largest[] = "foo";
+ constexpr uint64_t file_size = 100;
+
+ Add(level, file_number, smallest, largest, file_size);
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ constexpr int incorrect_level = 3;
+
+ edit.DeleteFile(incorrect_level, file_number);
+
+ const Status s = builder.Apply(&edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(),
+ "Cannot delete table file #2345 from level 3 since "
+ "it is on level 1"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileDeletionNotInLSMTree) {
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ constexpr int level = 3;
+ constexpr uint64_t file_number = 1234;
+
+ edit.DeleteFile(level, file_number);
+
+ const Status s = builder.Apply(&edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(),
+ "Cannot delete table file #1234 from level 3 since "
+ "it is not in the LSM tree"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) {
+ constexpr int level = 1;
+ constexpr uint64_t file_number = 2345;
+ constexpr char smallest[] = "bar";
+ constexpr char largest[] = "foo";
+ constexpr uint64_t file_size = 10000;
+ constexpr uint32_t path_id = 0;
+ constexpr SequenceNumber smallest_seq = 100;
+ constexpr SequenceNumber largest_seq = 500;
+ constexpr uint64_t num_entries = 0;
+ constexpr uint64_t num_deletions = 0;
+ constexpr bool sampled = false;
+ constexpr SequenceNumber smallest_seqno = 1;
+ constexpr SequenceNumber largest_seqno = 1000;
+
+ Add(level, file_number, smallest, largest, file_size, path_id, smallest_seq,
+ largest_seq, num_entries, num_deletions, sampled, smallest_seqno,
+ largest_seqno);
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit deletion;
+
+ deletion.DeleteFile(level, file_number);
+
+ ASSERT_OK(builder.Apply(&deletion));
+
+ VersionEdit addition;
+
+ constexpr bool marked_for_compaction = false;
+
+ addition.AddFile(level, file_number, path_id, file_size,
+ GetInternalKey(smallest, smallest_seq),
+ GetInternalKey(largest, largest_seq), smallest_seqno,
+ largest_seqno, marked_for_compaction, Temperature::kUnknown,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ ASSERT_OK(builder.Apply(&addition));
+
+ constexpr bool force_consistency_checks = false;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ ASSERT_EQ(new_vstorage.GetFileLocation(file_number).GetLevel(), level);
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) {
+ constexpr int level = 1;
+ constexpr uint64_t file_number = 2345;
+ constexpr char smallest[] = "bar";
+ constexpr char largest[] = "foo";
+ constexpr uint64_t file_size = 10000;
+
+ Add(level, file_number, smallest, largest, file_size);
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ constexpr int new_level = 2;
+ constexpr uint32_t path_id = 0;
+ constexpr SequenceNumber smallest_seqno = 100;
+ constexpr SequenceNumber largest_seqno = 1000;
+ constexpr bool marked_for_compaction = false;
+
+ edit.AddFile(
+ new_level, file_number, path_id, file_size, GetInternalKey(smallest),
+ GetInternalKey(largest), smallest_seqno, largest_seqno,
+ marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ const Status s = builder.Apply(&edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(),
+ "Cannot add table file #2345 to level 2 since it is "
+ "already in the LSM tree on level 1"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) {
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ constexpr int level = 3;
+ constexpr uint64_t file_number = 2345;
+ constexpr uint32_t path_id = 0;
+ constexpr uint64_t file_size = 10000;
+ constexpr char smallest[] = "bar";
+ constexpr char largest[] = "foo";
+ constexpr SequenceNumber smallest_seqno = 100;
+ constexpr SequenceNumber largest_seqno = 1000;
+ constexpr bool marked_for_compaction = false;
+
+ edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest),
+ GetInternalKey(largest), smallest_seqno, largest_seqno,
+ marked_for_compaction, Temperature::kUnknown,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ ASSERT_OK(builder.Apply(&edit));
+
+ VersionEdit other_edit;
+
+ constexpr int new_level = 2;
+
+ other_edit.AddFile(
+ new_level, file_number, path_id, file_size, GetInternalKey(smallest),
+ GetInternalKey(largest), smallest_seqno, largest_seqno,
+ marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ const Status s = builder.Apply(&other_edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(),
+ "Cannot add table file #2345 to level 2 since it is "
+ "already in the LSM tree on level 3"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) {
+ UpdateVersionStorageInfo();
+
+ constexpr int level = 1;
+ constexpr uint64_t file_number = 2345;
+ constexpr uint32_t path_id = 0;
+ constexpr uint64_t file_size = 10000;
+ constexpr char smallest[] = "bar";
+ constexpr char largest[] = "foo";
+ constexpr SequenceNumber smallest_seqno = 100;
+ constexpr SequenceNumber largest_seqno = 1000;
+ constexpr bool marked_for_compaction = false;
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit addition;
+
+ addition.AddFile(
+ level, file_number, path_id, file_size, GetInternalKey(smallest),
+ GetInternalKey(largest), smallest_seqno, largest_seqno,
+ marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ ASSERT_OK(builder.Apply(&addition));
+
+ VersionEdit deletion;
+
+ deletion.DeleteFile(level, file_number);
+
+ ASSERT_OK(builder.Apply(&deletion));
+
+ constexpr bool force_consistency_checks = false;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ ASSERT_FALSE(new_vstorage.GetFileLocation(file_number).IsValid());
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAddition) {
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ constexpr uint64_t blob_file_number = 1234;
+ constexpr uint64_t total_blob_count = 5678;
+ constexpr uint64_t total_blob_bytes = 999999;
+ constexpr char checksum_method[] = "SHA1";
+ constexpr char checksum_value[] =
+ "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+ "\x5c\xbd";
+
+ edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+ checksum_method, checksum_value);
+
+ // Add dummy table file to ensure the blob file is referenced.
+ constexpr uint64_t table_file_number = 1;
+ AddDummyFileToEdit(&edit, table_file_number, blob_file_number);
+
+ ASSERT_OK(builder.Apply(&edit));
+
+ constexpr bool force_consistency_checks = false;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ const auto& new_blob_files = new_vstorage.GetBlobFiles();
+ ASSERT_EQ(new_blob_files.size(), 1);
+
+ const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+ ASSERT_NE(new_meta, nullptr);
+ ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+ ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+ ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+ ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+ ASSERT_EQ(new_meta->GetLinkedSsts(),
+ BlobFileMetaData::LinkedSsts{table_file_number});
+ ASSERT_EQ(new_meta->GetGarbageBlobCount(), 0);
+ ASSERT_EQ(new_meta->GetGarbageBlobBytes(), 0);
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyInBase) {
+ // Attempt to add a blob file that is already present in the base version.
+
+ constexpr uint64_t blob_file_number = 1234;
+ constexpr uint64_t total_blob_count = 5678;
+ constexpr uint64_t total_blob_bytes = 999999;
+ constexpr char checksum_method[] = "SHA1";
+ constexpr char checksum_value[] =
+ "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+ "\x5c\xbd";
+ constexpr uint64_t garbage_blob_count = 123;
+ constexpr uint64_t garbage_blob_bytes = 456789;
+
+ AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+ checksum_value, BlobFileMetaData::LinkedSsts(), garbage_blob_count,
+ garbage_blob_bytes);
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+ checksum_method, checksum_value);
+
+ const Status s = builder.Apply(&edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added"));
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyApplied) {
+ // Attempt to add the same blob file twice using version edits.
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ constexpr uint64_t blob_file_number = 1234;
+ constexpr uint64_t total_blob_count = 5678;
+ constexpr uint64_t total_blob_bytes = 999999;
+ constexpr char checksum_method[] = "SHA1";
+ constexpr char checksum_value[] =
+ "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+ "\x5c\xbd";
+
+ edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+ checksum_method, checksum_value);
+
+ ASSERT_OK(builder.Apply(&edit));
+
+ const Status s = builder.Apply(&edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added"));
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileInBase) {
+ // Increase the amount of garbage for a blob file present in the base version.
+
+ constexpr uint64_t table_file_number = 1;
+ constexpr uint64_t blob_file_number = 1234;
+ constexpr uint64_t total_blob_count = 5678;
+ constexpr uint64_t total_blob_bytes = 999999;
+ constexpr char checksum_method[] = "SHA1";
+ constexpr char checksum_value[] =
+ "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+ "\x5c\xbd";
+ constexpr uint64_t garbage_blob_count = 123;
+ constexpr uint64_t garbage_blob_bytes = 456789;
+
+ AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+ checksum_value, BlobFileMetaData::LinkedSsts{table_file_number},
+ garbage_blob_count, garbage_blob_bytes);
+
+ const auto meta = vstorage_.GetBlobFileMetaData(blob_file_number);
+ ASSERT_NE(meta, nullptr);
+
+ // Add dummy table file to ensure the blob file is referenced.
+ AddDummyFile(table_file_number, blob_file_number);
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ constexpr uint64_t new_garbage_blob_count = 456;
+ constexpr uint64_t new_garbage_blob_bytes = 111111;
+
+ edit.AddBlobFileGarbage(blob_file_number, new_garbage_blob_count,
+ new_garbage_blob_bytes);
+
+ ASSERT_OK(builder.Apply(&edit));
+
+ constexpr bool force_consistency_checks = false;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ const auto& new_blob_files = new_vstorage.GetBlobFiles();
+ ASSERT_EQ(new_blob_files.size(), 1);
+
+ const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+ ASSERT_NE(new_meta, nullptr);
+ ASSERT_EQ(new_meta->GetSharedMeta(), meta->GetSharedMeta());
+ ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+ ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+ ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+ ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+ ASSERT_EQ(new_meta->GetLinkedSsts(),
+ BlobFileMetaData::LinkedSsts{table_file_number});
+ ASSERT_EQ(new_meta->GetGarbageBlobCount(),
+ garbage_blob_count + new_garbage_blob_count);
+ ASSERT_EQ(new_meta->GetGarbageBlobBytes(),
+ garbage_blob_bytes + new_garbage_blob_bytes);
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileAdditionApplied) {
+ // Increase the amount of garbage for a blob file added using a version edit.
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit addition;
+
+ constexpr uint64_t blob_file_number = 1234;
+ constexpr uint64_t total_blob_count = 5678;
+ constexpr uint64_t total_blob_bytes = 999999;
+ constexpr char checksum_method[] = "SHA1";
+ constexpr char checksum_value[] =
+ "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+ "\x5c\xbd";
+
+ addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+ checksum_method, checksum_value);
+
+ // Add dummy table file to ensure the blob file is referenced.
+ constexpr uint64_t table_file_number = 1;
+ AddDummyFileToEdit(&addition, table_file_number, blob_file_number);
+
+ ASSERT_OK(builder.Apply(&addition));
+
+ constexpr uint64_t garbage_blob_count = 123;
+ constexpr uint64_t garbage_blob_bytes = 456789;
+
+ VersionEdit garbage;
+
+ garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+
+ ASSERT_OK(builder.Apply(&garbage));
+
+ constexpr bool force_consistency_checks = false;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ const auto& new_blob_files = new_vstorage.GetBlobFiles();
+ ASSERT_EQ(new_blob_files.size(), 1);
+
+ const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+ ASSERT_NE(new_meta, nullptr);
+ ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+ ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+ ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+ ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+ ASSERT_EQ(new_meta->GetLinkedSsts(),
+ BlobFileMetaData::LinkedSsts{table_file_number});
+ ASSERT_EQ(new_meta->GetGarbageBlobCount(), garbage_blob_count);
+ ASSERT_EQ(new_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileNotFound) {
+ // Attempt to increase the amount of garbage for a blob file that is
+ // neither in the base version, nor was it added using a version edit.
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ constexpr uint64_t blob_file_number = 1234;
+ constexpr uint64_t garbage_blob_count = 5678;
+ constexpr uint64_t garbage_blob_bytes = 999999;
+
+ edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+
+ const Status s = builder.Apply(&edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 not found"));
+}
+
+TEST_F(VersionBuilderTest, BlobFileGarbageOverflow) {
+ // Test that VersionEdits that would result in the count/total size of garbage
+ // exceeding the count/total size of all blobs are rejected.
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit addition;
+
+ constexpr uint64_t blob_file_number = 1234;
+ constexpr uint64_t total_blob_count = 5678;
+ constexpr uint64_t total_blob_bytes = 999999;
+ constexpr char checksum_method[] = "SHA1";
+ constexpr char checksum_value[] =
+ "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+ "\x5c\xbd";
+
+ addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+ checksum_method, checksum_value);
+
+ // Add dummy table file to ensure the blob file is referenced.
+ constexpr uint64_t table_file_number = 1;
+ AddDummyFileToEdit(&addition, table_file_number, blob_file_number);
+
+ ASSERT_OK(builder.Apply(&addition));
+
+ {
+ // Garbage blob count overflow
+ constexpr uint64_t garbage_blob_count = 5679;
+ constexpr uint64_t garbage_blob_bytes = 999999;
+
+ VersionEdit garbage;
+
+ garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+
+ const Status s = builder.Apply(&garbage);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(
+ std::strstr(s.getState(), "Garbage overflow for blob file #1234"));
+ }
+
+ {
+ // Garbage blob bytes overflow
+ constexpr uint64_t garbage_blob_count = 5678;
+ constexpr uint64_t garbage_blob_bytes = 1000000;
+
+ VersionEdit garbage;
+
+ garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+
+ const Status s = builder.Apply(&garbage);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(
+ std::strstr(s.getState(), "Garbage overflow for blob file #1234"));
+ }
+}
+
+TEST_F(VersionBuilderTest, SaveBlobFilesTo) {
+ // Add three blob files to base version.
+ for (uint64_t i = 1; i <= 3; ++i) {
+ const uint64_t table_file_number = 2 * i;
+ const uint64_t blob_file_number = 2 * i + 1;
+ const uint64_t total_blob_count = i * 1000;
+ const uint64_t total_blob_bytes = i * 1000000;
+ const uint64_t garbage_blob_count = i * 100;
+ const uint64_t garbage_blob_bytes = i * 20000;
+
+ AddBlob(blob_file_number, total_blob_count, total_blob_bytes,
+ /* checksum_method */ std::string(),
+ /* checksum_value */ std::string(),
+ BlobFileMetaData::LinkedSsts{table_file_number}, garbage_blob_count,
+ garbage_blob_bytes);
+ }
+
+ // Add dummy table files to ensure the blob files are referenced.
+ // Note: files are added to L0, so they have to be added in reverse order
+ // (newest first).
+ for (uint64_t i = 3; i >= 1; --i) {
+ const uint64_t table_file_number = 2 * i;
+ const uint64_t blob_file_number = 2 * i + 1;
+
+ AddDummyFile(table_file_number, blob_file_number);
+ }
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ // Add some garbage to the second and third blob files. The second blob file
+ // remains valid since it does not consist entirely of garbage yet. The third
+ // blob file is all garbage after the edit and will not be part of the new
+ // version. The corresponding dummy table file is also removed for
+ // consistency.
+ edit.AddBlobFileGarbage(/* blob_file_number */ 5,
+ /* garbage_blob_count */ 200,
+ /* garbage_blob_bytes */ 100000);
+ edit.AddBlobFileGarbage(/* blob_file_number */ 7,
+ /* garbage_blob_count */ 2700,
+ /* garbage_blob_bytes */ 2940000);
+ edit.DeleteFile(/* level */ 0, /* file_number */ 6);
+
+ // Add a fourth blob file.
+ edit.AddBlobFile(/* blob_file_number */ 9, /* total_blob_count */ 4000,
+ /* total_blob_bytes */ 4000000,
+ /* checksum_method */ std::string(),
+ /* checksum_value */ std::string());
+
+ ASSERT_OK(builder.Apply(&edit));
+
+ constexpr bool force_consistency_checks = false;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ const auto& new_blob_files = new_vstorage.GetBlobFiles();
+ ASSERT_EQ(new_blob_files.size(), 3);
+
+ const auto meta3 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3);
+
+ ASSERT_NE(meta3, nullptr);
+ ASSERT_EQ(meta3->GetBlobFileNumber(), 3);
+ ASSERT_EQ(meta3->GetTotalBlobCount(), 1000);
+ ASSERT_EQ(meta3->GetTotalBlobBytes(), 1000000);
+ ASSERT_EQ(meta3->GetGarbageBlobCount(), 100);
+ ASSERT_EQ(meta3->GetGarbageBlobBytes(), 20000);
+
+ const auto meta5 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 5);
+
+ ASSERT_NE(meta5, nullptr);
+ ASSERT_EQ(meta5->GetBlobFileNumber(), 5);
+ ASSERT_EQ(meta5->GetTotalBlobCount(), 2000);
+ ASSERT_EQ(meta5->GetTotalBlobBytes(), 2000000);
+ ASSERT_EQ(meta5->GetGarbageBlobCount(), 400);
+ ASSERT_EQ(meta5->GetGarbageBlobBytes(), 140000);
+
+ const auto meta9 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 9);
+
+ ASSERT_NE(meta9, nullptr);
+ ASSERT_EQ(meta9->GetBlobFileNumber(), 9);
+ ASSERT_EQ(meta9->GetTotalBlobCount(), 4000);
+ ASSERT_EQ(meta9->GetTotalBlobBytes(), 4000000);
+ ASSERT_EQ(meta9->GetGarbageBlobCount(), 0);
+ ASSERT_EQ(meta9->GetGarbageBlobBytes(), 0);
+
+ // Delete the first table file, which makes the first blob file obsolete
+ // since it's at the head and unreferenced.
+ VersionBuilder second_builder(env_options, &ioptions_, table_cache,
+ &new_vstorage, version_set);
+
+ VersionEdit second_edit;
+ second_edit.DeleteFile(/* level */ 0, /* file_number */ 2);
+
+ ASSERT_OK(second_builder.Apply(&second_edit));
+
+ VersionStorageInfo newer_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &new_vstorage,
+ force_consistency_checks);
+
+ ASSERT_OK(second_builder.SaveTo(&newer_vstorage));
+
+ UpdateVersionStorageInfo(&newer_vstorage);
+
+ const auto& newer_blob_files = newer_vstorage.GetBlobFiles();
+ ASSERT_EQ(newer_blob_files.size(), 2);
+
+ const auto newer_meta3 =
+ newer_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3);
+
+ ASSERT_EQ(newer_meta3, nullptr);
+
+ UnrefFilesInVersion(&newer_vstorage);
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) {
+ // When multiple background jobs (flushes/compactions) are executing in
+ // parallel, it is possible for the VersionEdit adding blob file K to be
+ // applied *after* the VersionEdit adding blob file N (for N > K). This test
+ // case makes sure this is handled correctly.
+
+ // Add blob file #4 (referenced by table file #3) to base version.
+ constexpr uint64_t base_table_file_number = 3;
+ constexpr uint64_t base_blob_file_number = 4;
+ constexpr uint64_t base_total_blob_count = 100;
+ constexpr uint64_t base_total_blob_bytes = 1 << 20;
+
+ constexpr char checksum_method[] = "SHA1";
+ constexpr char checksum_value[] = "\xfa\xce\xb0\x0c";
+ constexpr uint64_t garbage_blob_count = 0;
+ constexpr uint64_t garbage_blob_bytes = 0;
+
+ AddDummyFile(base_table_file_number, base_blob_file_number);
+ AddBlob(base_blob_file_number, base_total_blob_count, base_total_blob_bytes,
+ checksum_method, checksum_value,
+ BlobFileMetaData::LinkedSsts{base_table_file_number},
+ garbage_blob_count, garbage_blob_bytes);
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ // Add blob file #2 (referenced by table file #1).
+ constexpr int level = 0;
+ constexpr uint64_t table_file_number = 1;
+ constexpr uint32_t path_id = 0;
+ constexpr uint64_t file_size = 1 << 12;
+ constexpr char smallest[] = "key1";
+ constexpr char largest[] = "key987";
+ constexpr SequenceNumber smallest_seqno = 0;
+ constexpr SequenceNumber largest_seqno = 0;
+ constexpr bool marked_for_compaction = false;
+
+ constexpr uint64_t blob_file_number = 2;
+ static_assert(blob_file_number < base_blob_file_number,
+ "Added blob file should have a smaller file number");
+
+ constexpr uint64_t total_blob_count = 234;
+ constexpr uint64_t total_blob_bytes = 1 << 22;
+
+ edit.AddFile(level, table_file_number, path_id, file_size,
+ GetInternalKey(smallest), GetInternalKey(largest),
+ smallest_seqno, largest_seqno, marked_for_compaction,
+ Temperature::kUnknown, blob_file_number,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ checksum_value, checksum_method, kNullUniqueId64x2);
+ edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+ checksum_method, checksum_value);
+
+ ASSERT_OK(builder.Apply(&edit));
+
+ constexpr bool force_consistency_checks = true;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ const auto& new_blob_files = new_vstorage.GetBlobFiles();
+ ASSERT_EQ(new_blob_files.size(), 2);
+
+ const auto base_meta =
+ new_vstorage.GetBlobFileMetaData(base_blob_file_number);
+
+ ASSERT_NE(base_meta, nullptr);
+ ASSERT_EQ(base_meta->GetBlobFileNumber(), base_blob_file_number);
+ ASSERT_EQ(base_meta->GetTotalBlobCount(), base_total_blob_count);
+ ASSERT_EQ(base_meta->GetTotalBlobBytes(), base_total_blob_bytes);
+ ASSERT_EQ(base_meta->GetGarbageBlobCount(), garbage_blob_count);
+ ASSERT_EQ(base_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+ ASSERT_EQ(base_meta->GetChecksumMethod(), checksum_method);
+ ASSERT_EQ(base_meta->GetChecksumValue(), checksum_value);
+
+ const auto added_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+ ASSERT_NE(added_meta, nullptr);
+ ASSERT_EQ(added_meta->GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(added_meta->GetTotalBlobCount(), total_blob_count);
+ ASSERT_EQ(added_meta->GetTotalBlobBytes(), total_blob_bytes);
+ ASSERT_EQ(added_meta->GetGarbageBlobCount(), garbage_blob_count);
+ ASSERT_EQ(added_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+ ASSERT_EQ(added_meta->GetChecksumMethod(), checksum_method);
+ ASSERT_EQ(added_meta->GetChecksumValue(), checksum_value);
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) {
+ // Initialize base version. The first table file points to a valid blob file
+ // in this version; the second one does not refer to any blob files.
+
+ Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+ /* largest */ "200", /* file_size */ 100,
+ /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+ /* num_entries */ 0, /* num_deletions */ 0,
+ /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+ /* oldest_blob_file_number */ 16);
+ Add(/* level */ 1, /* file_number */ 23, /* smallest */ "201",
+ /* largest */ "300", /* file_size */ 100,
+ /* path_id */ 0, /* smallest_seq */ 200, /* largest_seq */ 200,
+ /* num_entries */ 0, /* num_deletions */ 0,
+ /* sampled */ false, /* smallest_seqno */ 200, /* largest_seqno */ 200,
+ kInvalidBlobFileNumber);
+
+ AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+ /* total_blob_bytes */ 1000000,
+ /* checksum_method */ std::string(),
+ /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+ /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000);
+
+ UpdateVersionStorageInfo();
+
+ // Add a new table file that points to the existing blob file, and add a
+ // new table file--blob file pair.
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ edit.AddFile(/* level */ 1, /* file_number */ 606, /* path_id */ 0,
+ /* file_size */ 100, /* smallest */ GetInternalKey("701"),
+ /* largest */ GetInternalKey("750"), /* smallest_seqno */ 200,
+ /* largest_seqno */ 200, /* marked_for_compaction */ false,
+ Temperature::kUnknown,
+ /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0,
+ /* file_size */ 100, /* smallest */ GetInternalKey("801"),
+ /* largest */ GetInternalKey("850"), /* smallest_seqno */ 200,
+ /* largest_seqno */ 200, /* marked_for_compaction */ false,
+ Temperature::kUnknown,
+ /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000,
+ /* total_blob_bytes */ 200000,
+ /* checksum_method */ std::string(),
+ /* checksum_value */ std::string());
+
+ ASSERT_OK(builder.Apply(&edit));
+
+ // Save to a new version in order to trigger consistency checks.
+ constexpr bool force_consistency_checks = true;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesInconsistentLinks) {
+ // Initialize base version. Links between the table file and the blob file
+ // are inconsistent.
+
+ Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+ /* largest */ "200", /* file_size */ 100,
+ /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+ /* num_entries */ 0, /* num_deletions */ 0,
+ /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+ /* oldest_blob_file_number */ 256);
+
+ AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+ /* total_blob_bytes */ 1000000,
+ /* checksum_method */ std::string(),
+ /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+ /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000);
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ // Save to a new version in order to trigger consistency checks.
+ constexpr bool force_consistency_checks = true;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ const Status s = builder.SaveTo(&new_vstorage);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(
+ s.getState(),
+ "Links are inconsistent between table files and blob file #16"));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbage) {
+ // Initialize base version. The table file points to a blob file that is
+ // all garbage.
+
+ Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+ /* largest */ "200", /* file_size */ 100,
+ /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+ /* num_entries */ 0, /* num_deletions */ 0,
+ /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+ /* oldest_blob_file_number */ 16);
+
+ AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+ /* total_blob_bytes */ 1000000,
+ /* checksum_method */ std::string(),
+ /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+ /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000);
+
+ UpdateVersionStorageInfo();
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ // Save to a new version in order to trigger consistency checks.
+ constexpr bool force_consistency_checks = true;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ const Status s = builder.SaveTo(&new_vstorage);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(
+ std::strstr(s.getState(), "Blob file #16 consists entirely of garbage"));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbageLinkedSsts) {
+ // Initialize base version, with a table file pointing to a blob file
+ // that has no garbage at this point.
+
+ Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+ /* largest */ "200", /* file_size */ 100,
+ /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+ /* num_entries */ 0, /* num_deletions */ 0,
+ /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+ /* oldest_blob_file_number */ 16);
+
+ AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+ /* total_blob_bytes */ 1000000,
+ /* checksum_method */ std::string(),
+ /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+ /* garbage_blob_count */ 0, /* garbage_blob_bytes */ 0);
+
+ UpdateVersionStorageInfo();
+
+ // Mark the entire blob file garbage but do not remove the linked SST.
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ VersionEdit edit;
+
+ edit.AddBlobFileGarbage(/* blob_file_number */ 16,
+ /* garbage_blob_count */ 1000,
+ /* garbage_blob_bytes */ 1000000);
+
+ ASSERT_OK(builder.Apply(&edit));
+
+ // Save to a new version in order to trigger consistency checks.
+ constexpr bool force_consistency_checks = true;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ const Status s = builder.SaveTo(&new_vstorage);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(
+ std::strstr(s.getState(), "Blob file #16 consists entirely of garbage"));
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
+ // Initialize base version. Table files 1..10 are linked to blob files 1..5,
+ // while table files 11..20 are not linked to any blob files.
+
+ for (uint64_t i = 1; i <= 10; ++i) {
+ std::ostringstream oss;
+ oss << std::setw(2) << std::setfill('0') << i;
+
+ const std::string key = oss.str();
+
+ Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(),
+ /* largest */ key.c_str(), /* file_size */ 100,
+ /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100,
+ /* num_entries */ 0, /* num_deletions */ 0,
+ /* sampled */ false, /* smallest_seqno */ i * 100,
+ /* largest_seqno */ i * 100,
+ /* oldest_blob_file_number */ ((i - 1) % 5) + 1);
+ }
+
+ for (uint64_t i = 1; i <= 5; ++i) {
+ AddBlob(/* blob_file_number */ i, /* total_blob_count */ 2000,
+ /* total_blob_bytes */ 2000000,
+ /* checksum_method */ std::string(),
+ /* checksum_value */ std::string(),
+ BlobFileMetaData::LinkedSsts{i, i + 5},
+ /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000);
+ }
+
+ for (uint64_t i = 11; i <= 20; ++i) {
+ std::ostringstream oss;
+ oss << std::setw(2) << std::setfill('0') << i;
+
+ const std::string key = oss.str();
+
+ Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(),
+ /* largest */ key.c_str(), /* file_size */ 100,
+ /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100,
+ /* num_entries */ 0, /* num_deletions */ 0,
+ /* sampled */ false, /* smallest_seqno */ i * 100,
+ /* largest_seqno */ i * 100, kInvalidBlobFileNumber);
+ }
+
+ UpdateVersionStorageInfo();
+
+ {
+ const auto& blob_files = vstorage_.GetBlobFiles();
+ ASSERT_EQ(blob_files.size(), 5);
+
+ const std::vector<BlobFileMetaData::LinkedSsts> expected_linked_ssts{
+ {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+
+ for (size_t i = 0; i < 5; ++i) {
+ const auto meta =
+ vstorage_.GetBlobFileMetaData(/* blob_file_number */ i + 1);
+ ASSERT_NE(meta, nullptr);
+ ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]);
+ }
+ }
+
+ VersionEdit edit;
+
+ // Add an SST that references a blob file.
+ edit.AddFile(
+ /* level */ 1, /* file_number */ 21, /* path_id */ 0,
+ /* file_size */ 100, /* smallest */ GetInternalKey("21", 2100),
+ /* largest */ GetInternalKey("21", 2100), /* smallest_seqno */ 2100,
+ /* largest_seqno */ 2100, /* marked_for_compaction */ false,
+ Temperature::kUnknown,
+ /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ // Add an SST that does not reference any blob files.
+ edit.AddFile(
+ /* level */ 1, /* file_number */ 22, /* path_id */ 0,
+ /* file_size */ 100, /* smallest */ GetInternalKey("22", 2200),
+ /* largest */ GetInternalKey("22", 2200), /* smallest_seqno */ 2200,
+ /* largest_seqno */ 2200, /* marked_for_compaction */ false,
+ Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ // Delete a file that references a blob file.
+ edit.DeleteFile(/* level */ 1, /* file_number */ 6);
+
+ // Delete a file that does not reference any blob files.
+ edit.DeleteFile(/* level */ 1, /* file_number */ 16);
+
+ // Trivially move a file that references a blob file. Note that we save
+ // the original BlobFileMetaData object so we can check that no new object
+ // gets created.
+ auto meta3 = vstorage_.GetBlobFileMetaData(/* blob_file_number */ 3);
+
+ edit.DeleteFile(/* level */ 1, /* file_number */ 3);
+ edit.AddFile(/* level */ 2, /* file_number */ 3, /* path_id */ 0,
+ /* file_size */ 100, /* smallest */ GetInternalKey("03", 300),
+ /* largest */ GetInternalKey("03", 300),
+ /* smallest_seqno */ 300,
+ /* largest_seqno */ 300, /* marked_for_compaction */ false,
+ Temperature::kUnknown,
+ /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ // Trivially move a file that does not reference any blob files.
+ edit.DeleteFile(/* level */ 1, /* file_number */ 13);
+ edit.AddFile(/* level */ 2, /* file_number */ 13, /* path_id */ 0,
+ /* file_size */ 100, /* smallest */ GetInternalKey("13", 1300),
+ /* largest */ GetInternalKey("13", 1300),
+ /* smallest_seqno */ 1300,
+ /* largest_seqno */ 1300, /* marked_for_compaction */ false,
+ Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+
+ // Add one more SST file that references a blob file, then promptly
+ // delete it in a second version edit before the new version gets saved.
+ // This file should not show up as linked to the blob file in the new version.
+ edit.AddFile(/* level */ 1, /* file_number */ 23, /* path_id */ 0,
+ /* file_size */ 100, /* smallest */ GetInternalKey("23", 2300),
+ /* largest */ GetInternalKey("23", 2300),
+ /* smallest_seqno */ 2300,
+ /* largest_seqno */ 2300, /* marked_for_compaction */ false,
+ Temperature::kUnknown,
+ /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ VersionEdit edit2;
+
+ edit2.DeleteFile(/* level */ 1, /* file_number */ 23);
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+ version_set);
+
+ ASSERT_OK(builder.Apply(&edit));
+ ASSERT_OK(builder.Apply(&edit2));
+
+ constexpr bool force_consistency_checks = true;
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, &vstorage_,
+ force_consistency_checks);
+
+ ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ {
+ const auto& blob_files = new_vstorage.GetBlobFiles();
+ ASSERT_EQ(blob_files.size(), 5);
+
+ const std::vector<BlobFileMetaData::LinkedSsts> expected_linked_ssts{
+ {1, 21}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+
+ for (size_t i = 0; i < 5; ++i) {
+ const auto meta =
+ new_vstorage.GetBlobFileMetaData(/* blob_file_number */ i + 1);
+ ASSERT_NE(meta, nullptr);
+ ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]);
+ }
+
+ // Make sure that no new BlobFileMetaData got created for the blob file
+ // affected by the trivial move.
+ ASSERT_EQ(new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3),
+ meta3);
+ }
+
+ UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) {
+ Add(0, 1U, "150", "200", 100U);
+
+ UpdateVersionStorageInfo();
+
+ VersionEdit version_edit;
+ version_edit.DeleteFile(0, 1U);
+
+ EnvOptions env_options;
+ constexpr TableCache* table_cache = nullptr;
+ constexpr VersionSet* version_set = nullptr;
+
+ VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+ &vstorage_, version_set);
+ VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr,
+ true /* force_consistency_checks */);
+ ASSERT_OK(version_builder.Apply(&version_edit));
+ ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+ UpdateVersionStorageInfo(&new_vstorage);
+
+ VersionBuilder version_builder2(env_options, &ioptions_, table_cache,
+ &new_vstorage, version_set);
+ VersionStorageInfo new_vstorage2(&icmp_, ucmp_, options_.num_levels,
+ kCompactionStyleLevel, nullptr,
+ true /* force_consistency_checks */);
+ ASSERT_NOK(version_builder2.Apply(&version_edit));
+
+ UnrefFilesInVersion(&new_vstorage);
+ UnrefFilesInVersion(&new_vstorage2);
+}
+
+TEST_F(VersionBuilderTest, EstimatedActiveKeys) {
+ const uint32_t kTotalSamples = 20;
+ const uint32_t kNumLevels = 5;
+ const uint32_t kFilesPerLevel = 8;
+ const uint32_t kNumFiles = kNumLevels * kFilesPerLevel;
+ const uint32_t kEntriesPerFile = 1000;
+ const uint32_t kDeletionsPerFile = 100;
+ for (uint32_t i = 0; i < kNumFiles; ++i) {
+ Add(static_cast<int>(i / kFilesPerLevel), i + 1,
+ std::to_string((i + 100) * 1000).c_str(),
+ std::to_string((i + 100) * 1000 + 999).c_str(), 100U, 0, 100, 100,
+ kEntriesPerFile, kDeletionsPerFile, (i < kTotalSamples));
+ }
+ // minus 2X for the number of deletion entries because:
+ // 1x for deletion entry does not count as a data entry.
+ // 1x for each deletion entry will actually remove one data entry.
+ ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(),
+ (kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_edit.cc b/src/rocksdb/db/version_edit.cc
new file mode 100644
index 000000000..e4e02fe25
--- /dev/null
+++ b/src/rocksdb/db/version_edit.cc
@@ -0,0 +1,1043 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/blob/blob_index.h"
+#include "db/version_set.h"
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {} // anonymous namespace
+
+uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
+ assert(number <= kFileNumberMask);
+ return number | (path_id * (kFileNumberMask + 1));
+}
+
+Status FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
+ SequenceNumber seqno,
+ ValueType value_type) {
+ if (value_type == kTypeBlobIndex) {
+ BlobIndex blob_index;
+ const Status s = blob_index.DecodeFrom(value);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (!blob_index.IsInlined() && !blob_index.HasTTL()) {
+ if (blob_index.file_number() == kInvalidBlobFileNumber) {
+ return Status::Corruption("Invalid blob file number");
+ }
+
+ if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+ oldest_blob_file_number > blob_index.file_number()) {
+ oldest_blob_file_number = blob_index.file_number();
+ }
+ }
+ }
+
+ if (smallest.size() == 0) {
+ smallest.DecodeFrom(key);
+ }
+ largest.DecodeFrom(key);
+ fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+ fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+
+ return Status::OK();
+}
+
+void VersionEdit::Clear() {
+ max_level_ = 0;
+ db_id_.clear();
+ comparator_.clear();
+ log_number_ = 0;
+ prev_log_number_ = 0;
+ next_file_number_ = 0;
+ max_column_family_ = 0;
+ min_log_number_to_keep_ = 0;
+ last_sequence_ = 0;
+ has_db_id_ = false;
+ has_comparator_ = false;
+ has_log_number_ = false;
+ has_prev_log_number_ = false;
+ has_next_file_number_ = false;
+ has_max_column_family_ = false;
+ has_min_log_number_to_keep_ = false;
+ has_last_sequence_ = false;
+ compact_cursors_.clear();
+ deleted_files_.clear();
+ new_files_.clear();
+ blob_file_additions_.clear();
+ blob_file_garbages_.clear();
+ wal_additions_.clear();
+ wal_deletion_.Reset();
+ column_family_ = 0;
+ is_column_family_add_ = false;
+ is_column_family_drop_ = false;
+ column_family_name_.clear();
+ is_in_atomic_group_ = false;
+ remaining_entries_ = 0;
+ full_history_ts_low_.clear();
+}
+
+bool VersionEdit::EncodeTo(std::string* dst) const {
+ if (has_db_id_) {
+ PutVarint32(dst, kDbId);
+ PutLengthPrefixedSlice(dst, db_id_);
+ }
+ if (has_comparator_) {
+ PutVarint32(dst, kComparator);
+ PutLengthPrefixedSlice(dst, comparator_);
+ }
+ if (has_log_number_) {
+ PutVarint32Varint64(dst, kLogNumber, log_number_);
+ }
+ if (has_prev_log_number_) {
+ PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_);
+ }
+ if (has_next_file_number_) {
+ PutVarint32Varint64(dst, kNextFileNumber, next_file_number_);
+ }
+ if (has_max_column_family_) {
+ PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_);
+ }
+ if (has_min_log_number_to_keep_) {
+ PutVarint32Varint64(dst, kMinLogNumberToKeep, min_log_number_to_keep_);
+ }
+ if (has_last_sequence_) {
+ PutVarint32Varint64(dst, kLastSequence, last_sequence_);
+ }
+ for (size_t i = 0; i < compact_cursors_.size(); i++) {
+ if (compact_cursors_[i].second.Valid()) {
+ PutVarint32(dst, kCompactCursor);
+ PutVarint32(dst, compact_cursors_[i].first); // level
+ PutLengthPrefixedSlice(dst, compact_cursors_[i].second.Encode());
+ }
+ }
+ for (const auto& deleted : deleted_files_) {
+ PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */,
+ deleted.second /* file number */);
+ }
+
+ bool min_log_num_written = false;
+ for (size_t i = 0; i < new_files_.size(); i++) {
+ const FileMetaData& f = new_files_[i].second;
+ if (!f.smallest.Valid() || !f.largest.Valid()) {
+ return false;
+ }
+ PutVarint32(dst, kNewFile4);
+ PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
+ PutVarint64(dst, f.fd.GetFileSize());
+ PutLengthPrefixedSlice(dst, f.smallest.Encode());
+ PutLengthPrefixedSlice(dst, f.largest.Encode());
+ PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
+ // Customized fields' format:
+ // +-----------------------------+
+ // | 1st field's tag (varint32) |
+ // +-----------------------------+
+ // | 1st field's size (varint32) |
+ // +-----------------------------+
+ // | bytes for 1st field |
+ // | (based on size decoded) |
+ // +-----------------------------+
+ // | |
+ // | ...... |
+ // | |
+ // +-----------------------------+
+ // | last field's size (varint32)|
+ // +-----------------------------+
+ // | bytes for last field |
+ // | (based on size decoded) |
+ // +-----------------------------+
+ // | terminating tag (varint32) |
+ // +-----------------------------+
+ //
+ // Customized encoding for fields:
+ // tag kPathId: 1 byte as path_id
+ // tag kNeedCompaction:
+ // now only can take one char value 1 indicating need-compaction
+ //
+ PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime);
+ std::string varint_oldest_ancester_time;
+ PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
+ TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
+ &varint_oldest_ancester_time);
+ PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
+
+ PutVarint32(dst, NewFileCustomTag::kFileCreationTime);
+ std::string varint_file_creation_time;
+ PutVarint64(&varint_file_creation_time, f.file_creation_time);
+ TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
+ &varint_file_creation_time);
+ PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
+
+ PutVarint32(dst, NewFileCustomTag::kFileChecksum);
+ PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
+
+ PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName);
+ PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
+
+ if (f.fd.GetPathId() != 0) {
+ PutVarint32(dst, NewFileCustomTag::kPathId);
+ char p = static_cast<char>(f.fd.GetPathId());
+ PutLengthPrefixedSlice(dst, Slice(&p, 1));
+ }
+ if (f.temperature != Temperature::kUnknown) {
+ PutVarint32(dst, NewFileCustomTag::kTemperature);
+ char p = static_cast<char>(f.temperature);
+ PutLengthPrefixedSlice(dst, Slice(&p, 1));
+ }
+ if (f.marked_for_compaction) {
+ PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
+ char p = static_cast<char>(1);
+ PutLengthPrefixedSlice(dst, Slice(&p, 1));
+ }
+ if (has_min_log_number_to_keep_ && !min_log_num_written) {
+ PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack);
+ std::string varint_log_number;
+ PutFixed64(&varint_log_number, min_log_number_to_keep_);
+ PutLengthPrefixedSlice(dst, Slice(varint_log_number));
+ min_log_num_written = true;
+ }
+ if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+ PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber);
+ std::string oldest_blob_file_number;
+ PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
+ PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
+ }
+ UniqueId64x2 unique_id = f.unique_id;
+ TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id);
+ if (unique_id != kNullUniqueId64x2) {
+ PutVarint32(dst, NewFileCustomTag::kUniqueId);
+ std::string unique_id_str = EncodeUniqueIdBytes(&unique_id);
+ PutLengthPrefixedSlice(dst, Slice(unique_id_str));
+ }
+
+ TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
+ dst);
+
+ PutVarint32(dst, NewFileCustomTag::kTerminate);
+ }
+
+ for (const auto& blob_file_addition : blob_file_additions_) {
+ PutVarint32(dst, kBlobFileAddition);
+ blob_file_addition.EncodeTo(dst);
+ }
+
+ for (const auto& blob_file_garbage : blob_file_garbages_) {
+ PutVarint32(dst, kBlobFileGarbage);
+ blob_file_garbage.EncodeTo(dst);
+ }
+
+ for (const auto& wal_addition : wal_additions_) {
+ PutVarint32(dst, kWalAddition2);
+ std::string encoded;
+ wal_addition.EncodeTo(&encoded);
+ PutLengthPrefixedSlice(dst, encoded);
+ }
+
+ if (!wal_deletion_.IsEmpty()) {
+ PutVarint32(dst, kWalDeletion2);
+ std::string encoded;
+ wal_deletion_.EncodeTo(&encoded);
+ PutLengthPrefixedSlice(dst, encoded);
+ }
+
+ // 0 is default and does not need to be explicitly written
+ if (column_family_ != 0) {
+ PutVarint32Varint32(dst, kColumnFamily, column_family_);
+ }
+
+ if (is_column_family_add_) {
+ PutVarint32(dst, kColumnFamilyAdd);
+ PutLengthPrefixedSlice(dst, Slice(column_family_name_));
+ }
+
+ if (is_column_family_drop_) {
+ PutVarint32(dst, kColumnFamilyDrop);
+ }
+
+ if (is_in_atomic_group_) {
+ PutVarint32(dst, kInAtomicGroup);
+ PutVarint32(dst, remaining_entries_);
+ }
+
+ if (HasFullHistoryTsLow()) {
+ PutVarint32(dst, kFullHistoryTsLow);
+ PutLengthPrefixedSlice(dst, full_history_ts_low_);
+ }
+ return true;
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+ Slice str;
+ if (GetLengthPrefixedSlice(input, &str)) {
+ dst->DecodeFrom(str);
+ return dst->Valid();
+ } else {
+ return false;
+ }
+}
+
+bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
+ uint32_t v = 0;
+ if (GetVarint32(input, &v)) {
+ *level = v;
+ if (max_level_ < *level) {
+ max_level_ = *level;
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static bool is_pseudo_new_file_record_pr3488(
+ const int level,
+ const uint64_t number,
+ const uint64_t file_size,
+ InternalKey& smallest,
+ InternalKey& largest,
+ const bool has_min_log_number_to_keep_) {
+
+ if (level == 0 && number == 0 && file_size == 0 &&
+ has_min_log_number_to_keep_) {
+ InternalKey dummy_key(Slice("dummy_key"), 0ull, ValueType::kTypeValue);
+ return (*smallest.rep() == *dummy_key.rep() &&
+ *largest.rep() == *dummy_key.rep());
+ } else {
+ return false;
+ }
+}
+
+const char* VersionEdit::DecodeNewFile4From(Slice* input) {
+ const char* msg = nullptr;
+ int level = 0;
+ FileMetaData f;
+ uint64_t number = 0;
+ uint32_t path_id = 0;
+ uint64_t file_size = 0;
+ SequenceNumber smallest_seqno = 0;
+ SequenceNumber largest_seqno = kMaxSequenceNumber;
+ if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
+ GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
+ GetInternalKey(input, &f.largest) &&
+ GetVarint64(input, &smallest_seqno) &&
+ GetVarint64(input, &largest_seqno)) {
+ // See comments in VersionEdit::EncodeTo() for format of customized fields
+ while (true) {
+ uint32_t custom_tag = 0;
+ Slice field;
+ if (!GetVarint32(input, &custom_tag)) {
+ return "new-file4 custom field";
+ }
+ if (custom_tag == kTerminate) {
+ break;
+ }
+ if (!GetLengthPrefixedSlice(input, &field)) {
+ return "new-file4 custom field length prefixed slice error";
+ }
+ switch (custom_tag) {
+ case kPathId:
+ if (field.size() != 1) {
+ return "path_id field wrong size";
+ }
+ path_id = field[0];
+ if (path_id > 3) {
+ return "path_id wrong vaue";
+ }
+ break;
+ case kOldestAncesterTime:
+ if (!GetVarint64(&field, &f.oldest_ancester_time)) {
+ return "invalid oldest ancester time";
+ }
+ break;
+ case kFileCreationTime:
+ if (!GetVarint64(&field, &f.file_creation_time)) {
+ return "invalid file creation time";
+ }
+ break;
+ case kFileChecksum:
+ f.file_checksum = field.ToString();
+ break;
+ case kFileChecksumFuncName:
+ f.file_checksum_func_name = field.ToString();
+ break;
+ case kNeedCompaction:
+ if (field.size() != 1) {
+ return "need_compaction field wrong size";
+ }
+ f.marked_for_compaction = (field[0] == 1);
+ break;
+ case kMinLogNumberToKeepHack:
+ // This is a hack to encode kMinLogNumberToKeep in a
+ // forward-compatible fashion.
+ if (!GetFixed64(&field, &min_log_number_to_keep_)) {
+ return "deleted log number malformatted";
+ }
+ has_min_log_number_to_keep_ = true;
+ break;
+ case kOldestBlobFileNumber:
+ if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
+ return "invalid oldest blob file number";
+ }
+ break;
+ case kTemperature:
+ if (field.size() != 1) {
+ return "temperature field wrong size";
+ } else {
+ Temperature casted_field = static_cast<Temperature>(field[0]);
+ if (casted_field <= Temperature::kCold) {
+ f.temperature = casted_field;
+ }
+ }
+ break;
+ case kUniqueId:
+ if (!DecodeUniqueIdBytes(field.ToString(), &f.unique_id).ok()) {
+ f.unique_id = kNullUniqueId64x2;
+ return "invalid unique id";
+ }
+ break;
+ default:
+ if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
+ // Should not proceed if cannot understand it
+ return "new-file4 custom field not supported";
+ }
+ break;
+ }
+ }
+ } else {
+ return "new-file4 entry";
+ }
+ if (is_pseudo_new_file_record_pr3488(level, number, file_size,
+ f.smallest, f.largest,
+ has_min_log_number_to_keep_)) {
+ // Since this has nothing to do with NewFile, return immediately.
+ return nullptr;
+ }
+ f.fd =
+ FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
+ new_files_.push_back(std::make_pair(level, f));
+ return nullptr;
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+ Clear();
+#ifndef NDEBUG
+ bool ignore_ignorable_tags = false;
+ TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags",
+ &ignore_ignorable_tags);
+#endif
+ Slice input = src;
+ const char* msg = nullptr;
+ uint32_t tag = 0;
+
+ // Temporary storage for parsing
+ int level = 0;
+ FileMetaData f;
+ Slice str;
+ InternalKey key;
+ while (msg == nullptr && GetVarint32(&input, &tag)) {
+#ifndef NDEBUG
+ if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) {
+ tag = kTagSafeIgnoreMask;
+ }
+#endif
+ switch (tag) {
+ case kDbId:
+ if (GetLengthPrefixedSlice(&input, &str)) {
+ db_id_ = str.ToString();
+ has_db_id_ = true;
+ } else {
+ msg = "db id";
+ }
+ break;
+ case kComparator:
+ if (GetLengthPrefixedSlice(&input, &str)) {
+ comparator_ = str.ToString();
+ has_comparator_ = true;
+ } else {
+ msg = "comparator name";
+ }
+ break;
+
+ case kLogNumber:
+ if (GetVarint64(&input, &log_number_)) {
+ has_log_number_ = true;
+ } else {
+ msg = "log number";
+ }
+ break;
+
+ case kPrevLogNumber:
+ if (GetVarint64(&input, &prev_log_number_)) {
+ has_prev_log_number_ = true;
+ } else {
+ msg = "previous log number";
+ }
+ break;
+
+ case kNextFileNumber:
+ if (GetVarint64(&input, &next_file_number_)) {
+ has_next_file_number_ = true;
+ } else {
+ msg = "next file number";
+ }
+ break;
+
+ case kMaxColumnFamily:
+ if (GetVarint32(&input, &max_column_family_)) {
+ has_max_column_family_ = true;
+ } else {
+ msg = "max column family";
+ }
+ break;
+
+ case kMinLogNumberToKeep:
+ if (GetVarint64(&input, &min_log_number_to_keep_)) {
+ has_min_log_number_to_keep_ = true;
+ } else {
+ msg = "min log number to kee";
+ }
+ break;
+
+ case kLastSequence:
+ if (GetVarint64(&input, &last_sequence_)) {
+ has_last_sequence_ = true;
+ } else {
+ msg = "last sequence number";
+ }
+ break;
+
+ case kCompactCursor:
+ if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) {
+ // Here we re-use the output format of compact pointer in LevelDB
+ // to persist compact_cursors_
+ compact_cursors_.push_back(std::make_pair(level, key));
+ } else {
+ if (!msg) {
+ msg = "compaction cursor";
+ }
+ }
+ break;
+
+ case kDeletedFile: {
+ uint64_t number = 0;
+ if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
+ deleted_files_.insert(std::make_pair(level, number));
+ } else {
+ if (!msg) {
+ msg = "deleted file";
+ }
+ }
+ break;
+ }
+
+ case kNewFile: {
+ uint64_t number = 0;
+ uint64_t file_size = 0;
+ if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+ GetVarint64(&input, &file_size) &&
+ GetInternalKey(&input, &f.smallest) &&
+ GetInternalKey(&input, &f.largest)) {
+ f.fd = FileDescriptor(number, 0, file_size);
+ new_files_.push_back(std::make_pair(level, f));
+ } else {
+ if (!msg) {
+ msg = "new-file entry";
+ }
+ }
+ break;
+ }
+ case kNewFile2: {
+ uint64_t number = 0;
+ uint64_t file_size = 0;
+ SequenceNumber smallest_seqno = 0;
+ SequenceNumber largest_seqno = kMaxSequenceNumber;
+ if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+ GetVarint64(&input, &file_size) &&
+ GetInternalKey(&input, &f.smallest) &&
+ GetInternalKey(&input, &f.largest) &&
+ GetVarint64(&input, &smallest_seqno) &&
+ GetVarint64(&input, &largest_seqno)) {
+ f.fd = FileDescriptor(number, 0, file_size, smallest_seqno,
+ largest_seqno);
+ new_files_.push_back(std::make_pair(level, f));
+ } else {
+ if (!msg) {
+ msg = "new-file2 entry";
+ }
+ }
+ break;
+ }
+
+ case kNewFile3: {
+ uint64_t number = 0;
+ uint32_t path_id = 0;
+ uint64_t file_size = 0;
+ SequenceNumber smallest_seqno = 0;
+ SequenceNumber largest_seqno = kMaxSequenceNumber;
+ if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+ GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
+ GetInternalKey(&input, &f.smallest) &&
+ GetInternalKey(&input, &f.largest) &&
+ GetVarint64(&input, &smallest_seqno) &&
+ GetVarint64(&input, &largest_seqno)) {
+ f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno,
+ largest_seqno);
+ new_files_.push_back(std::make_pair(level, f));
+ } else {
+ if (!msg) {
+ msg = "new-file3 entry";
+ }
+ }
+ break;
+ }
+
+ case kNewFile4: {
+ msg = DecodeNewFile4From(&input);
+ break;
+ }
+
+ case kBlobFileAddition:
+ case kBlobFileAddition_DEPRECATED: {
+ BlobFileAddition blob_file_addition;
+ const Status s = blob_file_addition.DecodeFrom(&input);
+ if (!s.ok()) {
+ return s;
+ }
+
+ AddBlobFile(std::move(blob_file_addition));
+ break;
+ }
+
+ case kBlobFileGarbage:
+ case kBlobFileGarbage_DEPRECATED: {
+ BlobFileGarbage blob_file_garbage;
+ const Status s = blob_file_garbage.DecodeFrom(&input);
+ if (!s.ok()) {
+ return s;
+ }
+
+ AddBlobFileGarbage(std::move(blob_file_garbage));
+ break;
+ }
+
+ case kWalAddition: {
+ WalAddition wal_addition;
+ const Status s = wal_addition.DecodeFrom(&input);
+ if (!s.ok()) {
+ return s;
+ }
+
+ wal_additions_.emplace_back(std::move(wal_addition));
+ break;
+ }
+
+ case kWalAddition2: {
+ Slice encoded;
+ if (!GetLengthPrefixedSlice(&input, &encoded)) {
+ msg = "WalAddition not prefixed by length";
+ break;
+ }
+
+ WalAddition wal_addition;
+ const Status s = wal_addition.DecodeFrom(&encoded);
+ if (!s.ok()) {
+ return s;
+ }
+
+ wal_additions_.emplace_back(std::move(wal_addition));
+ break;
+ }
+
+ case kWalDeletion: {
+ WalDeletion wal_deletion;
+ const Status s = wal_deletion.DecodeFrom(&input);
+ if (!s.ok()) {
+ return s;
+ }
+
+ wal_deletion_ = std::move(wal_deletion);
+ break;
+ }
+
+ case kWalDeletion2: {
+ Slice encoded;
+ if (!GetLengthPrefixedSlice(&input, &encoded)) {
+ msg = "WalDeletion not prefixed by length";
+ break;
+ }
+
+ WalDeletion wal_deletion;
+ const Status s = wal_deletion.DecodeFrom(&encoded);
+ if (!s.ok()) {
+ return s;
+ }
+
+ wal_deletion_ = std::move(wal_deletion);
+ break;
+ }
+
+ case kColumnFamily:
+ if (!GetVarint32(&input, &column_family_)) {
+ if (!msg) {
+ msg = "set column family id";
+ }
+ }
+ break;
+
+ case kColumnFamilyAdd:
+ if (GetLengthPrefixedSlice(&input, &str)) {
+ is_column_family_add_ = true;
+ column_family_name_ = str.ToString();
+ } else {
+ if (!msg) {
+ msg = "column family add";
+ }
+ }
+ break;
+
+ case kColumnFamilyDrop:
+ is_column_family_drop_ = true;
+ break;
+
+ case kInAtomicGroup:
+ is_in_atomic_group_ = true;
+ if (!GetVarint32(&input, &remaining_entries_)) {
+ if (!msg) {
+ msg = "remaining entries";
+ }
+ }
+ break;
+
+ case kFullHistoryTsLow:
+ if (!GetLengthPrefixedSlice(&input, &str)) {
+ msg = "full_history_ts_low";
+ } else if (str.empty()) {
+ msg = "full_history_ts_low: empty";
+ } else {
+ full_history_ts_low_.assign(str.data(), str.size());
+ }
+ break;
+
+ default:
+ if (tag & kTagSafeIgnoreMask) {
+ // Tag from future which can be safely ignored.
+ // The next field must be the length of the entry.
+ uint32_t field_len;
+ if (!GetVarint32(&input, &field_len) ||
+ static_cast<size_t>(field_len) > input.size()) {
+ if (!msg) {
+ msg = "safely ignoreable tag length error";
+ }
+ } else {
+ input.remove_prefix(static_cast<size_t>(field_len));
+ }
+ } else {
+ msg = "unknown tag";
+ }
+ break;
+ }
+ }
+
+ if (msg == nullptr && !input.empty()) {
+ msg = "invalid tag";
+ }
+
+ Status result;
+ if (msg != nullptr) {
+ result = Status::Corruption("VersionEdit", msg);
+ }
+ return result;
+}
+
+std::string VersionEdit::DebugString(bool hex_key) const {
+ std::string r;
+ r.append("VersionEdit {");
+ if (has_db_id_) {
+ r.append("\n DB ID: ");
+ r.append(db_id_);
+ }
+ if (has_comparator_) {
+ r.append("\n Comparator: ");
+ r.append(comparator_);
+ }
+ if (has_log_number_) {
+ r.append("\n LogNumber: ");
+ AppendNumberTo(&r, log_number_);
+ }
+ if (has_prev_log_number_) {
+ r.append("\n PrevLogNumber: ");
+ AppendNumberTo(&r, prev_log_number_);
+ }
+ if (has_next_file_number_) {
+ r.append("\n NextFileNumber: ");
+ AppendNumberTo(&r, next_file_number_);
+ }
+ if (has_max_column_family_) {
+ r.append("\n MaxColumnFamily: ");
+ AppendNumberTo(&r, max_column_family_);
+ }
+ if (has_min_log_number_to_keep_) {
+ r.append("\n MinLogNumberToKeep: ");
+ AppendNumberTo(&r, min_log_number_to_keep_);
+ }
+ if (has_last_sequence_) {
+ r.append("\n LastSeq: ");
+ AppendNumberTo(&r, last_sequence_);
+ }
+ for (const auto& level_and_compact_cursor : compact_cursors_) {
+ r.append("\n CompactCursor: ");
+ AppendNumberTo(&r, level_and_compact_cursor.first);
+ r.append(" ");
+ r.append(level_and_compact_cursor.second.DebugString(hex_key));
+ }
+ for (const auto& deleted_file : deleted_files_) {
+ r.append("\n DeleteFile: ");
+ AppendNumberTo(&r, deleted_file.first);
+ r.append(" ");
+ AppendNumberTo(&r, deleted_file.second);
+ }
+ for (size_t i = 0; i < new_files_.size(); i++) {
+ const FileMetaData& f = new_files_[i].second;
+ r.append("\n AddFile: ");
+ AppendNumberTo(&r, new_files_[i].first);
+ r.append(" ");
+ AppendNumberTo(&r, f.fd.GetNumber());
+ r.append(" ");
+ AppendNumberTo(&r, f.fd.GetFileSize());
+ r.append(" ");
+ r.append(f.smallest.DebugString(hex_key));
+ r.append(" .. ");
+ r.append(f.largest.DebugString(hex_key));
+ if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+ r.append(" blob_file:");
+ AppendNumberTo(&r, f.oldest_blob_file_number);
+ }
+ r.append(" oldest_ancester_time:");
+ AppendNumberTo(&r, f.oldest_ancester_time);
+ r.append(" file_creation_time:");
+ AppendNumberTo(&r, f.file_creation_time);
+ r.append(" file_checksum:");
+ r.append(Slice(f.file_checksum).ToString(true));
+ r.append(" file_checksum_func_name: ");
+ r.append(f.file_checksum_func_name);
+ if (f.temperature != Temperature::kUnknown) {
+ r.append(" temperature: ");
+ // Maybe change to human readable format whenthe feature becomes
+ // permanent
+ r.append(std::to_string(static_cast<int>(f.temperature)));
+ }
+ if (f.unique_id != kNullUniqueId64x2) {
+ r.append(" unique_id(internal): ");
+ UniqueId64x2 id = f.unique_id;
+ r.append(InternalUniqueIdToHumanString(&id));
+ r.append(" public_unique_id: ");
+ InternalUniqueIdToExternal(&id);
+ r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id)));
+ }
+ }
+
+ for (const auto& blob_file_addition : blob_file_additions_) {
+ r.append("\n BlobFileAddition: ");
+ r.append(blob_file_addition.DebugString());
+ }
+
+ for (const auto& blob_file_garbage : blob_file_garbages_) {
+ r.append("\n BlobFileGarbage: ");
+ r.append(blob_file_garbage.DebugString());
+ }
+
+ for (const auto& wal_addition : wal_additions_) {
+ r.append("\n WalAddition: ");
+ r.append(wal_addition.DebugString());
+ }
+
+ if (!wal_deletion_.IsEmpty()) {
+ r.append("\n WalDeletion: ");
+ r.append(wal_deletion_.DebugString());
+ }
+
+ r.append("\n ColumnFamily: ");
+ AppendNumberTo(&r, column_family_);
+ if (is_column_family_add_) {
+ r.append("\n ColumnFamilyAdd: ");
+ r.append(column_family_name_);
+ }
+ if (is_column_family_drop_) {
+ r.append("\n ColumnFamilyDrop");
+ }
+ if (is_in_atomic_group_) {
+ r.append("\n AtomicGroup: ");
+ AppendNumberTo(&r, remaining_entries_);
+ r.append(" entries remains");
+ }
+ if (HasFullHistoryTsLow()) {
+ r.append("\n FullHistoryTsLow: ");
+ r.append(Slice(full_history_ts_low_).ToString(hex_key));
+ }
+ r.append("\n}\n");
+ return r;
+}
+
+std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
+ JSONWriter jw;
+ jw << "EditNumber" << edit_num;
+
+ if (has_db_id_) {
+ jw << "DB ID" << db_id_;
+ }
+ if (has_comparator_) {
+ jw << "Comparator" << comparator_;
+ }
+ if (has_log_number_) {
+ jw << "LogNumber" << log_number_;
+ }
+ if (has_prev_log_number_) {
+ jw << "PrevLogNumber" << prev_log_number_;
+ }
+ if (has_next_file_number_) {
+ jw << "NextFileNumber" << next_file_number_;
+ }
+ if (has_max_column_family_) {
+ jw << "MaxColumnFamily" << max_column_family_;
+ }
+ if (has_min_log_number_to_keep_) {
+ jw << "MinLogNumberToKeep" << min_log_number_to_keep_;
+ }
+ if (has_last_sequence_) {
+ jw << "LastSeq" << last_sequence_;
+ }
+
+ if (!deleted_files_.empty()) {
+ jw << "DeletedFiles";
+ jw.StartArray();
+
+ for (const auto& deleted_file : deleted_files_) {
+ jw.StartArrayedObject();
+ jw << "Level" << deleted_file.first;
+ jw << "FileNumber" << deleted_file.second;
+ jw.EndArrayedObject();
+ }
+
+ jw.EndArray();
+ }
+
+ if (!new_files_.empty()) {
+ jw << "AddedFiles";
+ jw.StartArray();
+
+ for (size_t i = 0; i < new_files_.size(); i++) {
+ jw.StartArrayedObject();
+ jw << "Level" << new_files_[i].first;
+ const FileMetaData& f = new_files_[i].second;
+ jw << "FileNumber" << f.fd.GetNumber();
+ jw << "FileSize" << f.fd.GetFileSize();
+ jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
+ jw << "LargestIKey" << f.largest.DebugString(hex_key);
+ jw << "OldestAncesterTime" << f.oldest_ancester_time;
+ jw << "FileCreationTime" << f.file_creation_time;
+ jw << "FileChecksum" << Slice(f.file_checksum).ToString(true);
+ jw << "FileChecksumFuncName" << f.file_checksum_func_name;
+ if (f.temperature != Temperature::kUnknown) {
+ jw << "temperature" << std::to_string(static_cast<int>(f.temperature));
+ }
+ if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+ jw << "OldestBlobFile" << f.oldest_blob_file_number;
+ }
+ if (f.temperature != Temperature::kUnknown) {
+ // Maybe change to human readable format whenthe feature becomes
+ // permanent
+ jw << "Temperature" << static_cast<int>(f.temperature);
+ }
+ jw.EndArrayedObject();
+ }
+
+ jw.EndArray();
+ }
+
+ if (!blob_file_additions_.empty()) {
+ jw << "BlobFileAdditions";
+
+ jw.StartArray();
+
+ for (const auto& blob_file_addition : blob_file_additions_) {
+ jw.StartArrayedObject();
+ jw << blob_file_addition;
+ jw.EndArrayedObject();
+ }
+
+ jw.EndArray();
+ }
+
+ if (!blob_file_garbages_.empty()) {
+ jw << "BlobFileGarbages";
+
+ jw.StartArray();
+
+ for (const auto& blob_file_garbage : blob_file_garbages_) {
+ jw.StartArrayedObject();
+ jw << blob_file_garbage;
+ jw.EndArrayedObject();
+ }
+
+ jw.EndArray();
+ }
+
+ if (!wal_additions_.empty()) {
+ jw << "WalAdditions";
+
+ jw.StartArray();
+
+ for (const auto& wal_addition : wal_additions_) {
+ jw.StartArrayedObject();
+ jw << wal_addition;
+ jw.EndArrayedObject();
+ }
+
+ jw.EndArray();
+ }
+
+ if (!wal_deletion_.IsEmpty()) {
+ jw << "WalDeletion";
+ jw.StartObject();
+ jw << wal_deletion_;
+ jw.EndObject();
+ }
+
+ jw << "ColumnFamily" << column_family_;
+
+ if (is_column_family_add_) {
+ jw << "ColumnFamilyAdd" << column_family_name_;
+ }
+ if (is_column_family_drop_) {
+ jw << "ColumnFamilyDrop" << column_family_name_;
+ }
+ if (is_in_atomic_group_) {
+ jw << "AtomicGroup" << remaining_entries_;
+ }
+
+ if (HasFullHistoryTsLow()) {
+ jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key);
+ }
+
+ jw.EndObject();
+
+ return jw.Get();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit.h b/src/rocksdb/db/version_edit.h
new file mode 100644
index 000000000..c9800a3c0
--- /dev/null
+++ b/src/rocksdb/db/version_edit.h
@@ -0,0 +1,669 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <algorithm>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_garbage.h"
+#include "db/dbformat.h"
+#include "db/wal_edit.h"
+#include "memory/arena.h"
+#include "port/malloc.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/cache.h"
+#include "table/table_reader.h"
+#include "table/unique_id_impl.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tag numbers for serialized VersionEdit. These numbers are written to
+// disk and should not be changed. The number should be forward compatible so
+// users can down-grade RocksDB safely. A future Tag is ignored by doing '&'
+// between Tag and kTagSafeIgnoreMask field.
+enum Tag : uint32_t {
+ kComparator = 1,
+ kLogNumber = 2,
+ kNextFileNumber = 3,
+ kLastSequence = 4,
+ kCompactCursor = 5,
+ kDeletedFile = 6,
+ kNewFile = 7,
+ // 8 was used for large value refs
+ kPrevLogNumber = 9,
+ kMinLogNumberToKeep = 10,
+
+ // these are new formats divergent from open source leveldb
+ kNewFile2 = 100,
+ kNewFile3 = 102,
+ kNewFile4 = 103, // 4th (the latest) format version of adding files
+ kColumnFamily = 200, // specify column family for version edit
+ kColumnFamilyAdd = 201,
+ kColumnFamilyDrop = 202,
+ kMaxColumnFamily = 203,
+
+ kInAtomicGroup = 300,
+
+ kBlobFileAddition = 400,
+ kBlobFileGarbage,
+
+ // Mask for an unidentified tag from the future which can be safely ignored.
+ kTagSafeIgnoreMask = 1 << 13,
+
+ // Forward compatible (aka ignorable) records
+ kDbId,
+ kBlobFileAddition_DEPRECATED,
+ kBlobFileGarbage_DEPRECATED,
+ kWalAddition,
+ kWalDeletion,
+ kFullHistoryTsLow,
+ kWalAddition2,
+ kWalDeletion2,
+};
+
+enum NewFileCustomTag : uint32_t {
+ kTerminate = 1, // The end of customized fields
+ kNeedCompaction = 2,
+ // Since Manifest is not entirely forward-compatible, we currently encode
+ // kMinLogNumberToKeep as part of NewFile as a hack. This should be removed
+ // when manifest becomes forward-compatible.
+ kMinLogNumberToKeepHack = 3,
+ kOldestBlobFileNumber = 4,
+ kOldestAncesterTime = 5,
+ kFileCreationTime = 6,
+ kFileChecksum = 7,
+ kFileChecksumFuncName = 8,
+ kTemperature = 9,
+ kMinTimestamp = 10,
+ kMaxTimestamp = 11,
+ kUniqueId = 12,
+
+ // If this bit for the custom tag is set, opening DB should fail if
+ // we don't know this field.
+ kCustomTagNonSafeIgnoreMask = 1 << 6,
+
+ // Forward incompatible (aka unignorable) fields
+ kPathId,
+};
+
+class VersionSet;
+
+constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
+constexpr uint64_t kUnknownOldestAncesterTime = 0;
+constexpr uint64_t kUnknownFileCreationTime = 0;
+
+extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
+
+// A copyable structure contains information needed to read data from an SST
+// file. It can contain a pointer to a table reader opened for the file, or
+// file number and size, which can be used to create a new table reader for it.
+// The behavior is undefined when a copied of the structure is used when the
+// file is not in any live version any more.
+struct FileDescriptor {
+ // Table reader in table_reader_handle
+ TableReader* table_reader;
+ uint64_t packed_number_and_path_id;
+ uint64_t file_size; // File size in bytes
+ SequenceNumber smallest_seqno; // The smallest seqno in this file
+ SequenceNumber largest_seqno; // The largest seqno in this file
+
+ FileDescriptor() : FileDescriptor(0, 0, 0) {}
+
+ FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
+ : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
+
+ FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
+ SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
+ : table_reader(nullptr),
+ packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
+ file_size(_file_size),
+ smallest_seqno(_smallest_seqno),
+ largest_seqno(_largest_seqno) {}
+
+ FileDescriptor(const FileDescriptor& fd) { *this = fd; }
+
+ FileDescriptor& operator=(const FileDescriptor& fd) {
+ table_reader = fd.table_reader;
+ packed_number_and_path_id = fd.packed_number_and_path_id;
+ file_size = fd.file_size;
+ smallest_seqno = fd.smallest_seqno;
+ largest_seqno = fd.largest_seqno;
+ return *this;
+ }
+
+ uint64_t GetNumber() const {
+ return packed_number_and_path_id & kFileNumberMask;
+ }
+ uint32_t GetPathId() const {
+ return static_cast<uint32_t>(packed_number_and_path_id /
+ (kFileNumberMask + 1));
+ }
+ uint64_t GetFileSize() const { return file_size; }
+};
+
+struct FileSampledStats {
+ FileSampledStats() : num_reads_sampled(0) {}
+ FileSampledStats(const FileSampledStats& other) { *this = other; }
+ FileSampledStats& operator=(const FileSampledStats& other) {
+ num_reads_sampled = other.num_reads_sampled.load();
+ return *this;
+ }
+
+ // number of user reads to this file.
+ mutable std::atomic<uint64_t> num_reads_sampled;
+};
+
+struct FileMetaData {
+ FileDescriptor fd;
+ InternalKey smallest; // Smallest internal key served by table
+ InternalKey largest; // Largest internal key served by table
+
+ // Needs to be disposed when refs becomes 0.
+ Cache::Handle* table_reader_handle = nullptr;
+
+ FileSampledStats stats;
+
+ // Stats for compensating deletion entries during compaction
+
+ // File size compensated by deletion entry.
+ // This is updated in Version::UpdateAccumulatedStats() first time when the
+ // file is created or loaded. After it is updated (!= 0), it is immutable.
+ uint64_t compensated_file_size = 0;
+ // These values can mutate, but they can only be read or written from
+ // single-threaded LogAndApply thread
+ uint64_t num_entries = 0; // the number of entries.
+ uint64_t num_deletions = 0; // the number of deletion entries.
+ uint64_t raw_key_size = 0; // total uncompressed key size.
+ uint64_t raw_value_size = 0; // total uncompressed value size.
+
+ int refs = 0; // Reference count
+
+ bool being_compacted = false; // Is this file undergoing compaction?
+ bool init_stats_from_file = false; // true if the data-entry stats of this
+ // file has initialized from file.
+
+ bool marked_for_compaction = false; // True if client asked us nicely to
+ // compact this file.
+ Temperature temperature = Temperature::kUnknown;
+
+ // Used only in BlobDB. The file number of the oldest blob file this SST file
+ // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+
+ // The file could be the compaction output from other SST files, which could
+ // in turn be outputs for compact older SST files. We track the memtable
+ // flush timestamp for the oldest SST file that eventually contribute data
+ // to this file. 0 means the information is not available.
+ uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+
+ // Unix time when the SST file is created.
+ uint64_t file_creation_time = kUnknownFileCreationTime;
+
+ // File checksum
+ std::string file_checksum = kUnknownFileChecksum;
+
+ // File checksum function name
+ std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+
+ // SST unique id
+ UniqueId64x2 unique_id{};
+
+ FileMetaData() = default;
+
+ FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
+ const InternalKey& smallest_key, const InternalKey& largest_key,
+ const SequenceNumber& smallest_seq,
+ const SequenceNumber& largest_seq, bool marked_for_compact,
+ Temperature _temperature, uint64_t oldest_blob_file,
+ uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+ const std::string& _file_checksum,
+ const std::string& _file_checksum_func_name,
+ UniqueId64x2 _unique_id)
+ : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
+ smallest(smallest_key),
+ largest(largest_key),
+ marked_for_compaction(marked_for_compact),
+ temperature(_temperature),
+ oldest_blob_file_number(oldest_blob_file),
+ oldest_ancester_time(_oldest_ancester_time),
+ file_creation_time(_file_creation_time),
+ file_checksum(_file_checksum),
+ file_checksum_func_name(_file_checksum_func_name),
+ unique_id(std::move(_unique_id)) {
+ TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
+ }
+
+ // REQUIRED: Keys must be given to the function in sorted order (it expects
+ // the last key to be the largest).
+ Status UpdateBoundaries(const Slice& key, const Slice& value,
+ SequenceNumber seqno, ValueType value_type);
+
+ // Unlike UpdateBoundaries, ranges do not need to be presented in any
+ // particular order.
+ void UpdateBoundariesForRange(const InternalKey& start,
+ const InternalKey& end, SequenceNumber seqno,
+ const InternalKeyComparator& icmp) {
+ if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
+ smallest = start;
+ }
+ if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
+ largest = end;
+ }
+ fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+ fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+ }
+
+ // Try to get oldest ancester time from the class itself or table properties
+ // if table reader is already pinned.
+ // 0 means the information is not available.
+ uint64_t TryGetOldestAncesterTime() {
+ if (oldest_ancester_time != kUnknownOldestAncesterTime) {
+ return oldest_ancester_time;
+ } else if (fd.table_reader != nullptr &&
+ fd.table_reader->GetTableProperties() != nullptr) {
+ return fd.table_reader->GetTableProperties()->creation_time;
+ }
+ return kUnknownOldestAncesterTime;
+ }
+
+ uint64_t TryGetFileCreationTime() {
+ if (file_creation_time != kUnknownFileCreationTime) {
+ return file_creation_time;
+ } else if (fd.table_reader != nullptr &&
+ fd.table_reader->GetTableProperties() != nullptr) {
+ return fd.table_reader->GetTableProperties()->file_creation_time;
+ }
+ return kUnknownFileCreationTime;
+ }
+
+ // WARNING: manual update to this function is needed
+ // whenever a new string property is added to FileMetaData
+ // to reduce approximation error.
+ //
+ // TODO: eliminate the need of manually updating this function
+ // for new string properties
+ size_t ApproximateMemoryUsage() const {
+ size_t usage = 0;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(const_cast<FileMetaData*>(this));
+#else
+ usage += sizeof(*this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ usage += smallest.size() + largest.size() + file_checksum.size() +
+ file_checksum_func_name.size();
+ return usage;
+ }
+};
+
+// A compressed copy of file meta data that just contain minimum data needed
+// to serve read operations, while still keeping the pointer to full metadata
+// of the file in case it is needed.
+struct FdWithKeyRange {
+ FileDescriptor fd;
+ FileMetaData* file_metadata; // Point to all metadata
+ Slice smallest_key; // slice that contain smallest key
+ Slice largest_key; // slice that contain largest key
+
+ FdWithKeyRange()
+ : fd(), file_metadata(nullptr), smallest_key(), largest_key() {}
+
+ FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
+ FileMetaData* _file_metadata)
+ : fd(_fd),
+ file_metadata(_file_metadata),
+ smallest_key(_smallest_key),
+ largest_key(_largest_key) {}
+};
+
+// Data structure to store an array of FdWithKeyRange in one level
+// Actual data is guaranteed to be stored closely
+struct LevelFilesBrief {
+ size_t num_files;
+ FdWithKeyRange* files;
+ LevelFilesBrief() {
+ num_files = 0;
+ files = nullptr;
+ }
+};
+
+// The state of a DB at any given time is referred to as a Version.
+// Any modification to the Version is considered a Version Edit. A Version is
+// constructed by joining a sequence of Version Edits. Version Edits are written
+// to the MANIFEST file.
+class VersionEdit {
+ public:
+ void Clear();
+
+ void SetDBId(const std::string& db_id) {
+ has_db_id_ = true;
+ db_id_ = db_id;
+ }
+ bool HasDbId() const { return has_db_id_; }
+ const std::string& GetDbId() const { return db_id_; }
+
+ void SetComparatorName(const Slice& name) {
+ has_comparator_ = true;
+ comparator_ = name.ToString();
+ }
+ bool HasComparatorName() const { return has_comparator_; }
+ const std::string& GetComparatorName() const { return comparator_; }
+
+ void SetLogNumber(uint64_t num) {
+ has_log_number_ = true;
+ log_number_ = num;
+ }
+ bool HasLogNumber() const { return has_log_number_; }
+ uint64_t GetLogNumber() const { return log_number_; }
+
+ void SetPrevLogNumber(uint64_t num) {
+ has_prev_log_number_ = true;
+ prev_log_number_ = num;
+ }
+ bool HasPrevLogNumber() const { return has_prev_log_number_; }
+ uint64_t GetPrevLogNumber() const { return prev_log_number_; }
+
+ void SetNextFile(uint64_t num) {
+ has_next_file_number_ = true;
+ next_file_number_ = num;
+ }
+ bool HasNextFile() const { return has_next_file_number_; }
+ uint64_t GetNextFile() const { return next_file_number_; }
+
+ void SetMaxColumnFamily(uint32_t max_column_family) {
+ has_max_column_family_ = true;
+ max_column_family_ = max_column_family;
+ }
+ bool HasMaxColumnFamily() const { return has_max_column_family_; }
+ uint32_t GetMaxColumnFamily() const { return max_column_family_; }
+
+ void SetMinLogNumberToKeep(uint64_t num) {
+ has_min_log_number_to_keep_ = true;
+ min_log_number_to_keep_ = num;
+ }
+ bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; }
+ uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; }
+
+ void SetLastSequence(SequenceNumber seq) {
+ has_last_sequence_ = true;
+ last_sequence_ = seq;
+ }
+ bool HasLastSequence() const { return has_last_sequence_; }
+ SequenceNumber GetLastSequence() const { return last_sequence_; }
+
+ // Delete the specified table file from the specified level.
+ void DeleteFile(int level, uint64_t file) {
+ deleted_files_.emplace(level, file);
+ }
+
+ // Retrieve the table files deleted as well as their associated levels.
+ using DeletedFiles = std::set<std::pair<int, uint64_t>>;
+ const DeletedFiles& GetDeletedFiles() const { return deleted_files_; }
+
+ // Add the specified table file at the specified level.
+ // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+ // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
+ // referred to by this file if any, kInvalidBlobFileNumber otherwise.
+ void AddFile(int level, uint64_t file, uint32_t file_path_id,
+ uint64_t file_size, const InternalKey& smallest,
+ const InternalKey& largest, const SequenceNumber& smallest_seqno,
+ const SequenceNumber& largest_seqno, bool marked_for_compaction,
+ Temperature temperature, uint64_t oldest_blob_file_number,
+ uint64_t oldest_ancester_time, uint64_t file_creation_time,
+ const std::string& file_checksum,
+ const std::string& file_checksum_func_name,
+ const UniqueId64x2& unique_id) {
+ assert(smallest_seqno <= largest_seqno);
+ new_files_.emplace_back(
+ level,
+ FileMetaData(file, file_path_id, file_size, smallest, largest,
+ smallest_seqno, largest_seqno, marked_for_compaction,
+ temperature, oldest_blob_file_number, oldest_ancester_time,
+ file_creation_time, file_checksum, file_checksum_func_name,
+ unique_id));
+ if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
+ SetLastSequence(largest_seqno);
+ }
+ }
+
+ void AddFile(int level, const FileMetaData& f) {
+ assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
+ new_files_.emplace_back(level, f);
+ if (!HasLastSequence() || f.fd.largest_seqno > GetLastSequence()) {
+ SetLastSequence(f.fd.largest_seqno);
+ }
+ }
+
+ // Retrieve the table files added as well as their associated levels.
+ using NewFiles = std::vector<std::pair<int, FileMetaData>>;
+ const NewFiles& GetNewFiles() const { return new_files_; }
+
+ // Retrieve all the compact cursors
+ using CompactCursors = std::vector<std::pair<int, InternalKey>>;
+ const CompactCursors& GetCompactCursors() const { return compact_cursors_; }
+ void AddCompactCursor(int level, const InternalKey& cursor) {
+ compact_cursors_.push_back(std::make_pair(level, cursor));
+ }
+ void SetCompactCursors(
+ const std::vector<InternalKey>& compact_cursors_by_level) {
+ compact_cursors_.clear();
+ compact_cursors_.reserve(compact_cursors_by_level.size());
+ for (int i = 0; i < (int)compact_cursors_by_level.size(); i++) {
+ if (compact_cursors_by_level[i].Valid()) {
+ compact_cursors_.push_back(
+ std::make_pair(i, compact_cursors_by_level[i]));
+ }
+ }
+ }
+
+ // Add a new blob file.
+ void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_count,
+ uint64_t total_blob_bytes, std::string checksum_method,
+ std::string checksum_value) {
+ blob_file_additions_.emplace_back(
+ blob_file_number, total_blob_count, total_blob_bytes,
+ std::move(checksum_method), std::move(checksum_value));
+ }
+
+ void AddBlobFile(BlobFileAddition blob_file_addition) {
+ blob_file_additions_.emplace_back(std::move(blob_file_addition));
+ }
+
+ // Retrieve all the blob files added.
+ using BlobFileAdditions = std::vector<BlobFileAddition>;
+ const BlobFileAdditions& GetBlobFileAdditions() const {
+ return blob_file_additions_;
+ }
+
+ void SetBlobFileAdditions(BlobFileAdditions blob_file_additions) {
+ assert(blob_file_additions_.empty());
+ blob_file_additions_ = std::move(blob_file_additions);
+ }
+
+ // Add garbage for an existing blob file. Note: intentionally broken English
+ // follows.
+ void AddBlobFileGarbage(uint64_t blob_file_number,
+ uint64_t garbage_blob_count,
+ uint64_t garbage_blob_bytes) {
+ blob_file_garbages_.emplace_back(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+ }
+
+ void AddBlobFileGarbage(BlobFileGarbage blob_file_garbage) {
+ blob_file_garbages_.emplace_back(std::move(blob_file_garbage));
+ }
+
+ // Retrieve all the blob file garbage added.
+ using BlobFileGarbages = std::vector<BlobFileGarbage>;
+ const BlobFileGarbages& GetBlobFileGarbages() const {
+ return blob_file_garbages_;
+ }
+
+ void SetBlobFileGarbages(BlobFileGarbages blob_file_garbages) {
+ assert(blob_file_garbages_.empty());
+ blob_file_garbages_ = std::move(blob_file_garbages);
+ }
+
+ // Add a WAL (either just created or closed).
+ // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
+ void AddWal(WalNumber number, WalMetadata metadata = WalMetadata()) {
+ assert(NumEntries() == wal_additions_.size());
+ wal_additions_.emplace_back(number, std::move(metadata));
+ }
+
+ // Retrieve all the added WALs.
+ const WalAdditions& GetWalAdditions() const { return wal_additions_; }
+
+ bool IsWalAddition() const { return !wal_additions_.empty(); }
+
+ // Delete a WAL (either directly deleted or archived).
+ // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
+ void DeleteWalsBefore(WalNumber number) {
+ assert((NumEntries() == 1) == !wal_deletion_.IsEmpty());
+ wal_deletion_ = WalDeletion(number);
+ }
+
+ const WalDeletion& GetWalDeletion() const { return wal_deletion_; }
+
+ bool IsWalDeletion() const { return !wal_deletion_.IsEmpty(); }
+
+ bool IsWalManipulation() const {
+ size_t entries = NumEntries();
+ return (entries > 0) && ((entries == wal_additions_.size()) ||
+ (entries == !wal_deletion_.IsEmpty()));
+ }
+
+ // Number of edits
+ size_t NumEntries() const {
+ return new_files_.size() + deleted_files_.size() +
+ blob_file_additions_.size() + blob_file_garbages_.size() +
+ wal_additions_.size() + !wal_deletion_.IsEmpty();
+ }
+
+ void SetColumnFamily(uint32_t column_family_id) {
+ column_family_ = column_family_id;
+ }
+ uint32_t GetColumnFamily() const { return column_family_; }
+
+ // set column family ID by calling SetColumnFamily()
+ void AddColumnFamily(const std::string& name) {
+ assert(!is_column_family_drop_);
+ assert(!is_column_family_add_);
+ assert(NumEntries() == 0);
+ is_column_family_add_ = true;
+ column_family_name_ = name;
+ }
+
+ // set column family ID by calling SetColumnFamily()
+ void DropColumnFamily() {
+ assert(!is_column_family_drop_);
+ assert(!is_column_family_add_);
+ assert(NumEntries() == 0);
+ is_column_family_drop_ = true;
+ }
+
+ bool IsColumnFamilyManipulation() const {
+ return is_column_family_add_ || is_column_family_drop_;
+ }
+
+ bool IsColumnFamilyAdd() const { return is_column_family_add_; }
+
+ bool IsColumnFamilyDrop() const { return is_column_family_drop_; }
+
+ void MarkAtomicGroup(uint32_t remaining_entries) {
+ is_in_atomic_group_ = true;
+ remaining_entries_ = remaining_entries;
+ }
+ bool IsInAtomicGroup() const { return is_in_atomic_group_; }
+ uint32_t GetRemainingEntries() const { return remaining_entries_; }
+
+ bool HasFullHistoryTsLow() const { return !full_history_ts_low_.empty(); }
+ const std::string& GetFullHistoryTsLow() const {
+ assert(HasFullHistoryTsLow());
+ return full_history_ts_low_;
+ }
+ void SetFullHistoryTsLow(std::string full_history_ts_low) {
+ assert(!full_history_ts_low.empty());
+ full_history_ts_low_ = std::move(full_history_ts_low);
+ }
+
+ // return true on success.
+ bool EncodeTo(std::string* dst) const;
+ Status DecodeFrom(const Slice& src);
+
+ std::string DebugString(bool hex_key = false) const;
+ std::string DebugJSON(int edit_num, bool hex_key = false) const;
+
+ private:
+ friend class ReactiveVersionSet;
+ friend class VersionEditHandlerBase;
+ friend class ListColumnFamiliesHandler;
+ friend class VersionEditHandler;
+ friend class VersionEditHandlerPointInTime;
+ friend class DumpManifestHandler;
+ friend class VersionSet;
+ friend class Version;
+ friend class AtomicGroupReadBuffer;
+
+ bool GetLevel(Slice* input, int* level, const char** msg);
+
+ const char* DecodeNewFile4From(Slice* input);
+
+ int max_level_ = 0;
+ std::string db_id_;
+ std::string comparator_;
+ uint64_t log_number_ = 0;
+ uint64_t prev_log_number_ = 0;
+ uint64_t next_file_number_ = 0;
+ uint32_t max_column_family_ = 0;
+ // The most recent WAL log number that is deleted
+ uint64_t min_log_number_to_keep_ = 0;
+ SequenceNumber last_sequence_ = 0;
+ bool has_db_id_ = false;
+ bool has_comparator_ = false;
+ bool has_log_number_ = false;
+ bool has_prev_log_number_ = false;
+ bool has_next_file_number_ = false;
+ bool has_max_column_family_ = false;
+ bool has_min_log_number_to_keep_ = false;
+ bool has_last_sequence_ = false;
+
+ // Compaction cursors for round-robin compaction policy
+ CompactCursors compact_cursors_;
+
+ DeletedFiles deleted_files_;
+ NewFiles new_files_;
+
+ BlobFileAdditions blob_file_additions_;
+ BlobFileGarbages blob_file_garbages_;
+
+ WalAdditions wal_additions_;
+ WalDeletion wal_deletion_;
+
+ // Each version edit record should have column_family_ set
+ // If it's not set, it is default (0)
+ uint32_t column_family_ = 0;
+ // a version edit can be either column_family add or
+ // column_family drop. If it's column family add,
+ // it also includes column family name.
+ bool is_column_family_drop_ = false;
+ bool is_column_family_add_ = false;
+ std::string column_family_name_;
+
+ bool is_in_atomic_group_ = false;
+ uint32_t remaining_entries_ = 0;
+
+ std::string full_history_ts_low_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit_handler.cc b/src/rocksdb/db/version_edit_handler.cc
new file mode 100644
index 000000000..145e78789
--- /dev/null
+++ b/src/rocksdb/db/version_edit_handler.cc
@@ -0,0 +1,1002 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit_handler.h"
+
+#include <cinttypes>
+#include <sstream>
+
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_source.h"
+#include "logging/logging.h"
+#include "monitoring/persistent_stats_history.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void VersionEditHandlerBase::Iterate(log::Reader& reader,
+ Status* log_read_status) {
+ Slice record;
+ std::string scratch;
+ assert(log_read_status);
+ assert(log_read_status->ok());
+
+ size_t recovered_edits = 0;
+ Status s = Initialize();
+ while (reader.LastRecordEnd() < max_manifest_read_size_ && s.ok() &&
+ reader.ReadRecord(&record, &scratch) && log_read_status->ok()) {
+ VersionEdit edit;
+ s = edit.DecodeFrom(record);
+ if (!s.ok()) {
+ break;
+ }
+
+ s = read_buffer_.AddEdit(&edit);
+ if (!s.ok()) {
+ break;
+ }
+ ColumnFamilyData* cfd = nullptr;
+ if (edit.is_in_atomic_group_) {
+ if (read_buffer_.IsFull()) {
+ for (auto& e : read_buffer_.replay_buffer()) {
+ s = ApplyVersionEdit(e, &cfd);
+ if (!s.ok()) {
+ break;
+ }
+ ++recovered_edits;
+ }
+ if (!s.ok()) {
+ break;
+ }
+ read_buffer_.Clear();
+ }
+ } else {
+ s = ApplyVersionEdit(edit, &cfd);
+ if (s.ok()) {
+ ++recovered_edits;
+ }
+ }
+ }
+ if (!log_read_status->ok()) {
+ s = *log_read_status;
+ }
+
+ CheckIterationResult(reader, &s);
+
+ if (!s.ok()) {
+ if (s.IsCorruption()) {
+ // when we find a Corruption error, something is
+ // wrong with the underlying file. in this case we
+ // want to report the filename, so in here we append
+ // the filename to the Corruption message
+ assert(reader.file());
+
+ // build a new error message
+ std::stringstream message;
+ // append previous dynamic state message
+ const char* state = s.getState();
+ if (state != nullptr) {
+ message << state;
+ message << ' ';
+ }
+ // append the filename to the corruption message
+ message << "in file " << reader.file()->file_name();
+ // overwrite the status with the extended status
+ s = Status(s.code(), s.subcode(), s.severity(), message.str());
+ }
+ status_ = s;
+ }
+ TEST_SYNC_POINT_CALLBACK("VersionEditHandlerBase::Iterate:Finish",
+ &recovered_edits);
+}
+
+Status ListColumnFamiliesHandler::ApplyVersionEdit(
+ VersionEdit& edit, ColumnFamilyData** /*unused*/) {
+ Status s;
+ if (edit.is_column_family_add_) {
+ if (column_family_names_.find(edit.column_family_) !=
+ column_family_names_.end()) {
+ s = Status::Corruption("Manifest adding the same column family twice");
+ } else {
+ column_family_names_.insert(
+ {edit.column_family_, edit.column_family_name_});
+ }
+ } else if (edit.is_column_family_drop_) {
+ if (column_family_names_.find(edit.column_family_) ==
+ column_family_names_.end()) {
+ s = Status::Corruption("Manifest - dropping non-existing column family");
+ } else {
+ column_family_names_.erase(edit.column_family_);
+ }
+ }
+ return s;
+}
+
+Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit,
+ ColumnFamilyData** /*unused*/) {
+ for (const auto& deleted_file : edit.GetDeletedFiles()) {
+ Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ for (const auto& new_file : edit.GetNewFiles()) {
+ Status s = file_checksum_list_.InsertOneFileChecksum(
+ new_file.second.fd.GetNumber(), new_file.second.file_checksum,
+ new_file.second.file_checksum_func_name);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ for (const auto& new_blob_file : edit.GetBlobFileAdditions()) {
+ std::string checksum_value = new_blob_file.GetChecksumValue();
+ std::string checksum_method = new_blob_file.GetChecksumMethod();
+ assert(checksum_value.empty() == checksum_method.empty());
+ if (checksum_method.empty()) {
+ checksum_value = kUnknownFileChecksum;
+ checksum_method = kUnknownFileChecksumFuncName;
+ }
+ Status s = file_checksum_list_.InsertOneFileChecksum(
+ new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ return Status::OK();
+}
+
+VersionEditHandler::VersionEditHandler(
+ bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+ VersionSet* version_set, bool track_missing_files,
+ bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer,
+ bool skip_load_table_files)
+ : VersionEditHandlerBase(),
+ read_only_(read_only),
+ column_families_(std::move(column_families)),
+ version_set_(version_set),
+ track_missing_files_(track_missing_files),
+ no_error_if_files_missing_(no_error_if_files_missing),
+ io_tracer_(io_tracer),
+ skip_load_table_files_(skip_load_table_files),
+ initialized_(false) {
+ assert(version_set_ != nullptr);
+}
+
+Status VersionEditHandler::Initialize() {
+ Status s;
+ if (!initialized_) {
+ for (const auto& cf_desc : column_families_) {
+ name_to_options_.emplace(cf_desc.name, cf_desc.options);
+ }
+ auto default_cf_iter = name_to_options_.find(kDefaultColumnFamilyName);
+ if (default_cf_iter == name_to_options_.end()) {
+ s = Status::InvalidArgument("Default column family not specified");
+ }
+ if (s.ok()) {
+ VersionEdit default_cf_edit;
+ default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+ default_cf_edit.SetColumnFamily(0);
+ ColumnFamilyData* cfd =
+ CreateCfAndInit(default_cf_iter->second, default_cf_edit);
+ assert(cfd != nullptr);
+#ifdef NDEBUG
+ (void)cfd;
+#endif
+ initialized_ = true;
+ }
+ }
+ return s;
+}
+
+Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit,
+ ColumnFamilyData** cfd) {
+ Status s;
+ if (edit.is_column_family_add_) {
+ s = OnColumnFamilyAdd(edit, cfd);
+ } else if (edit.is_column_family_drop_) {
+ s = OnColumnFamilyDrop(edit, cfd);
+ } else if (edit.IsWalAddition()) {
+ s = OnWalAddition(edit);
+ } else if (edit.IsWalDeletion()) {
+ s = OnWalDeletion(edit);
+ } else {
+ s = OnNonCfOperation(edit, cfd);
+ }
+ if (s.ok()) {
+ assert(cfd != nullptr);
+ s = ExtractInfoFromVersionEdit(*cfd, edit);
+ }
+ return s;
+}
+
+Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
+ ColumnFamilyData** cfd) {
+ bool cf_in_not_found = false;
+ bool cf_in_builders = false;
+ CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+ assert(cfd != nullptr);
+ *cfd = nullptr;
+ Status s;
+ if (cf_in_builders || cf_in_not_found) {
+ s = Status::Corruption("MANIFEST adding the same column family twice: " +
+ edit.column_family_name_);
+ }
+ if (s.ok()) {
+ auto cf_options = name_to_options_.find(edit.column_family_name_);
+ // implicitly add persistent_stats column family without requiring user
+ // to specify
+ ColumnFamilyData* tmp_cfd = nullptr;
+ bool is_persistent_stats_column_family =
+ edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
+ if (cf_options == name_to_options_.end() &&
+ !is_persistent_stats_column_family) {
+ column_families_not_found_.emplace(edit.column_family_,
+ edit.column_family_name_);
+ } else {
+ if (is_persistent_stats_column_family) {
+ ColumnFamilyOptions cfo;
+ OptimizeForPersistentStats(&cfo);
+ tmp_cfd = CreateCfAndInit(cfo, edit);
+ } else {
+ tmp_cfd = CreateCfAndInit(cf_options->second, edit);
+ }
+ *cfd = tmp_cfd;
+ }
+ }
+ return s;
+}
+
+Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit,
+ ColumnFamilyData** cfd) {
+ bool cf_in_not_found = false;
+ bool cf_in_builders = false;
+ CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+ assert(cfd != nullptr);
+ *cfd = nullptr;
+ ColumnFamilyData* tmp_cfd = nullptr;
+ Status s;
+ if (cf_in_builders) {
+ tmp_cfd = DestroyCfAndCleanup(edit);
+ } else if (cf_in_not_found) {
+ column_families_not_found_.erase(edit.column_family_);
+ } else {
+ s = Status::Corruption("MANIFEST - dropping non-existing column family");
+ }
+ *cfd = tmp_cfd;
+ return s;
+}
+
+Status VersionEditHandler::OnWalAddition(VersionEdit& edit) {
+ assert(edit.IsWalAddition());
+ return version_set_->wals_.AddWals(edit.GetWalAdditions());
+}
+
+Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) {
+ assert(edit.IsWalDeletion());
+ return version_set_->wals_.DeleteWalsBefore(
+ edit.GetWalDeletion().GetLogNumber());
+}
+
+Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit,
+ ColumnFamilyData** cfd) {
+ bool cf_in_not_found = false;
+ bool cf_in_builders = false;
+ CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+ assert(cfd != nullptr);
+ *cfd = nullptr;
+ Status s;
+ if (!cf_in_not_found) {
+ if (!cf_in_builders) {
+ s = Status::Corruption(
+ "MANIFEST record referencing unknown column family");
+ }
+ ColumnFamilyData* tmp_cfd = nullptr;
+ if (s.ok()) {
+ auto builder_iter = builders_.find(edit.column_family_);
+ assert(builder_iter != builders_.end());
+ tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily(
+ edit.column_family_);
+ assert(tmp_cfd != nullptr);
+ s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false);
+ if (s.ok()) {
+ s = builder_iter->second->version_builder()->Apply(&edit);
+ }
+ }
+ *cfd = tmp_cfd;
+ }
+ return s;
+}
+
+// TODO maybe cache the computation result
+bool VersionEditHandler::HasMissingFiles() const {
+ bool ret = false;
+ for (const auto& elem : cf_to_missing_files_) {
+ const auto& missing_files = elem.second;
+ if (!missing_files.empty()) {
+ ret = true;
+ break;
+ }
+ }
+ if (!ret) {
+ for (const auto& elem : cf_to_missing_blob_files_high_) {
+ if (elem.second != kInvalidBlobFileNumber) {
+ ret = true;
+ break;
+ }
+ }
+ }
+ return ret;
+}
+
+void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit,
+ bool* cf_in_not_found,
+ bool* cf_in_builders) const {
+ assert(cf_in_not_found != nullptr);
+ assert(cf_in_builders != nullptr);
+ // Not found means that user didn't supply that column
+ // family option AND we encountered column family add
+ // record. Once we encounter column family drop record,
+ // we will delete the column family from
+ // column_families_not_found.
+ bool in_not_found = column_families_not_found_.find(edit.column_family_) !=
+ column_families_not_found_.end();
+ // in builders means that user supplied that column family
+ // option AND that we encountered column family add record
+ bool in_builders = builders_.find(edit.column_family_) != builders_.end();
+ // They cannot both be true
+ assert(!(in_not_found && in_builders));
+ *cf_in_not_found = in_not_found;
+ *cf_in_builders = in_builders;
+}
+
+void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
+ Status* s) {
+ assert(s != nullptr);
+ if (!s->ok()) {
+ // Do nothing here.
+ } else if (!version_edit_params_.has_log_number_ ||
+ !version_edit_params_.has_next_file_number_ ||
+ !version_edit_params_.has_last_sequence_) {
+ std::string msg("no ");
+ if (!version_edit_params_.has_log_number_) {
+ msg.append("log_file_number, ");
+ }
+ if (!version_edit_params_.has_next_file_number_) {
+ msg.append("next_file_number, ");
+ }
+ if (!version_edit_params_.has_last_sequence_) {
+ msg.append("last_sequence, ");
+ }
+ msg = msg.substr(0, msg.size() - 2);
+ msg.append(" entry in MANIFEST");
+ *s = Status::Corruption(msg);
+ }
+ // There were some column families in the MANIFEST that weren't specified
+ // in the argument. This is OK in read_only mode
+ if (s->ok() && MustOpenAllColumnFamilies() &&
+ !column_families_not_found_.empty()) {
+ std::string msg;
+ for (const auto& cf : column_families_not_found_) {
+ msg.append(", ");
+ msg.append(cf.second);
+ }
+ msg = msg.substr(2);
+ *s = Status::InvalidArgument("Column families not opened: " + msg);
+ }
+ if (s->ok()) {
+ version_set_->GetColumnFamilySet()->UpdateMaxColumnFamily(
+ version_edit_params_.max_column_family_);
+ version_set_->MarkMinLogNumberToKeep(
+ version_edit_params_.min_log_number_to_keep_);
+ version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_);
+ version_set_->MarkFileNumberUsed(version_edit_params_.log_number_);
+ for (auto* cfd : *(version_set_->GetColumnFamilySet())) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ auto builder_iter = builders_.find(cfd->GetID());
+ assert(builder_iter != builders_.end());
+ auto* builder = builder_iter->second->version_builder();
+ if (!builder->CheckConsistencyForNumLevels()) {
+ *s = Status::InvalidArgument(
+ "db has more levels than options.num_levels");
+ break;
+ }
+ }
+ }
+ if (s->ok()) {
+ for (auto* cfd : *(version_set_->GetColumnFamilySet())) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (read_only_) {
+ cfd->table_cache()->SetTablesAreImmortal();
+ }
+ *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false,
+ /*is_initial_load=*/true);
+ if (!s->ok()) {
+ // If s is IOError::PathNotFound, then we mark the db as corrupted.
+ if (s->IsPathNotFound()) {
+ *s = Status::Corruption("Corruption: " + s->ToString());
+ }
+ break;
+ }
+ }
+ }
+ if (s->ok()) {
+ for (auto* cfd : *(version_set_->column_family_set_)) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ assert(cfd->initialized());
+ VersionEdit edit;
+ *s = MaybeCreateVersion(edit, cfd, /*force_create_version=*/true);
+ if (!s->ok()) {
+ break;
+ }
+ }
+ }
+ if (s->ok()) {
+ version_set_->manifest_file_size_ = reader.GetReadOffset();
+ assert(version_set_->manifest_file_size_ > 0);
+ version_set_->next_file_number_.store(
+ version_edit_params_.next_file_number_ + 1);
+ SequenceNumber last_seq = version_edit_params_.last_sequence_;
+ assert(last_seq != kMaxSequenceNumber);
+ if (last_seq != kMaxSequenceNumber &&
+ last_seq > version_set_->last_allocated_sequence_.load()) {
+ version_set_->last_allocated_sequence_.store(last_seq);
+ }
+ if (last_seq != kMaxSequenceNumber &&
+ last_seq > version_set_->last_published_sequence_.load()) {
+ version_set_->last_published_sequence_.store(last_seq);
+ }
+ if (last_seq != kMaxSequenceNumber &&
+ last_seq > version_set_->last_sequence_.load()) {
+ version_set_->last_sequence_.store(last_seq);
+ }
+ if (last_seq != kMaxSequenceNumber &&
+ last_seq > version_set_->descriptor_last_sequence_) {
+ // This is the maximum last sequence of all `VersionEdit`s iterated. It
+ // may be greater than the maximum `largest_seqno` of all files in case
+ // the newest data referred to by the MANIFEST has been dropped or had its
+ // sequence number zeroed through compaction.
+ version_set_->descriptor_last_sequence_ = last_seq;
+ }
+ version_set_->prev_log_number_ = version_edit_params_.prev_log_number_;
+ }
+}
+
+ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
+ const ColumnFamilyOptions& cf_options, const VersionEdit& edit) {
+ ColumnFamilyData* cfd = version_set_->CreateColumnFamily(cf_options, &edit);
+ assert(cfd != nullptr);
+ cfd->set_initialized();
+ assert(builders_.find(edit.column_family_) == builders_.end());
+ builders_.emplace(edit.column_family_,
+ VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd)));
+ if (track_missing_files_) {
+ cf_to_missing_files_.emplace(edit.column_family_,
+ std::unordered_set<uint64_t>());
+ cf_to_missing_blob_files_high_.emplace(edit.column_family_,
+ kInvalidBlobFileNumber);
+ }
+ return cfd;
+}
+
+ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
+ const VersionEdit& edit) {
+ auto builder_iter = builders_.find(edit.column_family_);
+ assert(builder_iter != builders_.end());
+ builders_.erase(builder_iter);
+ if (track_missing_files_) {
+ auto missing_files_iter = cf_to_missing_files_.find(edit.column_family_);
+ assert(missing_files_iter != cf_to_missing_files_.end());
+ cf_to_missing_files_.erase(missing_files_iter);
+
+ auto missing_blob_files_high_iter =
+ cf_to_missing_blob_files_high_.find(edit.column_family_);
+ assert(missing_blob_files_high_iter !=
+ cf_to_missing_blob_files_high_.end());
+ cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter);
+ }
+ ColumnFamilyData* ret =
+ version_set_->GetColumnFamilySet()->GetColumnFamily(edit.column_family_);
+ assert(ret != nullptr);
+ ret->SetDropped();
+ ret->UnrefAndTryDelete();
+ ret = nullptr;
+ return ret;
+}
+
+Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/,
+ ColumnFamilyData* cfd,
+ bool force_create_version) {
+ assert(cfd->initialized());
+ Status s;
+ if (force_create_version) {
+ auto builder_iter = builders_.find(cfd->GetID());
+ assert(builder_iter != builders_.end());
+ auto* builder = builder_iter->second->version_builder();
+ auto* v = new Version(cfd, version_set_, version_set_->file_options_,
+ *cfd->GetLatestMutableCFOptions(), io_tracer_,
+ version_set_->current_version_number_++);
+ s = builder->SaveTo(v->storage_info());
+ if (s.ok()) {
+ // Install new version
+ v->PrepareAppend(
+ *cfd->GetLatestMutableCFOptions(),
+ !(version_set_->db_options_->skip_stats_update_on_db_open));
+ version_set_->AppendVersion(cfd, v);
+ } else {
+ delete v;
+ }
+ }
+ return s;
+}
+
+Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd,
+ bool prefetch_index_and_filter_in_cache,
+ bool is_initial_load) {
+ bool skip_load_table_files = skip_load_table_files_;
+ TEST_SYNC_POINT_CALLBACK(
+ "VersionEditHandler::LoadTables:skip_load_table_files",
+ &skip_load_table_files);
+ if (skip_load_table_files) {
+ return Status::OK();
+ }
+ assert(cfd != nullptr);
+ assert(!cfd->IsDropped());
+ auto builder_iter = builders_.find(cfd->GetID());
+ assert(builder_iter != builders_.end());
+ assert(builder_iter->second != nullptr);
+ VersionBuilder* builder = builder_iter->second->version_builder();
+ assert(builder);
+ Status s = builder->LoadTableHandlers(
+ cfd->internal_stats(),
+ version_set_->db_options_->max_file_opening_threads,
+ prefetch_index_and_filter_in_cache, is_initial_load,
+ cfd->GetLatestMutableCFOptions()->prefix_extractor,
+ MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
+ if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) {
+ s = Status::OK();
+ }
+ if (!s.ok() && !version_set_->db_options_->paranoid_checks) {
+ s = Status::OK();
+ }
+ return s;
+}
+
+Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+ const VersionEdit& edit) {
+ Status s;
+ if (edit.has_db_id_) {
+ version_set_->db_id_ = edit.GetDbId();
+ version_edit_params_.SetDBId(edit.db_id_);
+ }
+ if (cfd != nullptr) {
+ if (edit.has_log_number_) {
+ if (cfd->GetLogNumber() > edit.log_number_) {
+ ROCKS_LOG_WARN(
+ version_set_->db_options()->info_log,
+ "MANIFEST corruption detected, but ignored - Log numbers in "
+ "records NOT monotonically increasing");
+ } else {
+ cfd->SetLogNumber(edit.log_number_);
+ version_edit_params_.SetLogNumber(edit.log_number_);
+ }
+ }
+ if (edit.has_comparator_ &&
+ edit.comparator_ != cfd->user_comparator()->Name()) {
+ if (!cf_to_cmp_names_) {
+ s = Status::InvalidArgument(
+ cfd->user_comparator()->Name(),
+ "does not match existing comparator " + edit.comparator_);
+ } else {
+ cf_to_cmp_names_->emplace(cfd->GetID(), edit.comparator_);
+ }
+ }
+ if (edit.HasFullHistoryTsLow()) {
+ const std::string& new_ts = edit.GetFullHistoryTsLow();
+ cfd->SetFullHistoryTsLow(new_ts);
+ }
+ }
+
+ if (s.ok()) {
+ if (edit.has_prev_log_number_) {
+ version_edit_params_.SetPrevLogNumber(edit.prev_log_number_);
+ }
+ if (edit.has_next_file_number_) {
+ version_edit_params_.SetNextFile(edit.next_file_number_);
+ }
+ if (edit.has_max_column_family_) {
+ version_edit_params_.SetMaxColumnFamily(edit.max_column_family_);
+ }
+ if (edit.has_min_log_number_to_keep_) {
+ version_edit_params_.min_log_number_to_keep_ =
+ std::max(version_edit_params_.min_log_number_to_keep_,
+ edit.min_log_number_to_keep_);
+ }
+ if (edit.has_last_sequence_) {
+ // `VersionEdit::last_sequence_`s are assumed to be non-decreasing. This
+ // is legacy behavior that cannot change without breaking downgrade
+ // compatibility.
+ assert(!version_edit_params_.has_last_sequence_ ||
+ version_edit_params_.last_sequence_ <= edit.last_sequence_);
+ version_edit_params_.SetLastSequence(edit.last_sequence_);
+ }
+ if (!version_edit_params_.has_prev_log_number_) {
+ version_edit_params_.SetPrevLogNumber(0);
+ }
+ }
+ return s;
+}
+
+VersionEditHandlerPointInTime::VersionEditHandlerPointInTime(
+ bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+ VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer)
+ : VersionEditHandler(read_only, column_families, version_set,
+ /*track_missing_files=*/true,
+ /*no_error_if_files_missing=*/true, io_tracer) {}
+
+VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() {
+ for (const auto& elem : versions_) {
+ delete elem.second;
+ }
+ versions_.clear();
+}
+
+void VersionEditHandlerPointInTime::CheckIterationResult(
+ const log::Reader& reader, Status* s) {
+ VersionEditHandler::CheckIterationResult(reader, s);
+ assert(s != nullptr);
+ if (s->ok()) {
+ for (auto* cfd : *(version_set_->column_family_set_)) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ assert(cfd->initialized());
+ auto v_iter = versions_.find(cfd->GetID());
+ if (v_iter != versions_.end()) {
+ assert(v_iter->second != nullptr);
+
+ version_set_->AppendVersion(cfd, v_iter->second);
+ versions_.erase(v_iter);
+ }
+ }
+ } else {
+ for (const auto& elem : versions_) {
+ delete elem.second;
+ }
+ versions_.clear();
+ }
+}
+
+ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup(
+ const VersionEdit& edit) {
+ ColumnFamilyData* cfd = VersionEditHandler::DestroyCfAndCleanup(edit);
+ auto v_iter = versions_.find(edit.column_family_);
+ if (v_iter != versions_.end()) {
+ delete v_iter->second;
+ versions_.erase(v_iter);
+ }
+ return cfd;
+}
+
+Status VersionEditHandlerPointInTime::MaybeCreateVersion(
+ const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) {
+ assert(cfd != nullptr);
+ if (!force_create_version) {
+ assert(edit.column_family_ == cfd->GetID());
+ }
+ auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID());
+ assert(missing_files_iter != cf_to_missing_files_.end());
+ std::unordered_set<uint64_t>& missing_files = missing_files_iter->second;
+
+ auto missing_blob_files_high_iter =
+ cf_to_missing_blob_files_high_.find(cfd->GetID());
+ assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end());
+ const uint64_t prev_missing_blob_file_high =
+ missing_blob_files_high_iter->second;
+
+ VersionBuilder* builder = nullptr;
+
+ if (prev_missing_blob_file_high != kInvalidBlobFileNumber) {
+ auto builder_iter = builders_.find(cfd->GetID());
+ assert(builder_iter != builders_.end());
+ builder = builder_iter->second->version_builder();
+ assert(builder != nullptr);
+ }
+
+ // At this point, we have not yet applied the new version edits read from the
+ // MANIFEST. We check whether we have any missing table and blob files.
+ const bool prev_has_missing_files =
+ !missing_files.empty() ||
+ (prev_missing_blob_file_high != kInvalidBlobFileNumber &&
+ prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber());
+
+ for (const auto& file : edit.GetDeletedFiles()) {
+ uint64_t file_num = file.second;
+ auto fiter = missing_files.find(file_num);
+ if (fiter != missing_files.end()) {
+ missing_files.erase(fiter);
+ }
+ }
+
+ assert(!cfd->ioptions()->cf_paths.empty());
+ Status s;
+ for (const auto& elem : edit.GetNewFiles()) {
+ int level = elem.first;
+ const FileMetaData& meta = elem.second;
+ const FileDescriptor& fd = meta.fd;
+ uint64_t file_num = fd.GetNumber();
+ const std::string fpath =
+ MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num);
+ s = VerifyFile(cfd, fpath, level, meta);
+ if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
+ missing_files.insert(file_num);
+ s = Status::OK();
+ } else if (!s.ok()) {
+ break;
+ }
+ }
+
+ uint64_t missing_blob_file_num = prev_missing_blob_file_high;
+ for (const auto& elem : edit.GetBlobFileAdditions()) {
+ uint64_t file_num = elem.GetBlobFileNumber();
+ s = VerifyBlobFile(cfd, file_num, elem);
+ if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
+ missing_blob_file_num = std::max(missing_blob_file_num, file_num);
+ s = Status::OK();
+ } else if (!s.ok()) {
+ break;
+ }
+ }
+
+ bool has_missing_blob_files = false;
+ if (missing_blob_file_num != kInvalidBlobFileNumber &&
+ missing_blob_file_num >= prev_missing_blob_file_high) {
+ missing_blob_files_high_iter->second = missing_blob_file_num;
+ has_missing_blob_files = true;
+ } else if (missing_blob_file_num < prev_missing_blob_file_high) {
+ assert(false);
+ }
+
+ // We still have not applied the new version edit, but have tried to add new
+ // table and blob files after verifying their presence and consistency.
+ // Therefore, we know whether we will see new missing table and blob files
+ // later after actually applying the version edit. We perform the check here
+ // and record the result.
+ const bool has_missing_files =
+ !missing_files.empty() || has_missing_blob_files;
+
+ bool missing_info = !version_edit_params_.has_log_number_ ||
+ !version_edit_params_.has_next_file_number_ ||
+ !version_edit_params_.has_last_sequence_;
+
+ // Create version before apply edit. The version will represent the state
+ // before applying the version edit.
+ // A new version will created if:
+ // 1) no error has occurred so far, and
+ // 2) log_number_, next_file_number_ and last_sequence_ are known, and
+ // 3) any of the following:
+ // a) no missing file before, but will have missing file(s) after applying
+ // this version edit.
+ // b) no missing file after applying the version edit, and the caller
+ // explicitly request that a new version be created.
+ if (s.ok() && !missing_info &&
+ ((has_missing_files && !prev_has_missing_files) ||
+ (!has_missing_files && force_create_version))) {
+ if (!builder) {
+ auto builder_iter = builders_.find(cfd->GetID());
+ assert(builder_iter != builders_.end());
+ builder = builder_iter->second->version_builder();
+ assert(builder);
+ }
+
+ auto* version = new Version(cfd, version_set_, version_set_->file_options_,
+ *cfd->GetLatestMutableCFOptions(), io_tracer_,
+ version_set_->current_version_number_++);
+ s = builder->LoadTableHandlers(
+ cfd->internal_stats(),
+ version_set_->db_options_->max_file_opening_threads, false, true,
+ cfd->GetLatestMutableCFOptions()->prefix_extractor,
+ MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
+ if (!s.ok()) {
+ delete version;
+ if (s.IsCorruption()) {
+ s = Status::OK();
+ }
+ return s;
+ }
+ s = builder->SaveTo(version->storage_info());
+ if (s.ok()) {
+ version->PrepareAppend(
+ *cfd->GetLatestMutableCFOptions(),
+ !version_set_->db_options_->skip_stats_update_on_db_open);
+ auto v_iter = versions_.find(cfd->GetID());
+ if (v_iter != versions_.end()) {
+ delete v_iter->second;
+ v_iter->second = version;
+ } else {
+ versions_.emplace(cfd->GetID(), version);
+ }
+ } else {
+ delete version;
+ }
+ }
+ return s;
+}
+
+Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd,
+ const std::string& fpath,
+ int level,
+ const FileMetaData& fmeta) {
+ return version_set_->VerifyFileMetadata(cfd, fpath, level, fmeta);
+}
+
+Status VersionEditHandlerPointInTime::VerifyBlobFile(
+ ColumnFamilyData* cfd, uint64_t blob_file_num,
+ const BlobFileAddition& blob_addition) {
+ BlobSource* blob_source = cfd->blob_source();
+ assert(blob_source);
+ CacheHandleGuard<BlobFileReader> blob_file_reader;
+ Status s = blob_source->GetBlobFileReader(blob_file_num, &blob_file_reader);
+ if (!s.ok()) {
+ return s;
+ }
+ // TODO: verify checksum
+ (void)blob_addition;
+ return s;
+}
+
+Status VersionEditHandlerPointInTime::LoadTables(
+ ColumnFamilyData* /*cfd*/, bool /*prefetch_index_and_filter_in_cache*/,
+ bool /*is_initial_load*/) {
+ return Status::OK();
+}
+
+Status ManifestTailer::Initialize() {
+ if (Mode::kRecovery == mode_) {
+ return VersionEditHandler::Initialize();
+ }
+ assert(Mode::kCatchUp == mode_);
+ Status s;
+ if (!initialized_) {
+ ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet();
+ assert(cfd_set);
+ ColumnFamilyData* default_cfd = cfd_set->GetDefault();
+ assert(default_cfd);
+ auto builder_iter = builders_.find(default_cfd->GetID());
+ assert(builder_iter != builders_.end());
+
+ Version* dummy_version = default_cfd->dummy_versions();
+ assert(dummy_version);
+ Version* base_version = dummy_version->Next();
+ assert(base_version);
+ base_version->Ref();
+ VersionBuilderUPtr new_builder(
+ new BaseReferencedVersionBuilder(default_cfd, base_version));
+ builder_iter->second = std::move(new_builder);
+
+ initialized_ = true;
+ }
+ return s;
+}
+
+Status ManifestTailer::ApplyVersionEdit(VersionEdit& edit,
+ ColumnFamilyData** cfd) {
+ Status s = VersionEditHandler::ApplyVersionEdit(edit, cfd);
+ if (s.ok()) {
+ assert(cfd);
+ if (*cfd) {
+ cfds_changed_.insert(*cfd);
+ }
+ }
+ return s;
+}
+
+Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit,
+ ColumnFamilyData** cfd) {
+ if (Mode::kRecovery == mode_) {
+ return VersionEditHandler::OnColumnFamilyAdd(edit, cfd);
+ }
+ assert(Mode::kCatchUp == mode_);
+ ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet();
+ assert(cfd_set);
+ ColumnFamilyData* tmp_cfd = cfd_set->GetColumnFamily(edit.GetColumnFamily());
+ assert(cfd);
+ *cfd = tmp_cfd;
+ if (!tmp_cfd) {
+ // For now, ignore new column families created after Recover() succeeds.
+ return Status::OK();
+ }
+ auto builder_iter = builders_.find(edit.GetColumnFamily());
+ assert(builder_iter != builders_.end());
+
+ Version* dummy_version = tmp_cfd->dummy_versions();
+ assert(dummy_version);
+ Version* base_version = dummy_version->Next();
+ assert(base_version);
+ base_version->Ref();
+ VersionBuilderUPtr new_builder(
+ new BaseReferencedVersionBuilder(tmp_cfd, base_version));
+ builder_iter->second = std::move(new_builder);
+
+#ifndef NDEBUG
+ auto version_iter = versions_.find(edit.GetColumnFamily());
+ assert(version_iter == versions_.end());
+#endif // !NDEBUG
+ return Status::OK();
+}
+
+void ManifestTailer::CheckIterationResult(const log::Reader& reader,
+ Status* s) {
+ VersionEditHandlerPointInTime::CheckIterationResult(reader, s);
+ assert(s);
+ if (s->ok()) {
+ if (Mode::kRecovery == mode_) {
+ mode_ = Mode::kCatchUp;
+ } else {
+ assert(Mode::kCatchUp == mode_);
+ }
+ }
+}
+
+Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd,
+ const std::string& fpath, int level,
+ const FileMetaData& fmeta) {
+ Status s =
+ VersionEditHandlerPointInTime::VerifyFile(cfd, fpath, level, fmeta);
+ // TODO: Open file or create hard link to prevent the file from being
+ // deleted.
+ return s;
+}
+
+void DumpManifestHandler::CheckIterationResult(const log::Reader& reader,
+ Status* s) {
+ VersionEditHandler::CheckIterationResult(reader, s);
+ if (!s->ok()) {
+ fprintf(stdout, "%s\n", s->ToString().c_str());
+ return;
+ }
+ assert(cf_to_cmp_names_);
+ for (auto* cfd : *(version_set_->column_family_set_)) {
+ fprintf(stdout,
+ "--------------- Column family \"%s\" (ID %" PRIu32
+ ") --------------\n",
+ cfd->GetName().c_str(), cfd->GetID());
+ fprintf(stdout, "log number: %" PRIu64 "\n", cfd->GetLogNumber());
+ auto it = cf_to_cmp_names_->find(cfd->GetID());
+ if (it != cf_to_cmp_names_->end()) {
+ fprintf(stdout,
+ "comparator: <%s>, but the comparator object is not available.\n",
+ it->second.c_str());
+ } else {
+ fprintf(stdout, "comparator: %s\n", cfd->user_comparator()->Name());
+ }
+ assert(cfd->current());
+
+ // Print out DebugStrings. Can include non-terminating null characters.
+ fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char),
+ cfd->current()->DebugString(hex_).size(), stdout);
+ }
+ fprintf(stdout,
+ "next_file_number %" PRIu64 " last_sequence %" PRIu64
+ " prev_log_number %" PRIu64 " max_column_family %" PRIu32
+ " min_log_number_to_keep %" PRIu64 "\n",
+ version_set_->current_next_file_number(),
+ version_set_->LastSequence(), version_set_->prev_log_number(),
+ version_set_->column_family_set_->GetMaxColumnFamily(),
+ version_set_->min_log_number_to_keep());
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit_handler.h b/src/rocksdb/db/version_edit_handler.h
new file mode 100644
index 000000000..fd2379b07
--- /dev/null
+++ b/src/rocksdb/db/version_edit_handler.h
@@ -0,0 +1,313 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/version_builder.h"
+#include "db/version_edit.h"
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FileMetaData;
+
+class VersionEditHandlerBase {
+ public:
+ explicit VersionEditHandlerBase()
+ : max_manifest_read_size_(std::numeric_limits<uint64_t>::max()) {}
+
+ virtual ~VersionEditHandlerBase() {}
+
+ void Iterate(log::Reader& reader, Status* log_read_status);
+
+ const Status& status() const { return status_; }
+
+ AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; }
+
+ protected:
+ explicit VersionEditHandlerBase(uint64_t max_read_size)
+ : max_manifest_read_size_(max_read_size) {}
+ virtual Status Initialize() { return Status::OK(); }
+
+ virtual Status ApplyVersionEdit(VersionEdit& edit,
+ ColumnFamilyData** cfd) = 0;
+
+ virtual void CheckIterationResult(const log::Reader& /*reader*/,
+ Status* /*s*/) {}
+
+ void ClearReadBuffer() { read_buffer_.Clear(); }
+
+ Status status_;
+
+ private:
+ AtomicGroupReadBuffer read_buffer_;
+ const uint64_t max_manifest_read_size_;
+};
+
+class ListColumnFamiliesHandler : public VersionEditHandlerBase {
+ public:
+ ListColumnFamiliesHandler() : VersionEditHandlerBase() {}
+
+ ~ListColumnFamiliesHandler() override {}
+
+ const std::map<uint32_t, std::string> GetColumnFamilyNames() const {
+ return column_family_names_;
+ }
+
+ protected:
+ Status ApplyVersionEdit(VersionEdit& edit,
+ ColumnFamilyData** /*unused*/) override;
+
+ private:
+ // default column family is always implicitly there
+ std::map<uint32_t, std::string> column_family_names_{
+ {0, kDefaultColumnFamilyName}};
+};
+
+class FileChecksumRetriever : public VersionEditHandlerBase {
+ public:
+ FileChecksumRetriever(uint64_t max_read_size,
+ FileChecksumList& file_checksum_list)
+ : VersionEditHandlerBase(max_read_size),
+ file_checksum_list_(file_checksum_list) {}
+
+ ~FileChecksumRetriever() override {}
+
+ protected:
+ Status ApplyVersionEdit(VersionEdit& edit,
+ ColumnFamilyData** /*unused*/) override;
+
+ private:
+ FileChecksumList& file_checksum_list_;
+};
+
+using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
+
+// A class used for scanning MANIFEST file.
+// VersionEditHandler reads a MANIFEST file, parses the version edits, and
+// builds the version set's in-memory state, e.g. the version storage info for
+// the versions of column families.
+// To use this class and its subclasses,
+// 1. Create an object of VersionEditHandler or its subclasses.
+// VersionEditHandler handler(read_only, column_families, version_set,
+// track_missing_files,
+// no_error_if_files_missing);
+// 2. Status s = handler.Iterate(reader, &db_id);
+// 3. Check s and handle possible errors.
+//
+// Not thread-safe, external synchronization is necessary if an object of
+// VersionEditHandler is shared by multiple threads.
+class VersionEditHandler : public VersionEditHandlerBase {
+ public:
+ explicit VersionEditHandler(
+ bool read_only,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ VersionSet* version_set, bool track_missing_files,
+ bool no_error_if_files_missing,
+ const std::shared_ptr<IOTracer>& io_tracer)
+ : VersionEditHandler(read_only, column_families, version_set,
+ track_missing_files, no_error_if_files_missing,
+ io_tracer, /*skip_load_table_files=*/false) {}
+
+ ~VersionEditHandler() override {}
+
+ const VersionEditParams& GetVersionEditParams() const {
+ return version_edit_params_;
+ }
+
+ bool HasMissingFiles() const;
+
+ void GetDbId(std::string* db_id) const {
+ if (db_id && version_edit_params_.has_db_id_) {
+ *db_id = version_edit_params_.db_id_;
+ }
+ }
+
+ protected:
+ explicit VersionEditHandler(
+ bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+ VersionSet* version_set, bool track_missing_files,
+ bool no_error_if_files_missing,
+ const std::shared_ptr<IOTracer>& io_tracer, bool skip_load_table_files);
+
+ Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+ virtual Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd);
+
+ Status OnColumnFamilyDrop(VersionEdit& edit, ColumnFamilyData** cfd);
+
+ Status OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd);
+
+ Status OnWalAddition(VersionEdit& edit);
+
+ Status OnWalDeletion(VersionEdit& edit);
+
+ Status Initialize() override;
+
+ void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found,
+ bool* cf_in_builders) const;
+
+ void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+ ColumnFamilyData* CreateCfAndInit(const ColumnFamilyOptions& cf_options,
+ const VersionEdit& edit);
+
+ virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit);
+
+ virtual Status MaybeCreateVersion(const VersionEdit& edit,
+ ColumnFamilyData* cfd,
+ bool force_create_version);
+
+ virtual Status LoadTables(ColumnFamilyData* cfd,
+ bool prefetch_index_and_filter_in_cache,
+ bool is_initial_load);
+
+ virtual bool MustOpenAllColumnFamilies() const { return !read_only_; }
+
+ const bool read_only_;
+ std::vector<ColumnFamilyDescriptor> column_families_;
+ VersionSet* version_set_;
+ std::unordered_map<uint32_t, VersionBuilderUPtr> builders_;
+ std::unordered_map<std::string, ColumnFamilyOptions> name_to_options_;
+ // Keeps track of column families in manifest that were not found in
+ // column families parameters. if those column families are not dropped
+ // by subsequent manifest records, Recover() will return failure status.
+ std::unordered_map<uint32_t, std::string> column_families_not_found_;
+ VersionEditParams version_edit_params_;
+ const bool track_missing_files_;
+ std::unordered_map<uint32_t, std::unordered_set<uint64_t>>
+ cf_to_missing_files_;
+ std::unordered_map<uint32_t, uint64_t> cf_to_missing_blob_files_high_;
+ bool no_error_if_files_missing_;
+ std::shared_ptr<IOTracer> io_tracer_;
+ bool skip_load_table_files_;
+ bool initialized_;
+ std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_;
+
+ private:
+ Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+ const VersionEdit& edit);
+};
+
+// A class similar to its base class, i.e. VersionEditHandler.
+// VersionEditHandlerPointInTime restores the versions to the most recent point
+// in time such that at this point, the version does not have missing files.
+//
+// Not thread-safe, external synchronization is necessary if an object of
+// VersionEditHandlerPointInTime is shared by multiple threads.
+class VersionEditHandlerPointInTime : public VersionEditHandler {
+ public:
+ VersionEditHandlerPointInTime(
+ bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+ VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer);
+ ~VersionEditHandlerPointInTime() override;
+
+ protected:
+ void CheckIterationResult(const log::Reader& reader, Status* s) override;
+ ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override;
+ Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd,
+ bool force_create_version) override;
+ virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath,
+ int level, const FileMetaData& fmeta);
+ virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
+ const BlobFileAddition& blob_addition);
+
+ Status LoadTables(ColumnFamilyData* cfd,
+ bool prefetch_index_and_filter_in_cache,
+ bool is_initial_load) override;
+
+ std::unordered_map<uint32_t, Version*> versions_;
+};
+
+class ManifestTailer : public VersionEditHandlerPointInTime {
+ public:
+ explicit ManifestTailer(std::vector<ColumnFamilyDescriptor> column_families,
+ VersionSet* version_set,
+ const std::shared_ptr<IOTracer>& io_tracer)
+ : VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
+ version_set, io_tracer),
+ mode_(Mode::kRecovery) {}
+
+ void PrepareToReadNewManifest() {
+ initialized_ = false;
+ ClearReadBuffer();
+ }
+
+ std::unordered_set<ColumnFamilyData*>& GetUpdatedColumnFamilies() {
+ return cfds_changed_;
+ }
+
+ protected:
+ Status Initialize() override;
+
+ bool MustOpenAllColumnFamilies() const override { return false; }
+
+ Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+ Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+ void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+ Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
+ const FileMetaData& fmeta) override;
+
+ enum Mode : uint8_t {
+ kRecovery = 0,
+ kCatchUp = 1,
+ };
+
+ Mode mode_;
+ std::unordered_set<ColumnFamilyData*> cfds_changed_;
+};
+
+class DumpManifestHandler : public VersionEditHandler {
+ public:
+ DumpManifestHandler(std::vector<ColumnFamilyDescriptor> column_families,
+ VersionSet* version_set,
+ const std::shared_ptr<IOTracer>& io_tracer, bool verbose,
+ bool hex, bool json)
+ : VersionEditHandler(
+ /*read_only=*/true, column_families, version_set,
+ /*track_missing_files=*/false,
+ /*no_error_if_files_missing=*/false, io_tracer,
+ /*skip_load_table_files=*/true),
+ verbose_(verbose),
+ hex_(hex),
+ json_(json),
+ count_(0) {
+ cf_to_cmp_names_.reset(new std::unordered_map<uint32_t, std::string>());
+ }
+
+ ~DumpManifestHandler() override {}
+
+ Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override {
+ // Write out each individual edit
+ if (verbose_ && !json_) {
+ // Print out DebugStrings. Can include non-terminating null characters.
+ fwrite(edit.DebugString(hex_).data(), sizeof(char),
+ edit.DebugString(hex_).size(), stdout);
+ } else if (json_) {
+ // Print out DebugStrings. Can include non-terminating null characters.
+ fwrite(edit.DebugString(hex_).data(), sizeof(char),
+ edit.DebugString(hex_).size(), stdout);
+ }
+ ++count_;
+ return VersionEditHandler::ApplyVersionEdit(edit, cfd);
+ }
+
+ void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+ private:
+ const bool verbose_;
+ const bool hex_;
+ const bool json_;
+ int count_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit_test.cc b/src/rocksdb/db/version_edit_test.cc
new file mode 100644
index 000000000..c7f271d83
--- /dev/null
+++ b/src/rocksdb/db/version_edit_test.cc
@@ -0,0 +1,730 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/blob/blob_index.h"
+#include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+ std::string encoded, encoded2;
+ edit.EncodeTo(&encoded);
+ VersionEdit parsed;
+ Status s = parsed.DecodeFrom(encoded);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ parsed.EncodeTo(&encoded2);
+ ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest : public testing::Test {};
+
+TEST_F(VersionEditTest, EncodeDecode) {
+ static const uint64_t kBig = 1ull << 50;
+ static const uint32_t kBig32Bit = 1ull << 30;
+
+ VersionEdit edit;
+ for (int i = 0; i < 4; i++) {
+ TestEncodeDecode(edit);
+ edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
+ InternalKey("foo", kBig + 500 + i, kTypeValue),
+ InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
+ kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown,
+ kInvalidBlobFileNumber, 888, 678, "234", "crc32c",
+ kNullUniqueId64x2);
+ edit.DeleteFile(4, kBig + 700 + i);
+ }
+
+ edit.SetComparatorName("foo");
+ edit.SetLogNumber(kBig + 100);
+ edit.SetNextFile(kBig + 200);
+ edit.SetLastSequence(kBig + 1000);
+ TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
+ static const uint64_t kBig = 1ull << 50;
+
+ VersionEdit edit;
+ edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+ InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+ kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+ edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
+ InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
+ kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+ edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
+ InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
+ kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+ 666, 888, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+ edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
+ InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
+ kBig + 603, true, Temperature::kUnknown, 1001,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+
+ edit.DeleteFile(4, 700);
+
+ edit.SetComparatorName("foo");
+ edit.SetLogNumber(kBig + 100);
+ edit.SetNextFile(kBig + 200);
+ edit.SetLastSequence(kBig + 1000);
+ TestEncodeDecode(edit);
+
+ std::string encoded, encoded2;
+ edit.EncodeTo(&encoded);
+ VersionEdit parsed;
+ Status s = parsed.DecodeFrom(encoded);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ auto& new_files = parsed.GetNewFiles();
+ ASSERT_TRUE(new_files[0].second.marked_for_compaction);
+ ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
+ ASSERT_TRUE(new_files[2].second.marked_for_compaction);
+ ASSERT_TRUE(new_files[3].second.marked_for_compaction);
+ ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+ ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+ ASSERT_EQ(0u, new_files[2].second.fd.GetPathId());
+ ASSERT_EQ(0u, new_files[3].second.fd.GetPathId());
+ ASSERT_EQ(kInvalidBlobFileNumber,
+ new_files[0].second.oldest_blob_file_number);
+ ASSERT_EQ(kInvalidBlobFileNumber,
+ new_files[1].second.oldest_blob_file_number);
+ ASSERT_EQ(kInvalidBlobFileNumber,
+ new_files[2].second.oldest_blob_file_number);
+ ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number);
+}
+
+TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
+ static const uint64_t kBig = 1ull << 50;
+ VersionEdit edit;
+ edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+ InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+ kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+ edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
+ InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
+ kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
+ 686, 868, "234", "crc32c", kNullUniqueId64x2);
+ edit.DeleteFile(4, 700);
+
+ edit.SetComparatorName("foo");
+ edit.SetLogNumber(kBig + 100);
+ edit.SetNextFile(kBig + 200);
+ edit.SetLastSequence(kBig + 1000);
+
+ std::string encoded;
+
+ // Call back function to add extra customized builds.
+ bool first = true;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
+ std::string* str = reinterpret_cast<std::string*>(arg);
+ PutVarint32(str, 33);
+ const std::string str1 = "random_string";
+ PutLengthPrefixedSlice(str, str1);
+ if (first) {
+ first = false;
+ PutVarint32(str, 22);
+ const std::string str2 = "s";
+ PutLengthPrefixedSlice(str, str2);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ edit.EncodeTo(&encoded);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ VersionEdit parsed;
+ Status s = parsed.DecodeFrom(encoded);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_TRUE(!first);
+ auto& new_files = parsed.GetNewFiles();
+ ASSERT_TRUE(new_files[0].second.marked_for_compaction);
+ ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
+ ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+ ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+ ASSERT_EQ(1u, parsed.GetDeletedFiles().size());
+}
+
+TEST_F(VersionEditTest, NewFile4NotSupportedField) {
+ static const uint64_t kBig = 1ull << 50;
+ VersionEdit edit;
+ edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+ InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+ kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+
+ edit.SetComparatorName("foo");
+ edit.SetLogNumber(kBig + 100);
+ edit.SetNextFile(kBig + 200);
+ edit.SetLastSequence(kBig + 1000);
+
+ std::string encoded;
+
+ // Call back function to add extra customized builds.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
+ std::string* str = reinterpret_cast<std::string*>(arg);
+ const std::string str1 = "s";
+ PutLengthPrefixedSlice(str, str1);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ edit.EncodeTo(&encoded);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ VersionEdit parsed;
+ Status s = parsed.DecodeFrom(encoded);
+ ASSERT_NOK(s);
+}
+
+TEST_F(VersionEditTest, EncodeEmptyFile) {
+ VersionEdit edit;
+ edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
+ Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+ std::string buffer;
+ ASSERT_TRUE(!edit.EncodeTo(&buffer));
+}
+
+TEST_F(VersionEditTest, ColumnFamilyTest) {
+ VersionEdit edit;
+ edit.SetColumnFamily(2);
+ edit.AddColumnFamily("column_family");
+ edit.SetMaxColumnFamily(5);
+ TestEncodeDecode(edit);
+
+ edit.Clear();
+ edit.SetColumnFamily(3);
+ edit.DropColumnFamily();
+ TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, MinLogNumberToKeep) {
+ VersionEdit edit;
+ edit.SetMinLogNumberToKeep(13);
+ TestEncodeDecode(edit);
+
+ edit.Clear();
+ edit.SetMinLogNumberToKeep(23);
+ TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, AtomicGroupTest) {
+ VersionEdit edit;
+ edit.MarkAtomicGroup(1);
+ TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, IgnorableField) {
+ VersionEdit ve;
+ std::string encoded;
+
+ // Size of ignorable field is too large
+ PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+ // This is a customized ignorable tag
+ PutVarint32Varint64(&encoded,
+ 0x2710 /* A field with kTagSafeIgnoreMask set */,
+ 5 /* fieldlength 5 */);
+ encoded += "abc"; // Only fills 3 bytes,
+ ASSERT_NOK(ve.DecodeFrom(encoded));
+
+ encoded.clear();
+ // Error when seeing unidentified tag that is not ignorable
+ PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+ // This is a customized ignorable tag
+ PutVarint32Varint64(&encoded, 666 /* A field with kTagSafeIgnoreMask unset */,
+ 3 /* fieldlength 3 */);
+ encoded += "abc"; // Fill 3 bytes
+ PutVarint32Varint64(&encoded, 3 /* next file number */, 88);
+ ASSERT_NOK(ve.DecodeFrom(encoded));
+
+ // Safely ignore an identified but safely ignorable entry
+ encoded.clear();
+ PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+ // This is a customized ignorable tag
+ PutVarint32Varint64(&encoded,
+ 0x2710 /* A field with kTagSafeIgnoreMask set */,
+ 3 /* fieldlength 3 */);
+ encoded += "abc"; // Fill 3 bytes
+ PutVarint32Varint64(&encoded, 3 /* kNextFileNumber */, 88);
+
+ ASSERT_OK(ve.DecodeFrom(encoded));
+
+ ASSERT_TRUE(ve.HasLogNumber());
+ ASSERT_TRUE(ve.HasNextFile());
+ ASSERT_EQ(66, ve.GetLogNumber());
+ ASSERT_EQ(88, ve.GetNextFile());
+}
+
+TEST_F(VersionEditTest, DbId) {
+ VersionEdit edit;
+ edit.SetDBId("ab34-cd12-435f-er00");
+ TestEncodeDecode(edit);
+
+ edit.Clear();
+ edit.SetDBId("34ba-cd12-435f-er01");
+ TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, BlobFileAdditionAndGarbage) {
+ VersionEdit edit;
+
+ const std::string checksum_method_prefix = "Hash";
+ const std::string checksum_value_prefix = "Value";
+
+ for (uint64_t blob_file_number = 1; blob_file_number <= 10;
+ ++blob_file_number) {
+ const uint64_t total_blob_count = blob_file_number << 10;
+ const uint64_t total_blob_bytes = blob_file_number << 20;
+
+ std::string checksum_method(checksum_method_prefix);
+ AppendNumberTo(&checksum_method, blob_file_number);
+
+ std::string checksum_value(checksum_value_prefix);
+ AppendNumberTo(&checksum_value, blob_file_number);
+
+ edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+ checksum_method, checksum_value);
+
+ const uint64_t garbage_blob_count = total_blob_count >> 2;
+ const uint64_t garbage_blob_bytes = total_blob_bytes >> 1;
+
+ edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+ }
+
+ TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, AddWalEncodeDecode) {
+ VersionEdit edit;
+ for (uint64_t log_number = 1; log_number <= 20; log_number++) {
+ WalMetadata meta;
+ bool has_size = rand() % 2 == 0;
+ if (has_size) {
+ meta.SetSyncedSizeInBytes(rand() % 1000);
+ }
+ edit.AddWal(log_number, meta);
+ }
+ TestEncodeDecode(edit);
+}
+
+static std::string PrefixEncodedWalAdditionWithLength(
+ const std::string& encoded) {
+ std::string ret;
+ PutVarint32(&ret, Tag::kWalAddition2);
+ PutLengthPrefixedSlice(&ret, encoded);
+ return ret;
+}
+
+TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) {
+ std::string encoded;
+
+ {
+ // No log number.
+ std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+ VersionEdit edit;
+ Status s = edit.DecodeFrom(encoded_edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") !=
+ std::string::npos)
+ << s.ToString();
+ }
+
+ {
+ // log number should be varint64,
+ // but we only encode 128 which is not a valid representation of varint64.
+ char c = 0;
+ unsigned char* ptr = reinterpret_cast<unsigned char*>(&c);
+ *ptr = 128;
+ encoded.append(1, c);
+
+ std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+ VersionEdit edit;
+ Status s = edit.DecodeFrom(encoded_edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") !=
+ std::string::npos)
+ << s.ToString();
+ }
+}
+
+TEST_F(VersionEditTest, AddWalDecodeBadTag) {
+ constexpr WalNumber kLogNumber = 100;
+ constexpr uint64_t kSizeInBytes = 100;
+
+ std::string encoded;
+ PutVarint64(&encoded, kLogNumber);
+
+ {
+ // No tag.
+ std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+ VersionEdit edit;
+ Status s = edit.DecodeFrom(encoded_edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+ << s.ToString();
+ }
+
+ {
+ // Only has size tag, no terminate tag.
+ std::string encoded_with_size = encoded;
+ PutVarint32(&encoded_with_size,
+ static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+ PutVarint64(&encoded_with_size, kSizeInBytes);
+
+ std::string encoded_edit =
+ PrefixEncodedWalAdditionWithLength(encoded_with_size);
+ VersionEdit edit;
+ Status s = edit.DecodeFrom(encoded_edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+ << s.ToString();
+ }
+
+ {
+ // Only has terminate tag.
+ std::string encoded_with_terminate = encoded;
+ PutVarint32(&encoded_with_terminate,
+ static_cast<uint32_t>(WalAdditionTag::kTerminate));
+
+ std::string encoded_edit =
+ PrefixEncodedWalAdditionWithLength(encoded_with_terminate);
+ VersionEdit edit;
+ ASSERT_OK(edit.DecodeFrom(encoded_edit));
+ auto& wal_addition = edit.GetWalAdditions()[0];
+ ASSERT_EQ(wal_addition.GetLogNumber(), kLogNumber);
+ ASSERT_FALSE(wal_addition.GetMetadata().HasSyncedSize());
+ }
+}
+
+TEST_F(VersionEditTest, AddWalDecodeNoSize) {
+ constexpr WalNumber kLogNumber = 100;
+
+ std::string encoded;
+ PutVarint64(&encoded, kLogNumber);
+ PutVarint32(&encoded, static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+ // No real size after the size tag.
+
+ {
+ // Without terminate tag.
+ std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+ VersionEdit edit;
+ Status s = edit.DecodeFrom(encoded_edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(s.ToString().find("Error decoding WAL file size") !=
+ std::string::npos)
+ << s.ToString();
+ }
+
+ {
+ // With terminate tag.
+ PutVarint32(&encoded, static_cast<uint32_t>(WalAdditionTag::kTerminate));
+
+ std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+ VersionEdit edit;
+ Status s = edit.DecodeFrom(encoded_edit);
+ ASSERT_TRUE(s.IsCorruption());
+ // The terminate tag is misunderstood as the size.
+ ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+ << s.ToString();
+ }
+}
+
+TEST_F(VersionEditTest, AddWalDebug) {
+ constexpr int n = 2;
+ constexpr std::array<uint64_t, n> kLogNumbers{{10, 20}};
+ constexpr std::array<uint64_t, n> kSizeInBytes{{100, 200}};
+
+ VersionEdit edit;
+ for (int i = 0; i < n; i++) {
+ edit.AddWal(kLogNumbers[i], WalMetadata(kSizeInBytes[i]));
+ }
+
+ const WalAdditions& wals = edit.GetWalAdditions();
+
+ ASSERT_TRUE(edit.IsWalAddition());
+ ASSERT_EQ(wals.size(), n);
+ for (int i = 0; i < n; i++) {
+ const WalAddition& wal = wals[i];
+ ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[i]);
+ ASSERT_EQ(wal.GetMetadata().GetSyncedSizeInBytes(), kSizeInBytes[i]);
+ }
+
+ std::string expected_str = "VersionEdit {\n";
+ for (int i = 0; i < n; i++) {
+ std::stringstream ss;
+ ss << " WalAddition: log_number: " << kLogNumbers[i]
+ << " synced_size_in_bytes: " << kSizeInBytes[i] << "\n";
+ expected_str += ss.str();
+ }
+ expected_str += " ColumnFamily: 0\n}\n";
+ ASSERT_EQ(edit.DebugString(true), expected_str);
+
+ std::string expected_json = "{\"EditNumber\": 4, \"WalAdditions\": [";
+ for (int i = 0; i < n; i++) {
+ std::stringstream ss;
+ ss << "{\"LogNumber\": " << kLogNumbers[i] << ", "
+ << "\"SyncedSizeInBytes\": " << kSizeInBytes[i] << "}";
+ if (i < n - 1) ss << ", ";
+ expected_json += ss.str();
+ }
+ expected_json += "], \"ColumnFamily\": 0}";
+ ASSERT_EQ(edit.DebugJSON(4, true), expected_json);
+}
+
+TEST_F(VersionEditTest, DeleteWalEncodeDecode) {
+ VersionEdit edit;
+ edit.DeleteWalsBefore(rand() % 100);
+ TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, DeleteWalDebug) {
+ constexpr int n = 2;
+ constexpr std::array<uint64_t, n> kLogNumbers{{10, 20}};
+
+ VersionEdit edit;
+ edit.DeleteWalsBefore(kLogNumbers[n - 1]);
+
+ const WalDeletion& wal = edit.GetWalDeletion();
+
+ ASSERT_TRUE(edit.IsWalDeletion());
+ ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[n - 1]);
+
+ std::string expected_str = "VersionEdit {\n";
+ {
+ std::stringstream ss;
+ ss << " WalDeletion: log_number: " << kLogNumbers[n - 1] << "\n";
+ expected_str += ss.str();
+ }
+ expected_str += " ColumnFamily: 0\n}\n";
+ ASSERT_EQ(edit.DebugString(true), expected_str);
+
+ std::string expected_json = "{\"EditNumber\": 4, \"WalDeletion\": ";
+ {
+ std::stringstream ss;
+ ss << "{\"LogNumber\": " << kLogNumbers[n - 1] << "}";
+ expected_json += ss.str();
+ }
+ expected_json += ", \"ColumnFamily\": 0}";
+ ASSERT_EQ(edit.DebugJSON(4, true), expected_json);
+}
+
+TEST_F(VersionEditTest, FullHistoryTsLow) {
+ VersionEdit edit;
+ ASSERT_FALSE(edit.HasFullHistoryTsLow());
+ std::string ts = test::EncodeInt(0);
+ edit.SetFullHistoryTsLow(ts);
+ TestEncodeDecode(edit);
+}
+
+// Tests that if RocksDB is downgraded, the new types of VersionEdits
+// that have a tag larger than kTagSafeIgnoreMask can be safely ignored.
+TEST_F(VersionEditTest, IgnorableTags) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionEdit::EncodeTo:IgnoreIgnorableTags", [&](void* arg) {
+ bool* ignore = static_cast<bool*>(arg);
+ *ignore = true;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr uint64_t kPrevLogNumber = 100;
+ constexpr uint64_t kLogNumber = 200;
+ constexpr uint64_t kNextFileNumber = 300;
+ constexpr uint64_t kColumnFamilyId = 400;
+
+ VersionEdit edit;
+ // Add some ignorable entries.
+ for (int i = 0; i < 2; i++) {
+ edit.AddWal(i + 1, WalMetadata(i + 2));
+ }
+ edit.SetDBId("db_id");
+ // Add unignorable entries.
+ edit.SetPrevLogNumber(kPrevLogNumber);
+ edit.SetLogNumber(kLogNumber);
+ // Add more ignorable entries.
+ edit.DeleteWalsBefore(100);
+ // Add unignorable entry.
+ edit.SetNextFile(kNextFileNumber);
+ // Add more ignorable entries.
+ edit.SetFullHistoryTsLow("ts");
+ // Add unignorable entry.
+ edit.SetColumnFamily(kColumnFamilyId);
+
+ std::string encoded;
+ ASSERT_TRUE(edit.EncodeTo(&encoded));
+
+ VersionEdit decoded;
+ ASSERT_OK(decoded.DecodeFrom(encoded));
+
+ // Check that all ignorable entries are ignored.
+ ASSERT_FALSE(decoded.HasDbId());
+ ASSERT_FALSE(decoded.HasFullHistoryTsLow());
+ ASSERT_FALSE(decoded.IsWalAddition());
+ ASSERT_FALSE(decoded.IsWalDeletion());
+ ASSERT_TRUE(decoded.GetWalAdditions().empty());
+ ASSERT_TRUE(decoded.GetWalDeletion().IsEmpty());
+
+ // Check that unignorable entries are still present.
+ ASSERT_EQ(edit.GetPrevLogNumber(), kPrevLogNumber);
+ ASSERT_EQ(edit.GetLogNumber(), kLogNumber);
+ ASSERT_EQ(edit.GetNextFile(), kNextFileNumber);
+ ASSERT_EQ(edit.GetColumnFamily(), kColumnFamilyId);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST(FileMetaDataTest, UpdateBoundariesBlobIndex) {
+ FileMetaData meta;
+
+ {
+ constexpr uint64_t file_number = 10;
+ constexpr uint32_t path_id = 0;
+ constexpr uint64_t file_size = 0;
+
+ meta.fd = FileDescriptor(file_number, path_id, file_size);
+ }
+
+ constexpr char key[] = "foo";
+
+ constexpr uint64_t expected_oldest_blob_file_number = 20;
+
+ // Plain old value (does not affect oldest_blob_file_number)
+ {
+ constexpr char value[] = "value";
+ constexpr SequenceNumber seq = 200;
+
+ ASSERT_OK(meta.UpdateBoundaries(key, value, seq, kTypeValue));
+ ASSERT_EQ(meta.oldest_blob_file_number, kInvalidBlobFileNumber);
+ }
+
+ // Non-inlined, non-TTL blob index (sets oldest_blob_file_number)
+ {
+ constexpr uint64_t blob_file_number = 25;
+ static_assert(blob_file_number > expected_oldest_blob_file_number,
+ "unexpected");
+
+ constexpr uint64_t offset = 1000;
+ constexpr uint64_t size = 100;
+
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+ kNoCompression);
+
+ constexpr SequenceNumber seq = 201;
+
+ ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+ ASSERT_EQ(meta.oldest_blob_file_number, blob_file_number);
+ }
+
+ // Another one, with the oldest blob file number (updates
+ // oldest_blob_file_number)
+ {
+ constexpr uint64_t offset = 2000;
+ constexpr uint64_t size = 300;
+
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, expected_oldest_blob_file_number, offset,
+ size, kNoCompression);
+
+ constexpr SequenceNumber seq = 202;
+
+ ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+ ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+ }
+
+ // Inlined TTL blob index (does not affect oldest_blob_file_number)
+ {
+ constexpr uint64_t expiration = 9876543210;
+ constexpr char value[] = "value";
+
+ std::string blob_index;
+ BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+
+ constexpr SequenceNumber seq = 203;
+
+ ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+ ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+ }
+
+ // Non-inlined TTL blob index (does not affect oldest_blob_file_number, even
+ // though file number is smaller)
+ {
+ constexpr uint64_t expiration = 9876543210;
+ constexpr uint64_t blob_file_number = 15;
+ static_assert(blob_file_number < expected_oldest_blob_file_number,
+ "unexpected");
+
+ constexpr uint64_t offset = 2000;
+ constexpr uint64_t size = 500;
+
+ std::string blob_index;
+ BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+ size, kNoCompression);
+
+ constexpr SequenceNumber seq = 204;
+
+ ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+ ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+ }
+
+ // Corrupt blob index
+ {
+ constexpr char corrupt_blob_index[] = "!corrupt!";
+ constexpr SequenceNumber seq = 205;
+
+ ASSERT_TRUE(
+ meta.UpdateBoundaries(key, corrupt_blob_index, seq, kTypeBlobIndex)
+ .IsCorruption());
+ ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+ }
+
+ // Invalid blob file number
+ {
+ constexpr uint64_t offset = 10000;
+ constexpr uint64_t size = 1000;
+
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, kInvalidBlobFileNumber, offset, size,
+ kNoCompression);
+
+ constexpr SequenceNumber seq = 206;
+
+ ASSERT_TRUE(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex)
+ .IsCorruption());
+ ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_set.cc b/src/rocksdb/db/version_set.cc
new file mode 100644
index 000000000..427af6e25
--- /dev/null
+++ b/src/rocksdb/db/version_set.cc
@@ -0,0 +1,6903 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <cstdio>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_source.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/file_pri.h"
+#include "db/dbformat.h"
+#include "db/internal_stats.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/table_cache.h"
+#include "db/version_builder.h"
+#include "db/version_edit_handler.h"
+#if USE_COROUTINES
+#include "folly/experimental/coro/BlockingWait.h"
+#include "folly/experimental/coro/Collect.h"
+#endif
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "monitoring/file_read_sample.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/merging_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/multiget_context.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "table/two_level_iterator.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/coro_utils.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/user_comparator_wrapper.h"
+
+// Generate the regular and coroutine versions of some methods by
+// including version_set_sync_and_async.h twice
+// Macros in the header will expand differently based on whether
+// WITH_COROUTINES or WITHOUT_COROUTINES is defined
+// clang-format off
+#define WITHOUT_COROUTINES
+#include "db/version_set_sync_and_async.h"
+#undef WITHOUT_COROUTINES
+#define WITH_COROUTINES
+#include "db/version_set_sync_and_async.h"
+#undef WITH_COROUTINES
+// clang-format on
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Find File in LevelFilesBrief data structure
+// Within an index range defined by left and right
+int FindFileInRange(const InternalKeyComparator& icmp,
+ const LevelFilesBrief& file_level, const Slice& key,
+ uint32_t left, uint32_t right) {
+ auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+ return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0;
+ };
+ const auto& b = file_level.files;
+ return static_cast<int>(std::lower_bound(b + left, b + right, key, cmp) - b);
+}
+
+Status OverlapWithIterator(const Comparator* ucmp,
+ const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ InternalIterator* iter, bool* overlap) {
+ InternalKey range_start(smallest_user_key, kMaxSequenceNumber,
+ kValueTypeForSeek);
+ iter->Seek(range_start.Encode());
+ if (!iter->status().ok()) {
+ return iter->status();
+ }
+
+ *overlap = false;
+ if (iter->Valid()) {
+ ParsedInternalKey seek_result;
+ Status s = ParseInternalKey(iter->key(), &seek_result,
+ false /* log_err_key */); // TODO
+ if (!s.ok()) return s;
+
+ if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <=
+ 0) {
+ *overlap = true;
+ }
+ }
+
+ return iter->status();
+}
+
+// Class to help choose the next file to search for the particular key.
+// Searches and returns files level by level.
+// We can search level-by-level since entries never hop across
+// levels. Therefore we are guaranteed that if we find data
+// in a smaller level, later levels are irrelevant (unless we
+// are MergeInProgress).
+class FilePicker {
+ public:
+ FilePicker(const Slice& user_key, const Slice& ikey,
+ autovector<LevelFilesBrief>* file_levels, unsigned int num_levels,
+ FileIndexer* file_indexer, const Comparator* user_comparator,
+ const InternalKeyComparator* internal_comparator)
+ : num_levels_(num_levels),
+ curr_level_(static_cast<unsigned int>(-1)),
+ returned_file_level_(static_cast<unsigned int>(-1)),
+ hit_file_level_(static_cast<unsigned int>(-1)),
+ search_left_bound_(0),
+ search_right_bound_(FileIndexer::kLevelMaxIndex),
+ level_files_brief_(file_levels),
+ is_hit_file_last_in_level_(false),
+ curr_file_level_(nullptr),
+ user_key_(user_key),
+ ikey_(ikey),
+ file_indexer_(file_indexer),
+ user_comparator_(user_comparator),
+ internal_comparator_(internal_comparator) {
+ // Setup member variables to search first level.
+ search_ended_ = !PrepareNextLevel();
+ if (!search_ended_) {
+ // Prefetch Level 0 table data to avoid cache miss if possible.
+ for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+ auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
+ if (r) {
+ r->Prepare(ikey);
+ }
+ }
+ }
+ }
+
+ int GetCurrentLevel() const { return curr_level_; }
+
+ FdWithKeyRange* GetNextFile() {
+ while (!search_ended_) { // Loops over different levels.
+ while (curr_index_in_curr_level_ < curr_file_level_->num_files) {
+ // Loops over all files in current level.
+ FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_];
+ hit_file_level_ = curr_level_;
+ is_hit_file_last_in_level_ =
+ curr_index_in_curr_level_ == curr_file_level_->num_files - 1;
+ int cmp_largest = -1;
+
+ // Do key range filtering of files or/and fractional cascading if:
+ // (1) not all the files are in level 0, or
+ // (2) there are more than 3 current level files
+ // If there are only 3 or less current level files in the system, we
+ // skip the key range filtering. In this case, more likely, the system
+ // is highly tuned to minimize number of tables queried by each query,
+ // so it is unlikely that key range filtering is more efficient than
+ // querying the files.
+ if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
+ // Check if key is within a file's range. If search left bound and
+ // right bound point to the same find, we are sure key falls in
+ // range.
+ assert(curr_level_ == 0 ||
+ curr_index_in_curr_level_ == start_index_in_curr_level_ ||
+ user_comparator_->CompareWithoutTimestamp(
+ user_key_, ExtractUserKey(f->smallest_key)) <= 0);
+
+ int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+ user_key_, ExtractUserKey(f->smallest_key));
+ if (cmp_smallest >= 0) {
+ cmp_largest = user_comparator_->CompareWithoutTimestamp(
+ user_key_, ExtractUserKey(f->largest_key));
+ }
+
+ // Setup file search bound for the next level based on the
+ // comparison results
+ if (curr_level_ > 0) {
+ file_indexer_->GetNextLevelIndex(
+ curr_level_, curr_index_in_curr_level_, cmp_smallest,
+ cmp_largest, &search_left_bound_, &search_right_bound_);
+ }
+ // Key falls out of current file's range
+ if (cmp_smallest < 0 || cmp_largest > 0) {
+ if (curr_level_ == 0) {
+ ++curr_index_in_curr_level_;
+ continue;
+ } else {
+ // Search next level.
+ break;
+ }
+ }
+ }
+
+ returned_file_level_ = curr_level_;
+ if (curr_level_ > 0 && cmp_largest < 0) {
+ // No more files to search in this level.
+ search_ended_ = !PrepareNextLevel();
+ } else {
+ ++curr_index_in_curr_level_;
+ }
+ return f;
+ }
+ // Start searching next level.
+ search_ended_ = !PrepareNextLevel();
+ }
+ // Search ended.
+ return nullptr;
+ }
+
+ // getter for current file level
+ // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+ unsigned int GetHitFileLevel() { return hit_file_level_; }
+
+ // Returns true if the most recent "hit file" (i.e., one returned by
+ // GetNextFile()) is at the last index in its level.
+ bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
+
+ private:
+ unsigned int num_levels_;
+ unsigned int curr_level_;
+ unsigned int returned_file_level_;
+ unsigned int hit_file_level_;
+ int32_t search_left_bound_;
+ int32_t search_right_bound_;
+ autovector<LevelFilesBrief>* level_files_brief_;
+ bool search_ended_;
+ bool is_hit_file_last_in_level_;
+ LevelFilesBrief* curr_file_level_;
+ unsigned int curr_index_in_curr_level_;
+ unsigned int start_index_in_curr_level_;
+ Slice user_key_;
+ Slice ikey_;
+ FileIndexer* file_indexer_;
+ const Comparator* user_comparator_;
+ const InternalKeyComparator* internal_comparator_;
+
+ // Setup local variables to search next level.
+ // Returns false if there are no more levels to search.
+ bool PrepareNextLevel() {
+ curr_level_++;
+ while (curr_level_ < num_levels_) {
+ curr_file_level_ = &(*level_files_brief_)[curr_level_];
+ if (curr_file_level_->num_files == 0) {
+ // When current level is empty, the search bound generated from upper
+ // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+ // also empty.
+ assert(search_left_bound_ == 0);
+ assert(search_right_bound_ == -1 ||
+ search_right_bound_ == FileIndexer::kLevelMaxIndex);
+ // Since current level is empty, it will need to search all files in
+ // the next level
+ search_left_bound_ = 0;
+ search_right_bound_ = FileIndexer::kLevelMaxIndex;
+ curr_level_++;
+ continue;
+ }
+
+ // Some files may overlap each other. We find
+ // all files that overlap user_key and process them in order from
+ // newest to oldest. In the context of merge-operator, this can occur at
+ // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
+ // are always compacted into a single entry).
+ int32_t start_index;
+ if (curr_level_ == 0) {
+ // On Level-0, we read through all files to check for overlap.
+ start_index = 0;
+ } else {
+ // On Level-n (n>=1), files are sorted. Binary search to find the
+ // earliest file whose largest key >= ikey. Search left bound and
+ // right bound are used to narrow the range.
+ if (search_left_bound_ <= search_right_bound_) {
+ if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
+ search_right_bound_ =
+ static_cast<int32_t>(curr_file_level_->num_files) - 1;
+ }
+ // `search_right_bound_` is an inclusive upper-bound, but since it was
+ // determined based on user key, it is still possible the lookup key
+ // falls to the right of `search_right_bound_`'s corresponding file.
+ // So, pass a limit one higher, which allows us to detect this case.
+ start_index =
+ FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
+ static_cast<uint32_t>(search_left_bound_),
+ static_cast<uint32_t>(search_right_bound_) + 1);
+ if (start_index == search_right_bound_ + 1) {
+ // `ikey_` comes after `search_right_bound_`. The lookup key does
+ // not exist on this level, so let's skip this level and do a full
+ // binary search on the next level.
+ search_left_bound_ = 0;
+ search_right_bound_ = FileIndexer::kLevelMaxIndex;
+ curr_level_++;
+ continue;
+ }
+ } else {
+ // search_left_bound > search_right_bound, key does not exist in
+ // this level. Since no comparison is done in this level, it will
+ // need to search all files in the next level.
+ search_left_bound_ = 0;
+ search_right_bound_ = FileIndexer::kLevelMaxIndex;
+ curr_level_++;
+ continue;
+ }
+ }
+ start_index_in_curr_level_ = start_index;
+ curr_index_in_curr_level_ = start_index;
+
+ return true;
+ }
+ // curr_level_ = num_levels_. So, no more levels to search.
+ return false;
+ }
+};
+} // anonymous namespace
+
+class FilePickerMultiGet {
+ private:
+ struct FilePickerContext;
+
+ public:
+ FilePickerMultiGet(MultiGetRange* range,
+ autovector<LevelFilesBrief>* file_levels,
+ unsigned int num_levels, FileIndexer* file_indexer,
+ const Comparator* user_comparator,
+ const InternalKeyComparator* internal_comparator)
+ : num_levels_(num_levels),
+ curr_level_(static_cast<unsigned int>(-1)),
+ returned_file_level_(static_cast<unsigned int>(-1)),
+ hit_file_level_(static_cast<unsigned int>(-1)),
+ range_(*range, range->begin(), range->end()),
+ maybe_repeat_key_(false),
+ current_level_range_(*range, range->begin(), range->end()),
+ current_file_range_(*range, range->begin(), range->end()),
+ batch_iter_(range->begin()),
+ batch_iter_prev_(range->begin()),
+ upper_key_(range->begin()),
+ level_files_brief_(file_levels),
+ is_hit_file_last_in_level_(false),
+ curr_file_level_(nullptr),
+ file_indexer_(file_indexer),
+ user_comparator_(user_comparator),
+ internal_comparator_(internal_comparator),
+ hit_file_(nullptr) {
+ for (auto iter = range_.begin(); iter != range_.end(); ++iter) {
+ fp_ctx_array_[iter.index()] =
+ FilePickerContext(0, FileIndexer::kLevelMaxIndex);
+ }
+
+ // Setup member variables to search first level.
+ search_ended_ = !PrepareNextLevel();
+ if (!search_ended_) {
+ // REVISIT
+ // Prefetch Level 0 table data to avoid cache miss if possible.
+ // As of now, only PlainTableReader and CuckooTableReader do any
+ // prefetching. This may not be necessary anymore once we implement
+ // batching in those table readers
+ for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+ auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
+ if (r) {
+ for (auto iter = range_.begin(); iter != range_.end(); ++iter) {
+ r->Prepare(iter->ikey);
+ }
+ }
+ }
+ }
+ }
+
+ FilePickerMultiGet(MultiGetRange* range, const FilePickerMultiGet& other)
+ : num_levels_(other.num_levels_),
+ curr_level_(other.curr_level_),
+ returned_file_level_(other.returned_file_level_),
+ hit_file_level_(other.hit_file_level_),
+ fp_ctx_array_(other.fp_ctx_array_),
+ range_(*range, range->begin(), range->end()),
+ maybe_repeat_key_(false),
+ current_level_range_(*range, range->begin(), range->end()),
+ current_file_range_(*range, range->begin(), range->end()),
+ batch_iter_(range->begin()),
+ batch_iter_prev_(range->begin()),
+ upper_key_(range->begin()),
+ level_files_brief_(other.level_files_brief_),
+ is_hit_file_last_in_level_(false),
+ curr_file_level_(other.curr_file_level_),
+ file_indexer_(other.file_indexer_),
+ user_comparator_(other.user_comparator_),
+ internal_comparator_(other.internal_comparator_),
+ hit_file_(nullptr) {
+ PrepareNextLevelForSearch();
+ }
+
+ int GetCurrentLevel() const { return curr_level_; }
+
+ void PrepareNextLevelForSearch() { search_ended_ = !PrepareNextLevel(); }
+
+ FdWithKeyRange* GetNextFileInLevel() {
+ if (batch_iter_ == current_level_range_.end() || search_ended_) {
+ hit_file_ = nullptr;
+ return nullptr;
+ } else {
+ if (maybe_repeat_key_) {
+ maybe_repeat_key_ = false;
+ // Check if we found the final value for the last key in the
+ // previous lookup range. If we did, then there's no need to look
+ // any further for that key, so advance batch_iter_. Else, keep
+ // batch_iter_ positioned on that key so we look it up again in
+ // the next file
+ // For L0, always advance the key because we will look in the next
+ // file regardless for all keys not found yet
+ if (current_level_range_.CheckKeyDone(batch_iter_) ||
+ curr_level_ == 0) {
+ batch_iter_ = upper_key_;
+ }
+ }
+ // batch_iter_prev_ will become the start key for the next file
+ // lookup
+ batch_iter_prev_ = batch_iter_;
+ }
+
+ MultiGetRange next_file_range(current_level_range_, batch_iter_prev_,
+ current_level_range_.end());
+ size_t curr_file_index =
+ (batch_iter_ != current_level_range_.end())
+ ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
+ : curr_file_level_->num_files;
+ FdWithKeyRange* f;
+ bool is_last_key_in_file;
+ if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f,
+ &is_last_key_in_file)) {
+ hit_file_ = nullptr;
+ return nullptr;
+ } else {
+ if (is_last_key_in_file) {
+ // Since cmp_largest is 0, batch_iter_ still points to the last key
+ // that falls in this file, instead of the next one. Increment
+ // the file index for all keys between batch_iter_ and upper_key_
+ auto tmp_iter = batch_iter_;
+ while (tmp_iter != upper_key_) {
+ ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level);
+ ++tmp_iter;
+ }
+ maybe_repeat_key_ = true;
+ }
+ // Set the range for this file
+ current_file_range_ =
+ MultiGetRange(next_file_range, batch_iter_prev_, upper_key_);
+ returned_file_level_ = curr_level_;
+ hit_file_level_ = curr_level_;
+ is_hit_file_last_in_level_ =
+ curr_file_index == curr_file_level_->num_files - 1;
+ hit_file_ = f;
+ return f;
+ }
+ }
+
+ // getter for current file level
+ // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+ unsigned int GetHitFileLevel() { return hit_file_level_; }
+
+ FdWithKeyRange* GetHitFile() { return hit_file_; }
+
+ // Returns true if the most recent "hit file" (i.e., one returned by
+ // GetNextFile()) is at the last index in its level.
+ bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
+
+ bool KeyMaySpanNextFile() { return maybe_repeat_key_; }
+
+ bool IsSearchEnded() { return search_ended_; }
+
+ const MultiGetRange& CurrentFileRange() { return current_file_range_; }
+
+ bool RemainingOverlapInLevel() {
+ return !current_level_range_.Suffix(current_file_range_).empty();
+ }
+
+ MultiGetRange& GetRange() { return range_; }
+
+ void ReplaceRange(const MultiGetRange& other) {
+ assert(hit_file_ == nullptr);
+ range_ = other;
+ current_level_range_ = other;
+ }
+
+ FilePickerMultiGet(FilePickerMultiGet&& other)
+ : num_levels_(other.num_levels_),
+ curr_level_(other.curr_level_),
+ returned_file_level_(other.returned_file_level_),
+ hit_file_level_(other.hit_file_level_),
+ fp_ctx_array_(std::move(other.fp_ctx_array_)),
+ range_(std::move(other.range_)),
+ maybe_repeat_key_(other.maybe_repeat_key_),
+ current_level_range_(std::move(other.current_level_range_)),
+ current_file_range_(std::move(other.current_file_range_)),
+ batch_iter_(other.batch_iter_, &current_level_range_),
+ batch_iter_prev_(other.batch_iter_prev_, &current_level_range_),
+ upper_key_(other.upper_key_, &current_level_range_),
+ level_files_brief_(other.level_files_brief_),
+ search_ended_(other.search_ended_),
+ is_hit_file_last_in_level_(other.is_hit_file_last_in_level_),
+ curr_file_level_(other.curr_file_level_),
+ file_indexer_(other.file_indexer_),
+ user_comparator_(other.user_comparator_),
+ internal_comparator_(other.internal_comparator_),
+ hit_file_(other.hit_file_) {}
+
+ private:
+ unsigned int num_levels_;
+ unsigned int curr_level_;
+ unsigned int returned_file_level_;
+ unsigned int hit_file_level_;
+
+ struct FilePickerContext {
+ int32_t search_left_bound;
+ int32_t search_right_bound;
+ unsigned int curr_index_in_curr_level;
+ unsigned int start_index_in_curr_level;
+
+ FilePickerContext(int32_t left, int32_t right)
+ : search_left_bound(left),
+ search_right_bound(right),
+ curr_index_in_curr_level(0),
+ start_index_in_curr_level(0) {}
+
+ FilePickerContext() = default;
+ };
+ std::array<FilePickerContext, MultiGetContext::MAX_BATCH_SIZE> fp_ctx_array_;
+ MultiGetRange range_;
+ bool maybe_repeat_key_;
+ MultiGetRange current_level_range_;
+ MultiGetRange current_file_range_;
+ // Iterator to iterate through the keys in a MultiGet batch, that gets reset
+ // at the beginning of each level. Each call to GetNextFile() will position
+ // batch_iter_ at or right after the last key that was found in the returned
+ // SST file
+ MultiGetRange::Iterator batch_iter_;
+ // An iterator that records the previous position of batch_iter_, i.e last
+ // key found in the previous SST file, in order to serve as the start of
+ // the batch key range for the next SST file
+ MultiGetRange::Iterator batch_iter_prev_;
+ MultiGetRange::Iterator upper_key_;
+ autovector<LevelFilesBrief>* level_files_brief_;
+ bool search_ended_;
+ bool is_hit_file_last_in_level_;
+ LevelFilesBrief* curr_file_level_;
+ FileIndexer* file_indexer_;
+ const Comparator* user_comparator_;
+ const InternalKeyComparator* internal_comparator_;
+ FdWithKeyRange* hit_file_;
+
+ // Iterates through files in the current level until it finds a file that
+ // contains at least one key from the MultiGet batch
+ bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
+ size_t* file_index, FdWithKeyRange** fd,
+ bool* is_last_key_in_file) {
+ size_t curr_file_index = *file_index;
+ FdWithKeyRange* f = nullptr;
+ bool file_hit = false;
+ int cmp_largest = -1;
+ if (curr_file_index >= curr_file_level_->num_files) {
+ // In the unlikely case the next key is a duplicate of the current key,
+ // and the current key is the last in the level and the internal key
+ // was not found, we need to skip lookup for the remaining keys and
+ // reset the search bounds
+ if (batch_iter_ != current_level_range_.end()) {
+ ++batch_iter_;
+ for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) {
+ struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+ fp_ctx.search_left_bound = 0;
+ fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+ }
+ }
+ return false;
+ }
+ // Loops over keys in the MultiGet batch until it finds a file with
+ // atleast one of the keys. Then it keeps moving forward until the
+ // last key in the batch that falls in that file
+ while (batch_iter_ != current_level_range_.end() &&
+ (fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level ==
+ curr_file_index ||
+ !file_hit)) {
+ struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+ f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level];
+ Slice& user_key = batch_iter_->ukey_without_ts;
+
+ // Do key range filtering of files or/and fractional cascading if:
+ // (1) not all the files are in level 0, or
+ // (2) there are more than 3 current level files
+ // If there are only 3 or less current level files in the system, we
+ // skip the key range filtering. In this case, more likely, the system
+ // is highly tuned to minimize number of tables queried by each query,
+ // so it is unlikely that key range filtering is more efficient than
+ // querying the files.
+ if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
+ // Check if key is within a file's range. If search left bound and
+ // right bound point to the same find, we are sure key falls in
+ // range.
+ int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+ user_key, false, ExtractUserKey(f->smallest_key), true);
+
+ assert(curr_level_ == 0 ||
+ fp_ctx.curr_index_in_curr_level ==
+ fp_ctx.start_index_in_curr_level ||
+ cmp_smallest <= 0);
+
+ if (cmp_smallest >= 0) {
+ cmp_largest = user_comparator_->CompareWithoutTimestamp(
+ user_key, false, ExtractUserKey(f->largest_key), true);
+ } else {
+ cmp_largest = -1;
+ }
+
+ // Setup file search bound for the next level based on the
+ // comparison results
+ if (curr_level_ > 0) {
+ file_indexer_->GetNextLevelIndex(
+ curr_level_, fp_ctx.curr_index_in_curr_level, cmp_smallest,
+ cmp_largest, &fp_ctx.search_left_bound,
+ &fp_ctx.search_right_bound);
+ }
+ // Key falls out of current file's range
+ if (cmp_smallest < 0 || cmp_largest > 0) {
+ next_file_range->SkipKey(batch_iter_);
+ } else {
+ file_hit = true;
+ }
+ } else {
+ file_hit = true;
+ }
+ if (cmp_largest == 0) {
+ // cmp_largest is 0, which means the next key will not be in this
+ // file, so stop looking further. However, its possible there are
+ // duplicates in the batch, so find the upper bound for the batch
+ // in this file (upper_key_) by skipping past the duplicates. We
+ // leave batch_iter_ as is since we may have to pick up from there
+ // for the next file, if this file has a merge value rather than
+ // final value
+ upper_key_ = batch_iter_;
+ ++upper_key_;
+ while (upper_key_ != current_level_range_.end() &&
+ user_comparator_->CompareWithoutTimestamp(
+ batch_iter_->ukey_without_ts, false,
+ upper_key_->ukey_without_ts, false) == 0) {
+ ++upper_key_;
+ }
+ break;
+ } else {
+ if (curr_level_ == 0) {
+ // We need to look through all files in level 0
+ ++fp_ctx.curr_index_in_curr_level;
+ }
+ ++batch_iter_;
+ }
+ if (!file_hit) {
+ curr_file_index =
+ (batch_iter_ != current_level_range_.end())
+ ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
+ : curr_file_level_->num_files;
+ }
+ }
+
+ *fd = f;
+ *file_index = curr_file_index;
+ *is_last_key_in_file = cmp_largest == 0;
+ if (!*is_last_key_in_file) {
+ // If the largest key in the batch overlapping the file is not the
+ // largest key in the file, upper_ley_ would not have been updated so
+ // update it here
+ upper_key_ = batch_iter_;
+ }
+ return file_hit;
+ }
+
+ // Setup local variables to search next level.
+ // Returns false if there are no more levels to search.
+ bool PrepareNextLevel() {
+ if (curr_level_ == 0) {
+ MultiGetRange::Iterator mget_iter = current_level_range_.begin();
+ if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level <
+ curr_file_level_->num_files) {
+ batch_iter_prev_ = current_level_range_.begin();
+ upper_key_ = batch_iter_ = current_level_range_.begin();
+ return true;
+ }
+ }
+
+ curr_level_++;
+ // Reset key range to saved value
+ while (curr_level_ < num_levels_) {
+ bool level_contains_keys = false;
+ curr_file_level_ = &(*level_files_brief_)[curr_level_];
+ if (curr_file_level_->num_files == 0) {
+ // When current level is empty, the search bound generated from upper
+ // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+ // also empty.
+
+ for (auto mget_iter = current_level_range_.begin();
+ mget_iter != current_level_range_.end(); ++mget_iter) {
+ struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
+
+ assert(fp_ctx.search_left_bound == 0);
+ assert(fp_ctx.search_right_bound == -1 ||
+ fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex);
+ // Since current level is empty, it will need to search all files in
+ // the next level
+ fp_ctx.search_left_bound = 0;
+ fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+ }
+ // Skip all subsequent empty levels
+ do {
+ ++curr_level_;
+ } while ((curr_level_ < num_levels_) &&
+ (*level_files_brief_)[curr_level_].num_files == 0);
+ continue;
+ }
+
+ // Some files may overlap each other. We find
+ // all files that overlap user_key and process them in order from
+ // newest to oldest. In the context of merge-operator, this can occur at
+ // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
+ // are always compacted into a single entry).
+ int32_t start_index = -1;
+ current_level_range_ =
+ MultiGetRange(range_, range_.begin(), range_.end());
+ for (auto mget_iter = current_level_range_.begin();
+ mget_iter != current_level_range_.end(); ++mget_iter) {
+ struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
+ if (curr_level_ == 0) {
+ // On Level-0, we read through all files to check for overlap.
+ start_index = 0;
+ level_contains_keys = true;
+ } else {
+ // On Level-n (n>=1), files are sorted. Binary search to find the
+ // earliest file whose largest key >= ikey. Search left bound and
+ // right bound are used to narrow the range.
+ if (fp_ctx.search_left_bound <= fp_ctx.search_right_bound) {
+ if (fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex) {
+ fp_ctx.search_right_bound =
+ static_cast<int32_t>(curr_file_level_->num_files) - 1;
+ }
+ // `search_right_bound_` is an inclusive upper-bound, but since it
+ // was determined based on user key, it is still possible the lookup
+ // key falls to the right of `search_right_bound_`'s corresponding
+ // file. So, pass a limit one higher, which allows us to detect this
+ // case.
+ Slice& ikey = mget_iter->ikey;
+ start_index = FindFileInRange(
+ *internal_comparator_, *curr_file_level_, ikey,
+ static_cast<uint32_t>(fp_ctx.search_left_bound),
+ static_cast<uint32_t>(fp_ctx.search_right_bound) + 1);
+ if (start_index == fp_ctx.search_right_bound + 1) {
+ // `ikey_` comes after `search_right_bound_`. The lookup key does
+ // not exist on this level, so let's skip this level and do a full
+ // binary search on the next level.
+ fp_ctx.search_left_bound = 0;
+ fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+ current_level_range_.SkipKey(mget_iter);
+ continue;
+ } else {
+ level_contains_keys = true;
+ }
+ } else {
+ // search_left_bound > search_right_bound, key does not exist in
+ // this level. Since no comparison is done in this level, it will
+ // need to search all files in the next level.
+ fp_ctx.search_left_bound = 0;
+ fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+ current_level_range_.SkipKey(mget_iter);
+ continue;
+ }
+ }
+ fp_ctx.start_index_in_curr_level = start_index;
+ fp_ctx.curr_index_in_curr_level = start_index;
+ }
+ if (level_contains_keys) {
+ batch_iter_prev_ = current_level_range_.begin();
+ upper_key_ = batch_iter_ = current_level_range_.begin();
+ return true;
+ }
+ curr_level_++;
+ }
+ // curr_level_ = num_levels_. So, no more levels to search.
+ return false;
+ }
+};
+
+VersionStorageInfo::~VersionStorageInfo() { delete[] files_; }
+
+Version::~Version() {
+ assert(refs_ == 0);
+
+ // Remove from linked list
+ prev_->next_ = next_;
+ next_->prev_ = prev_;
+
+ // Drop references to files
+ for (int level = 0; level < storage_info_.num_levels_; level++) {
+ for (size_t i = 0; i < storage_info_.files_[level].size(); i++) {
+ FileMetaData* f = storage_info_.files_[level][i];
+ assert(f->refs > 0);
+ f->refs--;
+ if (f->refs <= 0) {
+ assert(cfd_ != nullptr);
+ uint32_t path_id = f->fd.GetPathId();
+ assert(path_id < cfd_->ioptions()->cf_paths.size());
+ vset_->obsolete_files_.push_back(
+ ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path,
+ cfd_->GetFileMetadataCacheReservationManager()));
+ }
+ }
+ }
+}
+
+int FindFile(const InternalKeyComparator& icmp,
+ const LevelFilesBrief& file_level, const Slice& key) {
+ return FindFileInRange(icmp, file_level, key, 0,
+ static_cast<uint32_t>(file_level.num_files));
+}
+
+void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+ const std::vector<FileMetaData*>& files,
+ Arena* arena) {
+ assert(file_level);
+ assert(arena);
+
+ size_t num = files.size();
+ file_level->num_files = num;
+ char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange));
+ file_level->files = new (mem) FdWithKeyRange[num];
+
+ for (size_t i = 0; i < num; i++) {
+ Slice smallest_key = files[i]->smallest.Encode();
+ Slice largest_key = files[i]->largest.Encode();
+
+ // Copy key slice to sequential memory
+ size_t smallest_size = smallest_key.size();
+ size_t largest_size = largest_key.size();
+ mem = arena->AllocateAligned(smallest_size + largest_size);
+ memcpy(mem, smallest_key.data(), smallest_size);
+ memcpy(mem + smallest_size, largest_key.data(), largest_size);
+
+ FdWithKeyRange& f = file_level->files[i];
+ f.fd = files[i]->fd;
+ f.file_metadata = files[i];
+ f.smallest_key = Slice(mem, smallest_size);
+ f.largest_key = Slice(mem + smallest_size, largest_size);
+ }
+}
+
+static bool AfterFile(const Comparator* ucmp, const Slice* user_key,
+ const FdWithKeyRange* f) {
+ // nullptr user_key occurs before all keys and is therefore never after *f
+ return (user_key != nullptr &&
+ ucmp->CompareWithoutTimestamp(*user_key,
+ ExtractUserKey(f->largest_key)) > 0);
+}
+
+static bool BeforeFile(const Comparator* ucmp, const Slice* user_key,
+ const FdWithKeyRange* f) {
+ // nullptr user_key occurs after all keys and is therefore never before *f
+ return (user_key != nullptr &&
+ ucmp->CompareWithoutTimestamp(*user_key,
+ ExtractUserKey(f->smallest_key)) < 0);
+}
+
+bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
+ bool disjoint_sorted_files,
+ const LevelFilesBrief& file_level,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key) {
+ const Comparator* ucmp = icmp.user_comparator();
+ if (!disjoint_sorted_files) {
+ // Need to check against all files
+ for (size_t i = 0; i < file_level.num_files; i++) {
+ const FdWithKeyRange* f = &(file_level.files[i]);
+ if (AfterFile(ucmp, smallest_user_key, f) ||
+ BeforeFile(ucmp, largest_user_key, f)) {
+ // No overlap
+ } else {
+ return true; // Overlap
+ }
+ }
+ return false;
+ }
+
+ // Binary search over file list
+ uint32_t index = 0;
+ if (smallest_user_key != nullptr) {
+ // Find the leftmost possible internal key for smallest_user_key
+ InternalKey small;
+ small.SetMinPossibleForUserKey(*smallest_user_key);
+ index = FindFile(icmp, file_level, small.Encode());
+ }
+
+ if (index >= file_level.num_files) {
+ // beginning of range is after all files, so no overlap.
+ return false;
+ }
+
+ return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]);
+}
+
+namespace {
+
+class LevelIterator final : public InternalIterator {
+ public:
+ // @param read_options Must outlive this iterator.
+ LevelIterator(
+ TableCache* table_cache, const ReadOptions& read_options,
+ const FileOptions& file_options, const InternalKeyComparator& icomparator,
+ const LevelFilesBrief* flevel,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ bool should_sample, HistogramImpl* file_read_hist,
+ TableReaderCaller caller, bool skip_filters, int level,
+ RangeDelAggregator* range_del_agg,
+ const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
+ nullptr,
+ bool allow_unprepared_value = false,
+ TruncatedRangeDelIterator**** range_tombstone_iter_ptr_ = nullptr)
+ : table_cache_(table_cache),
+ read_options_(read_options),
+ file_options_(file_options),
+ icomparator_(icomparator),
+ user_comparator_(icomparator.user_comparator()),
+ flevel_(flevel),
+ prefix_extractor_(prefix_extractor),
+ file_read_hist_(file_read_hist),
+ should_sample_(should_sample),
+ caller_(caller),
+ skip_filters_(skip_filters),
+ allow_unprepared_value_(allow_unprepared_value),
+ file_index_(flevel_->num_files),
+ level_(level),
+ range_del_agg_(range_del_agg),
+ pinned_iters_mgr_(nullptr),
+ compaction_boundaries_(compaction_boundaries),
+ is_next_read_sequential_(false),
+ range_tombstone_iter_(nullptr),
+ to_return_sentinel_(false) {
+ // Empty level is not supported.
+ assert(flevel_ != nullptr && flevel_->num_files > 0);
+ if (range_tombstone_iter_ptr_) {
+ *range_tombstone_iter_ptr_ = &range_tombstone_iter_;
+ }
+ }
+
+ ~LevelIterator() override { delete file_iter_.Set(nullptr); }
+
+ // Seek to the first file with a key >= target.
+ // If range_tombstone_iter_ is not nullptr, then we pretend that file
+ // boundaries are fake keys (sentinel keys). These keys are used to keep range
+ // tombstones alive even when all point keys in an SST file are exhausted.
+ // These sentinel keys will be skipped in merging iterator.
+ void Seek(const Slice& target) override;
+ void SeekForPrev(const Slice& target) override;
+ void SeekToFirst() override;
+ void SeekToLast() override;
+ void Next() final override;
+ bool NextAndGetResult(IterateResult* result) override;
+ void Prev() override;
+
+ // In addition to valid and invalid state (!file_iter.Valid() and
+ // status.ok()), a third state of the iterator is when !file_iter_.Valid() and
+ // to_return_sentinel_. This means we are at the end of a file, and a sentinel
+ // key (the file boundary that we pretend as a key) is to be returned next.
+ // file_iter_.Valid() and to_return_sentinel_ should not both be true.
+ bool Valid() const override {
+ assert(!(file_iter_.Valid() && to_return_sentinel_));
+ return file_iter_.Valid() || to_return_sentinel_;
+ }
+ Slice key() const override {
+ assert(Valid());
+ if (to_return_sentinel_) {
+ // Sentinel should be returned after file_iter_ reaches the end of the
+ // file
+ assert(!file_iter_.Valid());
+ return sentinel_;
+ }
+ return file_iter_.key();
+ }
+
+ Slice value() const override {
+ assert(Valid());
+ assert(!to_return_sentinel_);
+ return file_iter_.value();
+ }
+
+ Status status() const override {
+ return file_iter_.iter() ? file_iter_.status() : Status::OK();
+ }
+
+ bool PrepareValue() override { return file_iter_.PrepareValue(); }
+
+ inline bool MayBeOutOfLowerBound() override {
+ assert(Valid());
+ return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
+ }
+
+ inline IterBoundCheck UpperBoundCheckResult() override {
+ if (Valid()) {
+ return file_iter_.UpperBoundCheckResult();
+ } else {
+ return IterBoundCheck::kUnknown;
+ }
+ }
+
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ if (file_iter_.iter()) {
+ file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
+ }
+ }
+
+ bool IsKeyPinned() const override {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ file_iter_.iter() && file_iter_.IsKeyPinned();
+ }
+
+ bool IsValuePinned() const override {
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ file_iter_.iter() && file_iter_.IsValuePinned();
+ }
+
+ bool IsDeleteRangeSentinelKey() const override { return to_return_sentinel_; }
+
+ private:
+ // Return true if at least one invalid file is seen and skipped.
+ bool SkipEmptyFileForward();
+ void SkipEmptyFileBackward();
+ void SetFileIterator(InternalIterator* iter);
+ void InitFileIterator(size_t new_file_index);
+
+ const Slice& file_smallest_key(size_t file_index) {
+ assert(file_index < flevel_->num_files);
+ return flevel_->files[file_index].smallest_key;
+ }
+
+ const Slice& file_largest_key(size_t file_index) {
+ assert(file_index < flevel_->num_files);
+ return flevel_->files[file_index].largest_key;
+ }
+
+ bool KeyReachedUpperBound(const Slice& internal_key) {
+ return read_options_.iterate_upper_bound != nullptr &&
+ user_comparator_.CompareWithoutTimestamp(
+ ExtractUserKey(internal_key), /*a_has_ts=*/true,
+ *read_options_.iterate_upper_bound, /*b_has_ts=*/false) >= 0;
+ }
+
+ void ClearRangeTombstoneIter() {
+ if (range_tombstone_iter_ && *range_tombstone_iter_) {
+ delete *range_tombstone_iter_;
+ *range_tombstone_iter_ = nullptr;
+ }
+ }
+
+ // Move file_iter_ to the file at file_index_.
+ // range_tombstone_iter_ is updated with a range tombstone iterator
+ // into the new file. Old range tombstone iterator is cleared.
+ InternalIterator* NewFileIterator() {
+ assert(file_index_ < flevel_->num_files);
+ auto file_meta = flevel_->files[file_index_];
+ if (should_sample_) {
+ sample_file_read_inc(file_meta.file_metadata);
+ }
+
+ const InternalKey* smallest_compaction_key = nullptr;
+ const InternalKey* largest_compaction_key = nullptr;
+ if (compaction_boundaries_ != nullptr) {
+ smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
+ largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
+ }
+ CheckMayBeOutOfLowerBound();
+ ClearRangeTombstoneIter();
+ return table_cache_->NewIterator(
+ read_options_, file_options_, icomparator_, *file_meta.file_metadata,
+ range_del_agg_, prefix_extractor_,
+ nullptr /* don't need reference to table */, file_read_hist_, caller_,
+ /*arena=*/nullptr, skip_filters_, level_,
+ /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key,
+ largest_compaction_key, allow_unprepared_value_, range_tombstone_iter_);
+ }
+
+ // Check if current file being fully within iterate_lower_bound.
+ //
+ // Note MyRocks may update iterate bounds between seek. To workaround it,
+ // we need to check and update may_be_out_of_lower_bound_ accordingly.
+ void CheckMayBeOutOfLowerBound() {
+ if (read_options_.iterate_lower_bound != nullptr &&
+ file_index_ < flevel_->num_files) {
+ may_be_out_of_lower_bound_ =
+ user_comparator_.CompareWithoutTimestamp(
+ ExtractUserKey(file_smallest_key(file_index_)), /*a_has_ts=*/true,
+ *read_options_.iterate_lower_bound, /*b_has_ts=*/false) < 0;
+ }
+ }
+
+ TableCache* table_cache_;
+ const ReadOptions& read_options_;
+ const FileOptions& file_options_;
+ const InternalKeyComparator& icomparator_;
+ const UserComparatorWrapper user_comparator_;
+ const LevelFilesBrief* flevel_;
+ mutable FileDescriptor current_value_;
+ // `prefix_extractor_` may be non-null even for total order seek. Checking
+ // this variable is not the right way to identify whether prefix iterator
+ // is used.
+ const std::shared_ptr<const SliceTransform>& prefix_extractor_;
+
+ HistogramImpl* file_read_hist_;
+ bool should_sample_;
+ TableReaderCaller caller_;
+ bool skip_filters_;
+ bool allow_unprepared_value_;
+ bool may_be_out_of_lower_bound_ = true;
+ size_t file_index_;
+ int level_;
+ RangeDelAggregator* range_del_agg_;
+ IteratorWrapper file_iter_; // May be nullptr
+ PinnedIteratorsManager* pinned_iters_mgr_;
+
+ // To be propagated to RangeDelAggregator in order to safely truncate range
+ // tombstones.
+ const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
+
+ bool is_next_read_sequential_;
+
+ // This is set when this level iterator is used under a merging iterator
+ // that processes range tombstones. range_tombstone_iter_ points to where the
+ // merging iterator stores the range tombstones iterator for this level. When
+ // this level iterator moves to a new SST file, it updates the range
+ // tombstones accordingly through this pointer. So the merging iterator always
+ // has access to the current SST file's range tombstones.
+ //
+ // The level iterator treats file boundary as fake keys (sentinel keys) to
+ // keep range tombstones alive if needed and make upper level, i.e. merging
+ // iterator, aware of file changes (when level iterator moves to a new SST
+ // file, there is some bookkeeping work that needs to be done at merging
+ // iterator end).
+ //
+ // *range_tombstone_iter_ points to range tombstones of the current SST file
+ TruncatedRangeDelIterator** range_tombstone_iter_;
+
+ // Whether next/prev key is a sentinel key.
+ bool to_return_sentinel_ = false;
+ // The sentinel key to be returned
+ Slice sentinel_;
+ // Sets flags for if we should return the sentinel key next.
+ // The condition for returning sentinel is reaching the end of current
+ // file_iter_: !Valid() && status.().ok().
+ void TrySetDeleteRangeSentinel(const Slice& boundary_key);
+ void ClearSentinel() { to_return_sentinel_ = false; }
+
+ // Set in Seek() when a prefix seek reaches end of the current file,
+ // and the next file has a different prefix. SkipEmptyFileForward()
+ // will not move to next file when this flag is set.
+ bool prefix_exhausted_ = false;
+};
+
+void LevelIterator::TrySetDeleteRangeSentinel(const Slice& boundary_key) {
+ assert(range_tombstone_iter_);
+ if (file_iter_.iter() != nullptr && !file_iter_.Valid() &&
+ file_iter_.status().ok()) {
+ to_return_sentinel_ = true;
+ sentinel_ = boundary_key;
+ }
+}
+
+void LevelIterator::Seek(const Slice& target) {
+ prefix_exhausted_ = false;
+ ClearSentinel();
+ // Check whether the seek key fall under the same file
+ bool need_to_reseek = true;
+ if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) {
+ const FdWithKeyRange& cur_file = flevel_->files[file_index_];
+ if (icomparator_.InternalKeyComparator::Compare(
+ target, cur_file.largest_key) <= 0 &&
+ icomparator_.InternalKeyComparator::Compare(
+ target, cur_file.smallest_key) >= 0) {
+ need_to_reseek = false;
+ assert(static_cast<size_t>(FindFile(icomparator_, *flevel_, target)) ==
+ file_index_);
+ }
+ }
+ if (need_to_reseek) {
+ TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile");
+ size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+ InitFileIterator(new_file_index);
+ }
+
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.Seek(target);
+ // Status::TryAgain indicates asynchronous request for retrieval of data
+ // blocks has been submitted. So it should return at this point and Seek
+ // should be called again to retrieve the requested block and execute the
+ // remaining code.
+ if (file_iter_.status() == Status::TryAgain()) {
+ return;
+ }
+ if (!file_iter_.Valid() && file_iter_.status().ok() &&
+ prefix_extractor_ != nullptr && !read_options_.total_order_seek &&
+ !read_options_.auto_prefix_mode &&
+ file_index_ < flevel_->num_files - 1) {
+ size_t ts_sz = user_comparator_.user_comparator()->timestamp_size();
+ Slice target_user_key_without_ts =
+ ExtractUserKeyAndStripTimestamp(target, ts_sz);
+ Slice next_file_first_user_key_without_ts =
+ ExtractUserKeyAndStripTimestamp(file_smallest_key(file_index_ + 1),
+ ts_sz);
+ if (prefix_extractor_->InDomain(target_user_key_without_ts) &&
+ (!prefix_extractor_->InDomain(next_file_first_user_key_without_ts) ||
+ user_comparator_.CompareWithoutTimestamp(
+ prefix_extractor_->Transform(target_user_key_without_ts), false,
+ prefix_extractor_->Transform(
+ next_file_first_user_key_without_ts),
+ false) != 0)) {
+ // SkipEmptyFileForward() will not advance to next file when this flag
+ // is set for reason detailed below.
+ //
+ // The file we initially positioned to has no keys under the target
+ // prefix, and the next file's smallest key has a different prefix than
+ // target. When doing prefix iterator seek, when keys for one prefix
+ // have been exhausted, it can jump to any key that is larger. Here we
+ // are enforcing a stricter contract than that, in order to make it
+ // easier for higher layers (merging and DB iterator) to reason the
+ // correctness:
+ // 1. Within the prefix, the result should be accurate.
+ // 2. If keys for the prefix is exhausted, it is either positioned to
+ // the next key after the prefix, or make the iterator invalid.
+ // A side benefit will be that it invalidates the iterator earlier so
+ // that the upper level merging iterator can merge fewer child
+ // iterators.
+ //
+ // The flag is cleared in Seek*() calls. There is no need to clear the
+ // flag in Prev() since Prev() will not be called when the flag is set
+ // for reasons explained below. If range_tombstone_iter_ is nullptr,
+ // then there is no file boundary sentinel key. Since
+ // !file_iter_.Valid() from the if condition above, this level iterator
+ // is !Valid(), so Prev() will not be called. If range_tombstone_iter_
+ // is not nullptr, there are two cases depending on if this level
+ // iterator reaches top of the heap in merging iterator (the upper
+ // layer).
+ // If so, merging iterator will see the sentinel key, call
+ // NextAndGetResult() and the call to NextAndGetResult() will skip the
+ // sentinel key and makes this level iterator invalid. If not, then it
+ // could be because the upper layer is done before any method of this
+ // level iterator is called or another Seek*() call is invoked. Either
+ // way, Prev() is never called before Seek*().
+ // The flag should not be cleared at the beginning of
+ // Next/NextAndGetResult() since it is used in SkipEmptyFileForward()
+ // called in Next/NextAndGetResult().
+ prefix_exhausted_ = true;
+ }
+ }
+
+ if (range_tombstone_iter_) {
+ TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+ }
+ }
+ SkipEmptyFileForward();
+ CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekForPrev(const Slice& target) {
+ prefix_exhausted_ = false;
+ ClearSentinel();
+ size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+ // Seek beyond this level's smallest key
+ if (new_file_index == 0 &&
+ icomparator_.Compare(target, file_smallest_key(0)) < 0) {
+ SetFileIterator(nullptr);
+ ClearRangeTombstoneIter();
+ CheckMayBeOutOfLowerBound();
+ return;
+ }
+ if (new_file_index >= flevel_->num_files) {
+ new_file_index = flevel_->num_files - 1;
+ }
+
+ InitFileIterator(new_file_index);
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.SeekForPrev(target);
+ if (range_tombstone_iter_ &&
+ icomparator_.Compare(target, file_smallest_key(file_index_)) >= 0) {
+ // In SeekForPrev() case, it is possible that the target is less than
+ // file's lower boundary since largest key is used to determine file index
+ // (FindFile()). When target is less than file's lower boundary, sentinel
+ // key should not be set so that SeekForPrev() does not result in a key
+ // larger than target. This is correct in that there is no need to keep
+ // the range tombstones in this file alive as they only cover keys
+ // starting from the file's lower boundary, which is after `target`.
+ TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+ }
+ SkipEmptyFileBackward();
+ }
+ CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekToFirst() {
+ prefix_exhausted_ = false;
+ ClearSentinel();
+ InitFileIterator(0);
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.SeekToFirst();
+ if (range_tombstone_iter_) {
+ // We do this in SeekToFirst() and SeekToLast() since
+ // we could have an empty file with only range tombstones.
+ TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+ }
+ }
+ SkipEmptyFileForward();
+ CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekToLast() {
+ prefix_exhausted_ = false;
+ ClearSentinel();
+ InitFileIterator(flevel_->num_files - 1);
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.SeekToLast();
+ if (range_tombstone_iter_) {
+ TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+ }
+ }
+ SkipEmptyFileBackward();
+ CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::Next() {
+ assert(Valid());
+ if (to_return_sentinel_) {
+ // file_iter_ is at EOF already when to_return_sentinel_
+ ClearSentinel();
+ } else {
+ file_iter_.Next();
+ if (range_tombstone_iter_) {
+ TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+ }
+ }
+ SkipEmptyFileForward();
+}
+
+bool LevelIterator::NextAndGetResult(IterateResult* result) {
+ assert(Valid());
+ // file_iter_ is at EOF already when to_return_sentinel_
+ bool is_valid = !to_return_sentinel_ && file_iter_.NextAndGetResult(result);
+ if (!is_valid) {
+ if (to_return_sentinel_) {
+ ClearSentinel();
+ } else if (range_tombstone_iter_) {
+ TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+ }
+ is_next_read_sequential_ = true;
+ SkipEmptyFileForward();
+ is_next_read_sequential_ = false;
+ is_valid = Valid();
+ if (is_valid) {
+ // This could be set in TrySetDeleteRangeSentinel() or
+ // SkipEmptyFileForward() above.
+ if (to_return_sentinel_) {
+ result->key = sentinel_;
+ result->bound_check_result = IterBoundCheck::kUnknown;
+ result->value_prepared = true;
+ } else {
+ result->key = key();
+ result->bound_check_result = file_iter_.UpperBoundCheckResult();
+ // Ideally, we should return the real file_iter_.value_prepared but the
+ // information is not here. It would casue an extra PrepareValue()
+ // for the first key of a file.
+ result->value_prepared = !allow_unprepared_value_;
+ }
+ }
+ }
+ return is_valid;
+}
+
+void LevelIterator::Prev() {
+ assert(Valid());
+ if (to_return_sentinel_) {
+ ClearSentinel();
+ } else {
+ file_iter_.Prev();
+ if (range_tombstone_iter_) {
+ TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+ }
+ }
+ SkipEmptyFileBackward();
+}
+
+bool LevelIterator::SkipEmptyFileForward() {
+ bool seen_empty_file = false;
+ // Pause at sentinel key
+ while (!to_return_sentinel_ &&
+ (file_iter_.iter() == nullptr ||
+ (!file_iter_.Valid() && file_iter_.status().ok() &&
+ file_iter_.iter()->UpperBoundCheckResult() !=
+ IterBoundCheck::kOutOfBound))) {
+ seen_empty_file = true;
+ // Move to next file
+ if (file_index_ >= flevel_->num_files - 1 ||
+ KeyReachedUpperBound(file_smallest_key(file_index_ + 1)) ||
+ prefix_exhausted_) {
+ SetFileIterator(nullptr);
+ ClearRangeTombstoneIter();
+ break;
+ }
+ // may init a new *range_tombstone_iter
+ InitFileIterator(file_index_ + 1);
+ // We moved to a new SST file
+ // Seek range_tombstone_iter_ to reset its !Valid() default state.
+ // We do not need to call range_tombstone_iter_.Seek* in
+ // LevelIterator::Seek* since when the merging iterator calls
+ // LevelIterator::Seek*, it should also call Seek* into the corresponding
+ // range tombstone iterator.
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.SeekToFirst();
+ if (range_tombstone_iter_) {
+ if (*range_tombstone_iter_) {
+ (*range_tombstone_iter_)->SeekToFirst();
+ }
+ TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+ }
+ }
+ }
+ return seen_empty_file;
+}
+
+void LevelIterator::SkipEmptyFileBackward() {
+ // Pause at sentinel key
+ while (!to_return_sentinel_ &&
+ (file_iter_.iter() == nullptr ||
+ (!file_iter_.Valid() && file_iter_.status().ok()))) {
+ // Move to previous file
+ if (file_index_ == 0) {
+ // Already the first file
+ SetFileIterator(nullptr);
+ ClearRangeTombstoneIter();
+ return;
+ }
+ InitFileIterator(file_index_ - 1);
+ // We moved to a new SST file
+ // Seek range_tombstone_iter_ to reset its !Valid() default state.
+ if (file_iter_.iter() != nullptr) {
+ file_iter_.SeekToLast();
+ if (range_tombstone_iter_) {
+ if (*range_tombstone_iter_) {
+ (*range_tombstone_iter_)->SeekToLast();
+ }
+ TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+ if (to_return_sentinel_) {
+ break;
+ }
+ }
+ }
+ }
+}
+
+void LevelIterator::SetFileIterator(InternalIterator* iter) {
+ if (pinned_iters_mgr_ && iter) {
+ iter->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+
+ InternalIterator* old_iter = file_iter_.Set(iter);
+
+ // Update the read pattern for PrefetchBuffer.
+ if (is_next_read_sequential_) {
+ file_iter_.UpdateReadaheadState(old_iter);
+ }
+
+ if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+ pinned_iters_mgr_->PinIterator(old_iter);
+ } else {
+ delete old_iter;
+ }
+}
+
+void LevelIterator::InitFileIterator(size_t new_file_index) {
+ if (new_file_index >= flevel_->num_files) {
+ file_index_ = new_file_index;
+ SetFileIterator(nullptr);
+ ClearRangeTombstoneIter();
+ return;
+ } else {
+ // If the file iterator shows incomplete, we try it again if users seek
+ // to the same file, as this time we may go to a different data block
+ // which is cached in block cache.
+ //
+ if (file_iter_.iter() != nullptr && !file_iter_.status().IsIncomplete() &&
+ new_file_index == file_index_) {
+ // file_iter_ is already constructed with this iterator, so
+ // no need to change anything
+ } else {
+ file_index_ = new_file_index;
+ InternalIterator* iter = NewFileIterator();
+ SetFileIterator(iter);
+ }
+ }
+}
+} // anonymous namespace
+
+Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+ const FileMetaData* file_meta,
+ const std::string* fname) const {
+ auto table_cache = cfd_->table_cache();
+ auto ioptions = cfd_->ioptions();
+ Status s = table_cache->GetTableProperties(
+ file_options_, cfd_->internal_comparator(), *file_meta, tp,
+ mutable_cf_options_.prefix_extractor, true /* no io */);
+ if (s.ok()) {
+ return s;
+ }
+
+ // We only ignore error type `Incomplete` since it's by design that we
+ // disallow table when it's not in table cache.
+ if (!s.IsIncomplete()) {
+ return s;
+ }
+
+ // 2. Table is not present in table cache, we'll read the table properties
+ // directly from the properties block in the file.
+ std::unique_ptr<FSRandomAccessFile> file;
+ std::string file_name;
+ if (fname != nullptr) {
+ file_name = *fname;
+ } else {
+ file_name = TableFileName(ioptions->cf_paths, file_meta->fd.GetNumber(),
+ file_meta->fd.GetPathId());
+ }
+ s = ioptions->fs->NewRandomAccessFile(file_name, file_options_, &file,
+ nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // By setting the magic number to kNullTableMagicNumber, we can bypass
+ // the magic number check in the footer.
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(
+ std::move(file), file_name, nullptr /* env */, io_tracer_,
+ nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
+ nullptr /* rate_limiter */, ioptions->listeners));
+ std::unique_ptr<TableProperties> props;
+ s = ReadTableProperties(
+ file_reader.get(), file_meta->fd.GetFileSize(),
+ Footer::kNullTableMagicNumber /* table's magic number */, *ioptions,
+ &props);
+ if (!s.ok()) {
+ return s;
+ }
+ *tp = std::move(props);
+ RecordTick(ioptions->stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
+ return s;
+}
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+ Status s;
+ for (int level = 0; level < storage_info_.num_levels_; level++) {
+ s = GetPropertiesOfAllTables(props, level);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ return Status::OK();
+}
+
+Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
+ std::string* out_str) {
+ if (max_entries_to_print <= 0) {
+ return Status::OK();
+ }
+ int num_entries_left = max_entries_to_print;
+
+ std::stringstream ss;
+
+ for (int level = 0; level < storage_info_.num_levels_; level++) {
+ for (const auto& file_meta : storage_info_.files_[level]) {
+ auto fname =
+ TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+ file_meta->fd.GetPathId());
+
+ ss << "=== file : " << fname << " ===\n";
+
+ TableCache* table_cache = cfd_->table_cache();
+ std::unique_ptr<FragmentedRangeTombstoneIterator> tombstone_iter;
+
+ Status s = table_cache->GetRangeTombstoneIterator(
+ ReadOptions(), cfd_->internal_comparator(), *file_meta,
+ &tombstone_iter);
+ if (!s.ok()) {
+ return s;
+ }
+ if (tombstone_iter) {
+ tombstone_iter->SeekToFirst();
+
+ // TODO: print timestamp
+ while (tombstone_iter->Valid() && num_entries_left > 0) {
+ ss << "start: " << tombstone_iter->start_key().ToString(true)
+ << " end: " << tombstone_iter->end_key().ToString(true)
+ << " seq: " << tombstone_iter->seq() << '\n';
+ tombstone_iter->Next();
+ num_entries_left--;
+ }
+ if (num_entries_left <= 0) {
+ break;
+ }
+ }
+ }
+ if (num_entries_left <= 0) {
+ break;
+ }
+ }
+ assert(num_entries_left >= 0);
+ if (num_entries_left <= 0) {
+ ss << "(results may not be complete)\n";
+ }
+
+ *out_str = ss.str();
+ return Status::OK();
+}
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
+ int level) {
+ for (const auto& file_meta : storage_info_.files_[level]) {
+ auto fname =
+ TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+ file_meta->fd.GetPathId());
+ // 1. If the table is already present in table cache, load table
+ // properties from there.
+ std::shared_ptr<const TableProperties> table_properties;
+ Status s = GetTableProperties(&table_properties, file_meta, &fname);
+ if (s.ok()) {
+ props->insert({fname, table_properties});
+ } else {
+ return s;
+ }
+ }
+
+ return Status::OK();
+}
+
+Status Version::GetPropertiesOfTablesInRange(
+ const Range* range, std::size_t n, TablePropertiesCollection* props) const {
+ for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
+ for (decltype(n) i = 0; i < n; i++) {
+ // Convert user_key into a corresponding internal key.
+ InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+ std::vector<FileMetaData*> files;
+ storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr,
+ false);
+ for (const auto& file_meta : files) {
+ auto fname =
+ TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+ file_meta->fd.GetPathId());
+ if (props->count(fname) == 0) {
+ // 1. If the table is already present in table cache, load table
+ // properties from there.
+ std::shared_ptr<const TableProperties> table_properties;
+ Status s = GetTableProperties(&table_properties, file_meta, &fname);
+ if (s.ok()) {
+ props->insert({fname, table_properties});
+ } else {
+ return s;
+ }
+ }
+ }
+ }
+ }
+
+ return Status::OK();
+}
+
+Status Version::GetAggregatedTableProperties(
+ std::shared_ptr<const TableProperties>* tp, int level) {
+ TablePropertiesCollection props;
+ Status s;
+ if (level < 0) {
+ s = GetPropertiesOfAllTables(&props);
+ } else {
+ s = GetPropertiesOfAllTables(&props, level);
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ auto* new_tp = new TableProperties();
+ for (const auto& item : props) {
+ new_tp->Add(*item.second);
+ }
+ tp->reset(new_tp);
+ return Status::OK();
+}
+
+size_t Version::GetMemoryUsageByTableReaders() {
+ size_t total_usage = 0;
+ for (auto& file_level : storage_info_.level_files_brief_) {
+ for (size_t i = 0; i < file_level.num_files; i++) {
+ total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
+ file_options_, cfd_->internal_comparator(),
+ *file_level.files[i].file_metadata,
+ mutable_cf_options_.prefix_extractor);
+ }
+ }
+ return total_usage;
+}
+
+void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
+ assert(cf_meta);
+ assert(cfd_);
+
+ cf_meta->name = cfd_->GetName();
+ cf_meta->size = 0;
+ cf_meta->file_count = 0;
+ cf_meta->levels.clear();
+
+ cf_meta->blob_file_size = 0;
+ cf_meta->blob_file_count = 0;
+ cf_meta->blob_files.clear();
+
+ auto* ioptions = cfd_->ioptions();
+ auto* vstorage = storage_info();
+
+ for (int level = 0; level < cfd_->NumberLevels(); level++) {
+ uint64_t level_size = 0;
+ cf_meta->file_count += vstorage->LevelFiles(level).size();
+ std::vector<SstFileMetaData> files;
+ for (const auto& file : vstorage->LevelFiles(level)) {
+ uint32_t path_id = file->fd.GetPathId();
+ std::string file_path;
+ if (path_id < ioptions->cf_paths.size()) {
+ file_path = ioptions->cf_paths[path_id].path;
+ } else {
+ assert(!ioptions->cf_paths.empty());
+ file_path = ioptions->cf_paths.back().path;
+ }
+ const uint64_t file_number = file->fd.GetNumber();
+ files.emplace_back(
+ MakeTableFileName("", file_number), file_number, file_path,
+ file->fd.GetFileSize(), file->fd.smallest_seqno,
+ file->fd.largest_seqno, file->smallest.user_key().ToString(),
+ file->largest.user_key().ToString(),
+ file->stats.num_reads_sampled.load(std::memory_order_relaxed),
+ file->being_compacted, file->temperature,
+ file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
+ file->TryGetFileCreationTime(), file->file_checksum,
+ file->file_checksum_func_name);
+ files.back().num_entries = file->num_entries;
+ files.back().num_deletions = file->num_deletions;
+ level_size += file->fd.GetFileSize();
+ }
+ cf_meta->levels.emplace_back(level, level_size, std::move(files));
+ cf_meta->size += level_size;
+ }
+ for (const auto& meta : vstorage->GetBlobFiles()) {
+ assert(meta);
+
+ cf_meta->blob_files.emplace_back(
+ meta->GetBlobFileNumber(), BlobFileName("", meta->GetBlobFileNumber()),
+ ioptions->cf_paths.front().path, meta->GetBlobFileSize(),
+ meta->GetTotalBlobCount(), meta->GetTotalBlobBytes(),
+ meta->GetGarbageBlobCount(), meta->GetGarbageBlobBytes(),
+ meta->GetChecksumMethod(), meta->GetChecksumValue());
+ ++cf_meta->blob_file_count;
+ cf_meta->blob_file_size += meta->GetBlobFileSize();
+ }
+}
+
+uint64_t Version::GetSstFilesSize() {
+ uint64_t sst_files_size = 0;
+ for (int level = 0; level < storage_info_.num_levels_; level++) {
+ for (const auto& file_meta : storage_info_.LevelFiles(level)) {
+ sst_files_size += file_meta->fd.GetFileSize();
+ }
+ }
+ return sst_files_size;
+}
+
+void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+ uint64_t oldest_time = std::numeric_limits<uint64_t>::max();
+ for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
+ for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
+ assert(meta->fd.table_reader != nullptr);
+ uint64_t file_creation_time = meta->TryGetFileCreationTime();
+ if (file_creation_time == kUnknownFileCreationTime) {
+ *creation_time = 0;
+ return;
+ }
+ if (file_creation_time < oldest_time) {
+ oldest_time = file_creation_time;
+ }
+ }
+ }
+ *creation_time = oldest_time;
+}
+
+InternalIterator* Version::TEST_GetLevelIterator(
+ const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder,
+ int level, bool allow_unprepared_value) {
+ auto* arena = merge_iter_builder->GetArena();
+ auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
+ TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
+ auto level_iter = new (mem) LevelIterator(
+ cfd_->table_cache(), read_options, file_options_,
+ cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+ mutable_cf_options_.prefix_extractor, should_sample_file_read(),
+ cfd_->internal_stats()->GetFileReadHist(level),
+ TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+ nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
+ allow_unprepared_value, &tombstone_iter_ptr);
+ if (read_options.ignore_range_deletions) {
+ merge_iter_builder->AddIterator(level_iter);
+ } else {
+ merge_iter_builder->AddPointAndTombstoneIterator(
+ level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
+ }
+ return level_iter;
+}
+
+uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
+ // Estimation will be inaccurate when:
+ // (1) there exist merge keys
+ // (2) keys are directly overwritten
+ // (3) deletion on non-existing keys
+ // (4) low number of samples
+ if (current_num_samples_ == 0) {
+ return 0;
+ }
+
+ if (current_num_non_deletions_ <= current_num_deletions_) {
+ return 0;
+ }
+
+ uint64_t est = current_num_non_deletions_ - current_num_deletions_;
+
+ uint64_t file_count = 0;
+ for (int level = 0; level < num_levels_; ++level) {
+ file_count += files_[level].size();
+ }
+
+ if (current_num_samples_ < file_count) {
+ // casting to avoid overflowing
+ return static_cast<uint64_t>(
+ (est * static_cast<double>(file_count) / current_num_samples_));
+ } else {
+ return est;
+ }
+}
+
+double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel(
+ int level) const {
+ assert(level < num_levels_);
+ uint64_t sum_file_size_bytes = 0;
+ uint64_t sum_data_size_bytes = 0;
+ for (auto* file_meta : files_[level]) {
+ sum_file_size_bytes += file_meta->fd.GetFileSize();
+ sum_data_size_bytes += file_meta->raw_key_size + file_meta->raw_value_size;
+ }
+ if (sum_file_size_bytes == 0) {
+ return -1.0;
+ }
+ return static_cast<double>(sum_data_size_bytes) / sum_file_size_bytes;
+}
+
+void Version::AddIterators(const ReadOptions& read_options,
+ const FileOptions& soptions,
+ MergeIteratorBuilder* merge_iter_builder,
+ bool allow_unprepared_value) {
+ assert(storage_info_.finalized_);
+
+ for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
+ AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level,
+ allow_unprepared_value);
+ }
+}
+
+void Version::AddIteratorsForLevel(const ReadOptions& read_options,
+ const FileOptions& soptions,
+ MergeIteratorBuilder* merge_iter_builder,
+ int level, bool allow_unprepared_value) {
+ assert(storage_info_.finalized_);
+ if (level >= storage_info_.num_non_empty_levels()) {
+ // This is an empty level
+ return;
+ } else if (storage_info_.LevelFilesBrief(level).num_files == 0) {
+ // No files in this level
+ return;
+ }
+
+ bool should_sample = should_sample_file_read();
+
+ auto* arena = merge_iter_builder->GetArena();
+ if (level == 0) {
+ // Merge all level zero files together since they may overlap
+ TruncatedRangeDelIterator* tombstone_iter = nullptr;
+ for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+ const auto& file = storage_info_.LevelFilesBrief(0).files[i];
+ auto table_iter = cfd_->table_cache()->NewIterator(
+ read_options, soptions, cfd_->internal_comparator(),
+ *file.file_metadata, /*range_del_agg=*/nullptr,
+ mutable_cf_options_.prefix_extractor, nullptr,
+ cfd_->internal_stats()->GetFileReadHist(0),
+ TableReaderCaller::kUserIterator, arena,
+ /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr, allow_unprepared_value,
+ &tombstone_iter);
+ if (read_options.ignore_range_deletions) {
+ merge_iter_builder->AddIterator(table_iter);
+ } else {
+ merge_iter_builder->AddPointAndTombstoneIterator(table_iter,
+ tombstone_iter);
+ }
+ }
+ if (should_sample) {
+ // Count ones for every L0 files. This is done per iterator creation
+ // rather than Seek(), while files in other levels are recored per seek.
+ // If users execute one range query per iterator, there may be some
+ // discrepancy here.
+ for (FileMetaData* meta : storage_info_.LevelFiles(0)) {
+ sample_file_read_inc(meta);
+ }
+ }
+ } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
+ // For levels > 0, we can use a concatenating iterator that sequentially
+ // walks through the non-overlapping files in the level, opening them
+ // lazily.
+ auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
+ TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
+ auto level_iter = new (mem) LevelIterator(
+ cfd_->table_cache(), read_options, soptions,
+ cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+ mutable_cf_options_.prefix_extractor, should_sample_file_read(),
+ cfd_->internal_stats()->GetFileReadHist(level),
+ TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+ /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr,
+ allow_unprepared_value, &tombstone_iter_ptr);
+ if (read_options.ignore_range_deletions) {
+ merge_iter_builder->AddIterator(level_iter);
+ } else {
+ merge_iter_builder->AddPointAndTombstoneIterator(
+ level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
+ }
+ }
+}
+
+Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
+ const FileOptions& file_options,
+ const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ int level, bool* overlap) {
+ assert(storage_info_.finalized_);
+
+ auto icmp = cfd_->internal_comparator();
+ auto ucmp = icmp.user_comparator();
+
+ Arena arena;
+ Status status;
+ ReadRangeDelAggregator range_del_agg(&icmp,
+ kMaxSequenceNumber /* upper_bound */);
+
+ *overlap = false;
+
+ if (level == 0) {
+ for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+ const auto file = &storage_info_.LevelFilesBrief(0).files[i];
+ if (AfterFile(ucmp, &smallest_user_key, file) ||
+ BeforeFile(ucmp, &largest_user_key, file)) {
+ continue;
+ }
+ ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
+ read_options, file_options, cfd_->internal_comparator(),
+ *file->file_metadata, &range_del_agg,
+ mutable_cf_options_.prefix_extractor, nullptr,
+ cfd_->internal_stats()->GetFileReadHist(0),
+ TableReaderCaller::kUserIterator, &arena,
+ /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr,
+ /*allow_unprepared_value=*/false));
+ status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
+ iter.get(), overlap);
+ if (!status.ok() || *overlap) {
+ break;
+ }
+ }
+ } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
+ auto mem = arena.AllocateAligned(sizeof(LevelIterator));
+ ScopedArenaIterator iter(new (mem) LevelIterator(
+ cfd_->table_cache(), read_options, file_options,
+ cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+ mutable_cf_options_.prefix_extractor, should_sample_file_read(),
+ cfd_->internal_stats()->GetFileReadHist(level),
+ TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+ &range_del_agg));
+ status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
+ iter.get(), overlap);
+ }
+
+ if (status.ok() && *overlap == false &&
+ range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) {
+ *overlap = true;
+ }
+ return status;
+}
+
+VersionStorageInfo::VersionStorageInfo(
+ const InternalKeyComparator* internal_comparator,
+ const Comparator* user_comparator, int levels,
+ CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage,
+ bool _force_consistency_checks)
+ : internal_comparator_(internal_comparator),
+ user_comparator_(user_comparator),
+ // cfd is nullptr if Version is dummy
+ num_levels_(levels),
+ num_non_empty_levels_(0),
+ file_indexer_(user_comparator),
+ compaction_style_(compaction_style),
+ files_(new std::vector<FileMetaData*>[num_levels_]),
+ base_level_(num_levels_ == 1 ? -1 : 1),
+ level_multiplier_(0.0),
+ files_by_compaction_pri_(num_levels_),
+ level0_non_overlapping_(false),
+ next_file_to_compact_by_size_(num_levels_),
+ compaction_score_(num_levels_),
+ compaction_level_(num_levels_),
+ l0_delay_trigger_count_(0),
+ compact_cursor_(num_levels_),
+ accumulated_file_size_(0),
+ accumulated_raw_key_size_(0),
+ accumulated_raw_value_size_(0),
+ accumulated_num_non_deletions_(0),
+ accumulated_num_deletions_(0),
+ current_num_non_deletions_(0),
+ current_num_deletions_(0),
+ current_num_samples_(0),
+ estimated_compaction_needed_bytes_(0),
+ finalized_(false),
+ force_consistency_checks_(_force_consistency_checks) {
+ if (ref_vstorage != nullptr) {
+ accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
+ accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
+ accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_;
+ accumulated_num_non_deletions_ =
+ ref_vstorage->accumulated_num_non_deletions_;
+ accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_;
+ current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_;
+ current_num_deletions_ = ref_vstorage->current_num_deletions_;
+ current_num_samples_ = ref_vstorage->current_num_samples_;
+ oldest_snapshot_seqnum_ = ref_vstorage->oldest_snapshot_seqnum_;
+ compact_cursor_ = ref_vstorage->compact_cursor_;
+ compact_cursor_.resize(num_levels_);
+ }
+}
+
+Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
+ const FileOptions& file_opt,
+ const MutableCFOptions mutable_cf_options,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ uint64_t version_number)
+ : env_(vset->env_),
+ clock_(vset->clock_),
+ cfd_(column_family_data),
+ info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->logger),
+ db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->stats),
+ table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
+ blob_source_(cfd_ ? cfd_->blob_source() : nullptr),
+ merge_operator_(
+ (cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()),
+ storage_info_(
+ (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
+ (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
+ cfd_ == nullptr ? 0 : cfd_->NumberLevels(),
+ cfd_ == nullptr ? kCompactionStyleLevel
+ : cfd_->ioptions()->compaction_style,
+ (cfd_ == nullptr || cfd_->current() == nullptr)
+ ? nullptr
+ : cfd_->current()->storage_info(),
+ cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks),
+ vset_(vset),
+ next_(this),
+ prev_(this),
+ refs_(0),
+ file_options_(file_opt),
+ mutable_cf_options_(mutable_cf_options),
+ max_file_size_for_l0_meta_pin_(
+ MaxFileSizeForL0MetaPin(mutable_cf_options_)),
+ version_number_(version_number),
+ io_tracer_(io_tracer) {}
+
+Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
+ const Slice& blob_index_slice,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* value, uint64_t* bytes_read) const {
+ BlobIndex blob_index;
+
+ {
+ Status s = blob_index.DecodeFrom(blob_index_slice);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ return GetBlob(read_options, user_key, blob_index, prefetch_buffer, value,
+ bytes_read);
+}
+
+Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
+ const BlobIndex& blob_index,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* value, uint64_t* bytes_read) const {
+ assert(value);
+
+ if (blob_index.HasTTL() || blob_index.IsInlined()) {
+ return Status::Corruption("Unexpected TTL/inlined blob index");
+ }
+
+ const uint64_t blob_file_number = blob_index.file_number();
+
+ auto blob_file_meta = storage_info_.GetBlobFileMetaData(blob_file_number);
+ if (!blob_file_meta) {
+ return Status::Corruption("Invalid blob file number");
+ }
+
+ assert(blob_source_);
+ value->Reset();
+ const Status s = blob_source_->GetBlob(
+ read_options, user_key, blob_file_number, blob_index.offset(),
+ blob_file_meta->GetBlobFileSize(), blob_index.size(),
+ blob_index.compression(), prefetch_buffer, value, bytes_read);
+
+ return s;
+}
+
+void Version::MultiGetBlob(
+ const ReadOptions& read_options, MultiGetRange& range,
+ std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs) {
+ assert(!blob_ctxs.empty());
+
+ autovector<BlobFileReadRequests> blob_reqs;
+
+ for (auto& ctx : blob_ctxs) {
+ const auto file_number = ctx.first;
+ const auto blob_file_meta = storage_info_.GetBlobFileMetaData(file_number);
+
+ autovector<BlobReadRequest> blob_reqs_in_file;
+ BlobReadContexts& blobs_in_file = ctx.second;
+ for (const auto& blob : blobs_in_file) {
+ const BlobIndex& blob_index = blob.first;
+ const KeyContext& key_context = blob.second;
+
+ if (!blob_file_meta) {
+ *key_context.s = Status::Corruption("Invalid blob file number");
+ continue;
+ }
+
+ if (blob_index.HasTTL() || blob_index.IsInlined()) {
+ *key_context.s =
+ Status::Corruption("Unexpected TTL/inlined blob index");
+ continue;
+ }
+
+ key_context.value->Reset();
+ blob_reqs_in_file.emplace_back(
+ key_context.ukey_with_ts, blob_index.offset(), blob_index.size(),
+ blob_index.compression(), key_context.value, key_context.s);
+ }
+ if (blob_reqs_in_file.size() > 0) {
+ const auto file_size = blob_file_meta->GetBlobFileSize();
+ blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file);
+ }
+ }
+
+ if (blob_reqs.size() > 0) {
+ blob_source_->MultiGetBlob(read_options, blob_reqs, /*bytes_read=*/nullptr);
+ }
+
+ for (auto& ctx : blob_ctxs) {
+ BlobReadContexts& blobs_in_file = ctx.second;
+ for (const auto& blob : blobs_in_file) {
+ const KeyContext& key_context = blob.second;
+ if (key_context.s->ok()) {
+ range.AddValueSize(key_context.value->size());
+ if (range.GetValueSize() > read_options.value_size_soft_limit) {
+ *key_context.s = Status::Aborted();
+ }
+ } else if (key_context.s->IsIncomplete()) {
+ // read_options.read_tier == kBlockCacheTier
+ // Cannot read blob(s): no disk I/O allowed
+ assert(key_context.get_context);
+ auto& get_context = *(key_context.get_context);
+ get_context.MarkKeyMayExist();
+ }
+ }
+ }
+}
+
+void Version::Get(const ReadOptions& read_options, const LookupKey& k,
+ PinnableSlice* value, PinnableWideColumns* columns,
+ std::string* timestamp, Status* status,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ PinnedIteratorsManager* pinned_iters_mgr, bool* value_found,
+ bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
+ bool* is_blob, bool do_merge) {
+ Slice ikey = k.internal_key();
+ Slice user_key = k.user_key();
+
+ assert(status->ok() || status->IsMergeInProgress());
+
+ if (key_exists != nullptr) {
+ // will falsify below if not found
+ *key_exists = true;
+ }
+
+ uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
+ if (vset_ && vset_->block_cache_tracer_ &&
+ vset_->block_cache_tracer_->is_tracing_enabled()) {
+ tracing_get_id = vset_->block_cache_tracer_->NextGetId();
+ }
+
+ // Note: the old StackableDB-based BlobDB passes in
+ // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we
+ // need to provide it here.
+ bool is_blob_index = false;
+ bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index;
+ BlobFetcher blob_fetcher(this, read_options);
+
+ assert(pinned_iters_mgr);
+ GetContext get_context(
+ user_comparator(), merge_operator_, info_log_, db_statistics_,
+ status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
+ do_merge ? value : nullptr, do_merge ? columns : nullptr,
+ do_merge ? timestamp : nullptr, value_found, merge_context, do_merge,
+ max_covering_tombstone_seq, clock_, seq,
+ merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use,
+ tracing_get_id, &blob_fetcher);
+
+ // Pin blocks that we read to hold merge operands
+ if (merge_operator_) {
+ pinned_iters_mgr->StartPinning();
+ }
+
+ FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_,
+ storage_info_.num_non_empty_levels_,
+ &storage_info_.file_indexer_, user_comparator(),
+ internal_comparator());
+ FdWithKeyRange* f = fp.GetNextFile();
+
+ while (f != nullptr) {
+ if (*max_covering_tombstone_seq > 0) {
+ // The remaining files we look at will only contain covered keys, so we
+ // stop here.
+ break;
+ }
+ if (get_context.sample()) {
+ sample_file_read_inc(f->file_metadata);
+ }
+
+ bool timer_enabled =
+ GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+ get_perf_context()->per_level_perf_context_enabled;
+ StopWatchNano timer(clock_, timer_enabled /* auto_start */);
+ *status = table_cache_->Get(
+ read_options, *internal_comparator(), *f->file_metadata, ikey,
+ &get_context, mutable_cf_options_.prefix_extractor,
+ cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+ IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+ fp.IsHitFileLastInLevel()),
+ fp.GetHitFileLevel(), max_file_size_for_l0_meta_pin_);
+ // TODO: examine the behavior for corrupted key
+ if (timer_enabled) {
+ PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+ fp.GetHitFileLevel());
+ }
+ if (!status->ok()) {
+ if (db_statistics_ != nullptr) {
+ get_context.ReportCounters();
+ }
+ return;
+ }
+
+ // report the counters before returning
+ if (get_context.State() != GetContext::kNotFound &&
+ get_context.State() != GetContext::kMerge &&
+ db_statistics_ != nullptr) {
+ get_context.ReportCounters();
+ }
+ switch (get_context.State()) {
+ case GetContext::kNotFound:
+ // Keep searching in other files
+ break;
+ case GetContext::kMerge:
+ // TODO: update per-level perfcontext user_key_return_count for kMerge
+ break;
+ case GetContext::kFound:
+ if (fp.GetHitFileLevel() == 0) {
+ RecordTick(db_statistics_, GET_HIT_L0);
+ } else if (fp.GetHitFileLevel() == 1) {
+ RecordTick(db_statistics_, GET_HIT_L1);
+ } else if (fp.GetHitFileLevel() >= 2) {
+ RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+ }
+
+ PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
+ fp.GetHitFileLevel());
+
+ if (is_blob_index) {
+ if (do_merge && value) {
+ TEST_SYNC_POINT_CALLBACK("Version::Get::TamperWithBlobIndex",
+ value);
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr uint64_t* bytes_read = nullptr;
+
+ *status = GetBlob(read_options, user_key, *value, prefetch_buffer,
+ value, bytes_read);
+ if (!status->ok()) {
+ if (status->IsIncomplete()) {
+ get_context.MarkKeyMayExist();
+ }
+ return;
+ }
+ }
+ }
+
+ return;
+ case GetContext::kDeleted:
+ // Use empty error message for speed
+ *status = Status::NotFound();
+ return;
+ case GetContext::kCorrupt:
+ *status = Status::Corruption("corrupted key for ", user_key);
+ return;
+ case GetContext::kUnexpectedBlobIndex:
+ ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+ *status = Status::NotSupported(
+ "Encounter unexpected blob index. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+ return;
+ }
+ f = fp.GetNextFile();
+ }
+ if (db_statistics_ != nullptr) {
+ get_context.ReportCounters();
+ }
+ if (GetContext::kMerge == get_context.State()) {
+ if (!do_merge) {
+ *status = Status::OK();
+ return;
+ }
+ if (!merge_operator_) {
+ *status = Status::InvalidArgument(
+ "merge_operator is not properly initialized.");
+ return;
+ }
+ // merge_operands are in saver and we hit the beginning of the key history
+ // do a final merge of nullptr and operands;
+ if (value || columns) {
+ std::string result;
+ *status = MergeHelper::TimedFullMerge(
+ merge_operator_, user_key, nullptr, merge_context->GetOperands(),
+ &result, info_log_, db_statistics_, clock_,
+ /* result_operand */ nullptr, /* update_num_ops_stats */ true);
+ if (status->ok()) {
+ if (LIKELY(value != nullptr)) {
+ *(value->GetSelf()) = std::move(result);
+ value->PinSelf();
+ } else {
+ assert(columns != nullptr);
+ columns->SetPlainValue(result);
+ }
+ }
+ }
+ } else {
+ if (key_exists != nullptr) {
+ *key_exists = false;
+ }
+ *status = Status::NotFound(); // Use an empty error message for speed
+ }
+}
+
+void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+ ReadCallback* callback) {
+ PinnedIteratorsManager pinned_iters_mgr;
+
+ // Pin blocks that we read to hold merge operands
+ if (merge_operator_) {
+ pinned_iters_mgr.StartPinning();
+ }
+ uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+
+ if (vset_ && vset_->block_cache_tracer_ &&
+ vset_->block_cache_tracer_->is_tracing_enabled()) {
+ tracing_mget_id = vset_->block_cache_tracer_->NextGetId();
+ }
+ // Even though we know the batch size won't be > MAX_BATCH_SIZE,
+ // use autovector in order to avoid unnecessary construction of GetContext
+ // objects, which is expensive
+ autovector<GetContext, 16> get_ctx;
+ BlobFetcher blob_fetcher(this, read_options);
+ for (auto iter = range->begin(); iter != range->end(); ++iter) {
+ assert(iter->s->ok() || iter->s->IsMergeInProgress());
+ get_ctx.emplace_back(
+ user_comparator(), merge_operator_, info_log_, db_statistics_,
+ iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge,
+ iter->ukey_with_ts, iter->value, /*columns=*/nullptr, iter->timestamp,
+ nullptr, &(iter->merge_context), true,
+ &iter->max_covering_tombstone_seq, clock_, nullptr,
+ merge_operator_ ? &pinned_iters_mgr : nullptr, callback,
+ &iter->is_blob_index, tracing_mget_id, &blob_fetcher);
+ // MergeInProgress status, if set, has been transferred to the get_context
+ // state, so we set status to ok here. From now on, the iter status will
+ // be used for IO errors, and get_context state will be used for any
+ // key level errors
+ *(iter->s) = Status::OK();
+ }
+ int get_ctx_index = 0;
+ for (auto iter = range->begin(); iter != range->end();
+ ++iter, get_ctx_index++) {
+ iter->get_context = &(get_ctx[get_ctx_index]);
+ }
+
+ Status s;
+ // blob_file => [[blob_idx, it], ...]
+ std::unordered_map<uint64_t, BlobReadContexts> blob_ctxs;
+ MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
+#if USE_COROUTINES
+ if (read_options.async_io && read_options.optimize_multiget_for_io &&
+ using_coroutines()) {
+ s = MultiGetAsync(read_options, range, &blob_ctxs);
+ } else
+#endif // USE_COROUTINES
+ {
+ MultiGetRange file_picker_range(*range, range->begin(), range->end());
+ FilePickerMultiGet fp(&file_picker_range, &storage_info_.level_files_brief_,
+ storage_info_.num_non_empty_levels_,
+ &storage_info_.file_indexer_, user_comparator(),
+ internal_comparator());
+ FdWithKeyRange* f = fp.GetNextFileInLevel();
+ uint64_t num_index_read = 0;
+ uint64_t num_filter_read = 0;
+ uint64_t num_sst_read = 0;
+ uint64_t num_level_read = 0;
+
+ int prev_level = -1;
+
+ while (!fp.IsSearchEnded()) {
+ // This will be set to true later if we actually look up in a file in L0.
+ // For per level stats purposes, an L0 file is treated as a level
+ bool dump_stats_for_l0_file = false;
+
+ // Avoid using the coroutine version if we're looking in a L0 file, since
+ // L0 files won't be parallelized anyway. The regular synchronous version
+ // is faster.
+ if (!read_options.async_io || !using_coroutines() ||
+ fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) {
+ if (f) {
+ bool skip_filters =
+ IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+ fp.IsHitFileLastInLevel());
+ // Call MultiGetFromSST for looking up a single file
+ s = MultiGetFromSST(read_options, fp.CurrentFileRange(),
+ fp.GetHitFileLevel(), skip_filters,
+ /*skip_range_deletions=*/false, f, blob_ctxs,
+ /*table_handle=*/nullptr, num_filter_read,
+ num_index_read, num_sst_read);
+ if (fp.GetHitFileLevel() == 0) {
+ dump_stats_for_l0_file = true;
+ }
+ }
+ if (s.ok()) {
+ f = fp.GetNextFileInLevel();
+ }
+#if USE_COROUTINES
+ } else {
+ std::vector<folly::coro::Task<Status>> mget_tasks;
+ while (f != nullptr) {
+ MultiGetRange file_range = fp.CurrentFileRange();
+ Cache::Handle* table_handle = nullptr;
+ bool skip_filters =
+ IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+ fp.IsHitFileLastInLevel());
+ bool skip_range_deletions = false;
+ if (!skip_filters) {
+ Status status = table_cache_->MultiGetFilter(
+ read_options, *internal_comparator(), *f->file_metadata,
+ mutable_cf_options_.prefix_extractor,
+ cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+ fp.GetHitFileLevel(), &file_range, &table_handle);
+ skip_range_deletions = true;
+ if (status.ok()) {
+ skip_filters = true;
+ } else if (!status.IsNotSupported()) {
+ s = status;
+ }
+ }
+
+ if (!s.ok()) {
+ break;
+ }
+
+ if (!file_range.empty()) {
+ mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
+ read_options, file_range, fp.GetHitFileLevel(), skip_filters,
+ skip_range_deletions, f, blob_ctxs, table_handle,
+ num_filter_read, num_index_read, num_sst_read));
+ }
+ if (fp.KeyMaySpanNextFile()) {
+ break;
+ }
+ f = fp.GetNextFileInLevel();
+ }
+ if (mget_tasks.size() > 0) {
+ RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT,
+ mget_tasks.size());
+ // Collect all results so far
+ std::vector<Status> statuses = folly::coro::blockingWait(
+ folly::coro::collectAllRange(std::move(mget_tasks))
+ .scheduleOn(&range->context()->executor()));
+ if (s.ok()) {
+ for (Status stat : statuses) {
+ if (!stat.ok()) {
+ s = std::move(stat);
+ break;
+ }
+ }
+ }
+
+ if (s.ok() && fp.KeyMaySpanNextFile()) {
+ f = fp.GetNextFileInLevel();
+ }
+ }
+#endif // USE_COROUTINES
+ }
+ // If bad status or we found final result for all the keys
+ if (!s.ok() || file_picker_range.empty()) {
+ break;
+ }
+ if (!f) {
+ // Reached the end of this level. Prepare the next level
+ fp.PrepareNextLevelForSearch();
+ if (!fp.IsSearchEnded()) {
+ // Its possible there is no overlap on this level and f is nullptr
+ f = fp.GetNextFileInLevel();
+ }
+ if (dump_stats_for_l0_file ||
+ (prev_level != 0 && prev_level != (int)fp.GetHitFileLevel())) {
+ // Dump the stats if the search has moved to the next level and
+ // reset for next level.
+ if (num_filter_read + num_index_read) {
+ RecordInHistogram(db_statistics_,
+ NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+ num_index_read + num_filter_read);
+ }
+ if (num_sst_read) {
+ RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL,
+ num_sst_read);
+ num_level_read++;
+ }
+ num_filter_read = 0;
+ num_index_read = 0;
+ num_sst_read = 0;
+ }
+ prev_level = fp.GetHitFileLevel();
+ }
+ }
+
+ // Dump stats for most recent level
+ if (num_filter_read + num_index_read) {
+ RecordInHistogram(db_statistics_,
+ NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+ num_index_read + num_filter_read);
+ }
+ if (num_sst_read) {
+ RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
+ num_level_read++;
+ }
+ if (num_level_read) {
+ RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET,
+ num_level_read);
+ }
+ }
+
+ if (s.ok() && !blob_ctxs.empty()) {
+ MultiGetBlob(read_options, keys_with_blobs_range, blob_ctxs);
+ }
+
+ // Process any left over keys
+ for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) {
+ GetContext& get_context = *iter->get_context;
+ Status* status = iter->s;
+ Slice user_key = iter->lkey->user_key();
+
+ if (db_statistics_ != nullptr) {
+ get_context.ReportCounters();
+ }
+ if (GetContext::kMerge == get_context.State()) {
+ if (!merge_operator_) {
+ *status = Status::InvalidArgument(
+ "merge_operator is not properly initialized.");
+ range->MarkKeyDone(iter);
+ continue;
+ }
+ // merge_operands are in saver and we hit the beginning of the key history
+ // do a final merge of nullptr and operands;
+ std::string* str_value =
+ iter->value != nullptr ? iter->value->GetSelf() : nullptr;
+ *status = MergeHelper::TimedFullMerge(
+ merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(),
+ str_value, info_log_, db_statistics_, clock_,
+ /* result_operand */ nullptr, /* update_num_ops_stats */ true);
+ if (LIKELY(iter->value != nullptr)) {
+ iter->value->PinSelf();
+ range->AddValueSize(iter->value->size());
+ range->MarkKeyDone(iter);
+ if (range->GetValueSize() > read_options.value_size_soft_limit) {
+ s = Status::Aborted();
+ break;
+ }
+ }
+ } else {
+ range->MarkKeyDone(iter);
+ *status = Status::NotFound(); // Use an empty error message for speed
+ }
+ }
+
+ for (auto iter = range->begin(); iter != range->end(); ++iter) {
+ range->MarkKeyDone(iter);
+ *(iter->s) = s;
+ }
+}
+
+#ifdef USE_COROUTINES
+Status Version::ProcessBatch(
+ const ReadOptions& read_options, FilePickerMultiGet* batch,
+ std::vector<folly::coro::Task<Status>>& mget_tasks,
+ std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
+ autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting,
+ std::deque<size_t>& to_process, unsigned int& num_tasks_queued,
+ std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>&
+ mget_stats) {
+ FilePickerMultiGet& fp = *batch;
+ MultiGetRange range = fp.GetRange();
+ // Initialize a new empty range. Any keys that are not in this level will
+ // eventually become part of the new range.
+ MultiGetRange leftover(range, range.begin(), range.begin());
+ FdWithKeyRange* f = nullptr;
+ Status s;
+
+ f = fp.GetNextFileInLevel();
+ while (!f) {
+ fp.PrepareNextLevelForSearch();
+ if (!fp.IsSearchEnded()) {
+ f = fp.GetNextFileInLevel();
+ } else {
+ break;
+ }
+ }
+ while (f) {
+ MultiGetRange file_range = fp.CurrentFileRange();
+ Cache::Handle* table_handle = nullptr;
+ bool skip_filters = IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+ fp.IsHitFileLastInLevel());
+ bool skip_range_deletions = false;
+ if (!skip_filters) {
+ Status status = table_cache_->MultiGetFilter(
+ read_options, *internal_comparator(), *f->file_metadata,
+ mutable_cf_options_.prefix_extractor,
+ cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+ fp.GetHitFileLevel(), &file_range, &table_handle);
+ if (status.ok()) {
+ skip_filters = true;
+ skip_range_deletions = true;
+ } else if (!status.IsNotSupported()) {
+ s = status;
+ }
+ }
+ if (!s.ok()) {
+ break;
+ }
+ // At this point, file_range contains any keys that are likely in this
+ // file. It may have false positives, but that's ok since higher level
+ // lookups for the key are dependent on this lookup anyway.
+ // Add the complement of file_range to leftover. That's the set of keys
+ // definitely not in this level.
+ // Subtract the complement of file_range from range, since they will be
+ // processed in a separate batch in parallel.
+ leftover += ~file_range;
+ range -= ~file_range;
+ if (!file_range.empty()) {
+ int level = fp.GetHitFileLevel();
+ auto stat = mget_stats.find(level);
+ if (stat == mget_stats.end()) {
+ auto entry = mget_stats.insert({level, {0, 0, 0}});
+ assert(entry.second);
+ stat = entry.first;
+ }
+
+ if (waiting.empty() && to_process.empty() &&
+ !fp.RemainingOverlapInLevel() && leftover.empty() &&
+ mget_tasks.empty()) {
+ // All keys are in one SST file, so take the fast path
+ s = MultiGetFromSST(read_options, file_range, fp.GetHitFileLevel(),
+ skip_filters, skip_range_deletions, f, *blob_ctxs,
+ table_handle, std::get<0>(stat->second),
+ std::get<1>(stat->second),
+ std::get<2>(stat->second));
+ } else {
+ mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
+ read_options, file_range, fp.GetHitFileLevel(), skip_filters,
+ skip_range_deletions, f, *blob_ctxs, table_handle,
+ std::get<0>(stat->second), std::get<1>(stat->second),
+ std::get<2>(stat->second)));
+ ++num_tasks_queued;
+ }
+ }
+ if (fp.KeyMaySpanNextFile() && !file_range.empty()) {
+ break;
+ }
+ f = fp.GetNextFileInLevel();
+ }
+ // Split the current batch only if some keys are likely in this level and
+ // some are not. Only split if we're done with this level, i.e f is null.
+ // Otherwise, it means there are more files in this level to look at.
+ if (s.ok() && !f && !leftover.empty() && !range.empty()) {
+ fp.ReplaceRange(range);
+ batches.emplace_back(&leftover, fp);
+ to_process.emplace_back(batches.size() - 1);
+ }
+ // 1. If f is non-null, that means we might not be done with this level.
+ // This can happen if one of the keys is the last key in the file, i.e
+ // fp.KeyMaySpanNextFile() is true.
+ // 2. If range is empty, then we're done with this range and no need to
+ // prepare the next level
+ // 3. If some tasks were queued for this range, then the next level will be
+ // prepared after executing those tasks
+ if (!f && !range.empty() && !num_tasks_queued) {
+ fp.PrepareNextLevelForSearch();
+ }
+ return s;
+}
+
+Status Version::MultiGetAsync(
+ const ReadOptions& options, MultiGetRange* range,
+ std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs) {
+ autovector<FilePickerMultiGet, 4> batches;
+ std::deque<size_t> waiting;
+ std::deque<size_t> to_process;
+ Status s;
+ std::vector<folly::coro::Task<Status>> mget_tasks;
+ std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>> mget_stats;
+
+ // Create the initial batch with the input range
+ batches.emplace_back(range, &storage_info_.level_files_brief_,
+ storage_info_.num_non_empty_levels_,
+ &storage_info_.file_indexer_, user_comparator(),
+ internal_comparator());
+ to_process.emplace_back(0);
+
+ while (!to_process.empty()) {
+ // As we process a batch, it may get split into two. So reserve space for
+ // an additional batch in the autovector in order to prevent later moves
+ // of elements in ProcessBatch().
+ batches.reserve(batches.size() + 1);
+
+ size_t idx = to_process.front();
+ FilePickerMultiGet* batch = &batches.at(idx);
+ unsigned int num_tasks_queued = 0;
+ to_process.pop_front();
+ if (batch->IsSearchEnded() || batch->GetRange().empty()) {
+ // If to_process is empty, i.e no more batches to look at, then we need
+ // schedule the enqueued coroutines and wait for them. Otherwise, we
+ // skip this batch and move to the next one in to_process.
+ if (!to_process.empty()) {
+ continue;
+ }
+ } else {
+ // Look through one level. This may split the batch and enqueue it to
+ // to_process
+ s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting,
+ to_process, num_tasks_queued, mget_stats);
+ // If ProcessBatch didn't enqueue any coroutine tasks, it means all
+ // keys were filtered out. So put the batch back in to_process to
+ // lookup in the next level
+ if (!num_tasks_queued && !batch->IsSearchEnded()) {
+ // Put this back in the processing queue
+ to_process.emplace_back(idx);
+ } else if (num_tasks_queued) {
+ waiting.emplace_back(idx);
+ }
+ }
+ // If ProcessBatch() returned an error, then schedule the enqueued
+ // coroutines and wait for them, then abort the MultiGet.
+ if (to_process.empty() || !s.ok()) {
+ if (mget_tasks.size() > 0) {
+ assert(waiting.size());
+ RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
+ // Collect all results so far
+ std::vector<Status> statuses = folly::coro::blockingWait(
+ folly::coro::collectAllRange(std::move(mget_tasks))
+ .scheduleOn(&range->context()->executor()));
+ mget_tasks.clear();
+ if (s.ok()) {
+ for (Status stat : statuses) {
+ if (!stat.ok()) {
+ s = std::move(stat);
+ break;
+ }
+ }
+ }
+
+ if (!s.ok()) {
+ break;
+ }
+
+ for (size_t wait_idx : waiting) {
+ FilePickerMultiGet& fp = batches.at(wait_idx);
+ // 1. If fp.GetHitFile() is non-null, then there could be more
+ // overlap in this level. So skip preparing next level.
+ // 2. If fp.GetRange() is empty, then this batch is completed
+ // and no need to prepare the next level.
+ if (!fp.GetHitFile() && !fp.GetRange().empty()) {
+ fp.PrepareNextLevelForSearch();
+ }
+ }
+ to_process.swap(waiting);
+ } else {
+ assert(!s.ok() || waiting.size() == 0);
+ }
+ }
+ if (!s.ok()) {
+ break;
+ }
+ }
+
+ uint64_t num_levels = 0;
+ for (auto& stat : mget_stats) {
+ if (stat.first == 0) {
+ num_levels += std::get<2>(stat.second);
+ } else {
+ num_levels++;
+ }
+
+ uint64_t num_meta_reads =
+ std::get<0>(stat.second) + std::get<1>(stat.second);
+ uint64_t num_sst_reads = std::get<2>(stat.second);
+ if (num_meta_reads > 0) {
+ RecordInHistogram(db_statistics_,
+ NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+ num_meta_reads);
+ }
+ if (num_sst_reads > 0) {
+ RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_reads);
+ }
+ }
+ if (num_levels > 0) {
+ RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET, num_levels);
+ }
+
+ return s;
+}
+#endif
+
+bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) {
+ // Reaching the bottom level implies misses at all upper levels, so we'll
+ // skip checking the filters when we predict a hit.
+ return cfd_->ioptions()->optimize_filters_for_hits &&
+ (level > 0 || is_file_last_in_level) &&
+ level == storage_info_.num_non_empty_levels() - 1;
+}
+
+void VersionStorageInfo::GenerateLevelFilesBrief() {
+ level_files_brief_.resize(num_non_empty_levels_);
+ for (int level = 0; level < num_non_empty_levels_; level++) {
+ DoGenerateLevelFilesBrief(&level_files_brief_[level], files_[level],
+ &arena_);
+ }
+}
+
+void VersionStorageInfo::PrepareForVersionAppend(
+ const ImmutableOptions& immutable_options,
+ const MutableCFOptions& mutable_cf_options) {
+ ComputeCompensatedSizes();
+ UpdateNumNonEmptyLevels();
+ CalculateBaseBytes(immutable_options, mutable_cf_options);
+ UpdateFilesByCompactionPri(immutable_options, mutable_cf_options);
+ GenerateFileIndexer();
+ GenerateLevelFilesBrief();
+ GenerateLevel0NonOverlapping();
+ if (!immutable_options.allow_ingest_behind) {
+ GenerateBottommostFiles();
+ }
+ GenerateFileLocationIndex();
+}
+
+void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options,
+ bool update_stats) {
+ TEST_SYNC_POINT_CALLBACK(
+ "Version::PrepareAppend:forced_check",
+ reinterpret_cast<void*>(&storage_info_.force_consistency_checks_));
+
+ if (update_stats) {
+ UpdateAccumulatedStats();
+ }
+
+ storage_info_.PrepareForVersionAppend(*cfd_->ioptions(), mutable_cf_options);
+}
+
+bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
+ if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) {
+ return false;
+ }
+ std::shared_ptr<const TableProperties> tp;
+ Status s = GetTableProperties(&tp, file_meta);
+ file_meta->init_stats_from_file = true;
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(vset_->db_options_->info_log,
+ "Unable to load table properties for file %" PRIu64
+ " --- %s\n",
+ file_meta->fd.GetNumber(), s.ToString().c_str());
+ return false;
+ }
+ if (tp.get() == nullptr) return false;
+ file_meta->num_entries = tp->num_entries;
+ file_meta->num_deletions = tp->num_deletions;
+ file_meta->raw_value_size = tp->raw_value_size;
+ file_meta->raw_key_size = tp->raw_key_size;
+
+ return true;
+}
+
+void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
+ TEST_SYNC_POINT_CALLBACK("VersionStorageInfo::UpdateAccumulatedStats",
+ nullptr);
+
+ assert(file_meta->init_stats_from_file);
+ accumulated_file_size_ += file_meta->fd.GetFileSize();
+ accumulated_raw_key_size_ += file_meta->raw_key_size;
+ accumulated_raw_value_size_ += file_meta->raw_value_size;
+ accumulated_num_non_deletions_ +=
+ file_meta->num_entries - file_meta->num_deletions;
+ accumulated_num_deletions_ += file_meta->num_deletions;
+
+ current_num_non_deletions_ +=
+ file_meta->num_entries - file_meta->num_deletions;
+ current_num_deletions_ += file_meta->num_deletions;
+ current_num_samples_++;
+}
+
+void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) {
+ if (file_meta->init_stats_from_file) {
+ current_num_non_deletions_ -=
+ file_meta->num_entries - file_meta->num_deletions;
+ current_num_deletions_ -= file_meta->num_deletions;
+ current_num_samples_--;
+ }
+}
+
+void Version::UpdateAccumulatedStats() {
+ // maximum number of table properties loaded from files.
+ const int kMaxInitCount = 20;
+ int init_count = 0;
+ // here only the first kMaxInitCount files which haven't been
+ // initialized from file will be updated with num_deletions.
+ // The motivation here is to cap the maximum I/O per Version creation.
+ // The reason for choosing files from lower-level instead of higher-level
+ // is that such design is able to propagate the initialization from
+ // lower-level to higher-level: When the num_deletions of lower-level
+ // files are updated, it will make the lower-level files have accurate
+ // compensated_file_size, making lower-level to higher-level compaction
+ // will be triggered, which creates higher-level files whose num_deletions
+ // will be updated here.
+ for (int level = 0;
+ level < storage_info_.num_levels_ && init_count < kMaxInitCount;
+ ++level) {
+ for (auto* file_meta : storage_info_.files_[level]) {
+ if (MaybeInitializeFileMetaData(file_meta)) {
+ // each FileMeta will be initialized only once.
+ storage_info_.UpdateAccumulatedStats(file_meta);
+ // when option "max_open_files" is -1, all the file metadata has
+ // already been read, so MaybeInitializeFileMetaData() won't incur
+ // any I/O cost. "max_open_files=-1" means that the table cache passed
+ // to the VersionSet and then to the ColumnFamilySet has a size of
+ // TableCache::kInfiniteCapacity
+ if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() ==
+ TableCache::kInfiniteCapacity) {
+ continue;
+ }
+ if (++init_count >= kMaxInitCount) {
+ break;
+ }
+ }
+ }
+ }
+ // In case all sampled-files contain only deletion entries, then we
+ // load the table-property of a file in higher-level to initialize
+ // that value.
+ for (int level = storage_info_.num_levels_ - 1;
+ storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
+ for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
+ storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
+ if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
+ storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
+ }
+ }
+ }
+}
+
+void VersionStorageInfo::ComputeCompensatedSizes() {
+ static const int kDeletionWeightOnCompaction = 2;
+ uint64_t average_value_size = GetAverageValueSize();
+
+ // compute the compensated size
+ for (int level = 0; level < num_levels_; level++) {
+ for (auto* file_meta : files_[level]) {
+ // Here we only compute compensated_file_size for those file_meta
+ // which compensated_file_size is uninitialized (== 0). This is true only
+ // for files that have been created right now and no other thread has
+ // access to them. That's why we can safely mutate compensated_file_size.
+ if (file_meta->compensated_file_size == 0) {
+ file_meta->compensated_file_size = file_meta->fd.GetFileSize();
+ // Here we only boost the size of deletion entries of a file only
+ // when the number of deletion entries is greater than the number of
+ // non-deletion entries in the file. The motivation here is that in
+ // a stable workload, the number of deletion entries should be roughly
+ // equal to the number of non-deletion entries. If we compensate the
+ // size of deletion entries in a stable workload, the deletion
+ // compensation logic might introduce unwanted effet which changes the
+ // shape of LSM tree.
+ if (file_meta->num_deletions * 2 >= file_meta->num_entries) {
+ file_meta->compensated_file_size +=
+ (file_meta->num_deletions * 2 - file_meta->num_entries) *
+ average_value_size * kDeletionWeightOnCompaction;
+ }
+ }
+ }
+ }
+}
+
+int VersionStorageInfo::MaxInputLevel() const {
+ if (compaction_style_ == kCompactionStyleLevel) {
+ return num_levels() - 2;
+ }
+ return 0;
+}
+
+int VersionStorageInfo::MaxOutputLevel(bool allow_ingest_behind) const {
+ if (allow_ingest_behind) {
+ assert(num_levels() > 1);
+ return num_levels() - 2;
+ }
+ return num_levels() - 1;
+}
+
+void VersionStorageInfo::EstimateCompactionBytesNeeded(
+ const MutableCFOptions& mutable_cf_options) {
+ // Only implemented for level-based compaction
+ if (compaction_style_ != kCompactionStyleLevel) {
+ estimated_compaction_needed_bytes_ = 0;
+ return;
+ }
+
+ // Start from Level 0, if level 0 qualifies compaction to level 1,
+ // we estimate the size of compaction.
+ // Then we move on to the next level and see whether it qualifies compaction
+ // to the next level. The size of the level is estimated as the actual size
+ // on the level plus the input bytes from the previous level if there is any.
+ // If it exceeds, take the exceeded bytes as compaction input and add the size
+ // of the compaction size to tatal size.
+ // We keep doing it to Level 2, 3, etc, until the last level and return the
+ // accumulated bytes.
+
+ uint64_t bytes_compact_to_next_level = 0;
+ uint64_t level_size = 0;
+ for (auto* f : files_[0]) {
+ level_size += f->fd.GetFileSize();
+ }
+ // Level 0
+ bool level0_compact_triggered = false;
+ if (static_cast<int>(files_[0].size()) >=
+ mutable_cf_options.level0_file_num_compaction_trigger ||
+ level_size >= mutable_cf_options.max_bytes_for_level_base) {
+ level0_compact_triggered = true;
+ estimated_compaction_needed_bytes_ = level_size;
+ bytes_compact_to_next_level = level_size;
+ } else {
+ estimated_compaction_needed_bytes_ = 0;
+ }
+
+ // Level 1 and up.
+ uint64_t bytes_next_level = 0;
+ for (int level = base_level(); level <= MaxInputLevel(); level++) {
+ level_size = 0;
+ if (bytes_next_level > 0) {
+#ifndef NDEBUG
+ uint64_t level_size2 = 0;
+ for (auto* f : files_[level]) {
+ level_size2 += f->fd.GetFileSize();
+ }
+ assert(level_size2 == bytes_next_level);
+#endif
+ level_size = bytes_next_level;
+ bytes_next_level = 0;
+ } else {
+ for (auto* f : files_[level]) {
+ level_size += f->fd.GetFileSize();
+ }
+ }
+ if (level == base_level() && level0_compact_triggered) {
+ // Add base level size to compaction if level0 compaction triggered.
+ estimated_compaction_needed_bytes_ += level_size;
+ }
+ // Add size added by previous compaction
+ level_size += bytes_compact_to_next_level;
+ bytes_compact_to_next_level = 0;
+ uint64_t level_target = MaxBytesForLevel(level);
+ if (level_size > level_target) {
+ bytes_compact_to_next_level = level_size - level_target;
+ // Estimate the actual compaction fan-out ratio as size ratio between
+ // the two levels.
+
+ assert(bytes_next_level == 0);
+ if (level + 1 < num_levels_) {
+ for (auto* f : files_[level + 1]) {
+ bytes_next_level += f->fd.GetFileSize();
+ }
+ }
+ if (bytes_next_level > 0) {
+ assert(level_size > 0);
+ estimated_compaction_needed_bytes_ += static_cast<uint64_t>(
+ static_cast<double>(bytes_compact_to_next_level) *
+ (static_cast<double>(bytes_next_level) /
+ static_cast<double>(level_size) +
+ 1));
+ }
+ }
+ }
+}
+
+namespace {
+uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ const std::vector<FileMetaData*>& files) {
+ uint32_t ttl_expired_files_count = 0;
+
+ int64_t _current_time;
+ auto status = ioptions.clock->GetCurrentTime(&_current_time);
+ if (status.ok()) {
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+ for (FileMetaData* f : files) {
+ if (!f->being_compacted) {
+ uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+ if (oldest_ancester_time != 0 &&
+ oldest_ancester_time < (current_time - mutable_cf_options.ttl)) {
+ ttl_expired_files_count++;
+ }
+ }
+ }
+ }
+ return ttl_expired_files_count;
+}
+} // anonymous namespace
+
+void VersionStorageInfo::ComputeCompactionScore(
+ const ImmutableOptions& immutable_options,
+ const MutableCFOptions& mutable_cf_options) {
+ double total_downcompact_bytes = 0.0;
+ // Historically, score is defined as actual bytes in a level divided by
+ // the level's target size, and 1.0 is the threshold for triggering
+ // compaction. Higher score means higher prioritization.
+ // Now we keep the compaction triggering condition, but consider more
+ // factors for priorization, while still keeping the 1.0 threshold.
+ // In order to provide flexibility for reducing score while still
+ // maintaining it to be over 1.0, we scale the original score by 10x
+ // if it is larger than 1.0.
+ const double kScoreScale = 10.0;
+ for (int level = 0; level <= MaxInputLevel(); level++) {
+ double score;
+ if (level == 0) {
+ // We treat level-0 specially by bounding the number of files
+ // instead of number of bytes for two reasons:
+ //
+ // (1) With larger write-buffer sizes, it is nice not to do too
+ // many level-0 compactions.
+ //
+ // (2) The files in level-0 are merged on every read and
+ // therefore we wish to avoid too many files when the individual
+ // file size is small (perhaps because of a small write-buffer
+ // setting, or very high compression ratios, or lots of
+ // overwrites/deletions).
+ int num_sorted_runs = 0;
+ uint64_t total_size = 0;
+ for (auto* f : files_[level]) {
+ total_downcompact_bytes += static_cast<double>(f->fd.GetFileSize());
+ if (!f->being_compacted) {
+ total_size += f->compensated_file_size;
+ num_sorted_runs++;
+ }
+ }
+ if (compaction_style_ == kCompactionStyleUniversal) {
+ // For universal compaction, we use level0 score to indicate
+ // compaction score for the whole DB. Adding other levels as if
+ // they are L0 files.
+ for (int i = 1; i < num_levels(); i++) {
+ // Its possible that a subset of the files in a level may be in a
+ // compaction, due to delete triggered compaction or trivial move.
+ // In that case, the below check may not catch a level being
+ // compacted as it only checks the first file. The worst that can
+ // happen is a scheduled compaction thread will find nothing to do.
+ if (!files_[i].empty() && !files_[i][0]->being_compacted) {
+ num_sorted_runs++;
+ }
+ }
+ }
+
+ if (compaction_style_ == kCompactionStyleFIFO) {
+ score = static_cast<double>(total_size) /
+ mutable_cf_options.compaction_options_fifo.max_table_files_size;
+ if (mutable_cf_options.compaction_options_fifo.allow_compaction ||
+ mutable_cf_options.compaction_options_fifo.age_for_warm > 0) {
+ // Warm tier move can happen at any time. It's too expensive to
+ // check very file's timestamp now. For now, just trigger it
+ // slightly more frequently than FIFO compaction so that this
+ // happens first.
+ score = std::max(
+ static_cast<double>(num_sorted_runs) /
+ mutable_cf_options.level0_file_num_compaction_trigger,
+ score);
+ }
+ if (mutable_cf_options.ttl > 0) {
+ score = std::max(
+ static_cast<double>(GetExpiredTtlFilesCount(
+ immutable_options, mutable_cf_options, files_[level])),
+ score);
+ }
+ } else {
+ score = static_cast<double>(num_sorted_runs) /
+ mutable_cf_options.level0_file_num_compaction_trigger;
+ if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
+ // Level-based involves L0->L0 compactions that can lead to oversized
+ // L0 files. Take into account size as well to avoid later giant
+ // compactions to the base level.
+ // If score in L0 is always too high, L0->L1 will always be
+ // prioritized over L1->L2 compaction and L1 will accumulate to
+ // too large. But if L0 score isn't high enough, L0 will accumulate
+ // and data is not moved to L1 fast enough. With potential L0->L0
+ // compaction, number of L0 files aren't always an indication of
+ // L0 oversizing, and we also need to consider total size of L0.
+ if (immutable_options.level_compaction_dynamic_level_bytes) {
+ if (total_size >= mutable_cf_options.max_bytes_for_level_base) {
+ // When calculating estimated_compaction_needed_bytes, we assume
+ // L0 is qualified as pending compactions. We will need to make
+ // sure that it qualifies for compaction.
+ // It might be guafanteed by logic below anyway, but we are
+ // explicit here to make sure we don't stop writes with no
+ // compaction scheduled.
+ score = std::max(score, 1.01);
+ }
+ if (total_size > level_max_bytes_[base_level_]) {
+ // In this case, we compare L0 size with actual L1 size and make
+ // sure score is more than 1.0 (10.0 after scaled) if L0 is larger
+ // than L1. Since in this case L1 score is lower than 10.0, L0->L1
+ // is prioritized over L1->L2.
+ uint64_t base_level_size = 0;
+ for (auto f : files_[base_level_]) {
+ base_level_size += f->compensated_file_size;
+ }
+ score = std::max(score, static_cast<double>(total_size) /
+ static_cast<double>(std::max(
+ base_level_size,
+ level_max_bytes_[base_level_])));
+ }
+ if (score > 1.0) {
+ score *= kScoreScale;
+ }
+ } else {
+ score = std::max(score,
+ static_cast<double>(total_size) /
+ mutable_cf_options.max_bytes_for_level_base);
+ }
+ }
+ }
+ } else {
+ // Compute the ratio of current size to size limit.
+ uint64_t level_bytes_no_compacting = 0;
+ uint64_t level_total_bytes = 0;
+ for (auto f : files_[level]) {
+ level_total_bytes += f->fd.GetFileSize();
+ if (!f->being_compacted) {
+ level_bytes_no_compacting += f->compensated_file_size;
+ }
+ }
+ if (!immutable_options.level_compaction_dynamic_level_bytes ||
+ level_bytes_no_compacting < MaxBytesForLevel(level)) {
+ score = static_cast<double>(level_bytes_no_compacting) /
+ MaxBytesForLevel(level);
+ } else {
+ // If there are a large mount of data being compacted down to the
+ // current level soon, we would de-prioritize compaction from
+ // a level where the incoming data would be a large ratio. We do
+ // it by dividing level size not by target level size, but
+ // the target size and the incoming compaction bytes.
+ score = static_cast<double>(level_bytes_no_compacting) /
+ (MaxBytesForLevel(level) + total_downcompact_bytes) *
+ kScoreScale;
+ }
+ if (level_total_bytes > MaxBytesForLevel(level)) {
+ total_downcompact_bytes +=
+ static_cast<double>(level_total_bytes - MaxBytesForLevel(level));
+ }
+ }
+ compaction_level_[level] = level;
+ compaction_score_[level] = score;
+ }
+
+ // sort all the levels based on their score. Higher scores get listed
+ // first. Use bubble sort because the number of entries are small.
+ for (int i = 0; i < num_levels() - 2; i++) {
+ for (int j = i + 1; j < num_levels() - 1; j++) {
+ if (compaction_score_[i] < compaction_score_[j]) {
+ double score = compaction_score_[i];
+ int level = compaction_level_[i];
+ compaction_score_[i] = compaction_score_[j];
+ compaction_level_[i] = compaction_level_[j];
+ compaction_score_[j] = score;
+ compaction_level_[j] = level;
+ }
+ }
+ }
+ ComputeFilesMarkedForCompaction();
+ if (!immutable_options.allow_ingest_behind) {
+ ComputeBottommostFilesMarkedForCompaction();
+ }
+ if (mutable_cf_options.ttl > 0) {
+ ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
+ }
+ if (mutable_cf_options.periodic_compaction_seconds > 0) {
+ ComputeFilesMarkedForPeriodicCompaction(
+ immutable_options, mutable_cf_options.periodic_compaction_seconds);
+ }
+
+ if (mutable_cf_options.enable_blob_garbage_collection &&
+ mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 &&
+ mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) {
+ ComputeFilesMarkedForForcedBlobGC(
+ mutable_cf_options.blob_garbage_collection_age_cutoff,
+ mutable_cf_options.blob_garbage_collection_force_threshold);
+ }
+
+ EstimateCompactionBytesNeeded(mutable_cf_options);
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
+ files_marked_for_compaction_.clear();
+ int last_qualify_level = 0;
+
+ // Do not include files from the last level with data
+ // If table properties collector suggests a file on the last level,
+ // we should not move it to a new level.
+ for (int level = num_levels() - 1; level >= 1; level--) {
+ if (!files_[level].empty()) {
+ last_qualify_level = level - 1;
+ break;
+ }
+ }
+
+ for (int level = 0; level <= last_qualify_level; level++) {
+ for (auto* f : files_[level]) {
+ if (!f->being_compacted && f->marked_for_compaction) {
+ files_marked_for_compaction_.emplace_back(level, f);
+ }
+ }
+ }
+}
+
+void VersionStorageInfo::ComputeExpiredTtlFiles(
+ const ImmutableOptions& ioptions, const uint64_t ttl) {
+ assert(ttl > 0);
+
+ expired_ttl_files_.clear();
+
+ int64_t _current_time;
+ auto status = ioptions.clock->GetCurrentTime(&_current_time);
+ if (!status.ok()) {
+ return;
+ }
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+ for (int level = 0; level < num_levels() - 1; level++) {
+ for (FileMetaData* f : files_[level]) {
+ if (!f->being_compacted) {
+ uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+ if (oldest_ancester_time > 0 &&
+ oldest_ancester_time < (current_time - ttl)) {
+ expired_ttl_files_.emplace_back(level, f);
+ }
+ }
+ }
+ }
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
+ const ImmutableOptions& ioptions,
+ const uint64_t periodic_compaction_seconds) {
+ assert(periodic_compaction_seconds > 0);
+
+ files_marked_for_periodic_compaction_.clear();
+
+ int64_t temp_current_time;
+ auto status = ioptions.clock->GetCurrentTime(&temp_current_time);
+ if (!status.ok()) {
+ return;
+ }
+ const uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+
+ // If periodic_compaction_seconds is larger than current time, periodic
+ // compaction can't possibly be triggered.
+ if (periodic_compaction_seconds > current_time) {
+ return;
+ }
+
+ const uint64_t allowed_time_limit =
+ current_time - periodic_compaction_seconds;
+
+ for (int level = 0; level < num_levels(); level++) {
+ for (auto f : files_[level]) {
+ if (!f->being_compacted) {
+ // Compute a file's modification time in the following order:
+ // 1. Use file_creation_time table property if it is > 0.
+ // 2. Use creation_time table property if it is > 0.
+ // 3. Use file's mtime metadata if the above two table properties are 0.
+ // Don't consider the file at all if the modification time cannot be
+ // correctly determined based on the above conditions.
+ uint64_t file_modification_time = f->TryGetFileCreationTime();
+ if (file_modification_time == kUnknownFileCreationTime) {
+ file_modification_time = f->TryGetOldestAncesterTime();
+ }
+ if (file_modification_time == kUnknownOldestAncesterTime) {
+ auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(),
+ f->fd.GetPathId());
+ status = ioptions.env->GetFileModificationTime(
+ file_path, &file_modification_time);
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(ioptions.logger,
+ "Can't get file modification time: %s: %s",
+ file_path.c_str(), status.ToString().c_str());
+ continue;
+ }
+ }
+ if (file_modification_time > 0 &&
+ file_modification_time < allowed_time_limit) {
+ files_marked_for_periodic_compaction_.emplace_back(level, f);
+ }
+ }
+ }
+ }
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC(
+ double blob_garbage_collection_age_cutoff,
+ double blob_garbage_collection_force_threshold) {
+ files_marked_for_forced_blob_gc_.clear();
+
+ if (blob_files_.empty()) {
+ return;
+ }
+
+ // Number of blob files eligible for GC based on age
+ const size_t cutoff_count = static_cast<size_t>(
+ blob_garbage_collection_age_cutoff * blob_files_.size());
+ if (!cutoff_count) {
+ return;
+ }
+
+ // Compute the sum of total and garbage bytes over the oldest batch of blob
+ // files. The oldest batch is defined as the set of blob files which are
+ // kept alive by the same SSTs as the very oldest one. Here is a toy example.
+ // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11,
+ // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and
+ // potentially some higher-numbered ones, while SST 3 relies on blob file 12
+ // and potentially some higher-numbered ones. Then, the SST to oldest blob
+ // file mapping is as follows:
+ //
+ // SST file number Oldest blob file number
+ // 1 10
+ // 2 10
+ // 3 12
+ //
+ // This is what the same thing looks like from the blob files' POV. (Note that
+ // the linked SSTs simply denote the inverse mapping of the above.)
+ //
+ // Blob file number Linked SST set
+ // 10 {1, 2}
+ // 11 {}
+ // 12 {3}
+ // 13 {}
+ //
+ // Then, the oldest batch of blob files consists of blob files 10 and 11,
+ // and we can get rid of them by forcing the compaction of SSTs 1 and 2.
+ //
+ // Note that the overall ratio of garbage computed for the batch has to exceed
+ // blob_garbage_collection_force_threshold and the entire batch has to be
+ // eligible for GC according to blob_garbage_collection_age_cutoff in order
+ // for us to schedule any compactions.
+ const auto& oldest_meta = blob_files_.front();
+ assert(oldest_meta);
+
+ const auto& linked_ssts = oldest_meta->GetLinkedSsts();
+ assert(!linked_ssts.empty());
+
+ size_t count = 1;
+ uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes();
+ uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes();
+
+ assert(cutoff_count <= blob_files_.size());
+
+ for (; count < cutoff_count; ++count) {
+ const auto& meta = blob_files_[count];
+ assert(meta);
+
+ if (!meta->GetLinkedSsts().empty()) {
+ // Found the beginning of the next batch of blob files
+ break;
+ }
+
+ sum_total_blob_bytes += meta->GetTotalBlobBytes();
+ sum_garbage_blob_bytes += meta->GetGarbageBlobBytes();
+ }
+
+ if (count < blob_files_.size()) {
+ const auto& meta = blob_files_[count];
+ assert(meta);
+
+ if (meta->GetLinkedSsts().empty()) {
+ // Some files in the oldest batch are not eligible for GC
+ return;
+ }
+ }
+
+ if (sum_garbage_blob_bytes <
+ blob_garbage_collection_force_threshold * sum_total_blob_bytes) {
+ return;
+ }
+
+ for (uint64_t sst_file_number : linked_ssts) {
+ const FileLocation location = GetFileLocation(sst_file_number);
+ assert(location.IsValid());
+
+ const int level = location.GetLevel();
+ assert(level >= 0);
+
+ const size_t pos = location.GetPosition();
+
+ FileMetaData* const sst_meta = files_[level][pos];
+ assert(sst_meta);
+
+ if (sst_meta->being_compacted) {
+ continue;
+ }
+
+ files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta);
+ }
+}
+
+namespace {
+
+// used to sort files by size
+struct Fsize {
+ size_t index;
+ FileMetaData* file;
+};
+
+// Comparator that is used to sort files based on their size
+// In normal mode: descending size
+bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
+ return (first.file->compensated_file_size >
+ second.file->compensated_file_size);
+}
+} // anonymous namespace
+
+void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
+ auto& level_files = files_[level];
+ level_files.push_back(f);
+
+ f->refs++;
+}
+
+void VersionStorageInfo::AddBlobFile(
+ std::shared_ptr<BlobFileMetaData> blob_file_meta) {
+ assert(blob_file_meta);
+
+ assert(blob_files_.empty() ||
+ (blob_files_.back() && blob_files_.back()->GetBlobFileNumber() <
+ blob_file_meta->GetBlobFileNumber()));
+
+ blob_files_.emplace_back(std::move(blob_file_meta));
+}
+
+VersionStorageInfo::BlobFiles::const_iterator
+VersionStorageInfo::GetBlobFileMetaDataLB(uint64_t blob_file_number) const {
+ return std::lower_bound(
+ blob_files_.begin(), blob_files_.end(), blob_file_number,
+ [](const std::shared_ptr<BlobFileMetaData>& lhs, uint64_t rhs) {
+ assert(lhs);
+ return lhs->GetBlobFileNumber() < rhs;
+ });
+}
+
+void VersionStorageInfo::SetFinalized() {
+ finalized_ = true;
+
+#ifndef NDEBUG
+ if (compaction_style_ != kCompactionStyleLevel) {
+ // Not level based compaction.
+ return;
+ }
+ assert(base_level_ < 0 || num_levels() == 1 ||
+ (base_level_ >= 1 && base_level_ < num_levels()));
+ // Verify all levels newer than base_level are empty except L0
+ for (int level = 1; level < base_level(); level++) {
+ assert(NumLevelBytes(level) == 0);
+ }
+ uint64_t max_bytes_prev_level = 0;
+ for (int level = base_level(); level < num_levels() - 1; level++) {
+ if (LevelFiles(level).size() == 0) {
+ continue;
+ }
+ assert(MaxBytesForLevel(level) >= max_bytes_prev_level);
+ max_bytes_prev_level = MaxBytesForLevel(level);
+ }
+ for (int level = 0; level < num_levels(); level++) {
+ assert(LevelFiles(level).size() == 0 ||
+ LevelFiles(level).size() == LevelFilesBrief(level).num_files);
+ if (LevelFiles(level).size() > 0) {
+ assert(level < num_non_empty_levels());
+ }
+ }
+ assert(compaction_level_.size() > 0);
+ assert(compaction_level_.size() == compaction_score_.size());
+#endif
+}
+
+void VersionStorageInfo::UpdateNumNonEmptyLevels() {
+ num_non_empty_levels_ = num_levels_;
+ for (int i = num_levels_ - 1; i >= 0; i--) {
+ if (files_[i].size() != 0) {
+ return;
+ } else {
+ num_non_empty_levels_ = i;
+ }
+ }
+}
+
+namespace {
+// Sort `temp` based on ratio of overlapping size over file size
+void SortFileByOverlappingRatio(
+ const InternalKeyComparator& icmp, const std::vector<FileMetaData*>& files,
+ const std::vector<FileMetaData*>& next_level_files, SystemClock* clock,
+ int level, int num_non_empty_levels, uint64_t ttl,
+ std::vector<Fsize>* temp) {
+ std::unordered_map<uint64_t, uint64_t> file_to_order;
+ auto next_level_it = next_level_files.begin();
+
+ int64_t curr_time;
+ Status status = clock->GetCurrentTime(&curr_time);
+ if (!status.ok()) {
+ // If we can't get time, disable TTL.
+ ttl = 0;
+ }
+
+ FileTtlBooster ttl_booster(static_cast<uint64_t>(curr_time), ttl,
+ num_non_empty_levels, level);
+
+ for (auto& file : files) {
+ uint64_t overlapping_bytes = 0;
+ // Skip files in next level that is smaller than current file
+ while (next_level_it != next_level_files.end() &&
+ icmp.Compare((*next_level_it)->largest, file->smallest) < 0) {
+ next_level_it++;
+ }
+
+ while (next_level_it != next_level_files.end() &&
+ icmp.Compare((*next_level_it)->smallest, file->largest) < 0) {
+ overlapping_bytes += (*next_level_it)->fd.file_size;
+
+ if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) {
+ // next level file cross large boundary of current file.
+ break;
+ }
+ next_level_it++;
+ }
+
+ uint64_t ttl_boost_score = (ttl > 0) ? ttl_booster.GetBoostScore(file) : 1;
+ assert(ttl_boost_score > 0);
+ assert(file->compensated_file_size != 0);
+ file_to_order[file->fd.GetNumber()] = overlapping_bytes * 1024U /
+ file->compensated_file_size /
+ ttl_boost_score;
+ }
+
+ size_t num_to_sort = temp->size() > VersionStorageInfo::kNumberFilesToSort
+ ? VersionStorageInfo::kNumberFilesToSort
+ : temp->size();
+
+ std::partial_sort(temp->begin(), temp->begin() + num_to_sort, temp->end(),
+ [&](const Fsize& f1, const Fsize& f2) -> bool {
+ // If score is the same, pick file with smaller keys.
+ // This makes the algorithm more deterministic, and also
+ // help the trivial move case to have more files to
+ // extend.
+ if (file_to_order[f1.file->fd.GetNumber()] ==
+ file_to_order[f2.file->fd.GetNumber()]) {
+ return icmp.Compare(f1.file->smallest,
+ f2.file->smallest) < 0;
+ }
+ return file_to_order[f1.file->fd.GetNumber()] <
+ file_to_order[f2.file->fd.GetNumber()];
+ });
+}
+
+void SortFileByRoundRobin(const InternalKeyComparator& icmp,
+ std::vector<InternalKey>* compact_cursor,
+ bool level0_non_overlapping, int level,
+ std::vector<Fsize>* temp) {
+ if (level == 0 && !level0_non_overlapping) {
+ // Using kOldestSmallestSeqFirst when level === 0, since the
+ // files may overlap (not fully sorted)
+ std::sort(temp->begin(), temp->end(),
+ [](const Fsize& f1, const Fsize& f2) -> bool {
+ return f1.file->fd.smallest_seqno < f2.file->fd.smallest_seqno;
+ });
+ return;
+ }
+
+ bool should_move_files =
+ compact_cursor->at(level).size() > 0 && temp->size() > 1;
+
+ // The iterator points to the Fsize with smallest key larger than or equal to
+ // the given cursor
+ std::vector<Fsize>::iterator current_file_iter;
+ if (should_move_files) {
+ // Find the file of which the smallest key is larger than or equal to
+ // the cursor (the smallest key in the successor file of the last
+ // chosen file), skip this if the cursor is invalid or there is only
+ // one file in this level
+ current_file_iter = std::lower_bound(
+ temp->begin(), temp->end(), compact_cursor->at(level),
+ [&](const Fsize& f, const InternalKey& cursor) -> bool {
+ return icmp.Compare(cursor, f.file->smallest) > 0;
+ });
+
+ should_move_files =
+ current_file_iter != temp->end() && current_file_iter != temp->begin();
+ }
+ if (should_move_files) {
+ // Construct a local temporary vector
+ std::vector<Fsize> local_temp;
+ local_temp.reserve(temp->size());
+ // Move the selected File into the first position and its successors
+ // into the second, third, ..., positions
+ for (auto iter = current_file_iter; iter != temp->end(); iter++) {
+ local_temp.push_back(*iter);
+ }
+ // Move the origin predecessors of the selected file in a round-robin
+ // manner
+ for (auto iter = temp->begin(); iter != current_file_iter; iter++) {
+ local_temp.push_back(*iter);
+ }
+ // Replace all the items in temp
+ for (size_t i = 0; i < local_temp.size(); i++) {
+ temp->at(i) = local_temp[i];
+ }
+ }
+}
+} // anonymous namespace
+
+void VersionStorageInfo::UpdateFilesByCompactionPri(
+ const ImmutableOptions& ioptions, const MutableCFOptions& options) {
+ if (compaction_style_ == kCompactionStyleNone ||
+ compaction_style_ == kCompactionStyleFIFO ||
+ compaction_style_ == kCompactionStyleUniversal) {
+ // don't need this
+ return;
+ }
+ // No need to sort the highest level because it is never compacted.
+ for (int level = 0; level < num_levels() - 1; level++) {
+ const std::vector<FileMetaData*>& files = files_[level];
+ auto& files_by_compaction_pri = files_by_compaction_pri_[level];
+ assert(files_by_compaction_pri.size() == 0);
+
+ // populate a temp vector for sorting based on size
+ std::vector<Fsize> temp(files.size());
+ for (size_t i = 0; i < files.size(); i++) {
+ temp[i].index = i;
+ temp[i].file = files[i];
+ }
+
+ // sort the top number_of_files_to_sort_ based on file size
+ size_t num = VersionStorageInfo::kNumberFilesToSort;
+ if (num > temp.size()) {
+ num = temp.size();
+ }
+ switch (ioptions.compaction_pri) {
+ case kByCompensatedSize:
+ std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+ CompareCompensatedSizeDescending);
+ break;
+ case kOldestLargestSeqFirst:
+ std::sort(temp.begin(), temp.end(),
+ [](const Fsize& f1, const Fsize& f2) -> bool {
+ return f1.file->fd.largest_seqno <
+ f2.file->fd.largest_seqno;
+ });
+ break;
+ case kOldestSmallestSeqFirst:
+ std::sort(temp.begin(), temp.end(),
+ [](const Fsize& f1, const Fsize& f2) -> bool {
+ return f1.file->fd.smallest_seqno <
+ f2.file->fd.smallest_seqno;
+ });
+ break;
+ case kMinOverlappingRatio:
+ SortFileByOverlappingRatio(*internal_comparator_, files_[level],
+ files_[level + 1], ioptions.clock, level,
+ num_non_empty_levels_, options.ttl, &temp);
+ break;
+ case kRoundRobin:
+ SortFileByRoundRobin(*internal_comparator_, &compact_cursor_,
+ level0_non_overlapping_, level, &temp);
+ break;
+ default:
+ assert(false);
+ }
+ assert(temp.size() == files.size());
+
+ // initialize files_by_compaction_pri_
+ for (size_t i = 0; i < temp.size(); i++) {
+ files_by_compaction_pri.push_back(static_cast<int>(temp[i].index));
+ }
+ next_file_to_compact_by_size_[level] = 0;
+ assert(files_[level].size() == files_by_compaction_pri_[level].size());
+ }
+}
+
+void VersionStorageInfo::GenerateLevel0NonOverlapping() {
+ assert(!finalized_);
+ level0_non_overlapping_ = true;
+ if (level_files_brief_.size() == 0) {
+ return;
+ }
+
+ // A copy of L0 files sorted by smallest key
+ std::vector<FdWithKeyRange> level0_sorted_file(
+ level_files_brief_[0].files,
+ level_files_brief_[0].files + level_files_brief_[0].num_files);
+ std::sort(level0_sorted_file.begin(), level0_sorted_file.end(),
+ [this](const FdWithKeyRange& f1, const FdWithKeyRange& f2) -> bool {
+ return (internal_comparator_->Compare(f1.smallest_key,
+ f2.smallest_key) < 0);
+ });
+
+ for (size_t i = 1; i < level0_sorted_file.size(); ++i) {
+ FdWithKeyRange& f = level0_sorted_file[i];
+ FdWithKeyRange& prev = level0_sorted_file[i - 1];
+ if (internal_comparator_->Compare(prev.largest_key, f.smallest_key) >= 0) {
+ level0_non_overlapping_ = false;
+ break;
+ }
+ }
+}
+
+void VersionStorageInfo::GenerateBottommostFiles() {
+ assert(!finalized_);
+ assert(bottommost_files_.empty());
+ for (size_t level = 0; level < level_files_brief_.size(); ++level) {
+ for (size_t file_idx = 0; file_idx < level_files_brief_[level].num_files;
+ ++file_idx) {
+ const FdWithKeyRange& f = level_files_brief_[level].files[file_idx];
+ int l0_file_idx;
+ if (level == 0) {
+ l0_file_idx = static_cast<int>(file_idx);
+ } else {
+ l0_file_idx = -1;
+ }
+ Slice smallest_user_key = ExtractUserKey(f.smallest_key);
+ Slice largest_user_key = ExtractUserKey(f.largest_key);
+ if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key,
+ static_cast<int>(level),
+ l0_file_idx)) {
+ bottommost_files_.emplace_back(static_cast<int>(level),
+ f.file_metadata);
+ }
+ }
+ }
+}
+
+void VersionStorageInfo::GenerateFileLocationIndex() {
+ size_t num_files = 0;
+
+ for (int level = 0; level < num_levels_; ++level) {
+ num_files += files_[level].size();
+ }
+
+ file_locations_.reserve(num_files);
+
+ for (int level = 0; level < num_levels_; ++level) {
+ for (size_t pos = 0; pos < files_[level].size(); ++pos) {
+ const FileMetaData* const meta = files_[level][pos];
+ assert(meta);
+
+ const uint64_t file_number = meta->fd.GetNumber();
+
+ assert(file_locations_.find(file_number) == file_locations_.end());
+ file_locations_.emplace(file_number, FileLocation(level, pos));
+ }
+ }
+}
+
+void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) {
+ assert(seqnum >= oldest_snapshot_seqnum_);
+ oldest_snapshot_seqnum_ = seqnum;
+ if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) {
+ ComputeBottommostFilesMarkedForCompaction();
+ }
+}
+
+void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() {
+ bottommost_files_marked_for_compaction_.clear();
+ bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+ for (auto& level_and_file : bottommost_files_) {
+ if (!level_and_file.second->being_compacted &&
+ level_and_file.second->fd.largest_seqno != 0) {
+ // largest_seqno might be nonzero due to containing the final key in an
+ // earlier compaction, whose seqnum we didn't zero out. Multiple deletions
+ // ensures the file really contains deleted or overwritten keys.
+ if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) {
+ bottommost_files_marked_for_compaction_.push_back(level_and_file);
+ } else {
+ bottommost_files_mark_threshold_ =
+ std::min(bottommost_files_mark_threshold_,
+ level_and_file.second->fd.largest_seqno);
+ }
+ }
+ }
+}
+
+void Version::Ref() { ++refs_; }
+
+bool Version::Unref() {
+ assert(refs_ >= 1);
+ --refs_;
+ if (refs_ == 0) {
+ delete this;
+ return true;
+ }
+ return false;
+}
+
+bool VersionStorageInfo::OverlapInLevel(int level,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key) {
+ if (level >= num_non_empty_levels_) {
+ // empty level, no overlap
+ return false;
+ }
+ return SomeFileOverlapsRange(*internal_comparator_, (level > 0),
+ level_files_brief_[level], smallest_user_key,
+ largest_user_key);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// If hint_index is specified, then it points to a file in the
+// overlapping range.
+// The file_index returns a pointer to any file in an overlapping range.
+void VersionStorageInfo::GetOverlappingInputs(
+ int level, const InternalKey* begin, const InternalKey* end,
+ std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
+ bool expand_range, InternalKey** next_smallest) const {
+ if (level >= num_non_empty_levels_) {
+ // this level is empty, no overlapping inputs
+ return;
+ }
+
+ inputs->clear();
+ if (file_index) {
+ *file_index = -1;
+ }
+ const Comparator* user_cmp = user_comparator_;
+ if (level > 0) {
+ GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
+ file_index, false, next_smallest);
+ return;
+ }
+
+ if (next_smallest) {
+ // next_smallest key only makes sense for non-level 0, where files are
+ // non-overlapping
+ *next_smallest = nullptr;
+ }
+
+ Slice user_begin, user_end;
+ if (begin != nullptr) {
+ user_begin = begin->user_key();
+ }
+ if (end != nullptr) {
+ user_end = end->user_key();
+ }
+
+ // index stores the file index need to check.
+ std::list<size_t> index;
+ for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
+ index.emplace_back(i);
+ }
+
+ while (!index.empty()) {
+ bool found_overlapping_file = false;
+ auto iter = index.begin();
+ while (iter != index.end()) {
+ FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]);
+ const Slice file_start = ExtractUserKey(f->smallest_key);
+ const Slice file_limit = ExtractUserKey(f->largest_key);
+ if (begin != nullptr &&
+ user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) {
+ // "f" is completely before specified range; skip it
+ iter++;
+ } else if (end != nullptr &&
+ user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) {
+ // "f" is completely after specified range; skip it
+ iter++;
+ } else {
+ // if overlap
+ inputs->emplace_back(files_[level][*iter]);
+ found_overlapping_file = true;
+ // record the first file index.
+ if (file_index && *file_index == -1) {
+ *file_index = static_cast<int>(*iter);
+ }
+ // the related file is overlap, erase to avoid checking again.
+ iter = index.erase(iter);
+ if (expand_range) {
+ if (begin != nullptr &&
+ user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) {
+ user_begin = file_start;
+ }
+ if (end != nullptr &&
+ user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) {
+ user_end = file_limit;
+ }
+ }
+ }
+ }
+ // if all the files left are not overlap, break
+ if (!found_overlapping_file) {
+ break;
+ }
+ }
+}
+
+// Store in "*inputs" files in "level" that within range [begin,end]
+// Guarantee a "clean cut" boundary between the files in inputs
+// and the surrounding files and the maxinum number of files.
+// This will ensure that no parts of a key are lost during compaction.
+// If hint_index is specified, then it points to a file in the range.
+// The file_index returns a pointer to any file in an overlapping range.
+void VersionStorageInfo::GetCleanInputsWithinInterval(
+ int level, const InternalKey* begin, const InternalKey* end,
+ std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const {
+ inputs->clear();
+ if (file_index) {
+ *file_index = -1;
+ }
+ if (level >= num_non_empty_levels_ || level == 0 ||
+ level_files_brief_[level].num_files == 0) {
+ // this level is empty, no inputs within range
+ // also don't support clean input interval within L0
+ return;
+ }
+
+ GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
+ file_index, true /* within_interval */);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// Employ binary search to find at least one file that overlaps the
+// specified range. From that file, iterate backwards and
+// forwards to find all overlapping files.
+// if within_range is set, then only store the maximum clean inputs
+// within range [begin, end]. "clean" means there is a boundary
+// between the files in "*inputs" and the surrounding files
+void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
+ int level, const InternalKey* begin, const InternalKey* end,
+ std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
+ bool within_interval, InternalKey** next_smallest) const {
+ assert(level > 0);
+
+ auto user_cmp = user_comparator_;
+ const FdWithKeyRange* files = level_files_brief_[level].files;
+ const int num_files = static_cast<int>(level_files_brief_[level].num_files);
+
+ // begin to use binary search to find lower bound
+ // and upper bound.
+ int start_index = 0;
+ int end_index = num_files;
+
+ if (begin != nullptr) {
+ // if within_interval is true, with file_key would find
+ // not overlapping ranges in std::lower_bound.
+ auto cmp = [&user_cmp, &within_interval](const FdWithKeyRange& f,
+ const InternalKey* k) {
+ auto& file_key = within_interval ? f.file_metadata->smallest
+ : f.file_metadata->largest;
+ return sstableKeyCompare(user_cmp, file_key, *k) < 0;
+ };
+
+ start_index = static_cast<int>(
+ std::lower_bound(files,
+ files + (hint_index == -1 ? num_files : hint_index),
+ begin, cmp) -
+ files);
+
+ if (start_index > 0 && within_interval) {
+ bool is_overlapping = true;
+ while (is_overlapping && start_index < num_files) {
+ auto& pre_limit = files[start_index - 1].file_metadata->largest;
+ auto& cur_start = files[start_index].file_metadata->smallest;
+ is_overlapping = sstableKeyCompare(user_cmp, pre_limit, cur_start) == 0;
+ start_index += is_overlapping;
+ }
+ }
+ }
+
+ if (end != nullptr) {
+ // if within_interval is true, with file_key would find
+ // not overlapping ranges in std::upper_bound.
+ auto cmp = [&user_cmp, &within_interval](const InternalKey* k,
+ const FdWithKeyRange& f) {
+ auto& file_key = within_interval ? f.file_metadata->largest
+ : f.file_metadata->smallest;
+ return sstableKeyCompare(user_cmp, *k, file_key) < 0;
+ };
+
+ end_index = static_cast<int>(
+ std::upper_bound(files + start_index, files + num_files, end, cmp) -
+ files);
+
+ if (end_index < num_files && within_interval) {
+ bool is_overlapping = true;
+ while (is_overlapping && end_index > start_index) {
+ auto& next_start = files[end_index].file_metadata->smallest;
+ auto& cur_limit = files[end_index - 1].file_metadata->largest;
+ is_overlapping =
+ sstableKeyCompare(user_cmp, cur_limit, next_start) == 0;
+ end_index -= is_overlapping;
+ }
+ }
+ }
+
+ assert(start_index <= end_index);
+
+ // If there were no overlapping files, return immediately.
+ if (start_index == end_index) {
+ if (next_smallest) {
+ *next_smallest = nullptr;
+ }
+ return;
+ }
+
+ assert(start_index < end_index);
+
+ // returns the index where an overlap is found
+ if (file_index) {
+ *file_index = start_index;
+ }
+
+ // insert overlapping files into vector
+ for (int i = start_index; i < end_index; i++) {
+ inputs->push_back(files_[level][i]);
+ }
+
+ if (next_smallest != nullptr) {
+ // Provide the next key outside the range covered by inputs
+ if (end_index < static_cast<int>(files_[level].size())) {
+ **next_smallest = files_[level][end_index]->smallest;
+ } else {
+ *next_smallest = nullptr;
+ }
+ }
+}
+
+uint64_t VersionStorageInfo::NumLevelBytes(int level) const {
+ assert(level >= 0);
+ assert(level < num_levels());
+ return TotalFileSize(files_[level]);
+}
+
+const char* VersionStorageInfo::LevelSummary(
+ LevelSummaryStorage* scratch) const {
+ int len = 0;
+ if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
+ assert(base_level_ < static_cast<int>(level_max_bytes_.size()));
+ if (level_multiplier_ != 0.0) {
+ len = snprintf(
+ scratch->buffer, sizeof(scratch->buffer),
+ "base level %d level multiplier %.2f max bytes base %" PRIu64 " ",
+ base_level_, level_multiplier_, level_max_bytes_[base_level_]);
+ }
+ }
+ len +=
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files[");
+ for (int i = 0; i < num_levels(); i++) {
+ int sz = sizeof(scratch->buffer) - len;
+ int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
+ if (ret < 0 || ret >= sz) break;
+ len += ret;
+ }
+ if (len > 0) {
+ // overwrite the last space
+ --len;
+ }
+ len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ "] max score %.2f", compaction_score_[0]);
+
+ if (!files_marked_for_compaction_.empty()) {
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ " (%" ROCKSDB_PRIszt " files need compaction)",
+ files_marked_for_compaction_.size());
+ }
+
+ return scratch->buffer;
+}
+
+const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
+ int level) const {
+ int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
+ for (const auto& f : files_[level]) {
+ int sz = sizeof(scratch->buffer) - len;
+ char sztxt[16];
+ AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt));
+ int ret = snprintf(scratch->buffer + len, sz,
+ "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ",
+ f->fd.GetNumber(), f->fd.smallest_seqno, sztxt,
+ static_cast<int>(f->being_compacted));
+ if (ret < 0 || ret >= sz) break;
+ len += ret;
+ }
+ // overwrite the last space (only if files_[level].size() is non-zero)
+ if (files_[level].size() && len > 0) {
+ --len;
+ }
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+ return scratch->buffer;
+}
+
+uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
+ uint64_t result = 0;
+ std::vector<FileMetaData*> overlaps;
+ for (int level = 1; level < num_levels() - 1; level++) {
+ for (const auto& f : files_[level]) {
+ GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
+ const uint64_t sum = TotalFileSize(overlaps);
+ if (sum > result) {
+ result = sum;
+ }
+ }
+ }
+ return result;
+}
+
+uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const {
+ // Note: the result for level zero is not really used since we set
+ // the level-0 compaction threshold based on number of files.
+ assert(level >= 0);
+ assert(level < static_cast<int>(level_max_bytes_.size()));
+ return level_max_bytes_[level];
+}
+
+void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
+ const MutableCFOptions& options) {
+ // Special logic to set number of sorted runs.
+ // It is to match the previous behavior when all files are in L0.
+ int num_l0_count = static_cast<int>(files_[0].size());
+ if (compaction_style_ == kCompactionStyleUniversal) {
+ // For universal compaction, we use level0 score to indicate
+ // compaction score for the whole DB. Adding other levels as if
+ // they are L0 files.
+ for (int i = 1; i < num_levels(); i++) {
+ if (!files_[i].empty()) {
+ num_l0_count++;
+ }
+ }
+ }
+ set_l0_delay_trigger_count(num_l0_count);
+
+ level_max_bytes_.resize(ioptions.num_levels);
+ if (!ioptions.level_compaction_dynamic_level_bytes) {
+ base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1;
+
+ // Calculate for static bytes base case
+ for (int i = 0; i < ioptions.num_levels; ++i) {
+ if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) {
+ level_max_bytes_[i] = options.max_bytes_for_level_base;
+ } else if (i > 1) {
+ level_max_bytes_[i] = MultiplyCheckOverflow(
+ MultiplyCheckOverflow(level_max_bytes_[i - 1],
+ options.max_bytes_for_level_multiplier),
+ options.MaxBytesMultiplerAdditional(i - 1));
+ } else {
+ level_max_bytes_[i] = options.max_bytes_for_level_base;
+ }
+ }
+ } else {
+ uint64_t max_level_size = 0;
+
+ int first_non_empty_level = -1;
+ // Find size of non-L0 level of most data.
+ // Cannot use the size of the last level because it can be empty or less
+ // than previous levels after compaction.
+ for (int i = 1; i < num_levels_; i++) {
+ uint64_t total_size = 0;
+ for (const auto& f : files_[i]) {
+ total_size += f->fd.GetFileSize();
+ }
+ if (total_size > 0 && first_non_empty_level == -1) {
+ first_non_empty_level = i;
+ }
+ if (total_size > max_level_size) {
+ max_level_size = total_size;
+ }
+ }
+
+ // Prefill every level's max bytes to disallow compaction from there.
+ for (int i = 0; i < num_levels_; i++) {
+ level_max_bytes_[i] = std::numeric_limits<uint64_t>::max();
+ }
+
+ if (max_level_size == 0) {
+ // No data for L1 and up. L0 compacts to last level directly.
+ // No compaction from L1+ needs to be scheduled.
+ base_level_ = num_levels_ - 1;
+ } else {
+ uint64_t base_bytes_max = options.max_bytes_for_level_base;
+ uint64_t base_bytes_min = static_cast<uint64_t>(
+ base_bytes_max / options.max_bytes_for_level_multiplier);
+
+ // Try whether we can make last level's target size to be max_level_size
+ uint64_t cur_level_size = max_level_size;
+ for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) {
+ // Round up after dividing
+ cur_level_size = static_cast<uint64_t>(
+ cur_level_size / options.max_bytes_for_level_multiplier);
+ }
+
+ // Calculate base level and its size.
+ uint64_t base_level_size;
+ if (cur_level_size <= base_bytes_min) {
+ // Case 1. If we make target size of last level to be max_level_size,
+ // target size of the first non-empty level would be smaller than
+ // base_bytes_min. We set it be base_bytes_min.
+ base_level_size = base_bytes_min + 1U;
+ base_level_ = first_non_empty_level;
+ ROCKS_LOG_INFO(ioptions.logger,
+ "More existing levels in DB than needed. "
+ "max_bytes_for_level_multiplier may not be guaranteed.");
+ } else {
+ // Find base level (where L0 data is compacted to).
+ base_level_ = first_non_empty_level;
+ while (base_level_ > 1 && cur_level_size > base_bytes_max) {
+ --base_level_;
+ cur_level_size = static_cast<uint64_t>(
+ cur_level_size / options.max_bytes_for_level_multiplier);
+ }
+ if (cur_level_size > base_bytes_max) {
+ // Even L1 will be too large
+ assert(base_level_ == 1);
+ base_level_size = base_bytes_max;
+ } else {
+ base_level_size = cur_level_size;
+ }
+ }
+
+ level_multiplier_ = options.max_bytes_for_level_multiplier;
+ assert(base_level_size > 0);
+
+ uint64_t level_size = base_level_size;
+ for (int i = base_level_; i < num_levels_; i++) {
+ if (i > base_level_) {
+ level_size = MultiplyCheckOverflow(level_size, level_multiplier_);
+ }
+ // Don't set any level below base_bytes_max. Otherwise, the LSM can
+ // assume an hourglass shape where L1+ sizes are smaller than L0. This
+ // causes compaction scoring, which depends on level sizes, to favor L1+
+ // at the expense of L0, which may fill up and stall.
+ level_max_bytes_[i] = std::max(level_size, base_bytes_max);
+ }
+ }
+ }
+}
+
+uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
+ // Estimate the live data size by adding up the size of a maximal set of
+ // sst files with no range overlap in same or higher level. The less
+ // compacted, the more optimistic (smaller) this estimate is. Also,
+ // for multiple sorted runs within a level, file order will matter.
+ uint64_t size = 0;
+
+ auto ikey_lt = [this](InternalKey* x, InternalKey* y) {
+ return internal_comparator_->Compare(*x, *y) < 0;
+ };
+ // (Ordered) map of largest keys in files being included in size estimate
+ std::map<InternalKey*, FileMetaData*, decltype(ikey_lt)> ranges(ikey_lt);
+
+ for (int l = num_levels_ - 1; l >= 0; l--) {
+ bool found_end = false;
+ for (auto file : files_[l]) {
+ // Find the first file already included with largest key is larger than
+ // the smallest key of `file`. If that file does not overlap with the
+ // current file, none of the files in the map does. If there is
+ // no potential overlap, we can safely insert the rest of this level
+ // (if the level is not 0) into the map without checking again because
+ // the elements in the level are sorted and non-overlapping.
+ auto lb = (found_end && l != 0) ? ranges.end()
+ : ranges.lower_bound(&file->smallest);
+ found_end = (lb == ranges.end());
+ if (found_end || internal_comparator_->Compare(
+ file->largest, (*lb).second->smallest) < 0) {
+ ranges.emplace_hint(lb, &file->largest, file);
+ size += file->fd.file_size;
+ }
+ }
+ }
+
+ // For BlobDB, the result also includes the exact value of live bytes in the
+ // blob files of the version.
+ for (const auto& meta : blob_files_) {
+ assert(meta);
+
+ size += meta->GetTotalBlobBytes();
+ size -= meta->GetGarbageBlobBytes();
+ }
+
+ return size;
+}
+
+bool VersionStorageInfo::RangeMightExistAfterSortedRun(
+ const Slice& smallest_user_key, const Slice& largest_user_key,
+ int last_level, int last_l0_idx) {
+ assert((last_l0_idx != -1) == (last_level == 0));
+ // TODO(ajkr): this preserves earlier behavior where we considered an L0 file
+ // bottommost only if it's the oldest L0 file and there are no files on older
+ // levels. It'd be better to consider it bottommost if there's no overlap in
+ // older levels/files.
+ if (last_level == 0 &&
+ last_l0_idx != static_cast<int>(LevelFiles(0).size() - 1)) {
+ return true;
+ }
+
+ // Checks whether there are files living beyond the `last_level`. If lower
+ // levels have files, it checks for overlap between [`smallest_key`,
+ // `largest_key`] and those files. Bottomlevel optimizations can be made if
+ // there are no files in lower levels or if there is no overlap with the files
+ // in the lower levels.
+ for (int level = last_level + 1; level < num_levels(); level++) {
+ // The range is not in the bottommost level if there are files in lower
+ // levels when the `last_level` is 0 or if there are files in lower levels
+ // which overlap with [`smallest_key`, `largest_key`].
+ if (files_[level].size() > 0 &&
+ (last_level == 0 ||
+ OverlapInLevel(level, &smallest_user_key, &largest_user_key))) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files,
+ std::vector<uint64_t>* live_blob_files) const {
+ assert(live_table_files);
+ assert(live_blob_files);
+
+ for (int level = 0; level < storage_info_.num_levels(); ++level) {
+ const auto& level_files = storage_info_.LevelFiles(level);
+ for (const auto& meta : level_files) {
+ assert(meta);
+
+ live_table_files->emplace_back(meta->fd.GetNumber());
+ }
+ }
+
+ const auto& blob_files = storage_info_.GetBlobFiles();
+ for (const auto& meta : blob_files) {
+ assert(meta);
+
+ live_blob_files->emplace_back(meta->GetBlobFileNumber());
+ }
+}
+
+void Version::RemoveLiveFiles(
+ std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+ std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const {
+ for (ObsoleteFileInfo& fi : sst_delete_candidates) {
+ if (!fi.only_delete_metadata &&
+ storage_info()->GetFileLocation(fi.metadata->fd.GetNumber()) !=
+ VersionStorageInfo::FileLocation::Invalid()) {
+ fi.only_delete_metadata = true;
+ }
+ }
+
+ blob_delete_candidates.erase(
+ std::remove_if(
+ blob_delete_candidates.begin(), blob_delete_candidates.end(),
+ [this](ObsoleteBlobFileInfo& x) {
+ return storage_info()->GetBlobFileMetaData(x.GetBlobFileNumber());
+ }),
+ blob_delete_candidates.end());
+}
+
+std::string Version::DebugString(bool hex, bool print_stats) const {
+ std::string r;
+ for (int level = 0; level < storage_info_.num_levels_; level++) {
+ // E.g.,
+ // --- level 1 ---
+ // 17:123[1 .. 124]['a' .. 'd']
+ // 20:43[124 .. 128]['e' .. 'g']
+ //
+ // if print_stats=true:
+ // 17:123[1 .. 124]['a' .. 'd'](4096)
+ r.append("--- level ");
+ AppendNumberTo(&r, level);
+ r.append(" --- version# ");
+ AppendNumberTo(&r, version_number_);
+ if (storage_info_.compact_cursor_[level].Valid()) {
+ r.append(" --- compact_cursor: ");
+ r.append(storage_info_.compact_cursor_[level].DebugString(hex));
+ }
+ r.append(" ---\n");
+ const std::vector<FileMetaData*>& files = storage_info_.files_[level];
+ for (size_t i = 0; i < files.size(); i++) {
+ r.push_back(' ');
+ AppendNumberTo(&r, files[i]->fd.GetNumber());
+ r.push_back(':');
+ AppendNumberTo(&r, files[i]->fd.GetFileSize());
+ r.append("[");
+ AppendNumberTo(&r, files[i]->fd.smallest_seqno);
+ r.append(" .. ");
+ AppendNumberTo(&r, files[i]->fd.largest_seqno);
+ r.append("]");
+ r.append("[");
+ r.append(files[i]->smallest.DebugString(hex));
+ r.append(" .. ");
+ r.append(files[i]->largest.DebugString(hex));
+ r.append("]");
+ if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) {
+ r.append(" blob_file:");
+ AppendNumberTo(&r, files[i]->oldest_blob_file_number);
+ }
+ if (print_stats) {
+ r.append("(");
+ r.append(std::to_string(
+ files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed)));
+ r.append(")");
+ }
+ r.append("\n");
+ }
+ }
+
+ const auto& blob_files = storage_info_.GetBlobFiles();
+ if (!blob_files.empty()) {
+ r.append("--- blob files --- version# ");
+ AppendNumberTo(&r, version_number_);
+ r.append(" ---\n");
+ for (const auto& blob_file_meta : blob_files) {
+ assert(blob_file_meta);
+
+ r.append(blob_file_meta->DebugString());
+ r.push_back('\n');
+ }
+ }
+
+ return r;
+}
+
+// this is used to batch writes to the manifest file
+struct VersionSet::ManifestWriter {
+ Status status;
+ bool done;
+ InstrumentedCondVar cv;
+ ColumnFamilyData* cfd;
+ const MutableCFOptions mutable_cf_options;
+ const autovector<VersionEdit*>& edit_list;
+ const std::function<void(const Status&)> manifest_write_callback;
+
+ explicit ManifestWriter(
+ InstrumentedMutex* mu, ColumnFamilyData* _cfd,
+ const MutableCFOptions& cf_options, const autovector<VersionEdit*>& e,
+ const std::function<void(const Status&)>& manifest_wcb)
+ : done(false),
+ cv(mu),
+ cfd(_cfd),
+ mutable_cf_options(cf_options),
+ edit_list(e),
+ manifest_write_callback(manifest_wcb) {}
+ ~ManifestWriter() { status.PermitUncheckedError(); }
+
+ bool IsAllWalEdits() const {
+ bool all_wal_edits = true;
+ for (const auto& e : edit_list) {
+ if (!e->IsWalManipulation()) {
+ all_wal_edits = false;
+ break;
+ }
+ }
+ return all_wal_edits;
+ }
+};
+
+Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) {
+ assert(edit);
+ if (edit->is_in_atomic_group_) {
+ TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup");
+ if (replay_buffer_.empty()) {
+ replay_buffer_.resize(edit->remaining_entries_ + 1);
+ TEST_SYNC_POINT_CALLBACK(
+ "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit);
+ }
+ read_edits_in_atomic_group_++;
+ if (read_edits_in_atomic_group_ + edit->remaining_entries_ !=
+ static_cast<uint32_t>(replay_buffer_.size())) {
+ TEST_SYNC_POINT_CALLBACK(
+ "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit);
+ return Status::Corruption("corrupted atomic group");
+ }
+ replay_buffer_[read_edits_in_atomic_group_ - 1] = *edit;
+ if (read_edits_in_atomic_group_ == replay_buffer_.size()) {
+ TEST_SYNC_POINT_CALLBACK(
+ "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit);
+ return Status::OK();
+ }
+ return Status::OK();
+ }
+
+ // A normal edit.
+ if (!replay_buffer().empty()) {
+ TEST_SYNC_POINT_CALLBACK(
+ "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit);
+ return Status::Corruption("corrupted atomic group");
+ }
+ return Status::OK();
+}
+
+bool AtomicGroupReadBuffer::IsFull() const {
+ return read_edits_in_atomic_group_ == replay_buffer_.size();
+}
+
+bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); }
+
+void AtomicGroupReadBuffer::Clear() {
+ read_edits_in_atomic_group_ = 0;
+ replay_buffer_.clear();
+}
+
+VersionSet::VersionSet(const std::string& dbname,
+ const ImmutableDBOptions* _db_options,
+ const FileOptions& storage_options, Cache* table_cache,
+ WriteBufferManager* write_buffer_manager,
+ WriteController* write_controller,
+ BlockCacheTracer* const block_cache_tracer,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::string& db_id,
+ const std::string& db_session_id)
+ : column_family_set_(new ColumnFamilySet(
+ dbname, _db_options, storage_options, table_cache,
+ write_buffer_manager, write_controller, block_cache_tracer, io_tracer,
+ db_id, db_session_id)),
+ table_cache_(table_cache),
+ env_(_db_options->env),
+ fs_(_db_options->fs, io_tracer),
+ clock_(_db_options->clock),
+ dbname_(dbname),
+ db_options_(_db_options),
+ next_file_number_(2),
+ manifest_file_number_(0), // Filled by Recover()
+ options_file_number_(0),
+ options_file_size_(0),
+ pending_manifest_file_number_(0),
+ last_sequence_(0),
+ last_allocated_sequence_(0),
+ last_published_sequence_(0),
+ prev_log_number_(0),
+ current_version_number_(0),
+ manifest_file_size_(0),
+ file_options_(storage_options),
+ block_cache_tracer_(block_cache_tracer),
+ io_tracer_(io_tracer),
+ db_session_id_(db_session_id) {}
+
+VersionSet::~VersionSet() {
+ // we need to delete column_family_set_ because its destructor depends on
+ // VersionSet
+ column_family_set_.reset();
+ for (auto& file : obsolete_files_) {
+ if (file.metadata->table_reader_handle) {
+ table_cache_->Release(file.metadata->table_reader_handle);
+ TableCache::Evict(table_cache_, file.metadata->fd.GetNumber());
+ }
+ file.DeleteMetadata();
+ }
+ obsolete_files_.clear();
+ io_status_.PermitUncheckedError();
+}
+
+void VersionSet::Reset() {
+ if (column_family_set_) {
+ WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
+ WriteController* wc = column_family_set_->write_controller();
+ // db_id becomes the source of truth after DBImpl::Recover():
+ // https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527
+ // Note: we may not be able to recover db_id from MANIFEST if
+ // options.write_dbid_to_manifest is false (default).
+ column_family_set_.reset(new ColumnFamilySet(
+ dbname_, db_options_, file_options_, table_cache_, wbm, wc,
+ block_cache_tracer_, io_tracer_, db_id_, db_session_id_));
+ }
+ db_id_.clear();
+ next_file_number_.store(2);
+ min_log_number_to_keep_.store(0);
+ manifest_file_number_ = 0;
+ options_file_number_ = 0;
+ pending_manifest_file_number_ = 0;
+ last_sequence_.store(0);
+ last_allocated_sequence_.store(0);
+ last_published_sequence_.store(0);
+ prev_log_number_ = 0;
+ descriptor_log_.reset();
+ current_version_number_ = 0;
+ manifest_writers_.clear();
+ manifest_file_size_ = 0;
+ obsolete_files_.clear();
+ obsolete_manifests_.clear();
+ wals_.Reset();
+}
+
+void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
+ Version* v) {
+ // compute new compaction score
+ v->storage_info()->ComputeCompactionScore(
+ *column_family_data->ioptions(),
+ *column_family_data->GetLatestMutableCFOptions());
+
+ // Mark v finalized
+ v->storage_info_.SetFinalized();
+
+ // Make "v" current
+ assert(v->refs_ == 0);
+ Version* current = column_family_data->current();
+ assert(v != current);
+ if (current != nullptr) {
+ assert(current->refs_ > 0);
+ current->Unref();
+ }
+ column_family_data->SetCurrent(v);
+ v->Ref();
+
+ // Append to linked list
+ v->prev_ = column_family_data->dummy_versions()->prev_;
+ v->next_ = column_family_data->dummy_versions();
+ v->prev_->next_ = v;
+ v->next_->prev_ = v;
+}
+
+Status VersionSet::ProcessManifestWrites(
+ std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
+ FSDirectory* dir_contains_current_file, bool new_descriptor_log,
+ const ColumnFamilyOptions* new_cf_options) {
+ mu->AssertHeld();
+ assert(!writers.empty());
+ ManifestWriter& first_writer = writers.front();
+ ManifestWriter* last_writer = &first_writer;
+
+ assert(!manifest_writers_.empty());
+ assert(manifest_writers_.front() == &first_writer);
+
+ autovector<VersionEdit*> batch_edits;
+ autovector<Version*> versions;
+ autovector<const MutableCFOptions*> mutable_cf_options_ptrs;
+ std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards;
+
+ // Tracking `max_last_sequence` is needed to ensure we write
+ // `VersionEdit::last_sequence_`s in non-decreasing order according to the
+ // recovery code's requirement. It also allows us to defer updating
+ // `descriptor_last_sequence_` until the apply phase, after the log phase
+ // succeeds.
+ SequenceNumber max_last_sequence = descriptor_last_sequence_;
+
+ if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+ // No group commits for column family add or drop
+ LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence);
+ batch_edits.push_back(first_writer.edit_list.front());
+ } else {
+ auto it = manifest_writers_.cbegin();
+ size_t group_start = std::numeric_limits<size_t>::max();
+ while (it != manifest_writers_.cend()) {
+ if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) {
+ // no group commits for column family add or drop
+ break;
+ }
+ last_writer = *(it++);
+ assert(last_writer != nullptr);
+ assert(last_writer->cfd != nullptr);
+ if (last_writer->cfd->IsDropped()) {
+ // If we detect a dropped CF at this point, and the corresponding
+ // version edits belong to an atomic group, then we need to find out
+ // the preceding version edits in the same atomic group, and update
+ // their `remaining_entries_` member variable because we are NOT going
+ // to write the version edits' of dropped CF to the MANIFEST. If we
+ // don't update, then Recover can report corrupted atomic group because
+ // the `remaining_entries_` do not match.
+ if (!batch_edits.empty()) {
+ if (batch_edits.back()->is_in_atomic_group_ &&
+ batch_edits.back()->remaining_entries_ > 0) {
+ assert(group_start < batch_edits.size());
+ const auto& edit_list = last_writer->edit_list;
+ size_t k = 0;
+ while (k < edit_list.size()) {
+ if (!edit_list[k]->is_in_atomic_group_) {
+ break;
+ } else if (edit_list[k]->remaining_entries_ == 0) {
+ ++k;
+ break;
+ }
+ ++k;
+ }
+ for (auto i = group_start; i < batch_edits.size(); ++i) {
+ assert(static_cast<uint32_t>(k) <=
+ batch_edits.back()->remaining_entries_);
+ batch_edits[i]->remaining_entries_ -= static_cast<uint32_t>(k);
+ }
+ }
+ }
+ continue;
+ }
+ // We do a linear search on versions because versions is small.
+ // TODO(yanqin) maybe consider unordered_map
+ Version* version = nullptr;
+ VersionBuilder* builder = nullptr;
+ for (int i = 0; i != static_cast<int>(versions.size()); ++i) {
+ uint32_t cf_id = last_writer->cfd->GetID();
+ if (versions[i]->cfd()->GetID() == cf_id) {
+ version = versions[i];
+ assert(!builder_guards.empty() &&
+ builder_guards.size() == versions.size());
+ builder = builder_guards[i]->version_builder();
+ TEST_SYNC_POINT_CALLBACK(
+ "VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id);
+ break;
+ }
+ }
+ if (version == nullptr) {
+ // WAL manipulations do not need to be applied to versions.
+ if (!last_writer->IsAllWalEdits()) {
+ version = new Version(last_writer->cfd, this, file_options_,
+ last_writer->mutable_cf_options, io_tracer_,
+ current_version_number_++);
+ versions.push_back(version);
+ mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options);
+ builder_guards.emplace_back(
+ new BaseReferencedVersionBuilder(last_writer->cfd));
+ builder = builder_guards.back()->version_builder();
+ }
+ assert(last_writer->IsAllWalEdits() || builder);
+ assert(last_writer->IsAllWalEdits() || version);
+ TEST_SYNC_POINT_CALLBACK("VersionSet::ProcessManifestWrites:NewVersion",
+ version);
+ }
+ for (const auto& e : last_writer->edit_list) {
+ if (e->is_in_atomic_group_) {
+ if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ ||
+ (batch_edits.back()->is_in_atomic_group_ &&
+ batch_edits.back()->remaining_entries_ == 0)) {
+ group_start = batch_edits.size();
+ }
+ } else if (group_start != std::numeric_limits<size_t>::max()) {
+ group_start = std::numeric_limits<size_t>::max();
+ }
+ Status s = LogAndApplyHelper(last_writer->cfd, builder, e,
+ &max_last_sequence, mu);
+ if (!s.ok()) {
+ // free up the allocated memory
+ for (auto v : versions) {
+ delete v;
+ }
+ return s;
+ }
+ batch_edits.push_back(e);
+ }
+ }
+ for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+ assert(!builder_guards.empty() &&
+ builder_guards.size() == versions.size());
+ auto* builder = builder_guards[i]->version_builder();
+ Status s = builder->SaveTo(versions[i]->storage_info());
+ if (!s.ok()) {
+ // free up the allocated memory
+ for (auto v : versions) {
+ delete v;
+ }
+ return s;
+ }
+ }
+ }
+
+#ifndef NDEBUG
+ // Verify that version edits of atomic groups have correct
+ // remaining_entries_.
+ size_t k = 0;
+ while (k < batch_edits.size()) {
+ while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) {
+ ++k;
+ }
+ if (k == batch_edits.size()) {
+ break;
+ }
+ size_t i = k;
+ while (i < batch_edits.size()) {
+ if (!batch_edits[i]->is_in_atomic_group_) {
+ break;
+ }
+ assert(i - k + batch_edits[i]->remaining_entries_ ==
+ batch_edits[k]->remaining_entries_);
+ if (batch_edits[i]->remaining_entries_ == 0) {
+ ++i;
+ break;
+ }
+ ++i;
+ }
+ assert(batch_edits[i - 1]->is_in_atomic_group_);
+ assert(0 == batch_edits[i - 1]->remaining_entries_);
+ std::vector<VersionEdit*> tmp;
+ for (size_t j = k; j != i; ++j) {
+ tmp.emplace_back(batch_edits[j]);
+ }
+ TEST_SYNC_POINT_CALLBACK(
+ "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp);
+ k = i;
+ }
+#endif // NDEBUG
+
+ assert(pending_manifest_file_number_ == 0);
+ if (!descriptor_log_ ||
+ manifest_file_size_ > db_options_->max_manifest_file_size) {
+ TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
+ new_descriptor_log = true;
+ } else {
+ pending_manifest_file_number_ = manifest_file_number_;
+ }
+
+ // Local cached copy of state variable(s). WriteCurrentStateToManifest()
+ // reads its content after releasing db mutex to avoid race with
+ // SwitchMemtable().
+ std::unordered_map<uint32_t, MutableCFState> curr_state;
+ VersionEdit wal_additions;
+ if (new_descriptor_log) {
+ pending_manifest_file_number_ = NewFileNumber();
+ batch_edits.back()->SetNextFile(next_file_number_.load());
+
+ // if we are writing out new snapshot make sure to persist max column
+ // family.
+ if (column_family_set_->GetMaxColumnFamily() > 0) {
+ first_writer.edit_list.front()->SetMaxColumnFamily(
+ column_family_set_->GetMaxColumnFamily());
+ }
+ for (const auto* cfd : *column_family_set_) {
+ assert(curr_state.find(cfd->GetID()) == curr_state.end());
+ curr_state.emplace(std::make_pair(
+ cfd->GetID(),
+ MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow())));
+ }
+
+ for (const auto& wal : wals_.GetWals()) {
+ wal_additions.AddWal(wal.first, wal.second);
+ }
+ }
+
+ uint64_t new_manifest_file_size = 0;
+ Status s;
+ IOStatus io_s;
+ IOStatus manifest_io_status;
+ {
+ FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
+ mu->Unlock();
+ TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart");
+ TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr);
+ if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+ for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+ assert(!builder_guards.empty() &&
+ builder_guards.size() == versions.size());
+ assert(!mutable_cf_options_ptrs.empty() &&
+ builder_guards.size() == versions.size());
+ ColumnFamilyData* cfd = versions[i]->cfd_;
+ s = builder_guards[i]->version_builder()->LoadTableHandlers(
+ cfd->internal_stats(), 1 /* max_threads */,
+ true /* prefetch_index_and_filter_in_cache */,
+ false /* is_initial_load */,
+ mutable_cf_options_ptrs[i]->prefix_extractor,
+ MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]));
+ if (!s.ok()) {
+ if (db_options_->paranoid_checks) {
+ break;
+ }
+ s = Status::OK();
+ }
+ }
+ }
+
+ if (s.ok() && new_descriptor_log) {
+ // This is fine because everything inside of this block is serialized --
+ // only one thread can be here at the same time
+ // create new manifest file
+ ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
+ pending_manifest_file_number_);
+ std::string descriptor_fname =
+ DescriptorFileName(dbname_, pending_manifest_file_number_);
+ std::unique_ptr<FSWritableFile> descriptor_file;
+ io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file,
+ opt_file_opts);
+ if (io_s.ok()) {
+ descriptor_file->SetPreallocationBlockSize(
+ db_options_->manifest_preallocation_size);
+ FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
+ io_tracer_, nullptr, db_options_->listeners, nullptr,
+ tmp_set.Contains(FileType::kDescriptorFile),
+ tmp_set.Contains(FileType::kDescriptorFile)));
+ descriptor_log_.reset(
+ new log::Writer(std::move(file_writer), 0, false));
+ s = WriteCurrentStateToManifest(curr_state, wal_additions,
+ descriptor_log_.get(), io_s);
+ } else {
+ manifest_io_status = io_s;
+ s = io_s;
+ }
+ }
+
+ if (s.ok()) {
+ if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+ constexpr bool update_stats = true;
+
+ for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+ versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], update_stats);
+ }
+ }
+
+ // Write new records to MANIFEST log
+#ifndef NDEBUG
+ size_t idx = 0;
+#endif
+ for (auto& e : batch_edits) {
+ std::string record;
+ if (!e->EncodeTo(&record)) {
+ s = Status::Corruption("Unable to encode VersionEdit:" +
+ e->DebugString(true));
+ break;
+ }
+ TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord",
+ REDUCE_ODDS2);
+#ifndef NDEBUG
+ if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
+ TEST_SYNC_POINT_CALLBACK(
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+ nullptr);
+ TEST_SYNC_POINT(
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
+ }
+ ++idx;
+#endif /* !NDEBUG */
+ io_s = descriptor_log_->AddRecord(record);
+ if (!io_s.ok()) {
+ s = io_s;
+ manifest_io_status = io_s;
+ break;
+ }
+ }
+ if (s.ok()) {
+ io_s = SyncManifest(db_options_, descriptor_log_->file());
+ manifest_io_status = io_s;
+ TEST_SYNC_POINT_CALLBACK(
+ "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s);
+ }
+ if (!io_s.ok()) {
+ s = io_s;
+ ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n",
+ s.ToString().c_str());
+ }
+ }
+
+ // If we just created a new descriptor file, install it by writing a
+ // new CURRENT file that points to it.
+ if (s.ok()) {
+ assert(manifest_io_status.ok());
+ }
+ if (s.ok() && new_descriptor_log) {
+ io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_,
+ dir_contains_current_file);
+ if (!io_s.ok()) {
+ s = io_s;
+ }
+ }
+
+ if (s.ok()) {
+ // find offset in manifest file where this version is stored.
+ new_manifest_file_size = descriptor_log_->file()->GetFileSize();
+ }
+
+ if (first_writer.edit_list.front()->is_column_family_drop_) {
+ TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0");
+ TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1");
+ TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2");
+ }
+
+ LogFlush(db_options_->info_log);
+ TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone");
+ mu->Lock();
+ }
+
+ if (s.ok()) {
+ // Apply WAL edits, DB mutex must be held.
+ for (auto& e : batch_edits) {
+ if (e->IsWalAddition()) {
+ s = wals_.AddWals(e->GetWalAdditions());
+ } else if (e->IsWalDeletion()) {
+ s = wals_.DeleteWalsBefore(e->GetWalDeletion().GetLogNumber());
+ }
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+
+ if (!io_s.ok()) {
+ if (io_status_.ok()) {
+ io_status_ = io_s;
+ }
+ } else if (!io_status_.ok()) {
+ io_status_ = io_s;
+ }
+
+ // Append the old manifest file to the obsolete_manifest_ list to be deleted
+ // by PurgeObsoleteFiles later.
+ if (s.ok() && new_descriptor_log) {
+ obsolete_manifests_.emplace_back(
+ DescriptorFileName("", manifest_file_number_));
+ }
+
+ // Install the new versions
+ if (s.ok()) {
+ if (first_writer.edit_list.front()->is_column_family_add_) {
+ assert(batch_edits.size() == 1);
+ assert(new_cf_options != nullptr);
+ assert(max_last_sequence == descriptor_last_sequence_);
+ CreateColumnFamily(*new_cf_options, first_writer.edit_list.front());
+ } else if (first_writer.edit_list.front()->is_column_family_drop_) {
+ assert(batch_edits.size() == 1);
+ assert(max_last_sequence == descriptor_last_sequence_);
+ first_writer.cfd->SetDropped();
+ first_writer.cfd->UnrefAndTryDelete();
+ } else {
+ // Each version in versions corresponds to a column family.
+ // For each column family, update its log number indicating that logs
+ // with number smaller than this should be ignored.
+ uint64_t last_min_log_number_to_keep = 0;
+ for (const auto& e : batch_edits) {
+ ColumnFamilyData* cfd = nullptr;
+ if (!e->IsColumnFamilyManipulation()) {
+ cfd = column_family_set_->GetColumnFamily(e->column_family_);
+ // e would not have been added to batch_edits if its corresponding
+ // column family is dropped.
+ assert(cfd);
+ }
+ if (cfd) {
+ if (e->has_log_number_ && e->log_number_ > cfd->GetLogNumber()) {
+ cfd->SetLogNumber(e->log_number_);
+ }
+ if (e->HasFullHistoryTsLow()) {
+ cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow());
+ }
+ }
+ if (e->has_min_log_number_to_keep_) {
+ last_min_log_number_to_keep =
+ std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_);
+ }
+ }
+
+ if (last_min_log_number_to_keep != 0) {
+ MarkMinLogNumberToKeep(last_min_log_number_to_keep);
+ }
+
+ for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+ ColumnFamilyData* cfd = versions[i]->cfd_;
+ AppendVersion(cfd, versions[i]);
+ }
+ }
+ assert(max_last_sequence >= descriptor_last_sequence_);
+ descriptor_last_sequence_ = max_last_sequence;
+ manifest_file_number_ = pending_manifest_file_number_;
+ manifest_file_size_ = new_manifest_file_size;
+ prev_log_number_ = first_writer.edit_list.front()->prev_log_number_;
+ } else {
+ std::string version_edits;
+ for (auto& e : batch_edits) {
+ version_edits += ("\n" + e->DebugString(true));
+ }
+ ROCKS_LOG_ERROR(db_options_->info_log,
+ "Error in committing version edit to MANIFEST: %s",
+ version_edits.c_str());
+ for (auto v : versions) {
+ delete v;
+ }
+ if (manifest_io_status.ok()) {
+ manifest_file_number_ = pending_manifest_file_number_;
+ manifest_file_size_ = new_manifest_file_size;
+ }
+ // If manifest append failed for whatever reason, the file could be
+ // corrupted. So we need to force the next version update to start a
+ // new manifest file.
+ descriptor_log_.reset();
+ // If manifest operations failed, then we know the CURRENT file still
+ // points to the original MANIFEST. Therefore, we can safely delete the
+ // new MANIFEST.
+ // If manifest operations succeeded, and we are here, then it is possible
+ // that renaming tmp file to CURRENT failed.
+ //
+ // On local POSIX-compliant FS, the CURRENT must point to the original
+ // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
+ // keep it. Future recovery will ignore this MANIFEST. It's also ok for the
+ // process not to crash and continue using the db. Any future LogAndApply()
+ // call will switch to a new MANIFEST and update CURRENT, still ignoring
+ // this one.
+ //
+ // On non-local FS, it is
+ // possible that the rename operation succeeded on the server (remote)
+ // side, but the client somehow returns a non-ok status to RocksDB. Note
+ // that this does not violate atomicity. Should we delete the new MANIFEST
+ // successfully, a subsequent recovery attempt will likely see the CURRENT
+ // pointing to the new MANIFEST, thus fail. We will not be able to open the
+ // DB again. Therefore, if manifest operations succeed, we should keep the
+ // the new MANIFEST. If the process proceeds, any future LogAndApply() call
+ // will switch to a new MANIFEST and update CURRENT. If user tries to
+ // re-open the DB,
+ // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
+ // b) CURRENT points to the original MANIFEST, and the original MANIFEST
+ // also exists.
+ if (new_descriptor_log && !manifest_io_status.ok()) {
+ ROCKS_LOG_INFO(db_options_->info_log,
+ "Deleting manifest %" PRIu64 " current manifest %" PRIu64
+ "\n",
+ pending_manifest_file_number_, manifest_file_number_);
+ Status manifest_del_status = env_->DeleteFile(
+ DescriptorFileName(dbname_, pending_manifest_file_number_));
+ if (!manifest_del_status.ok()) {
+ ROCKS_LOG_WARN(db_options_->info_log,
+ "Failed to delete manifest %" PRIu64 ": %s",
+ pending_manifest_file_number_,
+ manifest_del_status.ToString().c_str());
+ }
+ }
+ }
+
+ pending_manifest_file_number_ = 0;
+
+#ifndef NDEBUG
+ // This is here kind of awkwardly because there's no other consistency
+ // checks on `VersionSet`'s updates for the new `Version`s. We might want
+ // to move it to a dedicated function, or remove it if we gain enough
+ // confidence in `descriptor_last_sequence_`.
+ if (s.ok()) {
+ for (const auto* v : versions) {
+ const auto* vstorage = v->storage_info();
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ for (const auto& file : vstorage->LevelFiles(level)) {
+ assert(file->fd.largest_seqno <= descriptor_last_sequence_);
+ }
+ }
+ }
+ }
+#endif // NDEBUG
+
+ // wake up all the waiting writers
+ while (true) {
+ ManifestWriter* ready = manifest_writers_.front();
+ manifest_writers_.pop_front();
+ bool need_signal = true;
+ for (const auto& w : writers) {
+ if (&w == ready) {
+ need_signal = false;
+ break;
+ }
+ }
+ ready->status = s;
+ ready->done = true;
+ if (ready->manifest_write_callback) {
+ (ready->manifest_write_callback)(s);
+ }
+ if (need_signal) {
+ ready->cv.Signal();
+ }
+ if (ready == last_writer) {
+ break;
+ }
+ }
+ if (!manifest_writers_.empty()) {
+ manifest_writers_.front()->cv.Signal();
+ }
+ return s;
+}
+
+void VersionSet::WakeUpWaitingManifestWriters() {
+ // wake up all the waiting writers
+ // Notify new head of manifest write queue.
+ if (!manifest_writers_.empty()) {
+ manifest_writers_.front()->cv.Signal();
+ }
+}
+
+// 'datas' is grammatically incorrect. We still use this notation to indicate
+// that this variable represents a collection of column_family_data.
+Status VersionSet::LogAndApply(
+ const autovector<ColumnFamilyData*>& column_family_datas,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<autovector<VersionEdit*>>& edit_lists,
+ InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
+ bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options,
+ const std::vector<std::function<void(const Status&)>>& manifest_wcbs) {
+ mu->AssertHeld();
+ int num_edits = 0;
+ for (const auto& elist : edit_lists) {
+ num_edits += static_cast<int>(elist.size());
+ }
+ if (num_edits == 0) {
+ return Status::OK();
+ } else if (num_edits > 1) {
+#ifndef NDEBUG
+ for (const auto& edit_list : edit_lists) {
+ for (const auto& edit : edit_list) {
+ assert(!edit->IsColumnFamilyManipulation());
+ }
+ }
+#endif /* ! NDEBUG */
+ }
+
+ int num_cfds = static_cast<int>(column_family_datas.size());
+ if (num_cfds == 1 && column_family_datas[0] == nullptr) {
+ assert(edit_lists.size() == 1 && edit_lists[0].size() == 1);
+ assert(edit_lists[0][0]->is_column_family_add_);
+ assert(new_cf_options != nullptr);
+ }
+ std::deque<ManifestWriter> writers;
+ if (num_cfds > 0) {
+ assert(static_cast<size_t>(num_cfds) == mutable_cf_options_list.size());
+ assert(static_cast<size_t>(num_cfds) == edit_lists.size());
+ }
+ for (int i = 0; i < num_cfds; ++i) {
+ const auto wcb =
+ manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i];
+ writers.emplace_back(mu, column_family_datas[i],
+ *mutable_cf_options_list[i], edit_lists[i], wcb);
+ manifest_writers_.push_back(&writers[i]);
+ }
+ assert(!writers.empty());
+ ManifestWriter& first_writer = writers.front();
+ TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:BeforeWriterWaiting",
+ nullptr);
+ while (!first_writer.done && &first_writer != manifest_writers_.front()) {
+ first_writer.cv.Wait();
+ }
+ if (first_writer.done) {
+ // All non-CF-manipulation operations can be grouped together and committed
+ // to MANIFEST. They should all have finished. The status code is stored in
+ // the first manifest writer.
+#ifndef NDEBUG
+ for (const auto& writer : writers) {
+ assert(writer.done);
+ }
+ TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndDone", mu);
+#endif /* !NDEBUG */
+ return first_writer.status;
+ }
+
+ int num_undropped_cfds = 0;
+ for (auto cfd : column_family_datas) {
+ // if cfd == nullptr, it is a column family add.
+ if (cfd == nullptr || !cfd->IsDropped()) {
+ ++num_undropped_cfds;
+ }
+ }
+ if (0 == num_undropped_cfds) {
+ for (int i = 0; i != num_cfds; ++i) {
+ manifest_writers_.pop_front();
+ }
+ // Notify new head of manifest write queue.
+ if (!manifest_writers_.empty()) {
+ manifest_writers_.front()->cv.Signal();
+ }
+ return Status::ColumnFamilyDropped();
+ }
+ return ProcessManifestWrites(writers, mu, dir_contains_current_file,
+ new_descriptor_log, new_cf_options);
+}
+
+void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
+ SequenceNumber* max_last_sequence) {
+ assert(max_last_sequence != nullptr);
+ assert(edit->IsColumnFamilyManipulation());
+ edit->SetNextFile(next_file_number_.load());
+ assert(!edit->HasLastSequence());
+ edit->SetLastSequence(*max_last_sequence);
+ if (edit->is_column_family_drop_) {
+ // if we drop column family, we have to make sure to save max column family,
+ // so that we don't reuse existing ID
+ edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
+ }
+}
+
+Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
+ VersionBuilder* builder, VersionEdit* edit,
+ SequenceNumber* max_last_sequence,
+ InstrumentedMutex* mu) {
+#ifdef NDEBUG
+ (void)cfd;
+#endif
+ mu->AssertHeld();
+ assert(!edit->IsColumnFamilyManipulation());
+ assert(max_last_sequence != nullptr);
+
+ if (edit->has_log_number_) {
+ assert(edit->log_number_ >= cfd->GetLogNumber());
+ assert(edit->log_number_ < next_file_number_.load());
+ }
+
+ if (!edit->has_prev_log_number_) {
+ edit->SetPrevLogNumber(prev_log_number_);
+ }
+ edit->SetNextFile(next_file_number_.load());
+ if (edit->HasLastSequence() && edit->GetLastSequence() > *max_last_sequence) {
+ *max_last_sequence = edit->GetLastSequence();
+ } else {
+ edit->SetLastSequence(*max_last_sequence);
+ }
+
+ // The builder can be nullptr only if edit is WAL manipulation,
+ // because WAL edits do not need to be applied to versions,
+ // we return Status::OK() in this case.
+ assert(builder || edit->IsWalManipulation());
+ return builder ? builder->Apply(edit) : Status::OK();
+}
+
+Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
+ FileSystem* fs,
+ std::string* manifest_path,
+ uint64_t* manifest_file_number) {
+ assert(fs != nullptr);
+ assert(manifest_path != nullptr);
+ assert(manifest_file_number != nullptr);
+
+ std::string fname;
+ Status s = ReadFileToString(fs, CurrentFileName(dbname), &fname);
+ if (!s.ok()) {
+ return s;
+ }
+ if (fname.empty() || fname.back() != '\n') {
+ return Status::Corruption("CURRENT file does not end with newline");
+ }
+ // remove the trailing '\n'
+ fname.resize(fname.size() - 1);
+ FileType type;
+ bool parse_ok = ParseFileName(fname, manifest_file_number, &type);
+ if (!parse_ok || type != kDescriptorFile) {
+ return Status::Corruption("CURRENT file corrupted");
+ }
+ *manifest_path = dbname;
+ if (dbname.back() != '/') {
+ manifest_path->push_back('/');
+ }
+ manifest_path->append(fname);
+ return Status::OK();
+}
+
+Status VersionSet::Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+ std::string* db_id, bool no_error_if_files_missing) {
+ // Read "CURRENT" file, which contains a pointer to the current manifest file
+ std::string manifest_path;
+ Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
+ &manifest_file_number_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n",
+ manifest_path.c_str());
+
+ std::unique_ptr<SequentialFileReader> manifest_file_reader;
+ {
+ std::unique_ptr<FSSequentialFile> manifest_file;
+ s = fs_->NewSequentialFile(manifest_path,
+ fs_->OptimizeForManifestRead(file_options_),
+ &manifest_file, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ manifest_file_reader.reset(new SequentialFileReader(
+ std::move(manifest_file), manifest_path,
+ db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+ }
+ uint64_t current_manifest_file_size = 0;
+ uint64_t log_number = 0;
+ {
+ VersionSet::LogReporter reporter;
+ Status log_read_status;
+ reporter.status = &log_read_status;
+ log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
+ true /* checksum */, 0 /* log_number */);
+ VersionEditHandler handler(
+ read_only, column_families, const_cast<VersionSet*>(this),
+ /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_);
+ handler.Iterate(reader, &log_read_status);
+ s = handler.status();
+ if (s.ok()) {
+ log_number = handler.GetVersionEditParams().log_number_;
+ current_manifest_file_size = reader.GetReadOffset();
+ assert(current_manifest_file_size != 0);
+ handler.GetDbId(db_id);
+ }
+ }
+
+ if (s.ok()) {
+ manifest_file_size_ = current_manifest_file_size;
+ ROCKS_LOG_INFO(
+ db_options_->info_log,
+ "Recovered from manifest file:%s succeeded,"
+ "manifest_file_number is %" PRIu64 ", next_file_number is %" PRIu64
+ ", last_sequence is %" PRIu64 ", log_number is %" PRIu64
+ ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
+ ",min_log_number_to_keep is %" PRIu64 "\n",
+ manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
+ last_sequence_.load(), log_number, prev_log_number_,
+ column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep());
+
+ for (auto cfd : *column_family_set_) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ ROCKS_LOG_INFO(db_options_->info_log,
+ "Column family [%s] (ID %" PRIu32
+ "), log number is %" PRIu64 "\n",
+ cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
+ }
+ }
+
+ return s;
+}
+
+namespace {
+class ManifestPicker {
+ public:
+ explicit ManifestPicker(const std::string& dbname,
+ const std::vector<std::string>& files_in_dbname);
+ // REQUIRES Valid() == true
+ std::string GetNextManifest(uint64_t* file_number, std::string* file_name);
+ bool Valid() const { return manifest_file_iter_ != manifest_files_.end(); }
+
+ private:
+ const std::string& dbname_;
+ // MANIFEST file names(s)
+ std::vector<std::string> manifest_files_;
+ std::vector<std::string>::const_iterator manifest_file_iter_;
+};
+
+ManifestPicker::ManifestPicker(const std::string& dbname,
+ const std::vector<std::string>& files_in_dbname)
+ : dbname_(dbname) {
+ // populate manifest files
+ assert(!files_in_dbname.empty());
+ for (const auto& fname : files_in_dbname) {
+ uint64_t file_num = 0;
+ FileType file_type;
+ bool parse_ok = ParseFileName(fname, &file_num, &file_type);
+ if (parse_ok && file_type == kDescriptorFile) {
+ manifest_files_.push_back(fname);
+ }
+ }
+ // seek to first manifest
+ std::sort(manifest_files_.begin(), manifest_files_.end(),
+ [](const std::string& lhs, const std::string& rhs) {
+ uint64_t num1 = 0;
+ uint64_t num2 = 0;
+ FileType type1;
+ FileType type2;
+ bool parse_ok1 = ParseFileName(lhs, &num1, &type1);
+ bool parse_ok2 = ParseFileName(rhs, &num2, &type2);
+#ifndef NDEBUG
+ assert(parse_ok1);
+ assert(parse_ok2);
+#else
+ (void)parse_ok1;
+ (void)parse_ok2;
+#endif
+ return num1 > num2;
+ });
+ manifest_file_iter_ = manifest_files_.begin();
+}
+
+std::string ManifestPicker::GetNextManifest(uint64_t* number,
+ std::string* file_name) {
+ assert(Valid());
+ std::string ret;
+ if (manifest_file_iter_ != manifest_files_.end()) {
+ ret.assign(dbname_);
+ if (ret.back() != kFilePathSeparator) {
+ ret.push_back(kFilePathSeparator);
+ }
+ ret.append(*manifest_file_iter_);
+ if (number) {
+ FileType type;
+ bool parse = ParseFileName(*manifest_file_iter_, number, &type);
+ assert(type == kDescriptorFile);
+#ifndef NDEBUG
+ assert(parse);
+#else
+ (void)parse;
+#endif
+ }
+ if (file_name) {
+ *file_name = *manifest_file_iter_;
+ }
+ ++manifest_file_iter_;
+ }
+ return ret;
+}
+} // anonymous namespace
+
+Status VersionSet::TryRecover(
+ const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+ const std::vector<std::string>& files_in_dbname, std::string* db_id,
+ bool* has_missing_table_file) {
+ ManifestPicker manifest_picker(dbname_, files_in_dbname);
+ if (!manifest_picker.Valid()) {
+ return Status::Corruption("Cannot locate MANIFEST file in " + dbname_);
+ }
+ Status s;
+ std::string manifest_path =
+ manifest_picker.GetNextManifest(&manifest_file_number_, nullptr);
+ while (!manifest_path.empty()) {
+ s = TryRecoverFromOneManifest(manifest_path, column_families, read_only,
+ db_id, has_missing_table_file);
+ if (s.ok() || !manifest_picker.Valid()) {
+ break;
+ }
+ Reset();
+ manifest_path =
+ manifest_picker.GetNextManifest(&manifest_file_number_, nullptr);
+ }
+ return s;
+}
+
+Status VersionSet::TryRecoverFromOneManifest(
+ const std::string& manifest_path,
+ const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+ std::string* db_id, bool* has_missing_table_file) {
+ ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n",
+ manifest_path.c_str());
+ std::unique_ptr<SequentialFileReader> manifest_file_reader;
+ Status s;
+ {
+ std::unique_ptr<FSSequentialFile> manifest_file;
+ s = fs_->NewSequentialFile(manifest_path,
+ fs_->OptimizeForManifestRead(file_options_),
+ &manifest_file, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ manifest_file_reader.reset(new SequentialFileReader(
+ std::move(manifest_file), manifest_path,
+ db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+ }
+
+ assert(s.ok());
+ VersionSet::LogReporter reporter;
+ reporter.status = &s;
+ log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
+ /*checksum=*/true, /*log_num=*/0);
+ VersionEditHandlerPointInTime handler_pit(
+ read_only, column_families, const_cast<VersionSet*>(this), io_tracer_);
+
+ handler_pit.Iterate(reader, &s);
+
+ handler_pit.GetDbId(db_id);
+
+ assert(nullptr != has_missing_table_file);
+ *has_missing_table_file = handler_pit.HasMissingFiles();
+
+ return handler_pit.status();
+}
+
+Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
+ const std::string& dbname,
+ FileSystem* fs) {
+ // Read "CURRENT" file, which contains a pointer to the current manifest file
+ std::string manifest_path;
+ uint64_t manifest_file_number;
+ Status s =
+ GetCurrentManifestPath(dbname, fs, &manifest_path, &manifest_file_number);
+ if (!s.ok()) {
+ return s;
+ }
+ return ListColumnFamiliesFromManifest(manifest_path, fs, column_families);
+}
+
+Status VersionSet::ListColumnFamiliesFromManifest(
+ const std::string& manifest_path, FileSystem* fs,
+ std::vector<std::string>* column_families) {
+ std::unique_ptr<SequentialFileReader> file_reader;
+ Status s;
+ {
+ std::unique_ptr<FSSequentialFile> file;
+ // these are just for performance reasons, not correctness,
+ // so we're fine using the defaults
+ s = fs->NewSequentialFile(manifest_path, FileOptions(), &file, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ file_reader = std::make_unique<SequentialFileReader>(
+ std::move(file), manifest_path, /*io_tracer=*/nullptr);
+ }
+
+ VersionSet::LogReporter reporter;
+ reporter.status = &s;
+ log::Reader reader(nullptr, std::move(file_reader), &reporter,
+ true /* checksum */, 0 /* log_number */);
+
+ ListColumnFamiliesHandler handler;
+ handler.Iterate(reader, &s);
+
+ assert(column_families);
+ column_families->clear();
+ if (handler.status().ok()) {
+ for (const auto& iter : handler.GetColumnFamilyNames()) {
+ column_families->push_back(iter.second);
+ }
+ }
+
+ return handler.status();
+}
+
+#ifndef ROCKSDB_LITE
+Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
+ const Options* options,
+ const FileOptions& file_options,
+ int new_levels) {
+ if (new_levels <= 1) {
+ return Status::InvalidArgument(
+ "Number of levels needs to be bigger than 1");
+ }
+
+ ImmutableDBOptions db_options(*options);
+ ColumnFamilyOptions cf_options(*options);
+ std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
+ options->table_cache_numshardbits));
+ WriteController wc(options->delayed_write_rate);
+ WriteBufferManager wb(options->db_write_buffer_size);
+ VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
+ nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
+ /*db_id*/ "",
+ /*db_session_id*/ "");
+ Status status;
+
+ std::vector<ColumnFamilyDescriptor> dummy;
+ ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
+ ColumnFamilyOptions(*options));
+ dummy.push_back(dummy_descriptor);
+ status = versions.Recover(dummy);
+ if (!status.ok()) {
+ return status;
+ }
+
+ Version* current_version =
+ versions.GetColumnFamilySet()->GetDefault()->current();
+ auto* vstorage = current_version->storage_info();
+ int current_levels = vstorage->num_levels();
+
+ if (current_levels <= new_levels) {
+ return Status::OK();
+ }
+
+ // Make sure there are file only on one level from
+ // (new_levels-1) to (current_levels-1)
+ int first_nonempty_level = -1;
+ int first_nonempty_level_filenum = 0;
+ for (int i = new_levels - 1; i < current_levels; i++) {
+ int file_num = vstorage->NumLevelFiles(i);
+ if (file_num != 0) {
+ if (first_nonempty_level < 0) {
+ first_nonempty_level = i;
+ first_nonempty_level_filenum = file_num;
+ } else {
+ char msg[255];
+ snprintf(msg, sizeof(msg),
+ "Found at least two levels containing files: "
+ "[%d:%d],[%d:%d].\n",
+ first_nonempty_level, first_nonempty_level_filenum, i,
+ file_num);
+ return Status::InvalidArgument(msg);
+ }
+ }
+ }
+
+ // we need to allocate an array with the old number of levels size to
+ // avoid SIGSEGV in WriteCurrentStatetoManifest()
+ // however, all levels bigger or equal to new_levels will be empty
+ std::vector<FileMetaData*>* new_files_list =
+ new std::vector<FileMetaData*>[current_levels];
+ for (int i = 0; i < new_levels - 1; i++) {
+ new_files_list[i] = vstorage->LevelFiles(i);
+ }
+
+ if (first_nonempty_level > 0) {
+ auto& new_last_level = new_files_list[new_levels - 1];
+
+ new_last_level = vstorage->LevelFiles(first_nonempty_level);
+
+ for (size_t i = 0; i < new_last_level.size(); ++i) {
+ const FileMetaData* const meta = new_last_level[i];
+ assert(meta);
+
+ const uint64_t file_number = meta->fd.GetNumber();
+
+ vstorage->file_locations_[file_number] =
+ VersionStorageInfo::FileLocation(new_levels - 1, i);
+ }
+ }
+
+ delete[] vstorage->files_;
+ vstorage->files_ = new_files_list;
+ vstorage->num_levels_ = new_levels;
+ vstorage->ResizeCompactCursors(new_levels);
+
+ MutableCFOptions mutable_cf_options(*options);
+ VersionEdit ve;
+ InstrumentedMutex dummy_mutex;
+ InstrumentedMutexLock l(&dummy_mutex);
+ return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(),
+ mutable_cf_options, &ve, &dummy_mutex, nullptr,
+ true);
+}
+
+// Get the checksum information including the checksum and checksum function
+// name of all SST and blob files in VersionSet. Store the information in
+// FileChecksumList which contains a map from file number to its checksum info.
+// If DB is not running, make sure call VersionSet::Recover() to load the file
+// metadata from Manifest to VersionSet before calling this function.
+Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
+ // Clean the previously stored checksum information if any.
+ Status s;
+ if (checksum_list == nullptr) {
+ s = Status::InvalidArgument("checksum_list is nullptr");
+ return s;
+ }
+ checksum_list->reset();
+
+ for (auto cfd : *column_family_set_) {
+ assert(cfd);
+
+ if (cfd->IsDropped() || !cfd->initialized()) {
+ continue;
+ }
+
+ const auto* current = cfd->current();
+ assert(current);
+
+ const auto* vstorage = current->storage_info();
+ assert(vstorage);
+
+ /* SST files */
+ for (int level = 0; level < cfd->NumberLevels(); level++) {
+ const auto& level_files = vstorage->LevelFiles(level);
+
+ for (const auto& file : level_files) {
+ assert(file);
+
+ s = checksum_list->InsertOneFileChecksum(file->fd.GetNumber(),
+ file->file_checksum,
+ file->file_checksum_func_name);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+
+ /* Blob files */
+ const auto& blob_files = vstorage->GetBlobFiles();
+ for (const auto& meta : blob_files) {
+ assert(meta);
+
+ std::string checksum_value = meta->GetChecksumValue();
+ std::string checksum_method = meta->GetChecksumMethod();
+ assert(checksum_value.empty() == checksum_method.empty());
+ if (meta->GetChecksumMethod().empty()) {
+ checksum_value = kUnknownFileChecksum;
+ checksum_method = kUnknownFileChecksumFuncName;
+ }
+
+ s = checksum_list->InsertOneFileChecksum(meta->GetBlobFileNumber(),
+ checksum_value, checksum_method);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+
+ return s;
+}
+
+Status VersionSet::DumpManifest(Options& options, std::string& dscname,
+ bool verbose, bool hex, bool json) {
+ assert(options.env);
+ std::vector<std::string> column_families;
+ Status s = ListColumnFamiliesFromManifest(
+ dscname, options.env->GetFileSystem().get(), &column_families);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Open the specified manifest file.
+ std::unique_ptr<SequentialFileReader> file_reader;
+ {
+ std::unique_ptr<FSSequentialFile> file;
+ const std::shared_ptr<FileSystem>& fs = options.env->GetFileSystem();
+ s = fs->NewSequentialFile(
+ dscname, fs->OptimizeForManifestRead(file_options_), &file, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ file_reader = std::make_unique<SequentialFileReader>(
+ std::move(file), dscname, db_options_->log_readahead_size, io_tracer_);
+ }
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ for (const auto& cf : column_families) {
+ cf_descs.emplace_back(cf, options);
+ }
+
+ DumpManifestHandler handler(cf_descs, this, io_tracer_, verbose, hex, json);
+ {
+ VersionSet::LogReporter reporter;
+ reporter.status = &s;
+ log::Reader reader(nullptr, std::move(file_reader), &reporter,
+ true /* checksum */, 0 /* log_number */);
+ handler.Iterate(reader, &s);
+ }
+
+ return handler.status();
+}
+#endif // ROCKSDB_LITE
+
+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+ // only called during recovery and repair which are single threaded, so this
+ // works because there can't be concurrent calls
+ if (next_file_number_.load(std::memory_order_relaxed) <= number) {
+ next_file_number_.store(number + 1, std::memory_order_relaxed);
+ }
+}
+// Called only either from ::LogAndApply which is protected by mutex or during
+// recovery which is single-threaded.
+void VersionSet::MarkMinLogNumberToKeep(uint64_t number) {
+ if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) {
+ min_log_number_to_keep_.store(number, std::memory_order_relaxed);
+ }
+}
+
+Status VersionSet::WriteCurrentStateToManifest(
+ const std::unordered_map<uint32_t, MutableCFState>& curr_state,
+ const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) {
+ // TODO: Break up into multiple records to reduce memory usage on recovery?
+
+ // WARNING: This method doesn't hold a mutex!!
+
+ // This is done without DB mutex lock held, but only within single-threaded
+ // LogAndApply. Column family manipulations can only happen within LogAndApply
+ // (the same single thread), so we're safe to iterate.
+
+ assert(io_s.ok());
+ if (db_options_->write_dbid_to_manifest) {
+ VersionEdit edit_for_db_id;
+ assert(!db_id_.empty());
+ edit_for_db_id.SetDBId(db_id_);
+ std::string db_id_record;
+ if (!edit_for_db_id.EncodeTo(&db_id_record)) {
+ return Status::Corruption("Unable to Encode VersionEdit:" +
+ edit_for_db_id.DebugString(true));
+ }
+ io_s = log->AddRecord(db_id_record);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+
+ // Save WALs.
+ if (!wal_additions.GetWalAdditions().empty()) {
+ TEST_SYNC_POINT_CALLBACK("VersionSet::WriteCurrentStateToManifest:SaveWal",
+ const_cast<VersionEdit*>(&wal_additions));
+ std::string record;
+ if (!wal_additions.EncodeTo(&record)) {
+ return Status::Corruption("Unable to Encode VersionEdit: " +
+ wal_additions.DebugString(true));
+ }
+ io_s = log->AddRecord(record);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+
+ for (auto cfd : *column_family_set_) {
+ assert(cfd);
+
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ assert(cfd->initialized());
+ {
+ // Store column family info
+ VersionEdit edit;
+ if (cfd->GetID() != 0) {
+ // default column family is always there,
+ // no need to explicitly write it
+ edit.AddColumnFamily(cfd->GetName());
+ edit.SetColumnFamily(cfd->GetID());
+ }
+ edit.SetComparatorName(
+ cfd->internal_comparator().user_comparator()->Name());
+ std::string record;
+ if (!edit.EncodeTo(&record)) {
+ return Status::Corruption("Unable to Encode VersionEdit:" +
+ edit.DebugString(true));
+ }
+ io_s = log->AddRecord(record);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+
+ {
+ // Save files
+ VersionEdit edit;
+ edit.SetColumnFamily(cfd->GetID());
+
+ const auto* current = cfd->current();
+ assert(current);
+
+ const auto* vstorage = current->storage_info();
+ assert(vstorage);
+
+ for (int level = 0; level < cfd->NumberLevels(); level++) {
+ const auto& level_files = vstorage->LevelFiles(level);
+
+ for (const auto& f : level_files) {
+ assert(f);
+
+ edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
+ f->fd.GetFileSize(), f->smallest, f->largest,
+ f->fd.smallest_seqno, f->fd.largest_seqno,
+ f->marked_for_compaction, f->temperature,
+ f->oldest_blob_file_number, f->oldest_ancester_time,
+ f->file_creation_time, f->file_checksum,
+ f->file_checksum_func_name, f->unique_id);
+ }
+ }
+
+ edit.SetCompactCursors(vstorage->GetCompactCursors());
+
+ const auto& blob_files = vstorage->GetBlobFiles();
+ for (const auto& meta : blob_files) {
+ assert(meta);
+
+ const uint64_t blob_file_number = meta->GetBlobFileNumber();
+
+ edit.AddBlobFile(blob_file_number, meta->GetTotalBlobCount(),
+ meta->GetTotalBlobBytes(), meta->GetChecksumMethod(),
+ meta->GetChecksumValue());
+ if (meta->GetGarbageBlobCount() > 0) {
+ edit.AddBlobFileGarbage(blob_file_number, meta->GetGarbageBlobCount(),
+ meta->GetGarbageBlobBytes());
+ }
+ }
+
+ const auto iter = curr_state.find(cfd->GetID());
+ assert(iter != curr_state.end());
+ uint64_t log_number = iter->second.log_number;
+ edit.SetLogNumber(log_number);
+
+ if (cfd->GetID() == 0) {
+ // min_log_number_to_keep is for the whole db, not for specific column
+ // family. So it does not need to be set for every column family, just
+ // need to be set once. Since default CF can never be dropped, we set
+ // the min_log to the default CF here.
+ uint64_t min_log = min_log_number_to_keep();
+ if (min_log != 0) {
+ edit.SetMinLogNumberToKeep(min_log);
+ }
+ }
+
+ const std::string& full_history_ts_low = iter->second.full_history_ts_low;
+ if (!full_history_ts_low.empty()) {
+ edit.SetFullHistoryTsLow(full_history_ts_low);
+ }
+
+ edit.SetLastSequence(descriptor_last_sequence_);
+
+ std::string record;
+ if (!edit.EncodeTo(&record)) {
+ return Status::Corruption("Unable to Encode VersionEdit:" +
+ edit.DebugString(true));
+ }
+ io_s = log->AddRecord(record);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+ }
+ return Status::OK();
+}
+
+// TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this
+// function is called repeatedly with consecutive pairs of slices. For example
+// if the slice list is [a, b, c, d] this function is called with arguments
+// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
+// we avoid doing binary search for the keys b and c twice and instead somehow
+// maintain state of where they first appear in the files.
+uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
+ Version* v, const Slice& start,
+ const Slice& end, int start_level,
+ int end_level, TableReaderCaller caller) {
+ const auto& icmp = v->cfd_->internal_comparator();
+
+ // pre-condition
+ assert(icmp.Compare(start, end) <= 0);
+
+ uint64_t total_full_size = 0;
+ const auto* vstorage = v->storage_info();
+ const int num_non_empty_levels = vstorage->num_non_empty_levels();
+ end_level = (end_level == -1) ? num_non_empty_levels
+ : std::min(end_level, num_non_empty_levels);
+
+ assert(start_level <= end_level);
+
+ // Outline of the optimization that uses options.files_size_error_margin.
+ // When approximating the files total size that is used to store a keys range,
+ // we first sum up the sizes of the files that fully fall into the range.
+ // Then we sum up the sizes of all the files that may intersect with the range
+ // (this includes all files in L0 as well). Then, if total_intersecting_size
+ // is smaller than total_full_size * options.files_size_error_margin - we can
+ // infer that the intersecting files have a sufficiently negligible
+ // contribution to the total size, and we can approximate the storage required
+ // for the keys in range as just half of the intersecting_files_size.
+ // E.g., if the value of files_size_error_margin is 0.1, then the error of the
+ // approximation is limited to only ~10% of the total size of files that fully
+ // fall into the keys range. In such case, this helps to avoid a costly
+ // process of binary searching the intersecting files that is required only
+ // for a more precise calculation of the total size.
+
+ autovector<FdWithKeyRange*, 32> first_files;
+ autovector<FdWithKeyRange*, 16> last_files;
+
+ // scan all the levels
+ for (int level = start_level; level < end_level; ++level) {
+ const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
+ if (files_brief.num_files == 0) {
+ // empty level, skip exploration
+ continue;
+ }
+
+ if (level == 0) {
+ // level 0 files are not in sorted order, we need to iterate through
+ // the list to compute the total bytes that require scanning,
+ // so handle the case explicitly (similarly to first_files case)
+ for (size_t i = 0; i < files_brief.num_files; i++) {
+ first_files.push_back(&files_brief.files[i]);
+ }
+ continue;
+ }
+
+ assert(level > 0);
+ assert(files_brief.num_files > 0);
+
+ // identify the file position for start key
+ const int idx_start =
+ FindFileInRange(icmp, files_brief, start, 0,
+ static_cast<uint32_t>(files_brief.num_files - 1));
+ assert(static_cast<size_t>(idx_start) < files_brief.num_files);
+
+ // identify the file position for end key
+ int idx_end = idx_start;
+ if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) {
+ idx_end =
+ FindFileInRange(icmp, files_brief, end, idx_start,
+ static_cast<uint32_t>(files_brief.num_files - 1));
+ }
+ assert(idx_end >= idx_start &&
+ static_cast<size_t>(idx_end) < files_brief.num_files);
+
+ // scan all files from the starting index to the ending index
+ // (inferred from the sorted order)
+
+ // first scan all the intermediate full files (excluding first and last)
+ for (int i = idx_start + 1; i < idx_end; ++i) {
+ uint64_t file_size = files_brief.files[i].fd.GetFileSize();
+ // The entire file falls into the range, so we can just take its size.
+ assert(file_size ==
+ ApproximateSize(v, files_brief.files[i], start, end, caller));
+ total_full_size += file_size;
+ }
+
+ // save the first and the last files (which may be the same file), so we
+ // can scan them later.
+ first_files.push_back(&files_brief.files[idx_start]);
+ if (idx_start != idx_end) {
+ // we need to estimate size for both files, only if they are different
+ last_files.push_back(&files_brief.files[idx_end]);
+ }
+ }
+
+ // The sum of all file sizes that intersect the [start, end] keys range.
+ uint64_t total_intersecting_size = 0;
+ for (const auto* file_ptr : first_files) {
+ total_intersecting_size += file_ptr->fd.GetFileSize();
+ }
+ for (const auto* file_ptr : last_files) {
+ total_intersecting_size += file_ptr->fd.GetFileSize();
+ }
+
+ // Now scan all the first & last files at each level, and estimate their size.
+ // If the total_intersecting_size is less than X% of the total_full_size - we
+ // want to approximate the result in order to avoid the costly binary search
+ // inside ApproximateSize. We use half of file size as an approximation below.
+
+ const double margin = options.files_size_error_margin;
+ if (margin > 0 && total_intersecting_size <
+ static_cast<uint64_t>(total_full_size * margin)) {
+ total_full_size += total_intersecting_size / 2;
+ } else {
+ // Estimate for all the first files (might also be last files), at each
+ // level
+ for (const auto file_ptr : first_files) {
+ total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
+ }
+
+ // Estimate for all the last files, at each level
+ for (const auto file_ptr : last_files) {
+ // We could use ApproximateSize here, but calling ApproximateOffsetOf
+ // directly is just more efficient.
+ total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller);
+ }
+ }
+
+ return total_full_size;
+}
+
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+ const Slice& key,
+ TableReaderCaller caller) {
+ // pre-condition
+ assert(v);
+ const auto& icmp = v->cfd_->internal_comparator();
+
+ uint64_t result = 0;
+ if (icmp.Compare(f.largest_key, key) <= 0) {
+ // Entire file is before "key", so just add the file size
+ result = f.fd.GetFileSize();
+ } else if (icmp.Compare(f.smallest_key, key) > 0) {
+ // Entire file is after "key", so ignore
+ result = 0;
+ } else {
+ // "key" falls in the range for this table. Add the
+ // approximate offset of "key" within the table.
+ TableCache* table_cache = v->cfd_->table_cache();
+ if (table_cache != nullptr) {
+ result = table_cache->ApproximateOffsetOf(
+ key, *f.file_metadata, caller, icmp,
+ v->GetMutableCFOptions().prefix_extractor);
+ }
+ }
+ return result;
+}
+
+uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
+ const Slice& start, const Slice& end,
+ TableReaderCaller caller) {
+ // pre-condition
+ assert(v);
+ const auto& icmp = v->cfd_->internal_comparator();
+ assert(icmp.Compare(start, end) <= 0);
+
+ if (icmp.Compare(f.largest_key, start) <= 0 ||
+ icmp.Compare(f.smallest_key, end) > 0) {
+ // Entire file is before or after the start/end keys range
+ return 0;
+ }
+
+ if (icmp.Compare(f.smallest_key, start) >= 0) {
+ // Start of the range is before the file start - approximate by end offset
+ return ApproximateOffsetOf(v, f, end, caller);
+ }
+
+ if (icmp.Compare(f.largest_key, end) < 0) {
+ // End of the range is after the file end - approximate by subtracting
+ // start offset from the file size
+ uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller);
+ assert(f.fd.GetFileSize() >= start_offset);
+ return f.fd.GetFileSize() - start_offset;
+ }
+
+ // The interval falls entirely in the range for this file.
+ TableCache* table_cache = v->cfd_->table_cache();
+ if (table_cache == nullptr) {
+ return 0;
+ }
+ return table_cache->ApproximateSize(
+ start, end, *f.file_metadata, caller, icmp,
+ v->GetMutableCFOptions().prefix_extractor);
+}
+
+void VersionSet::RemoveLiveFiles(
+ std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+ std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const {
+ assert(column_family_set_);
+ for (auto cfd : *column_family_set_) {
+ assert(cfd);
+ if (!cfd->initialized()) {
+ continue;
+ }
+
+ auto* current = cfd->current();
+ bool found_current = false;
+
+ Version* const dummy_versions = cfd->dummy_versions();
+ assert(dummy_versions);
+
+ for (Version* v = dummy_versions->next_; v != dummy_versions;
+ v = v->next_) {
+ v->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates);
+ if (v == current) {
+ found_current = true;
+ }
+ }
+
+ if (!found_current && current != nullptr) {
+ // Should never happen unless it is a bug.
+ assert(false);
+ current->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates);
+ }
+ }
+}
+
+void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_table_files,
+ std::vector<uint64_t>* live_blob_files) const {
+ assert(live_table_files);
+ assert(live_blob_files);
+
+ // pre-calculate space requirement
+ size_t total_table_files = 0;
+ size_t total_blob_files = 0;
+
+ assert(column_family_set_);
+ for (auto cfd : *column_family_set_) {
+ assert(cfd);
+
+ if (!cfd->initialized()) {
+ continue;
+ }
+
+ Version* const dummy_versions = cfd->dummy_versions();
+ assert(dummy_versions);
+
+ for (Version* v = dummy_versions->next_; v != dummy_versions;
+ v = v->next_) {
+ assert(v);
+
+ const auto* vstorage = v->storage_info();
+ assert(vstorage);
+
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ total_table_files += vstorage->LevelFiles(level).size();
+ }
+
+ total_blob_files += vstorage->GetBlobFiles().size();
+ }
+ }
+
+ // just one time extension to the right size
+ live_table_files->reserve(live_table_files->size() + total_table_files);
+ live_blob_files->reserve(live_blob_files->size() + total_blob_files);
+
+ assert(column_family_set_);
+ for (auto cfd : *column_family_set_) {
+ assert(cfd);
+ if (!cfd->initialized()) {
+ continue;
+ }
+
+ auto* current = cfd->current();
+ bool found_current = false;
+
+ Version* const dummy_versions = cfd->dummy_versions();
+ assert(dummy_versions);
+
+ for (Version* v = dummy_versions->next_; v != dummy_versions;
+ v = v->next_) {
+ v->AddLiveFiles(live_table_files, live_blob_files);
+ if (v == current) {
+ found_current = true;
+ }
+ }
+
+ if (!found_current && current != nullptr) {
+ // Should never happen unless it is a bug.
+ assert(false);
+ current->AddLiveFiles(live_table_files, live_blob_files);
+ }
+ }
+}
+
+InternalIterator* VersionSet::MakeInputIterator(
+ const ReadOptions& read_options, const Compaction* c,
+ RangeDelAggregator* range_del_agg,
+ const FileOptions& file_options_compactions,
+ const std::optional<const Slice>& start,
+ const std::optional<const Slice>& end) {
+ auto cfd = c->column_family_data();
+ // Level-0 files have to be merged together. For other levels,
+ // we will make a concatenating iterator per level.
+ // TODO(opt): use concatenating iterator for level-0 if there is no overlap
+ const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files +
+ c->num_input_levels() - 1
+ : c->num_input_levels());
+ InternalIterator** list = new InternalIterator*[space];
+ size_t num = 0;
+ for (size_t which = 0; which < c->num_input_levels(); which++) {
+ if (c->input_levels(which)->num_files != 0) {
+ if (c->level(which) == 0) {
+ const LevelFilesBrief* flevel = c->input_levels(which);
+ for (size_t i = 0; i < flevel->num_files; i++) {
+ const FileMetaData& fmd = *flevel->files[i].file_metadata;
+ if (start.has_value() &&
+ cfd->user_comparator()->CompareWithoutTimestamp(
+ start.value(), fmd.largest.user_key()) > 0) {
+ continue;
+ }
+ // We should be able to filter out the case where the end key
+ // equals to the end boundary, since the end key is exclusive.
+ // We try to be extra safe here.
+ if (end.has_value() &&
+ cfd->user_comparator()->CompareWithoutTimestamp(
+ end.value(), fmd.smallest.user_key()) < 0) {
+ continue;
+ }
+
+ list[num++] = cfd->table_cache()->NewIterator(
+ read_options, file_options_compactions,
+ cfd->internal_comparator(), fmd, range_del_agg,
+ c->mutable_cf_options()->prefix_extractor,
+ /*table_reader_ptr=*/nullptr,
+ /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction,
+ /*arena=*/nullptr,
+ /*skip_filters=*/false,
+ /*level=*/static_cast<int>(c->level(which)),
+ MaxFileSizeForL0MetaPin(*c->mutable_cf_options()),
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr,
+ /*allow_unprepared_value=*/false);
+ }
+ } else {
+ // Create concatenating iterator for the files from this level
+ list[num++] = new LevelIterator(
+ cfd->table_cache(), read_options, file_options_compactions,
+ cfd->internal_comparator(), c->input_levels(which),
+ c->mutable_cf_options()->prefix_extractor,
+ /*should_sample=*/false,
+ /*no per level latency histogram=*/nullptr,
+ TableReaderCaller::kCompaction, /*skip_filters=*/false,
+ /*level=*/static_cast<int>(c->level(which)), range_del_agg,
+ c->boundaries(which));
+ }
+ }
+ }
+ assert(num <= space);
+ InternalIterator* result =
+ NewMergingIterator(&c->column_family_data()->internal_comparator(), list,
+ static_cast<int>(num));
+ delete[] list;
+ return result;
+}
+
+Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
+ FileMetaData** meta,
+ ColumnFamilyData** cfd) {
+ for (auto cfd_iter : *column_family_set_) {
+ if (!cfd_iter->initialized()) {
+ continue;
+ }
+ Version* version = cfd_iter->current();
+ const auto* vstorage = version->storage_info();
+ for (int level = 0; level < vstorage->num_levels(); level++) {
+ for (const auto& file : vstorage->LevelFiles(level)) {
+ if (file->fd.GetNumber() == number) {
+ *meta = file;
+ *filelevel = level;
+ *cfd = cfd_iter;
+ return Status::OK();
+ }
+ }
+ }
+ }
+ return Status::NotFound("File not present in any level");
+}
+
+void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+ for (auto cfd : *column_family_set_) {
+ if (cfd->IsDropped() || !cfd->initialized()) {
+ continue;
+ }
+ for (int level = 0; level < cfd->NumberLevels(); level++) {
+ for (const auto& file :
+ cfd->current()->storage_info()->LevelFiles(level)) {
+ LiveFileMetaData filemetadata;
+ filemetadata.column_family_name = cfd->GetName();
+ uint32_t path_id = file->fd.GetPathId();
+ if (path_id < cfd->ioptions()->cf_paths.size()) {
+ filemetadata.db_path = cfd->ioptions()->cf_paths[path_id].path;
+ } else {
+ assert(!cfd->ioptions()->cf_paths.empty());
+ filemetadata.db_path = cfd->ioptions()->cf_paths.back().path;
+ }
+ filemetadata.directory = filemetadata.db_path;
+ const uint64_t file_number = file->fd.GetNumber();
+ filemetadata.name = MakeTableFileName("", file_number);
+ filemetadata.relative_filename = filemetadata.name.substr(1);
+ filemetadata.file_number = file_number;
+ filemetadata.level = level;
+ filemetadata.size = file->fd.GetFileSize();
+ filemetadata.smallestkey = file->smallest.user_key().ToString();
+ filemetadata.largestkey = file->largest.user_key().ToString();
+ filemetadata.smallest_seqno = file->fd.smallest_seqno;
+ filemetadata.largest_seqno = file->fd.largest_seqno;
+ filemetadata.num_reads_sampled =
+ file->stats.num_reads_sampled.load(std::memory_order_relaxed);
+ filemetadata.being_compacted = file->being_compacted;
+ filemetadata.num_entries = file->num_entries;
+ filemetadata.num_deletions = file->num_deletions;
+ filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
+ filemetadata.file_checksum = file->file_checksum;
+ filemetadata.file_checksum_func_name = file->file_checksum_func_name;
+ filemetadata.temperature = file->temperature;
+ filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
+ filemetadata.file_creation_time = file->TryGetFileCreationTime();
+ metadata->push_back(filemetadata);
+ }
+ }
+ }
+}
+
+void VersionSet::GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+ std::vector<ObsoleteBlobFileInfo>* blob_files,
+ std::vector<std::string>* manifest_filenames,
+ uint64_t min_pending_output) {
+ assert(files);
+ assert(blob_files);
+ assert(manifest_filenames);
+ assert(files->empty());
+ assert(blob_files->empty());
+ assert(manifest_filenames->empty());
+
+ std::vector<ObsoleteFileInfo> pending_files;
+ for (auto& f : obsolete_files_) {
+ if (f.metadata->fd.GetNumber() < min_pending_output) {
+ files->emplace_back(std::move(f));
+ } else {
+ pending_files.emplace_back(std::move(f));
+ }
+ }
+ obsolete_files_.swap(pending_files);
+
+ std::vector<ObsoleteBlobFileInfo> pending_blob_files;
+ for (auto& blob_file : obsolete_blob_files_) {
+ if (blob_file.GetBlobFileNumber() < min_pending_output) {
+ blob_files->emplace_back(std::move(blob_file));
+ } else {
+ pending_blob_files.emplace_back(std::move(blob_file));
+ }
+ }
+ obsolete_blob_files_.swap(pending_blob_files);
+
+ obsolete_manifests_.swap(*manifest_filenames);
+}
+
+ColumnFamilyData* VersionSet::CreateColumnFamily(
+ const ColumnFamilyOptions& cf_options, const VersionEdit* edit) {
+ assert(edit->is_column_family_add_);
+
+ MutableCFOptions dummy_cf_options;
+ Version* dummy_versions =
+ new Version(nullptr, this, file_options_, dummy_cf_options, io_tracer_);
+ // Ref() dummy version once so that later we can call Unref() to delete it
+ // by avoiding calling "delete" explicitly (~Version is private)
+ dummy_versions->Ref();
+ auto new_cfd = column_family_set_->CreateColumnFamily(
+ edit->column_family_name_, edit->column_family_, dummy_versions,
+ cf_options);
+
+ Version* v = new Version(new_cfd, this, file_options_,
+ *new_cfd->GetLatestMutableCFOptions(), io_tracer_,
+ current_version_number_++);
+
+ constexpr bool update_stats = false;
+
+ v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), update_stats);
+
+ AppendVersion(new_cfd, v);
+ // GetLatestMutableCFOptions() is safe here without mutex since the
+ // cfd is not available to client
+ new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions(),
+ LastSequence());
+ new_cfd->SetLogNumber(edit->log_number_);
+ return new_cfd;
+}
+
+uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) {
+ uint64_t count = 0;
+ for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+ count++;
+ }
+ return count;
+}
+
+uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) {
+ std::unordered_set<uint64_t> unique_files;
+ uint64_t total_files_size = 0;
+ for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+ VersionStorageInfo* storage_info = v->storage_info();
+ for (int level = 0; level < storage_info->num_levels_; level++) {
+ for (const auto& file_meta : storage_info->LevelFiles(level)) {
+ if (unique_files.find(file_meta->fd.packed_number_and_path_id) ==
+ unique_files.end()) {
+ unique_files.insert(file_meta->fd.packed_number_and_path_id);
+ total_files_size += file_meta->fd.GetFileSize();
+ }
+ }
+ }
+ }
+ return total_files_size;
+}
+
+uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
+ std::unordered_set<uint64_t> unique_blob_files;
+
+ uint64_t all_versions_blob_file_size = 0;
+
+ for (auto* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+ // iterate all the versions
+ const auto* vstorage = v->storage_info();
+ assert(vstorage);
+
+ const auto& blob_files = vstorage->GetBlobFiles();
+
+ for (const auto& meta : blob_files) {
+ assert(meta);
+
+ const uint64_t blob_file_number = meta->GetBlobFileNumber();
+
+ if (unique_blob_files.find(blob_file_number) == unique_blob_files.end()) {
+ // find Blob file that has not been counted
+ unique_blob_files.insert(blob_file_number);
+ all_versions_blob_file_size += meta->GetBlobFileSize();
+ }
+ }
+ }
+
+ return all_versions_blob_file_size;
+}
+
+Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd,
+ const std::string& fpath, int level,
+ const FileMetaData& meta) {
+ uint64_t fsize = 0;
+ Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr);
+ if (status.ok()) {
+ if (fsize != meta.fd.GetFileSize()) {
+ status = Status::Corruption("File size mismatch: " + fpath);
+ }
+ }
+ if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) {
+ assert(cfd);
+ TableCache* table_cache = cfd->table_cache();
+ assert(table_cache);
+
+ const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions();
+ assert(cf_opts);
+ std::shared_ptr<const SliceTransform> pe = cf_opts->prefix_extractor;
+ size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts);
+
+ const FileOptions& file_opts = file_options();
+
+ Version* version = cfd->current();
+ assert(version);
+ VersionStorageInfo& storage_info = version->storage_info_;
+ const InternalKeyComparator* icmp = storage_info.InternalComparator();
+ assert(icmp);
+
+ InternalStats* internal_stats = cfd->internal_stats();
+
+ FileMetaData meta_copy = meta;
+ status = table_cache->FindTable(
+ ReadOptions(), file_opts, *icmp, meta_copy,
+ &(meta_copy.table_reader_handle), pe,
+ /*no_io=*/false, /*record_read_stats=*/true,
+ internal_stats->GetFileReadHist(level), false, level,
+ /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,
+ meta_copy.temperature);
+ if (meta_copy.table_reader_handle) {
+ table_cache->ReleaseHandle(meta_copy.table_reader_handle);
+ }
+ }
+ return status;
+}
+
+ReactiveVersionSet::ReactiveVersionSet(
+ const std::string& dbname, const ImmutableDBOptions* _db_options,
+ const FileOptions& _file_options, Cache* table_cache,
+ WriteBufferManager* write_buffer_manager, WriteController* write_controller,
+ const std::shared_ptr<IOTracer>& io_tracer)
+ : VersionSet(dbname, _db_options, _file_options, table_cache,
+ write_buffer_manager, write_controller,
+ /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "",
+ /*db_session_id*/ "") {}
+
+ReactiveVersionSet::~ReactiveVersionSet() {}
+
+Status ReactiveVersionSet::Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+ std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+ std::unique_ptr<Status>* manifest_reader_status) {
+ assert(manifest_reader != nullptr);
+ assert(manifest_reporter != nullptr);
+ assert(manifest_reader_status != nullptr);
+
+ manifest_reader_status->reset(new Status());
+ manifest_reporter->reset(new LogReporter());
+ static_cast_with_check<LogReporter>(manifest_reporter->get())->status =
+ manifest_reader_status->get();
+ Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader);
+ if (!s.ok()) {
+ return s;
+ }
+ log::Reader* reader = manifest_reader->get();
+ assert(reader);
+
+ manifest_tailer_.reset(new ManifestTailer(
+ column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_));
+
+ manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
+
+ return manifest_tailer_->status();
+}
+
+Status ReactiveVersionSet::ReadAndApply(
+ InstrumentedMutex* mu,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+ Status* manifest_read_status,
+ std::unordered_set<ColumnFamilyData*>* cfds_changed) {
+ assert(manifest_reader != nullptr);
+ assert(cfds_changed != nullptr);
+ mu->AssertHeld();
+
+ Status s;
+ log::Reader* reader = manifest_reader->get();
+ assert(reader);
+ s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
+ if (!s.ok()) {
+ return s;
+ }
+ manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status);
+ s = manifest_tailer_->status();
+ if (s.ok()) {
+ *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
+ }
+
+ return s;
+}
+
+Status ReactiveVersionSet::MaybeSwitchManifest(
+ log::Reader::Reporter* reporter,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
+ assert(manifest_reader != nullptr);
+ Status s;
+ std::string manifest_path;
+ s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
+ &manifest_file_number_);
+ if (!s.ok()) {
+ return s;
+ }
+ std::unique_ptr<FSSequentialFile> manifest_file;
+ if (manifest_reader->get() != nullptr &&
+ manifest_reader->get()->file()->file_name() == manifest_path) {
+ // CURRENT points to the same MANIFEST as before, no need to switch
+ // MANIFEST.
+ return s;
+ }
+ assert(nullptr == manifest_reader->get() ||
+ manifest_reader->get()->file()->file_name() != manifest_path);
+ s = fs_->FileExists(manifest_path, IOOptions(), nullptr);
+ if (s.IsNotFound()) {
+ return Status::TryAgain(
+ "The primary may have switched to a new MANIFEST and deleted the old "
+ "one.");
+ } else if (!s.ok()) {
+ return s;
+ }
+ TEST_SYNC_POINT(
+ "ReactiveVersionSet::MaybeSwitchManifest:"
+ "AfterGetCurrentManifestPath:0");
+ TEST_SYNC_POINT(
+ "ReactiveVersionSet::MaybeSwitchManifest:"
+ "AfterGetCurrentManifestPath:1");
+ // The primary can also delete the MANIFEST while the secondary is reading
+ // it. This is OK on POSIX. For other file systems, maybe create a hard link
+ // to MANIFEST. The hard link should be cleaned up later by the secondary.
+ s = fs_->NewSequentialFile(manifest_path,
+ fs_->OptimizeForManifestRead(file_options_),
+ &manifest_file, nullptr);
+ std::unique_ptr<SequentialFileReader> manifest_file_reader;
+ if (s.ok()) {
+ manifest_file_reader.reset(new SequentialFileReader(
+ std::move(manifest_file), manifest_path,
+ db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+ manifest_reader->reset(new log::FragmentBufferedReader(
+ nullptr, std::move(manifest_file_reader), reporter, true /* checksum */,
+ 0 /* log_number */));
+ ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
+ manifest_path.c_str());
+ if (manifest_tailer_) {
+ manifest_tailer_->PrepareToReadNewManifest();
+ }
+ } else if (s.IsPathNotFound()) {
+ // This can happen if the primary switches to a new MANIFEST after the
+ // secondary reads the CURRENT file but before the secondary actually tries
+ // to open the MANIFEST.
+ s = Status::TryAgain(
+ "The primary may have switched to a new MANIFEST and deleted the old "
+ "one.");
+ }
+ return s;
+}
+
+#ifndef NDEBUG
+uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const {
+ assert(manifest_tailer_);
+ return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group();
+}
+#endif // !NDEBUG
+
+std::vector<VersionEdit>& ReactiveVersionSet::replay_buffer() {
+ assert(manifest_tailer_);
+ return manifest_tailer_->GetReadBuffer().replay_buffer();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_set.h b/src/rocksdb/db/version_set.h
new file mode 100644
index 000000000..03176a8b5
--- /dev/null
+++ b/src/rocksdb/db/version_set.h
@@ -0,0 +1,1652 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions. The
+// newest version is called "current". Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of table files per level, as well as a
+// set of blob files. The entire set of versions is maintained in a
+// VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#pragma once
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <map>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cache/cache_helpers.h"
+#include "db/blob/blob_file_meta.h"
+#include "db/column_family.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/dbformat.h"
+#include "db/file_indexer.h"
+#include "db/log_reader.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/table_cache.h"
+#include "db/version_builder.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "env/file_system_tracer.h"
+#if USE_COROUTINES
+#include "folly/experimental/coro/BlockingWait.h"
+#include "folly/experimental/coro/Collect.h"
+#endif
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "table/get_context.h"
+#include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/autovector.h"
+#include "util/coro_utils.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace log {
+class Writer;
+}
+
+class BlobIndex;
+class Compaction;
+class LogBuffer;
+class LookupKey;
+class MemTable;
+class Version;
+class VersionSet;
+class WriteBufferManager;
+class MergeContext;
+class ColumnFamilySet;
+class MergeIteratorBuilder;
+class SystemClock;
+class ManifestTailer;
+class FilePickerMultiGet;
+
+// VersionEdit is always supposed to be valid and it is used to point at
+// entries in Manifest. Ideally it should not be used as a container to
+// carry around few of its fields as function params because it can cause
+// readers to think it's a valid entry from Manifest. To avoid that confusion
+// introducing VersionEditParams to simply carry around multiple VersionEdit
+// params. It need not point to a valid record in Manifest.
+using VersionEditParams = VersionEdit;
+
+// Return the smallest index i such that file_level.files[i]->largest >= key.
+// Return file_level.num_files if there is no such file.
+// REQUIRES: "file_level.files" contains a sorted list of
+// non-overlapping files.
+extern int FindFile(const InternalKeyComparator& icmp,
+ const LevelFilesBrief& file_level, const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==nullptr represents a key smaller than all keys in the DB.
+// largest==nullptr represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, file_level.files[]
+// contains disjoint ranges in sorted order.
+extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
+ bool disjoint_sorted_files,
+ const LevelFilesBrief& file_level,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key);
+
+// Generate LevelFilesBrief from vector<FdWithKeyRange*>
+// Would copy smallest_key and largest_key data to sequential memory
+// arena: Arena used to allocate the memory
+extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+ const std::vector<FileMetaData*>& files,
+ Arena* arena);
+
+// Information of the storage associated with each Version, including number of
+// levels of LSM tree, files information at each level, files marked for
+// compaction, blob files, etc.
+class VersionStorageInfo {
+ public:
+ VersionStorageInfo(const InternalKeyComparator* internal_comparator,
+ const Comparator* user_comparator, int num_levels,
+ CompactionStyle compaction_style,
+ VersionStorageInfo* src_vstorage,
+ bool _force_consistency_checks);
+ // No copying allowed
+ VersionStorageInfo(const VersionStorageInfo&) = delete;
+ void operator=(const VersionStorageInfo&) = delete;
+ ~VersionStorageInfo();
+
+ void Reserve(int level, size_t size) { files_[level].reserve(size); }
+
+ void AddFile(int level, FileMetaData* f);
+
+ // Resize/Initialize the space for compact_cursor_
+ void ResizeCompactCursors(int level) {
+ compact_cursor_.resize(level, InternalKey());
+ }
+
+ const std::vector<InternalKey>& GetCompactCursors() const {
+ return compact_cursor_;
+ }
+
+ // REQUIRES: ResizeCompactCursors has been called
+ void AddCursorForOneLevel(int level,
+ const InternalKey& smallest_uncompacted_key) {
+ compact_cursor_[level] = smallest_uncompacted_key;
+ }
+
+ // REQUIRES: lock is held
+ // Update the compact cursor and advance the file index using increment
+ // so that it can point to the next cursor (increment means the number of
+ // input files in this level of the last compaction)
+ const InternalKey& GetNextCompactCursor(int level, size_t increment) {
+ int cmp_idx = next_file_to_compact_by_size_[level] + (int)increment;
+ assert(cmp_idx <= (int)files_by_compaction_pri_[level].size());
+ // TODO(zichen): may need to update next_file_to_compact_by_size_
+ // for parallel compaction.
+ InternalKey new_cursor;
+ if (cmp_idx >= (int)files_by_compaction_pri_[level].size()) {
+ cmp_idx = 0;
+ }
+ // TODO(zichen): rethink if this strategy gives us some good guarantee
+ return files_[level][files_by_compaction_pri_[level][cmp_idx]]->smallest;
+ }
+
+ void ReserveBlob(size_t size) { blob_files_.reserve(size); }
+
+ void AddBlobFile(std::shared_ptr<BlobFileMetaData> blob_file_meta);
+
+ void PrepareForVersionAppend(const ImmutableOptions& immutable_options,
+ const MutableCFOptions& mutable_cf_options);
+
+ // REQUIRES: PrepareForVersionAppend has been called
+ void SetFinalized();
+
+ // Update the accumulated stats from a file-meta.
+ void UpdateAccumulatedStats(FileMetaData* file_meta);
+
+ // Decrease the current stat from a to-be-deleted file-meta
+ void RemoveCurrentStats(FileMetaData* file_meta);
+
+ // Updates internal structures that keep track of compaction scores
+ // We use compaction scores to figure out which compaction to do next
+ // REQUIRES: db_mutex held!!
+ // TODO find a better way to pass compaction_options_fifo.
+ void ComputeCompactionScore(const ImmutableOptions& immutable_options,
+ const MutableCFOptions& mutable_cf_options);
+
+ // Estimate est_comp_needed_bytes_
+ void EstimateCompactionBytesNeeded(
+ const MutableCFOptions& mutable_cf_options);
+
+ // This computes files_marked_for_compaction_ and is called by
+ // ComputeCompactionScore()
+ void ComputeFilesMarkedForCompaction();
+
+ // This computes ttl_expired_files_ and is called by
+ // ComputeCompactionScore()
+ void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions,
+ const uint64_t ttl);
+
+ // This computes files_marked_for_periodic_compaction_ and is called by
+ // ComputeCompactionScore()
+ void ComputeFilesMarkedForPeriodicCompaction(
+ const ImmutableOptions& ioptions,
+ const uint64_t periodic_compaction_seconds);
+
+ // This computes bottommost_files_marked_for_compaction_ and is called by
+ // ComputeCompactionScore() or UpdateOldestSnapshot().
+ //
+ // Among bottommost files (assumes they've already been computed), marks the
+ // ones that have keys that would be eliminated if recompacted, according to
+ // the seqnum of the oldest existing snapshot. Must be called every time
+ // oldest snapshot changes as that is when bottom-level files can become
+ // eligible for compaction.
+ //
+ // REQUIRES: DB mutex held
+ void ComputeBottommostFilesMarkedForCompaction();
+
+ // This computes files_marked_for_forced_blob_gc_ and is called by
+ // ComputeCompactionScore()
+ //
+ // REQUIRES: DB mutex held
+ void ComputeFilesMarkedForForcedBlobGC(
+ double blob_garbage_collection_age_cutoff,
+ double blob_garbage_collection_force_threshold);
+
+ bool level0_non_overlapping() const { return level0_non_overlapping_; }
+
+ // Updates the oldest snapshot and related internal state, like the bottommost
+ // files marked for compaction.
+ // REQUIRES: DB mutex held
+ void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum);
+
+ int MaxInputLevel() const;
+ int MaxOutputLevel(bool allow_ingest_behind) const;
+
+ // Return level number that has idx'th highest score
+ int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
+
+ // Return idx'th highest score
+ double CompactionScore(int idx) const { return compaction_score_[idx]; }
+
+ void GetOverlappingInputs(
+ int level, const InternalKey* begin, // nullptr means before all keys
+ const InternalKey* end, // nullptr means after all keys
+ std::vector<FileMetaData*>* inputs,
+ int hint_index = -1, // index of overlap file
+ int* file_index = nullptr, // return index of overlap file
+ bool expand_range = true, // if set, returns files which overlap the
+ // range and overlap each other. If false,
+ // then just files intersecting the range
+ InternalKey** next_smallest = nullptr) // if non-null, returns the
+ const; // smallest key of next file not included
+ void GetCleanInputsWithinInterval(
+ int level, const InternalKey* begin, // nullptr means before all keys
+ const InternalKey* end, // nullptr means after all keys
+ std::vector<FileMetaData*>* inputs,
+ int hint_index = -1, // index of overlap file
+ int* file_index = nullptr) // return index of overlap file
+ const;
+
+ void GetOverlappingInputsRangeBinarySearch(
+ int level, // level > 0
+ const InternalKey* begin, // nullptr means before all keys
+ const InternalKey* end, // nullptr means after all keys
+ std::vector<FileMetaData*>* inputs,
+ int hint_index, // index of overlap file
+ int* file_index, // return index of overlap file
+ bool within_interval = false, // if set, force the inputs within interval
+ InternalKey** next_smallest = nullptr) // if non-null, returns the
+ const; // smallest key of next file not included
+
+ // Returns true iff some file in the specified level overlaps
+ // some part of [*smallest_user_key,*largest_user_key].
+ // smallest_user_key==NULL represents a key smaller than all keys in the DB.
+ // largest_user_key==NULL represents a key largest than all keys in the DB.
+ bool OverlapInLevel(int level, const Slice* smallest_user_key,
+ const Slice* largest_user_key);
+
+ // Returns true iff the first or last file in inputs contains
+ // an overlapping user key to the file "just outside" of it (i.e.
+ // just after the last file, or just before the first file)
+ // REQUIRES: "*inputs" is a sorted list of non-overlapping files
+ bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
+ int level);
+
+ int num_levels() const { return num_levels_; }
+
+ // REQUIRES: PrepareForVersionAppend has been called
+ int num_non_empty_levels() const {
+ assert(finalized_);
+ return num_non_empty_levels_;
+ }
+
+ // REQUIRES: PrepareForVersionAppend has been called
+ // This may or may not return number of level files. It is to keep backward
+ // compatible behavior in universal compaction.
+ int l0_delay_trigger_count() const { return l0_delay_trigger_count_; }
+
+ void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; }
+
+ // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+ int NumLevelFiles(int level) const {
+ assert(finalized_);
+ return static_cast<int>(files_[level].size());
+ }
+
+ // Return the combined file size of all files at the specified level.
+ uint64_t NumLevelBytes(int level) const;
+
+ // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+ const std::vector<FileMetaData*>& LevelFiles(int level) const {
+ return files_[level];
+ }
+
+ class FileLocation {
+ public:
+ FileLocation() = default;
+ FileLocation(int level, size_t position)
+ : level_(level), position_(position) {}
+
+ int GetLevel() const { return level_; }
+ size_t GetPosition() const { return position_; }
+
+ bool IsValid() const { return level_ >= 0; }
+
+ bool operator==(const FileLocation& rhs) const {
+ return level_ == rhs.level_ && position_ == rhs.position_;
+ }
+
+ bool operator!=(const FileLocation& rhs) const { return !(*this == rhs); }
+
+ static FileLocation Invalid() { return FileLocation(); }
+
+ private:
+ int level_ = -1;
+ size_t position_ = 0;
+ };
+
+ // REQUIRES: PrepareForVersionAppend has been called
+ FileLocation GetFileLocation(uint64_t file_number) const {
+ const auto it = file_locations_.find(file_number);
+
+ if (it == file_locations_.end()) {
+ return FileLocation::Invalid();
+ }
+
+ assert(it->second.GetLevel() < num_levels_);
+ assert(it->second.GetPosition() < files_[it->second.GetLevel()].size());
+ assert(files_[it->second.GetLevel()][it->second.GetPosition()]);
+ assert(files_[it->second.GetLevel()][it->second.GetPosition()]
+ ->fd.GetNumber() == file_number);
+
+ return it->second;
+ }
+
+ // REQUIRES: PrepareForVersionAppend has been called
+ FileMetaData* GetFileMetaDataByNumber(uint64_t file_number) const {
+ auto location = GetFileLocation(file_number);
+
+ if (!location.IsValid()) {
+ return nullptr;
+ }
+
+ return files_[location.GetLevel()][location.GetPosition()];
+ }
+
+ // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+ using BlobFiles = std::vector<std::shared_ptr<BlobFileMetaData>>;
+ const BlobFiles& GetBlobFiles() const { return blob_files_; }
+
+ // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+ BlobFiles::const_iterator GetBlobFileMetaDataLB(
+ uint64_t blob_file_number) const;
+
+ // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+ std::shared_ptr<BlobFileMetaData> GetBlobFileMetaData(
+ uint64_t blob_file_number) const {
+ const auto it = GetBlobFileMetaDataLB(blob_file_number);
+
+ assert(it == blob_files_.end() || *it);
+
+ if (it != blob_files_.end() &&
+ (*it)->GetBlobFileNumber() == blob_file_number) {
+ return *it;
+ }
+
+ return std::shared_ptr<BlobFileMetaData>();
+ }
+
+ // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+ struct BlobStats {
+ uint64_t total_file_size = 0;
+ uint64_t total_garbage_size = 0;
+ double space_amp = 0.0;
+ };
+
+ BlobStats GetBlobStats() const {
+ uint64_t total_file_size = 0;
+ uint64_t total_garbage_size = 0;
+
+ for (const auto& meta : blob_files_) {
+ assert(meta);
+
+ total_file_size += meta->GetBlobFileSize();
+ total_garbage_size += meta->GetGarbageBlobBytes();
+ }
+
+ double space_amp = 0.0;
+ if (total_file_size > total_garbage_size) {
+ space_amp = static_cast<double>(total_file_size) /
+ (total_file_size - total_garbage_size);
+ }
+
+ return BlobStats{total_file_size, total_garbage_size, space_amp};
+ }
+
+ const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const {
+ assert(level < static_cast<int>(level_files_brief_.size()));
+ return level_files_brief_[level];
+ }
+
+ // REQUIRES: PrepareForVersionAppend has been called
+ const std::vector<int>& FilesByCompactionPri(int level) const {
+ assert(finalized_);
+ return files_by_compaction_pri_[level];
+ }
+
+ // REQUIRES: ComputeCompactionScore has been called
+ // REQUIRES: DB mutex held during access
+ const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction()
+ const {
+ assert(finalized_);
+ return files_marked_for_compaction_;
+ }
+
+ // REQUIRES: ComputeCompactionScore has been called
+ // REQUIRES: DB mutex held during access
+ const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
+ assert(finalized_);
+ return expired_ttl_files_;
+ }
+
+ // REQUIRES: ComputeCompactionScore has been called
+ // REQUIRES: DB mutex held during access
+ const autovector<std::pair<int, FileMetaData*>>&
+ FilesMarkedForPeriodicCompaction() const {
+ assert(finalized_);
+ return files_marked_for_periodic_compaction_;
+ }
+
+ void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) {
+ files_marked_for_periodic_compaction_.emplace_back(level, f);
+ }
+
+ // REQUIRES: ComputeCompactionScore has been called
+ // REQUIRES: DB mutex held during access
+ const autovector<std::pair<int, FileMetaData*>>&
+ BottommostFilesMarkedForCompaction() const {
+ assert(finalized_);
+ return bottommost_files_marked_for_compaction_;
+ }
+
+ // REQUIRES: ComputeCompactionScore has been called
+ // REQUIRES: DB mutex held during access
+ const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForForcedBlobGC()
+ const {
+ assert(finalized_);
+ return files_marked_for_forced_blob_gc_;
+ }
+
+ int base_level() const { return base_level_; }
+ double level_multiplier() const { return level_multiplier_; }
+
+ // REQUIRES: lock is held
+ // Set the index that is used to offset into files_by_compaction_pri_ to find
+ // the next compaction candidate file.
+ void SetNextCompactionIndex(int level, int index) {
+ next_file_to_compact_by_size_[level] = index;
+ }
+
+ // REQUIRES: lock is held
+ int NextCompactionIndex(int level) const {
+ return next_file_to_compact_by_size_[level];
+ }
+
+ // REQUIRES: PrepareForVersionAppend has been called
+ const FileIndexer& file_indexer() const {
+ assert(finalized_);
+ return file_indexer_;
+ }
+
+ // Only the first few entries of files_by_compaction_pri_ are sorted.
+ // There is no need to sort all the files because it is likely
+ // that on a running system, we need to look at only the first
+ // few largest files because a new version is created every few
+ // seconds/minutes (because of concurrent compactions).
+ static const size_t kNumberFilesToSort = 50;
+
+ // Return a human-readable short (single-line) summary of the number
+ // of files per level. Uses *scratch as backing store.
+ struct LevelSummaryStorage {
+ char buffer[1000];
+ };
+ struct FileSummaryStorage {
+ char buffer[3000];
+ };
+ const char* LevelSummary(LevelSummaryStorage* scratch) const;
+ // Return a human-readable short (single-line) summary of files
+ // in a specified level. Uses *scratch as backing store.
+ const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+ // Return the maximum overlapping data (in bytes) at next level for any
+ // file at a level >= 1.
+ uint64_t MaxNextLevelOverlappingBytes();
+
+ // Return a human readable string that describes this version's contents.
+ std::string DebugString(bool hex = false) const;
+
+ uint64_t GetAverageValueSize() const {
+ if (accumulated_num_non_deletions_ == 0) {
+ return 0;
+ }
+ assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
+ assert(accumulated_file_size_ > 0);
+ return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
+ accumulated_file_size_ /
+ (accumulated_raw_key_size_ + accumulated_raw_value_size_);
+ }
+
+ uint64_t GetEstimatedActiveKeys() const;
+
+ double GetEstimatedCompressionRatioAtLevel(int level) const;
+
+ // re-initializes the index that is used to offset into
+ // files_by_compaction_pri_
+ // to find the next compaction candidate file.
+ void ResetNextCompactionIndex(int level) {
+ next_file_to_compact_by_size_[level] = 0;
+ }
+
+ const InternalKeyComparator* InternalComparator() const {
+ return internal_comparator_;
+ }
+
+ // Returns maximum total bytes of data on a given level.
+ uint64_t MaxBytesForLevel(int level) const;
+
+ // Returns an estimate of the amount of live data in bytes.
+ uint64_t EstimateLiveDataSize() const;
+
+ uint64_t estimated_compaction_needed_bytes() const {
+ return estimated_compaction_needed_bytes_;
+ }
+
+ void TEST_set_estimated_compaction_needed_bytes(uint64_t v) {
+ estimated_compaction_needed_bytes_ = v;
+ }
+
+ bool force_consistency_checks() const { return force_consistency_checks_; }
+
+ SequenceNumber bottommost_files_mark_threshold() const {
+ return bottommost_files_mark_threshold_;
+ }
+
+ // Returns whether any key in [`smallest_key`, `largest_key`] could appear in
+ // an older L0 file than `last_l0_idx` or in a greater level than `last_level`
+ //
+ // @param last_level Level after which we check for overlap
+ // @param last_l0_idx If `last_level == 0`, index of L0 file after which we
+ // check for overlap; otherwise, must be -1
+ bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ int last_level, int last_l0_idx);
+
+ private:
+ void ComputeCompensatedSizes();
+ void UpdateNumNonEmptyLevels();
+ void CalculateBaseBytes(const ImmutableOptions& ioptions,
+ const MutableCFOptions& options);
+ void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options,
+ const MutableCFOptions& mutable_cf_options);
+
+ void GenerateFileIndexer() {
+ file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
+ }
+
+ void GenerateLevelFilesBrief();
+ void GenerateLevel0NonOverlapping();
+ void GenerateBottommostFiles();
+ void GenerateFileLocationIndex();
+
+ const InternalKeyComparator* internal_comparator_;
+ const Comparator* user_comparator_;
+ int num_levels_; // Number of levels
+ int num_non_empty_levels_; // Number of levels. Any level larger than it
+ // is guaranteed to be empty.
+ // Per-level max bytes
+ std::vector<uint64_t> level_max_bytes_;
+
+ // A short brief metadata of files per level
+ autovector<ROCKSDB_NAMESPACE::LevelFilesBrief> level_files_brief_;
+ FileIndexer file_indexer_;
+ Arena arena_; // Used to allocate space for file_levels_
+
+ CompactionStyle compaction_style_;
+
+ // List of files per level, files in each level are arranged
+ // in increasing order of keys
+ std::vector<FileMetaData*>* files_;
+
+ // Map of all table files in version. Maps file number to (level, position on
+ // level).
+ using FileLocations = UnorderedMap<uint64_t, FileLocation>;
+ FileLocations file_locations_;
+
+ // Vector of blob files in version sorted by blob file number.
+ BlobFiles blob_files_;
+
+ // Level that L0 data should be compacted to. All levels < base_level_ should
+ // be empty. -1 if it is not level-compaction so it's not applicable.
+ int base_level_;
+
+ double level_multiplier_;
+
+ // A list for the same set of files that are stored in files_,
+ // but files in each level are now sorted based on file
+ // size. The file with the largest size is at the front.
+ // This vector stores the index of the file from files_.
+ std::vector<std::vector<int>> files_by_compaction_pri_;
+
+ // If true, means that files in L0 have keys with non overlapping ranges
+ bool level0_non_overlapping_;
+
+ // An index into files_by_compaction_pri_ that specifies the first
+ // file that is not yet compacted
+ std::vector<int> next_file_to_compact_by_size_;
+
+ // Only the first few entries of files_by_compaction_pri_ are sorted.
+ // There is no need to sort all the files because it is likely
+ // that on a running system, we need to look at only the first
+ // few largest files because a new version is created every few
+ // seconds/minutes (because of concurrent compactions).
+ static const size_t number_of_files_to_sort_ = 50;
+
+ // This vector contains list of files marked for compaction and also not
+ // currently being compacted. It is protected by DB mutex. It is calculated in
+ // ComputeCompactionScore()
+ autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
+
+ autovector<std::pair<int, FileMetaData*>> expired_ttl_files_;
+
+ autovector<std::pair<int, FileMetaData*>>
+ files_marked_for_periodic_compaction_;
+
+ // These files are considered bottommost because none of their keys can exist
+ // at lower levels. They are not necessarily all in the same level. The marked
+ // ones are eligible for compaction because they contain duplicate key
+ // versions that are no longer protected by snapshot. These variables are
+ // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and
+ // `ComputeBottommostFilesMarkedForCompaction()`.
+ autovector<std::pair<int, FileMetaData*>> bottommost_files_;
+ autovector<std::pair<int, FileMetaData*>>
+ bottommost_files_marked_for_compaction_;
+
+ autovector<std::pair<int, FileMetaData*>> files_marked_for_forced_blob_gc_;
+
+ // Threshold for needing to mark another bottommost file. Maintain it so we
+ // can quickly check when releasing a snapshot whether more bottommost files
+ // became eligible for compaction. It's defined as the min of the max nonzero
+ // seqnums of unmarked bottommost files.
+ SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+ // Monotonically increases as we release old snapshots. Zero indicates no
+ // snapshots have been released yet. When no snapshots remain we set it to the
+ // current seqnum, which needs to be protected as a snapshot can still be
+ // created that references it.
+ SequenceNumber oldest_snapshot_seqnum_ = 0;
+
+ // Level that should be compacted next and its compaction score.
+ // Score < 1 means compaction is not strictly needed. These fields
+ // are initialized by ComputeCompactionScore.
+ // The most critical level to be compacted is listed first
+ // These are used to pick the best compaction level
+ std::vector<double> compaction_score_;
+ std::vector<int> compaction_level_;
+ int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop
+ // for number of L0 files.
+
+ // Compact cursors for round-robin compactions in each level
+ std::vector<InternalKey> compact_cursor_;
+
+ // the following are the sampled temporary stats.
+ // the current accumulated size of sampled files.
+ uint64_t accumulated_file_size_;
+ // the current accumulated size of all raw keys based on the sampled files.
+ uint64_t accumulated_raw_key_size_;
+ // the current accumulated size of all raw keys based on the sampled files.
+ uint64_t accumulated_raw_value_size_;
+ // total number of non-deletion entries
+ uint64_t accumulated_num_non_deletions_;
+ // total number of deletion entries
+ uint64_t accumulated_num_deletions_;
+ // current number of non_deletion entries
+ uint64_t current_num_non_deletions_;
+ // current number of deletion entries
+ uint64_t current_num_deletions_;
+ // current number of file samples
+ uint64_t current_num_samples_;
+ // Estimated bytes needed to be compacted until all levels' size is down to
+ // target sizes.
+ uint64_t estimated_compaction_needed_bytes_;
+
+ bool finalized_;
+
+ // If set to true, we will run consistency checks even if RocksDB
+ // is compiled in release mode
+ bool force_consistency_checks_;
+
+ friend class Version;
+ friend class VersionSet;
+};
+
+struct ObsoleteFileInfo {
+ FileMetaData* metadata;
+ std::string path;
+ // If true, the FileMataData should be destroyed but the file should
+ // not be deleted. This is because another FileMetaData still references
+ // the file, usually because the file is trivial moved so two FileMetadata
+ // is managing the file.
+ bool only_delete_metadata = false;
+
+ ObsoleteFileInfo() noexcept
+ : metadata(nullptr), only_delete_metadata(false) {}
+ ObsoleteFileInfo(FileMetaData* f, const std::string& file_path,
+ std::shared_ptr<CacheReservationManager>
+ file_metadata_cache_res_mgr_arg = nullptr)
+ : metadata(f),
+ path(file_path),
+ only_delete_metadata(false),
+ file_metadata_cache_res_mgr(file_metadata_cache_res_mgr_arg) {}
+
+ ObsoleteFileInfo(const ObsoleteFileInfo&) = delete;
+ ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete;
+
+ ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept : ObsoleteFileInfo() {
+ *this = std::move(rhs);
+ }
+
+ ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept {
+ path = std::move(rhs.path);
+ metadata = rhs.metadata;
+ rhs.metadata = nullptr;
+ file_metadata_cache_res_mgr = rhs.file_metadata_cache_res_mgr;
+ rhs.file_metadata_cache_res_mgr = nullptr;
+
+ return *this;
+ }
+ void DeleteMetadata() {
+ if (file_metadata_cache_res_mgr) {
+ Status s = file_metadata_cache_res_mgr->UpdateCacheReservation(
+ metadata->ApproximateMemoryUsage(), false /* increase */);
+ s.PermitUncheckedError();
+ }
+ delete metadata;
+ metadata = nullptr;
+ }
+
+ private:
+ std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr;
+};
+
+class ObsoleteBlobFileInfo {
+ public:
+ ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path)
+ : blob_file_number_(blob_file_number), path_(std::move(path)) {}
+
+ uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+ const std::string& GetPath() const { return path_; }
+
+ private:
+ uint64_t blob_file_number_;
+ std::string path_;
+};
+
+using MultiGetRange = MultiGetContext::Range;
+// A column family's version consists of the table and blob files owned by
+// the column family at a certain point in time.
+class Version {
+ public:
+ // Append to *iters a sequence of iterators that will
+ // yield the contents of this Version when merged together.
+ // @param read_options Must outlive any iterator built by
+ // `merger_iter_builder`.
+ void AddIterators(const ReadOptions& read_options,
+ const FileOptions& soptions,
+ MergeIteratorBuilder* merger_iter_builder,
+ bool allow_unprepared_value);
+
+ // @param read_options Must outlive any iterator built by
+ // `merger_iter_builder`.
+ void AddIteratorsForLevel(const ReadOptions& read_options,
+ const FileOptions& soptions,
+ MergeIteratorBuilder* merger_iter_builder,
+ int level, bool allow_unprepared_value);
+
+ Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&,
+ const Slice& smallest_user_key,
+ const Slice& largest_user_key, int level,
+ bool* overlap);
+
+ // Lookup the value for key or get all merge operands for key.
+ // If do_merge = true (default) then lookup value for key.
+ // Behavior if do_merge = true:
+ // If found, store it in *value and
+ // return OK. Else return a non-OK status.
+ // Uses *operands to store merge_operator operations to apply later.
+ //
+ // If the ReadOptions.read_tier is set to do a read-only fetch, then
+ // *value_found will be set to false if it cannot be determined whether
+ // this value exists without doing IO.
+ //
+ // If the key is Deleted, *status will be set to NotFound and
+ // *key_exists will be set to true.
+ // If no key was found, *status will be set to NotFound and
+ // *key_exists will be set to false.
+ // If seq is non-null, *seq will be set to the sequence number found
+ // for the key if a key was found.
+ // Behavior if do_merge = false
+ // If the key has any merge operands then store them in
+ // merge_context.operands_list and don't merge the operands
+ // REQUIRES: lock is not held
+ // REQUIRES: pinned_iters_mgr != nullptr
+ void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
+ PinnableWideColumns* columns, std::string* timestamp, Status* status,
+ MergeContext* merge_context,
+ SequenceNumber* max_covering_tombstone_seq,
+ PinnedIteratorsManager* pinned_iters_mgr,
+ bool* value_found = nullptr, bool* key_exists = nullptr,
+ SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
+ bool* is_blob = nullptr, bool do_merge = true);
+
+ void MultiGet(const ReadOptions&, MultiGetRange* range,
+ ReadCallback* callback = nullptr);
+
+ // Interprets blob_index_slice as a blob reference, and (assuming the
+ // corresponding blob file is part of this Version) retrieves the blob and
+ // saves it in *value.
+ // REQUIRES: blob_index_slice stores an encoded blob reference
+ Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+ const Slice& blob_index_slice,
+ FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+ uint64_t* bytes_read) const;
+
+ // Retrieves a blob using a blob reference and saves it in *value,
+ // assuming the corresponding blob file is part of this Version.
+ Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+ const BlobIndex& blob_index,
+ FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+ uint64_t* bytes_read) const;
+
+ using BlobReadContext =
+ std::pair<BlobIndex, std::reference_wrapper<const KeyContext>>;
+ using BlobReadContexts = std::vector<BlobReadContext>;
+ void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range,
+ std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs);
+
+ // Loads some stats information from files (if update_stats is set) and
+ // populates derived data structures. Call without mutex held. It needs to be
+ // called before appending the version to the version set.
+ void PrepareAppend(const MutableCFOptions& mutable_cf_options,
+ bool update_stats);
+
+ // Reference count management (so Versions do not disappear out from
+ // under live iterators)
+ void Ref();
+ // Decrease reference count. Delete the object if no reference left
+ // and return true. Otherwise, return false.
+ bool Unref();
+
+ // Add all files listed in the current version to *live_table_files and
+ // *live_blob_files.
+ void AddLiveFiles(std::vector<uint64_t>* live_table_files,
+ std::vector<uint64_t>* live_blob_files) const;
+
+ // Remove live files that are in the delete candidate lists.
+ void RemoveLiveFiles(
+ std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+ std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const;
+
+ // Return a human readable string that describes this version's contents.
+ std::string DebugString(bool hex = false, bool print_stats = false) const;
+
+ // Returns the version number of this version
+ uint64_t GetVersionNumber() const { return version_number_; }
+
+ // REQUIRES: lock is held
+ // On success, "tp" will contains the table properties of the file
+ // specified in "file_meta". If the file name of "file_meta" is
+ // known ahead, passing it by a non-null "fname" can save a
+ // file-name conversion.
+ Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+ const FileMetaData* file_meta,
+ const std::string* fname = nullptr) const;
+
+ // REQUIRES: lock is held
+ // On success, *props will be populated with all SSTables' table properties.
+ // The keys of `props` are the sst file name, the values of `props` are the
+ // tables' properties, represented as std::shared_ptr.
+ Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
+ Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
+ Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
+ TablePropertiesCollection* props) const;
+
+ // Print summary of range delete tombstones in SST files into out_str,
+ // with maximum max_entries_to_print entries printed out.
+ Status TablesRangeTombstoneSummary(int max_entries_to_print,
+ std::string* out_str);
+
+ // REQUIRES: lock is held
+ // On success, "tp" will contains the aggregated table property among
+ // the table properties of all sst files in this version.
+ Status GetAggregatedTableProperties(
+ std::shared_ptr<const TableProperties>* tp, int level = -1);
+
+ uint64_t GetEstimatedActiveKeys() {
+ return storage_info_.GetEstimatedActiveKeys();
+ }
+
+ size_t GetMemoryUsageByTableReaders();
+
+ ColumnFamilyData* cfd() const { return cfd_; }
+
+ // Return the next Version in the linked list.
+ Version* Next() const { return next_; }
+
+ int TEST_refs() const { return refs_; }
+
+ VersionStorageInfo* storage_info() { return &storage_info_; }
+ const VersionStorageInfo* storage_info() const { return &storage_info_; }
+
+ VersionSet* version_set() { return vset_; }
+
+ void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
+
+ uint64_t GetSstFilesSize();
+
+ // Retrieves the file_creation_time of the oldest file in the DB.
+ // Prerequisite for this API is max_open_files = -1
+ void GetCreationTimeOfOldestFile(uint64_t* creation_time);
+
+ const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }
+
+ InternalIterator* TEST_GetLevelIterator(
+ const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder,
+ int level, bool allow_unprepared_value);
+
+ private:
+ Env* env_;
+ SystemClock* clock_;
+
+ friend class ReactiveVersionSet;
+ friend class VersionSet;
+ friend class VersionEditHandler;
+ friend class VersionEditHandlerPointInTime;
+
+ const InternalKeyComparator* internal_comparator() const {
+ return storage_info_.internal_comparator_;
+ }
+ const Comparator* user_comparator() const {
+ return storage_info_.user_comparator_;
+ }
+
+ // Returns true if the filter blocks in the specified level will not be
+ // checked during read operations. In certain cases (trivial move or preload),
+ // the filter block may already be cached, but we still do not access it such
+ // that it eventually expires from the cache.
+ bool IsFilterSkipped(int level, bool is_file_last_in_level = false);
+
+ // The helper function of UpdateAccumulatedStats, which may fill the missing
+ // fields of file_meta from its associated TableProperties.
+ // Returns true if it does initialize FileMetaData.
+ bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
+
+ // Update the accumulated stats associated with the current version.
+ // This accumulated stats will be used in compaction.
+ void UpdateAccumulatedStats();
+
+ DECLARE_SYNC_AND_ASYNC(
+ /* ret_type */ Status, /* func_name */ MultiGetFromSST,
+ const ReadOptions& read_options, MultiGetRange file_range,
+ int hit_file_level, bool skip_filters, bool skip_range_deletions,
+ FdWithKeyRange* f,
+ std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
+ Cache::Handle* table_handle, uint64_t& num_filter_read,
+ uint64_t& num_index_read, uint64_t& num_sst_read);
+
+#ifdef USE_COROUTINES
+ // MultiGet using async IO to read data blocks from SST files in parallel
+ // within and across levels
+ Status MultiGetAsync(
+ const ReadOptions& options, MultiGetRange* range,
+ std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs);
+
+ // A helper function to lookup a batch of keys in a single level. It will
+ // queue coroutine tasks to mget_tasks. It may also split the input batch
+ // by creating a new batch with keys definitely not in this level and
+ // enqueuing it to to_process.
+ Status ProcessBatch(
+ const ReadOptions& read_options, FilePickerMultiGet* batch,
+ std::vector<folly::coro::Task<Status>>& mget_tasks,
+ std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
+ autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting,
+ std::deque<size_t>& to_process, unsigned int& num_tasks_queued,
+ std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>&
+ mget_stats);
+#endif
+
+ ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
+ Logger* info_log_;
+ Statistics* db_statistics_;
+ TableCache* table_cache_;
+ BlobSource* blob_source_;
+ const MergeOperator* merge_operator_;
+
+ VersionStorageInfo storage_info_;
+ VersionSet* vset_; // VersionSet to which this Version belongs
+ Version* next_; // Next version in linked list
+ Version* prev_; // Previous version in linked list
+ int refs_; // Number of live refs to this version
+ const FileOptions file_options_;
+ const MutableCFOptions mutable_cf_options_;
+ // Cached value to avoid recomputing it on every read.
+ const size_t max_file_size_for_l0_meta_pin_;
+
+ // A version number that uniquely represents this version. This is
+ // used for debugging and logging purposes only.
+ uint64_t version_number_;
+ std::shared_ptr<IOTracer> io_tracer_;
+
+ Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
+ MutableCFOptions mutable_cf_options,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ uint64_t version_number = 0);
+
+ ~Version();
+
+ // No copying allowed
+ Version(const Version&) = delete;
+ void operator=(const Version&) = delete;
+};
+
+class BaseReferencedVersionBuilder;
+
+class AtomicGroupReadBuffer {
+ public:
+ AtomicGroupReadBuffer() = default;
+ Status AddEdit(VersionEdit* edit);
+ void Clear();
+ bool IsFull() const;
+ bool IsEmpty() const;
+
+ uint64_t TEST_read_edits_in_atomic_group() const {
+ return read_edits_in_atomic_group_;
+ }
+ std::vector<VersionEdit>& replay_buffer() { return replay_buffer_; }
+
+ private:
+ uint64_t read_edits_in_atomic_group_ = 0;
+ std::vector<VersionEdit> replay_buffer_;
+};
+
+// VersionSet is the collection of versions of all the column families of the
+// database. Each database owns one VersionSet. A VersionSet has access to all
+// column families via ColumnFamilySet, i.e. set of the column families.
+class VersionSet {
+ public:
+ VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
+ const FileOptions& file_options, Cache* table_cache,
+ WriteBufferManager* write_buffer_manager,
+ WriteController* write_controller,
+ BlockCacheTracer* const block_cache_tracer,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::string& db_id, const std::string& db_session_id);
+ // No copying allowed
+ VersionSet(const VersionSet&) = delete;
+ void operator=(const VersionSet&) = delete;
+
+ virtual ~VersionSet();
+
+ Status LogAndApplyToDefaultColumnFamily(
+ VersionEdit* edit, InstrumentedMutex* mu,
+ FSDirectory* dir_contains_current_file, bool new_descriptor_log = false,
+ const ColumnFamilyOptions* column_family_options = nullptr) {
+ ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault();
+ const MutableCFOptions* cf_options =
+ default_cf->GetLatestMutableCFOptions();
+ return LogAndApply(default_cf, *cf_options, edit, mu,
+ dir_contains_current_file, new_descriptor_log,
+ column_family_options);
+ }
+
+ // Apply *edit to the current version to form a new descriptor that
+ // is both saved to persistent state and installed as the new
+ // current version. Will release *mu while actually writing to the file.
+ // column_family_options has to be set if edit is column family add
+ // REQUIRES: *mu is held on entry.
+ // REQUIRES: no other thread concurrently calls LogAndApply()
+ Status LogAndApply(
+ ColumnFamilyData* column_family_data,
+ const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
+ InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
+ bool new_descriptor_log = false,
+ const ColumnFamilyOptions* column_family_options = nullptr) {
+ autovector<ColumnFamilyData*> cfds;
+ cfds.emplace_back(column_family_data);
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ mutable_cf_options_list.emplace_back(&mutable_cf_options);
+ autovector<autovector<VersionEdit*>> edit_lists;
+ autovector<VersionEdit*> edit_list;
+ edit_list.emplace_back(edit);
+ edit_lists.emplace_back(edit_list);
+ return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+ dir_contains_current_file, new_descriptor_log,
+ column_family_options);
+ }
+ // The batch version. If edit_list.size() > 1, caller must ensure that
+ // no edit in the list column family add or drop
+ Status LogAndApply(
+ ColumnFamilyData* column_family_data,
+ const MutableCFOptions& mutable_cf_options,
+ const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
+ FSDirectory* dir_contains_current_file, bool new_descriptor_log = false,
+ const ColumnFamilyOptions* column_family_options = nullptr,
+ const std::function<void(const Status&)>& manifest_wcb = {}) {
+ autovector<ColumnFamilyData*> cfds;
+ cfds.emplace_back(column_family_data);
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ mutable_cf_options_list.emplace_back(&mutable_cf_options);
+ autovector<autovector<VersionEdit*>> edit_lists;
+ edit_lists.emplace_back(edit_list);
+ return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+ dir_contains_current_file, new_descriptor_log,
+ column_family_options, {manifest_wcb});
+ }
+
+ // The across-multi-cf batch version. If edit_lists contain more than
+ // 1 version edits, caller must ensure that no edit in the []list is column
+ // family manipulation.
+ virtual Status LogAndApply(
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+ const autovector<autovector<VersionEdit*>>& edit_lists,
+ InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
+ bool new_descriptor_log = false,
+ const ColumnFamilyOptions* new_cf_options = nullptr,
+ const std::vector<std::function<void(const Status&)>>& manifest_wcbs =
+ {});
+
+ static Status GetCurrentManifestPath(const std::string& dbname,
+ FileSystem* fs,
+ std::string* manifest_filename,
+ uint64_t* manifest_file_number);
+ void WakeUpWaitingManifestWriters();
+
+ // Recover the last saved descriptor from persistent storage.
+ // If read_only == true, Recover() will not complain if some column families
+ // are not opened
+ Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool read_only = false, std::string* db_id = nullptr,
+ bool no_error_if_files_missing = false);
+
+ Status TryRecover(const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool read_only,
+ const std::vector<std::string>& files_in_dbname,
+ std::string* db_id, bool* has_missing_table_file);
+
+ // Try to recover the version set to the most recent consistent state
+ // recorded in the specified manifest.
+ Status TryRecoverFromOneManifest(
+ const std::string& manifest_path,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool read_only, std::string* db_id, bool* has_missing_table_file);
+
+ // Reads a manifest file and returns a list of column families in
+ // column_families.
+ static Status ListColumnFamilies(std::vector<std::string>* column_families,
+ const std::string& dbname, FileSystem* fs);
+ static Status ListColumnFamiliesFromManifest(
+ const std::string& manifest_path, FileSystem* fs,
+ std::vector<std::string>* column_families);
+
+#ifndef ROCKSDB_LITE
+ // Try to reduce the number of levels. This call is valid when
+ // only one level from the new max level to the old
+ // max level containing files.
+ // The call is static, since number of levels is immutable during
+ // the lifetime of a RocksDB instance. It reduces number of levels
+ // in a DB by applying changes to manifest.
+ // For example, a db currently has 7 levels [0-6], and a call to
+ // to reduce to 5 [0-4] can only be executed when only one level
+ // among [4-6] contains files.
+ static Status ReduceNumberOfLevels(const std::string& dbname,
+ const Options* options,
+ const FileOptions& file_options,
+ int new_levels);
+
+ // Get the checksum information of all live files
+ Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list);
+
+ // printf contents (for debugging)
+ Status DumpManifest(Options& options, std::string& manifestFileName,
+ bool verbose, bool hex = false, bool json = false);
+
+#endif // ROCKSDB_LITE
+
+ const std::string& DbSessionId() const { return db_session_id_; }
+
+ // Return the current manifest file number
+ uint64_t manifest_file_number() const { return manifest_file_number_; }
+
+ uint64_t options_file_number() const { return options_file_number_; }
+
+ uint64_t pending_manifest_file_number() const {
+ return pending_manifest_file_number_;
+ }
+
+ uint64_t current_next_file_number() const { return next_file_number_.load(); }
+
+ uint64_t min_log_number_to_keep() const {
+ return min_log_number_to_keep_.load();
+ }
+
+ // Allocate and return a new file number
+ uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
+
+ // Fetch And Add n new file number
+ uint64_t FetchAddFileNumber(uint64_t n) {
+ return next_file_number_.fetch_add(n);
+ }
+
+ // Return the last sequence number.
+ uint64_t LastSequence() const {
+ return last_sequence_.load(std::memory_order_acquire);
+ }
+
+ // Note: memory_order_acquire must be sufficient.
+ uint64_t LastAllocatedSequence() const {
+ return last_allocated_sequence_.load(std::memory_order_seq_cst);
+ }
+
+ // Note: memory_order_acquire must be sufficient.
+ uint64_t LastPublishedSequence() const {
+ return last_published_sequence_.load(std::memory_order_seq_cst);
+ }
+
+ // Set the last sequence number to s.
+ void SetLastSequence(uint64_t s) {
+ assert(s >= last_sequence_);
+ // Last visible sequence must always be less than last written seq
+ assert(!db_options_->two_write_queues || s <= last_allocated_sequence_);
+ last_sequence_.store(s, std::memory_order_release);
+ }
+
+ // Note: memory_order_release must be sufficient
+ void SetLastPublishedSequence(uint64_t s) {
+ assert(s >= last_published_sequence_);
+ last_published_sequence_.store(s, std::memory_order_seq_cst);
+ }
+
+ // Note: memory_order_release must be sufficient
+ void SetLastAllocatedSequence(uint64_t s) {
+ assert(s >= last_allocated_sequence_);
+ last_allocated_sequence_.store(s, std::memory_order_seq_cst);
+ }
+
+ // Note: memory_order_release must be sufficient
+ uint64_t FetchAddLastAllocatedSequence(uint64_t s) {
+ return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
+ }
+
+ // Mark the specified file number as used.
+ // REQUIRED: this is only called during single-threaded recovery or repair.
+ void MarkFileNumberUsed(uint64_t number);
+
+ // Mark the specified log number as deleted
+ // REQUIRED: this is only called during single-threaded recovery or repair, or
+ // from ::LogAndApply where the global mutex is held.
+ void MarkMinLogNumberToKeep(uint64_t number);
+
+ // Return the log file number for the log file that is currently
+ // being compacted, or zero if there is no such log file.
+ uint64_t prev_log_number() const { return prev_log_number_; }
+
+ // Returns the minimum log number which still has data not flushed to any SST
+ // file.
+ // In non-2PC mode, all the log numbers smaller than this number can be safely
+ // deleted, although we still use `min_log_number_to_keep_` to determine when
+ // to delete a WAL file.
+ uint64_t MinLogNumberWithUnflushedData() const {
+ return PreComputeMinLogNumberWithUnflushedData(nullptr);
+ }
+
+ // Returns the minimum log number which still has data not flushed to any SST
+ // file.
+ // Empty column families' log number is considered to be
+ // new_log_number_for_empty_cf.
+ uint64_t PreComputeMinLogNumberWithUnflushedData(
+ uint64_t new_log_number_for_empty_cf) const {
+ uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+ for (auto cfd : *column_family_set_) {
+ // It's safe to ignore dropped column families here:
+ // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+ uint64_t num =
+ cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber();
+ if (min_log_num > num && !cfd->IsDropped()) {
+ min_log_num = num;
+ }
+ }
+ return min_log_num;
+ }
+ // Returns the minimum log number which still has data not flushed to any SST
+ // file, except data from `cfd_to_skip`.
+ uint64_t PreComputeMinLogNumberWithUnflushedData(
+ const ColumnFamilyData* cfd_to_skip) const {
+ uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+ for (auto cfd : *column_family_set_) {
+ if (cfd == cfd_to_skip) {
+ continue;
+ }
+ // It's safe to ignore dropped column families here:
+ // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+ if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
+ min_log_num = cfd->GetLogNumber();
+ }
+ }
+ return min_log_num;
+ }
+ // Returns the minimum log number which still has data not flushed to any SST
+ // file, except data from `cfds_to_skip`.
+ uint64_t PreComputeMinLogNumberWithUnflushedData(
+ const std::unordered_set<const ColumnFamilyData*>& cfds_to_skip) const {
+ uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+ for (auto cfd : *column_family_set_) {
+ if (cfds_to_skip.count(cfd)) {
+ continue;
+ }
+ // It's safe to ignore dropped column families here:
+ // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+ if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
+ min_log_num = cfd->GetLogNumber();
+ }
+ }
+ return min_log_num;
+ }
+
+ // Create an iterator that reads over the compaction inputs for "*c".
+ // The caller should delete the iterator when no longer needed.
+ // @param read_options Must outlive the returned iterator.
+ // @param start, end indicates compaction range
+ InternalIterator* MakeInputIterator(
+ const ReadOptions& read_options, const Compaction* c,
+ RangeDelAggregator* range_del_agg,
+ const FileOptions& file_options_compactions,
+ const std::optional<const Slice>& start,
+ const std::optional<const Slice>& end);
+
+ // Add all files listed in any live version to *live_table_files and
+ // *live_blob_files. Note that these lists may contain duplicates.
+ void AddLiveFiles(std::vector<uint64_t>* live_table_files,
+ std::vector<uint64_t>* live_blob_files) const;
+
+ // Remove live files that are in the delete candidate lists.
+ void RemoveLiveFiles(
+ std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+ std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const;
+
+ // Return the approximate size of data to be scanned for range [start, end)
+ // in levels [start_level, end_level). If end_level == -1 it will search
+ // through all non-empty levels
+ uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
+ const Slice& start, const Slice& end,
+ int start_level, int end_level,
+ TableReaderCaller caller);
+
+ // Return the size of the current manifest file
+ uint64_t manifest_file_size() const { return manifest_file_size_; }
+
+ Status GetMetadataForFile(uint64_t number, int* filelevel,
+ FileMetaData** metadata, ColumnFamilyData** cfd);
+
+ // This function doesn't support leveldb SST filenames
+ void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
+
+ void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) {
+ assert(table_cache_);
+
+ table_cache_->Erase(GetSlice(&blob_file_number));
+
+ obsolete_blob_files_.emplace_back(blob_file_number, std::move(path));
+ }
+
+ void GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+ std::vector<ObsoleteBlobFileInfo>* blob_files,
+ std::vector<std::string>* manifest_filenames,
+ uint64_t min_pending_output);
+
+ ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+ RefedColumnFamilySet GetRefedColumnFamilySet() {
+ return RefedColumnFamilySet(GetColumnFamilySet());
+ }
+
+ const FileOptions& file_options() { return file_options_; }
+ void ChangeFileOptions(const MutableDBOptions& new_options) {
+ file_options_.writable_file_max_buffer_size =
+ new_options.writable_file_max_buffer_size;
+ }
+
+ const ImmutableDBOptions* db_options() const { return db_options_; }
+
+ static uint64_t GetNumLiveVersions(Version* dummy_versions);
+
+ static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
+
+ static uint64_t GetTotalBlobFileSize(Version* dummy_versions);
+
+ // Get the IO Status returned by written Manifest.
+ const IOStatus& io_status() const { return io_status_; }
+
+ // The returned WalSet needs to be accessed with DB mutex held.
+ const WalSet& GetWalSet() const { return wals_; }
+
+ void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) {
+ assert(cfd);
+
+ const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+ Version* const version =
+ new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_);
+
+ constexpr bool update_stats = false;
+ version->PrepareAppend(mutable_cf_options, update_stats);
+ AppendVersion(cfd, version);
+ }
+
+ protected:
+ using VersionBuilderMap =
+ UnorderedMap<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>;
+
+ struct ManifestWriter;
+
+ friend class Version;
+ friend class VersionEditHandler;
+ friend class VersionEditHandlerPointInTime;
+ friend class DumpManifestHandler;
+ friend class DBImpl;
+ friend class DBImplReadOnly;
+
+ struct LogReporter : public log::Reader::Reporter {
+ Status* status;
+ virtual void Corruption(size_t /*bytes*/, const Status& s) override {
+ if (status->ok()) {
+ *status = s;
+ }
+ }
+ };
+
+ void Reset();
+
+ // Returns approximated offset of a key in a file for a given version.
+ uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+ const Slice& key, TableReaderCaller caller);
+
+ // Returns approximated data size between start and end keys in a file
+ // for a given version.
+ uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
+ const Slice& start, const Slice& end,
+ TableReaderCaller caller);
+
+ struct MutableCFState {
+ uint64_t log_number;
+ std::string full_history_ts_low;
+
+ explicit MutableCFState() = default;
+ explicit MutableCFState(uint64_t _log_number, std::string ts_low)
+ : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {}
+ };
+
+ // Save current contents to *log
+ Status WriteCurrentStateToManifest(
+ const std::unordered_map<uint32_t, MutableCFState>& curr_state,
+ const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s);
+
+ void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
+
+ ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+ const VersionEdit* edit);
+
+ Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath,
+ int level, const FileMetaData& meta);
+
+ // Protected by DB mutex.
+ WalSet wals_;
+
+ std::unique_ptr<ColumnFamilySet> column_family_set_;
+ Cache* table_cache_;
+ Env* const env_;
+ FileSystemPtr const fs_;
+ SystemClock* const clock_;
+ const std::string dbname_;
+ std::string db_id_;
+ const ImmutableDBOptions* const db_options_;
+ std::atomic<uint64_t> next_file_number_;
+ // Any WAL number smaller than this should be ignored during recovery,
+ // and is qualified for being deleted.
+ std::atomic<uint64_t> min_log_number_to_keep_ = {0};
+ uint64_t manifest_file_number_;
+ uint64_t options_file_number_;
+ uint64_t options_file_size_;
+ uint64_t pending_manifest_file_number_;
+ // The last seq visible to reads. It normally indicates the last sequence in
+ // the memtable but when using two write queues it could also indicate the
+ // last sequence in the WAL visible to reads.
+ std::atomic<uint64_t> last_sequence_;
+ // The last sequence number of data committed to the descriptor (manifest
+ // file).
+ SequenceNumber descriptor_last_sequence_ = 0;
+ // The last seq that is already allocated. It is applicable only when we have
+ // two write queues. In that case seq might or might not have appreated in
+ // memtable but it is expected to appear in the WAL.
+ // We have last_sequence <= last_allocated_sequence_
+ std::atomic<uint64_t> last_allocated_sequence_;
+ // The last allocated sequence that is also published to the readers. This is
+ // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise
+ // last_sequence_ also indicates the last published seq.
+ // We have last_sequence <= last_published_sequence_ <=
+ // last_allocated_sequence_
+ std::atomic<uint64_t> last_published_sequence_;
+ uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
+
+ // Opened lazily
+ std::unique_ptr<log::Writer> descriptor_log_;
+
+ // generates a increasing version number for every new version
+ uint64_t current_version_number_;
+
+ // Queue of writers to the manifest file
+ std::deque<ManifestWriter*> manifest_writers_;
+
+ // Current size of manifest file
+ uint64_t manifest_file_size_;
+
+ std::vector<ObsoleteFileInfo> obsolete_files_;
+ std::vector<ObsoleteBlobFileInfo> obsolete_blob_files_;
+ std::vector<std::string> obsolete_manifests_;
+
+ // env options for all reads and writes except compactions
+ FileOptions file_options_;
+
+ BlockCacheTracer* const block_cache_tracer_;
+
+ // Store the IO status when Manifest is written
+ IOStatus io_status_;
+
+ std::shared_ptr<IOTracer> io_tracer_;
+
+ std::string db_session_id_;
+
+ private:
+ // REQUIRES db mutex at beginning. may release and re-acquire db mutex
+ Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
+ InstrumentedMutex* mu,
+ FSDirectory* dir_contains_current_file,
+ bool new_descriptor_log,
+ const ColumnFamilyOptions* new_cf_options);
+
+ void LogAndApplyCFHelper(VersionEdit* edit,
+ SequenceNumber* max_last_sequence);
+ Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
+ VersionEdit* edit, SequenceNumber* max_last_sequence,
+ InstrumentedMutex* mu);
+};
+
+// ReactiveVersionSet represents a collection of versions of the column
+// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary,
+// need to replay the MANIFEST (description log in older terms) in order to
+// reconstruct and install versions.
+class ReactiveVersionSet : public VersionSet {
+ public:
+ ReactiveVersionSet(const std::string& dbname,
+ const ImmutableDBOptions* _db_options,
+ const FileOptions& _file_options, Cache* table_cache,
+ WriteBufferManager* write_buffer_manager,
+ WriteController* write_controller,
+ const std::shared_ptr<IOTracer>& io_tracer);
+
+ ~ReactiveVersionSet() override;
+
+ Status ReadAndApply(
+ InstrumentedMutex* mu,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+ Status* manifest_read_status,
+ std::unordered_set<ColumnFamilyData*>* cfds_changed);
+
+ Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+ std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+ std::unique_ptr<Status>* manifest_reader_status);
+#ifndef NDEBUG
+ uint64_t TEST_read_edits_in_atomic_group() const;
+#endif //! NDEBUG
+
+ std::vector<VersionEdit>& replay_buffer();
+
+ protected:
+ // REQUIRES db mutex
+ Status ApplyOneVersionEditToBuilder(
+ VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ VersionEdit* version_edit);
+
+ Status MaybeSwitchManifest(
+ log::Reader::Reporter* reporter,
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader);
+
+ private:
+ std::unique_ptr<ManifestTailer> manifest_tailer_;
+
+ using VersionSet::LogAndApply;
+ using VersionSet::Recover;
+
+ Status LogAndApply(
+ const autovector<ColumnFamilyData*>& /*cfds*/,
+ const autovector<const MutableCFOptions*>& /*mutable_cf_options_list*/,
+ const autovector<autovector<VersionEdit*>>& /*edit_lists*/,
+ InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/,
+ bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/,
+ const std::vector<std::function<void(const Status&)>>& /*manifest_wcbs*/)
+ override {
+ return Status::NotSupported("not supported in reactive mode");
+ }
+
+ // No copy allowed
+ ReactiveVersionSet(const ReactiveVersionSet&);
+ ReactiveVersionSet& operator=(const ReactiveVersionSet&);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_set_sync_and_async.h b/src/rocksdb/db/version_set_sync_and_async.h
new file mode 100644
index 000000000..755585990
--- /dev/null
+++ b/src/rocksdb/db/version_set_sync_and_async.h
@@ -0,0 +1,151 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "util/coro_utils.h"
+
+#if defined(WITHOUT_COROUTINES) || \
+ (defined(USE_COROUTINES) && defined(WITH_COROUTINES))
+
+namespace ROCKSDB_NAMESPACE {
+
+// Lookup a batch of keys in a single SST file
+DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
+(const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level,
+ bool skip_filters, bool skip_range_deletions, FdWithKeyRange* f,
+ std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
+ Cache::Handle* table_handle, uint64_t& num_filter_read,
+ uint64_t& num_index_read, uint64_t& num_sst_read) {
+ bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+ get_perf_context()->per_level_perf_context_enabled;
+
+ Status s;
+ StopWatchNano timer(clock_, timer_enabled /* auto_start */);
+ s = CO_AWAIT(table_cache_->MultiGet)(
+ read_options, *internal_comparator(), *f->file_metadata, &file_range,
+ mutable_cf_options_.prefix_extractor,
+ cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters,
+ skip_range_deletions, hit_file_level, table_handle);
+ // TODO: examine the behavior for corrupted key
+ if (timer_enabled) {
+ PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+ hit_file_level);
+ }
+ if (!s.ok()) {
+ // TODO: Set status for individual keys appropriately
+ for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
+ *iter->s = s;
+ file_range.MarkKeyDone(iter);
+ }
+ CO_RETURN s;
+ }
+ uint64_t batch_size = 0;
+ for (auto iter = file_range.begin(); s.ok() && iter != file_range.end();
+ ++iter) {
+ GetContext& get_context = *iter->get_context;
+ Status* status = iter->s;
+ // The Status in the KeyContext takes precedence over GetContext state
+ // Status may be an error if there were any IO errors in the table
+ // reader. We never expect Status to be NotFound(), as that is
+ // determined by get_context
+ assert(!status->IsNotFound());
+ if (!status->ok()) {
+ file_range.MarkKeyDone(iter);
+ continue;
+ }
+
+ if (get_context.sample()) {
+ sample_file_read_inc(f->file_metadata);
+ }
+ batch_size++;
+ num_index_read += get_context.get_context_stats_.num_index_read;
+ num_filter_read += get_context.get_context_stats_.num_filter_read;
+ num_sst_read += get_context.get_context_stats_.num_sst_read;
+ // Reset these stats since they're specific to a level
+ get_context.get_context_stats_.num_index_read = 0;
+ get_context.get_context_stats_.num_filter_read = 0;
+ get_context.get_context_stats_.num_sst_read = 0;
+
+ // report the counters before returning
+ if (get_context.State() != GetContext::kNotFound &&
+ get_context.State() != GetContext::kMerge &&
+ db_statistics_ != nullptr) {
+ get_context.ReportCounters();
+ } else {
+ if (iter->max_covering_tombstone_seq > 0) {
+ // The remaining files we look at will only contain covered keys, so
+ // we stop here for this key
+ file_range.SkipKey(iter);
+ }
+ }
+ switch (get_context.State()) {
+ case GetContext::kNotFound:
+ // Keep searching in other files
+ break;
+ case GetContext::kMerge:
+ // TODO: update per-level perfcontext user_key_return_count for kMerge
+ break;
+ case GetContext::kFound:
+ if (hit_file_level == 0) {
+ RecordTick(db_statistics_, GET_HIT_L0);
+ } else if (hit_file_level == 1) {
+ RecordTick(db_statistics_, GET_HIT_L1);
+ } else if (hit_file_level >= 2) {
+ RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+ }
+
+ PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, hit_file_level);
+
+ file_range.MarkKeyDone(iter);
+
+ if (iter->is_blob_index) {
+ if (iter->value) {
+ TEST_SYNC_POINT_CALLBACK("Version::MultiGet::TamperWithBlobIndex",
+ &(*iter));
+
+ const Slice& blob_index_slice = *(iter->value);
+ BlobIndex blob_index;
+ Status tmp_s = blob_index.DecodeFrom(blob_index_slice);
+ if (tmp_s.ok()) {
+ const uint64_t blob_file_num = blob_index.file_number();
+ blob_ctxs[blob_file_num].emplace_back(
+ std::make_pair(blob_index, std::cref(*iter)));
+ } else {
+ *(iter->s) = tmp_s;
+ }
+ }
+ } else {
+ file_range.AddValueSize(iter->value->size());
+ if (file_range.GetValueSize() > read_options.value_size_soft_limit) {
+ s = Status::Aborted();
+ break;
+ }
+ }
+ continue;
+ case GetContext::kDeleted:
+ // Use empty error message for speed
+ *status = Status::NotFound();
+ file_range.MarkKeyDone(iter);
+ continue;
+ case GetContext::kCorrupt:
+ *status =
+ Status::Corruption("corrupted key for ", iter->lkey->user_key());
+ file_range.MarkKeyDone(iter);
+ continue;
+ case GetContext::kUnexpectedBlobIndex:
+ ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+ *status = Status::NotSupported(
+ "Encounter unexpected blob index. Please open DB with "
+ "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+ file_range.MarkKeyDone(iter);
+ continue;
+ }
+ }
+
+ RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
+ CO_RETURN s;
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/db/version_set_test.cc b/src/rocksdb/db/version_set_test.cc
new file mode 100644
index 000000000..7d17406c1
--- /dev/null
+++ b/src/rocksdb/db/version_set_test.cc
@@ -0,0 +1,3587 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/log_writer.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/file_system.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class GenerateLevelFilesBriefTest : public testing::Test {
+ public:
+ std::vector<FileMetaData*> files_;
+ LevelFilesBrief file_level_;
+ Arena arena_;
+
+ GenerateLevelFilesBriefTest() {}
+
+ ~GenerateLevelFilesBriefTest() override {
+ for (size_t i = 0; i < files_.size(); i++) {
+ delete files_[i];
+ }
+ }
+
+ void Add(const char* smallest, const char* largest,
+ SequenceNumber smallest_seq = 100,
+ SequenceNumber largest_seq = 100) {
+ FileMetaData* f = new FileMetaData(
+ files_.size() + 1, 0, 0,
+ InternalKey(smallest, smallest_seq, kTypeValue),
+ InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+ largest_seq, /* marked_for_compact */ false, Temperature::kUnknown,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ files_.push_back(f);
+ }
+
+ int Compare() {
+ int diff = 0;
+ for (size_t i = 0; i < files_.size(); i++) {
+ if (file_level_.files[i].fd.GetNumber() != files_[i]->fd.GetNumber()) {
+ diff++;
+ }
+ }
+ return diff;
+ }
+};
+
+TEST_F(GenerateLevelFilesBriefTest, Empty) {
+ DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+ ASSERT_EQ(0u, file_level_.num_files);
+ ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Single) {
+ Add("p", "q");
+ DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+ ASSERT_EQ(1u, file_level_.num_files);
+ ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Multiple) {
+ Add("150", "200");
+ Add("200", "250");
+ Add("300", "350");
+ Add("400", "450");
+ DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+ ASSERT_EQ(4u, file_level_.num_files);
+ ASSERT_EQ(0, Compare());
+}
+
+class CountingLogger : public Logger {
+ public:
+ CountingLogger() : log_count(0) {}
+ using Logger::Logv;
+ void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+ int log_count;
+};
+
+Options GetOptionsWithNumLevels(int num_levels,
+ std::shared_ptr<CountingLogger> logger) {
+ Options opt;
+ opt.num_levels = num_levels;
+ opt.info_log = logger;
+ return opt;
+}
+
+class VersionStorageInfoTestBase : public testing::Test {
+ public:
+ const Comparator* ucmp_;
+ InternalKeyComparator icmp_;
+ std::shared_ptr<CountingLogger> logger_;
+ Options options_;
+ ImmutableOptions ioptions_;
+ MutableCFOptions mutable_cf_options_;
+ VersionStorageInfo vstorage_;
+
+ InternalKey GetInternalKey(const char* ukey,
+ SequenceNumber smallest_seq = 100) {
+ return InternalKey(ukey, smallest_seq, kTypeValue);
+ }
+
+ explicit VersionStorageInfoTestBase(const Comparator* ucmp)
+ : ucmp_(ucmp),
+ icmp_(ucmp_),
+ logger_(new CountingLogger()),
+ options_(GetOptionsWithNumLevels(6, logger_)),
+ ioptions_(options_),
+ mutable_cf_options_(options_),
+ vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel,
+ /*src_vstorage=*/nullptr,
+ /*_force_consistency_checks=*/false) {}
+
+ ~VersionStorageInfoTestBase() override {
+ for (int i = 0; i < vstorage_.num_levels(); ++i) {
+ for (auto* f : vstorage_.LevelFiles(i)) {
+ if (--f->refs == 0) {
+ delete f;
+ }
+ }
+ }
+ }
+
+ void Add(int level, uint32_t file_number, const char* smallest,
+ const char* largest, uint64_t file_size = 0,
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+ constexpr SequenceNumber dummy_seq = 0;
+
+ Add(level, file_number, GetInternalKey(smallest, dummy_seq),
+ GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number);
+ }
+
+ void Add(int level, uint32_t file_number, const InternalKey& smallest,
+ const InternalKey& largest, uint64_t file_size = 0,
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+ assert(level < vstorage_.num_levels());
+ FileMetaData* f = new FileMetaData(
+ file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
+ /* largest_seq */ 0, /* marked_for_compact */ false,
+ Temperature::kUnknown, oldest_blob_file_number,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ f->compensated_file_size = file_size;
+ vstorage_.AddFile(level, f);
+ }
+
+ void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count,
+ uint64_t total_blob_bytes,
+ BlobFileMetaData::LinkedSsts linked_ssts,
+ uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) {
+ auto shared_meta = SharedBlobFileMetaData::Create(
+ blob_file_number, total_blob_count, total_blob_bytes,
+ /* checksum_method */ std::string(),
+ /* checksum_value */ std::string());
+ auto meta =
+ BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
+ garbage_blob_count, garbage_blob_bytes);
+
+ vstorage_.AddBlobFile(std::move(meta));
+ }
+
+ void UpdateVersionStorageInfo() {
+ vstorage_.PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+ vstorage_.SetFinalized();
+ }
+
+ std::string GetOverlappingFiles(int level, const InternalKey& begin,
+ const InternalKey& end) {
+ std::vector<FileMetaData*> inputs;
+ vstorage_.GetOverlappingInputs(level, &begin, &end, &inputs);
+
+ std::string result;
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ if (i > 0) {
+ result += ",";
+ }
+ AppendNumberTo(&result, inputs[i]->fd.GetNumber());
+ }
+ return result;
+ }
+};
+
+class VersionStorageInfoTest : public VersionStorageInfoTestBase {
+ public:
+ VersionStorageInfoTest() : VersionStorageInfoTestBase(BytewiseComparator()) {}
+
+ ~VersionStorageInfoTest() override {}
+};
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) {
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.max_bytes_for_level_base = 10;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+ Add(4, 100U, "1", "2", 100U);
+ Add(5, 101U, "1", "2", 100U);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 10U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 50U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 250U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1250U);
+
+ ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_1) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+ Add(5, 1U, "1", "2", 500U);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(vstorage_.base_level(), 5);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_2) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+ Add(5, 1U, "1", "2", 500U);
+ Add(5, 2U, "3", "4", 550U);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
+ ASSERT_EQ(vstorage_.base_level(), 4);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_3) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+ Add(5, 1U, "1", "2", 500U);
+ Add(5, 2U, "3", "4", 550U);
+ Add(4, 3U, "3", "4", 550U);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
+ ASSERT_EQ(vstorage_.base_level(), 4);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_4) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+ Add(5, 1U, "1", "2", 500U);
+ Add(5, 2U, "3", "4", 550U);
+ Add(4, 3U, "3", "4", 550U);
+ Add(3, 4U, "3", "4", 250U);
+ Add(3, 5U, "5", "7", 300U);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(1, logger_->log_count);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1005U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 1000U);
+ ASSERT_EQ(vstorage_.base_level(), 3);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_5) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+ Add(5, 1U, "1", "2", 500U);
+ Add(5, 2U, "3", "4", 550U);
+ Add(4, 3U, "3", "4", 550U);
+ Add(3, 4U, "3", "4", 250U);
+ Add(3, 5U, "5", "7", 300U);
+ Add(1, 6U, "3", "4", 5U);
+ Add(1, 7U, "8", "9", 5U);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(1, logger_->log_count);
+ ASSERT_GT(vstorage_.MaxBytesForLevel(4), 1005U);
+ ASSERT_GT(vstorage_.MaxBytesForLevel(3), 1005U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 1005U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 1000U);
+ ASSERT_EQ(vstorage_.base_level(), 1);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLotsOfData) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 100;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 2;
+
+ Add(0, 1U, "1", "2", 50U);
+ Add(1, 2U, "1", "2", 50U);
+ Add(2, 3U, "1", "2", 500U);
+ Add(3, 4U, "1", "2", 500U);
+ Add(4, 5U, "1", "2", 1700U);
+ Add(5, 6U, "1", "2", 500U);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 800U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 400U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 200U);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 100U);
+ ASSERT_EQ(vstorage_.base_level(), 1);
+ ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) {
+ uint64_t kOneGB = 1000U * 1000U * 1000U;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 10U * kOneGB;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ Add(0, 1U, "1", "2", 50U);
+ Add(3, 4U, "1", "2", 32U * kOneGB);
+ Add(4, 5U, "1", "2", 500U * kOneGB);
+ Add(5, 6U, "1", "2", 3000U * kOneGB);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(5), 3000U * kOneGB);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 300U * kOneGB);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 30U * kOneGB);
+ ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 10U * kOneGB);
+ ASSERT_EQ(vstorage_.base_level(), 2);
+ ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 40000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+ Add(0, 1U, "1", "2", 10000U);
+ Add(0, 2U, "1", "2", 10000U);
+ Add(0, 3U, "1", "2", 10000U);
+
+ Add(5, 4U, "1", "2", 1286250U);
+ Add(4, 5U, "1", "2", 200000U);
+ Add(3, 6U, "1", "2", 40000U);
+ Add(2, 7U, "1", "2", 8000U);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(2, vstorage_.base_level());
+ // level multiplier should be 3.5
+ ASSERT_EQ(vstorage_.level_multiplier(), 5.0);
+ ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2));
+ ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
+ ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
+
+ vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ // Only L0 hits compaction.
+ ASSERT_EQ(vstorage_.CompactionScoreLevel(0), 0);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 10000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+
+ Add(0, 11U, "1", "2", 10000U);
+ Add(0, 12U, "1", "2", 10000U);
+ Add(0, 13U, "1", "2", 10000U);
+
+ // Level size should be around 10,000, 10,290, 51,450, 257,250
+ Add(5, 4U, "1", "2", 1286250U);
+ Add(4, 5U, "1", "2", 258000U); // unadjusted score 1.003
+ Add(3, 6U, "1", "2", 53000U); // unadjusted score 1.03
+ Add(2, 7U, "1", "2", 20000U); // unadjusted score 1.94
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(1, vstorage_.base_level());
+ ASSERT_EQ(10000U, vstorage_.MaxBytesForLevel(1));
+ ASSERT_EQ(10290U, vstorage_.MaxBytesForLevel(2));
+ ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
+ ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
+
+ vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ // Although L2 and l3 have higher unadjusted compaction score, considering
+ // a relatively large L0 being compacted down soon, L4 is picked up for
+ // compaction.
+ // L0 is still picked up for oversizing.
+ ASSERT_EQ(0, vstorage_.CompactionScoreLevel(0));
+ ASSERT_EQ(4, vstorage_.CompactionScoreLevel(1));
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) {
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.max_bytes_for_level_base = 20000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 5;
+
+ Add(0, 11U, "1", "2", 2500U);
+ Add(0, 12U, "1", "2", 2500U);
+ Add(0, 13U, "1", "2", 2500U);
+ Add(0, 14U, "1", "2", 2500U);
+
+ // Level size should be around 20,000, 53000, 258000
+ Add(5, 4U, "1", "2", 1286250U);
+ Add(4, 5U, "1", "2", 260000U); // Unadjusted score 1.01, adjusted about 4.3
+ Add(3, 6U, "1", "2", 85000U); // Unadjusted score 1.42, adjusted about 11.6
+ Add(2, 7U, "1", "2", 30000); // Unadjusted score 1.5, adjusted about 10.0
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(0, logger_->log_count);
+ ASSERT_EQ(2, vstorage_.base_level());
+ ASSERT_EQ(20000U, vstorage_.MaxBytesForLevel(2));
+
+ vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ // Although L2 has higher unadjusted compaction score, considering
+ // a relatively large L0 being compacted down soon, L3 is picked up for
+ // compaction.
+
+ ASSERT_EQ(3, vstorage_.CompactionScoreLevel(0));
+ ASSERT_EQ(2, vstorage_.CompactionScoreLevel(1));
+ ASSERT_EQ(4, vstorage_.CompactionScoreLevel(2));
+}
+
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) {
+ // Test whether the overlaps are detected as expected
+ Add(1, 1U, "4", "7", 1U); // Perfect overlap with last level
+ Add(2, 2U, "3", "5", 1U); // Partial overlap with last level
+ Add(2, 3U, "6", "8", 1U); // Partial overlap with last level
+ Add(3, 4U, "1", "9", 1U); // Contains range of last level
+ Add(4, 5U, "4", "5", 1U); // Inside range of last level
+ Add(4, 6U, "6", "7", 1U); // Inside range of last level
+ Add(5, 7U, "4", "7", 10U);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize());
+}
+
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) {
+ Add(0, 1U, "9", "9", 1U); // Level 0 is not ordered
+ Add(0, 2U, "5", "6", 1U); // Ignored because of [5,6] in l1
+ Add(1, 3U, "1", "2", 1U); // Ignored because of [2,3] in l2
+ Add(1, 4U, "3", "4", 1U); // Ignored because of [2,3] in l2
+ Add(1, 5U, "5", "6", 1U);
+ Add(2, 6U, "2", "3", 1U);
+ Add(3, 7U, "7", "8", 1U);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize());
+}
+
+TEST_F(VersionStorageInfoTest, GetOverlappingInputs) {
+ // Two files that overlap at the range deletion tombstone sentinel.
+ Add(1, 1U, {"a", 0, kTypeValue},
+ {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1);
+ Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1);
+ // Two files that overlap at the same user key.
+ Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1);
+ Add(1, 4U, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}, 1);
+ // Two files that do not overlap.
+ Add(1, 5U, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}, 1);
+ Add(1, 6U, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}, 1);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ("1,2",
+ GetOverlappingFiles(1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue}));
+ ASSERT_EQ("1",
+ GetOverlappingFiles(1, {"a", 0, kTypeValue},
+ {"b", kMaxSequenceNumber, kTypeRangeDeletion}));
+ ASSERT_EQ("2", GetOverlappingFiles(1, {"b", kMaxSequenceNumber, kTypeValue},
+ {"c", 0, kTypeValue}));
+ ASSERT_EQ("3,4",
+ GetOverlappingFiles(1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue}));
+ ASSERT_EQ("3",
+ GetOverlappingFiles(1, {"d", 0, kTypeValue},
+ {"e", kMaxSequenceNumber, kTypeRangeDeletion}));
+ ASSERT_EQ("3,4", GetOverlappingFiles(1, {"e", kMaxSequenceNumber, kTypeValue},
+ {"f", 0, kTypeValue}));
+ ASSERT_EQ("3,4",
+ GetOverlappingFiles(1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}));
+ ASSERT_EQ("5",
+ GetOverlappingFiles(1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}));
+ ASSERT_EQ("6",
+ GetOverlappingFiles(1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}));
+}
+
+TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) {
+ Add(0, 11U, "1", "2", 5000U);
+ Add(0, 12U, "1", "2", 5000U);
+
+ Add(2, 7U, "1", "2", 8000U);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(vstorage_.GetFileLocation(11U),
+ VersionStorageInfo::FileLocation(0, 0));
+ ASSERT_NE(vstorage_.GetFileMetaDataByNumber(11U), nullptr);
+
+ ASSERT_EQ(vstorage_.GetFileLocation(12U),
+ VersionStorageInfo::FileLocation(0, 1));
+ ASSERT_NE(vstorage_.GetFileMetaDataByNumber(12U), nullptr);
+
+ ASSERT_EQ(vstorage_.GetFileLocation(7U),
+ VersionStorageInfo::FileLocation(2, 0));
+ ASSERT_NE(vstorage_.GetFileMetaDataByNumber(7U), nullptr);
+
+ ASSERT_FALSE(vstorage_.GetFileLocation(999U).IsValid());
+ ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr);
+}
+
+TEST_F(VersionStorageInfoTest, ForcedBlobGCEmpty) {
+ // No SST or blob files in VersionStorageInfo
+ UpdateVersionStorageInfo();
+
+ constexpr double age_cutoff = 0.5;
+ constexpr double force_threshold = 0.75;
+ vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+ ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+}
+
+TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) {
+ // Test the edge case when all blob files are part of the oldest batch.
+ // We have one L0 SST file #1, and four blob files #10, #11, #12, and #13.
+ // The oldest blob file used by SST #1 is blob file #10.
+
+ constexpr int level = 0;
+
+ constexpr uint64_t sst = 1;
+
+ constexpr uint64_t first_blob = 10;
+ constexpr uint64_t second_blob = 11;
+ constexpr uint64_t third_blob = 12;
+ constexpr uint64_t fourth_blob = 13;
+
+ {
+ constexpr char smallest[] = "bar1";
+ constexpr char largest[] = "foo1";
+ constexpr uint64_t file_size = 1000;
+
+ Add(level, sst, smallest, largest, file_size, first_blob);
+ }
+
+ {
+ constexpr uint64_t total_blob_count = 10;
+ constexpr uint64_t total_blob_bytes = 100000;
+ constexpr uint64_t garbage_blob_count = 2;
+ constexpr uint64_t garbage_blob_bytes = 15000;
+
+ AddBlob(first_blob, total_blob_count, total_blob_bytes,
+ BlobFileMetaData::LinkedSsts{sst}, garbage_blob_count,
+ garbage_blob_bytes);
+ }
+
+ {
+ constexpr uint64_t total_blob_count = 4;
+ constexpr uint64_t total_blob_bytes = 400000;
+ constexpr uint64_t garbage_blob_count = 3;
+ constexpr uint64_t garbage_blob_bytes = 235000;
+
+ AddBlob(second_blob, total_blob_count, total_blob_bytes,
+ BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+ garbage_blob_bytes);
+ }
+
+ {
+ constexpr uint64_t total_blob_count = 20;
+ constexpr uint64_t total_blob_bytes = 1000000;
+ constexpr uint64_t garbage_blob_count = 8;
+ constexpr uint64_t garbage_blob_bytes = 400000;
+
+ AddBlob(third_blob, total_blob_count, total_blob_bytes,
+ BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+ garbage_blob_bytes);
+ }
+
+ {
+ constexpr uint64_t total_blob_count = 128;
+ constexpr uint64_t total_blob_bytes = 1000000;
+ constexpr uint64_t garbage_blob_count = 67;
+ constexpr uint64_t garbage_blob_bytes = 600000;
+
+ AddBlob(fourth_blob, total_blob_count, total_blob_bytes,
+ BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+ garbage_blob_bytes);
+ }
+
+ UpdateVersionStorageInfo();
+
+ assert(vstorage_.num_levels() > 0);
+ const auto& level_files = vstorage_.LevelFiles(level);
+
+ assert(level_files.size() == 1);
+ assert(level_files[0] && level_files[0]->fd.GetNumber() == sst);
+
+ // No blob files eligible for GC due to the age cutoff
+
+ {
+ constexpr double age_cutoff = 0.1;
+ constexpr double force_threshold = 0.0;
+ vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+ ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+ }
+
+ // Part of the oldest batch of blob files (specifically, #12 and #13) is
+ // ineligible for GC due to the age cutoff
+
+ {
+ constexpr double age_cutoff = 0.5;
+ constexpr double force_threshold = 0.0;
+ vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+ ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+ }
+
+ // Oldest batch is eligible based on age cutoff but its overall garbage ratio
+ // is below threshold
+
+ {
+ constexpr double age_cutoff = 1.0;
+ constexpr double force_threshold = 0.6;
+ vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+ ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+ }
+
+ // Oldest batch is eligible based on age cutoff and its overall garbage ratio
+ // meets threshold
+
+ {
+ constexpr double age_cutoff = 1.0;
+ constexpr double force_threshold = 0.5;
+ vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+ auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
+ ASSERT_EQ(ssts_to_be_compacted.size(), 1);
+
+ const autovector<std::pair<int, FileMetaData*>>
+ expected_ssts_to_be_compacted{{level, level_files[0]}};
+
+ ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
+ }
+}
+
+TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) {
+ // Add three L0 SSTs (1, 2, and 3) and four blob files (10, 11, 12, and 13).
+ // The first two SSTs have the same oldest blob file, namely, the very oldest
+ // one (10), while the third SST's oldest blob file reference points to the
+ // third blob file (12). Thus, the oldest batch of blob files contains the
+ // first two blob files 10 and 11, and assuming they are eligible for GC based
+ // on the age cutoff, compacting away the SSTs 1 and 2 will eliminate them.
+
+ constexpr int level = 0;
+
+ constexpr uint64_t first_sst = 1;
+ constexpr uint64_t second_sst = 2;
+ constexpr uint64_t third_sst = 3;
+
+ constexpr uint64_t first_blob = 10;
+ constexpr uint64_t second_blob = 11;
+ constexpr uint64_t third_blob = 12;
+ constexpr uint64_t fourth_blob = 13;
+
+ {
+ constexpr char smallest[] = "bar1";
+ constexpr char largest[] = "foo1";
+ constexpr uint64_t file_size = 1000;
+
+ Add(level, first_sst, smallest, largest, file_size, first_blob);
+ }
+
+ {
+ constexpr char smallest[] = "bar2";
+ constexpr char largest[] = "foo2";
+ constexpr uint64_t file_size = 2000;
+
+ Add(level, second_sst, smallest, largest, file_size, first_blob);
+ }
+
+ {
+ constexpr char smallest[] = "bar3";
+ constexpr char largest[] = "foo3";
+ constexpr uint64_t file_size = 3000;
+
+ Add(level, third_sst, smallest, largest, file_size, third_blob);
+ }
+
+ {
+ constexpr uint64_t total_blob_count = 10;
+ constexpr uint64_t total_blob_bytes = 100000;
+ constexpr uint64_t garbage_blob_count = 2;
+ constexpr uint64_t garbage_blob_bytes = 15000;
+
+ AddBlob(first_blob, total_blob_count, total_blob_bytes,
+ BlobFileMetaData::LinkedSsts{first_sst, second_sst},
+ garbage_blob_count, garbage_blob_bytes);
+ }
+
+ {
+ constexpr uint64_t total_blob_count = 4;
+ constexpr uint64_t total_blob_bytes = 400000;
+ constexpr uint64_t garbage_blob_count = 3;
+ constexpr uint64_t garbage_blob_bytes = 235000;
+
+ AddBlob(second_blob, total_blob_count, total_blob_bytes,
+ BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+ garbage_blob_bytes);
+ }
+
+ {
+ constexpr uint64_t total_blob_count = 20;
+ constexpr uint64_t total_blob_bytes = 1000000;
+ constexpr uint64_t garbage_blob_count = 8;
+ constexpr uint64_t garbage_blob_bytes = 123456;
+
+ AddBlob(third_blob, total_blob_count, total_blob_bytes,
+ BlobFileMetaData::LinkedSsts{third_sst}, garbage_blob_count,
+ garbage_blob_bytes);
+ }
+
+ {
+ constexpr uint64_t total_blob_count = 128;
+ constexpr uint64_t total_blob_bytes = 789012345;
+ constexpr uint64_t garbage_blob_count = 67;
+ constexpr uint64_t garbage_blob_bytes = 88888888;
+
+ AddBlob(fourth_blob, total_blob_count, total_blob_bytes,
+ BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+ garbage_blob_bytes);
+ }
+
+ UpdateVersionStorageInfo();
+
+ assert(vstorage_.num_levels() > 0);
+ const auto& level_files = vstorage_.LevelFiles(level);
+
+ assert(level_files.size() == 3);
+ assert(level_files[0] && level_files[0]->fd.GetNumber() == first_sst);
+ assert(level_files[1] && level_files[1]->fd.GetNumber() == second_sst);
+ assert(level_files[2] && level_files[2]->fd.GetNumber() == third_sst);
+
+ // No blob files eligible for GC due to the age cutoff
+
+ {
+ constexpr double age_cutoff = 0.1;
+ constexpr double force_threshold = 0.0;
+ vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+ ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+ }
+
+ // Part of the oldest batch of blob files (specifically, the second file) is
+ // ineligible for GC due to the age cutoff
+
+ {
+ constexpr double age_cutoff = 0.25;
+ constexpr double force_threshold = 0.0;
+ vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+ ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+ }
+
+ // Oldest batch is eligible based on age cutoff but its overall garbage ratio
+ // is below threshold
+
+ {
+ constexpr double age_cutoff = 0.5;
+ constexpr double force_threshold = 0.6;
+ vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+ ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+ }
+
+ // Oldest batch is eligible based on age cutoff and its overall garbage ratio
+ // meets threshold
+
+ {
+ constexpr double age_cutoff = 0.5;
+ constexpr double force_threshold = 0.5;
+ vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+ auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
+ ASSERT_EQ(ssts_to_be_compacted.size(), 2);
+
+ std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(),
+ [](const std::pair<int, FileMetaData*>& lhs,
+ const std::pair<int, FileMetaData*>& rhs) {
+ assert(lhs.second);
+ assert(rhs.second);
+ return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber();
+ });
+
+ const autovector<std::pair<int, FileMetaData*>>
+ expected_ssts_to_be_compacted{{level, level_files[0]},
+ {level, level_files[1]}};
+
+ ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
+ ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]);
+ }
+
+ // Now try the last two cases again with a greater than necessary age cutoff
+
+ // Oldest batch is eligible based on age cutoff but its overall garbage ratio
+ // is below threshold
+
+ {
+ constexpr double age_cutoff = 0.75;
+ constexpr double force_threshold = 0.6;
+ vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+ ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+ }
+
+ // Oldest batch is eligible based on age cutoff and its overall garbage ratio
+ // meets threshold
+
+ {
+ constexpr double age_cutoff = 0.75;
+ constexpr double force_threshold = 0.5;
+ vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+ auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
+ ASSERT_EQ(ssts_to_be_compacted.size(), 2);
+
+ std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(),
+ [](const std::pair<int, FileMetaData*>& lhs,
+ const std::pair<int, FileMetaData*>& rhs) {
+ assert(lhs.second);
+ assert(rhs.second);
+ return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber();
+ });
+
+ const autovector<std::pair<int, FileMetaData*>>
+ expected_ssts_to_be_compacted{{level, level_files[0]},
+ {level, level_files[1]}};
+
+ ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
+ ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]);
+ }
+}
+
+class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase {
+ public:
+ VersionStorageInfoTimestampTest()
+ : VersionStorageInfoTestBase(test::BytewiseComparatorWithU64TsWrapper()) {
+ }
+ ~VersionStorageInfoTimestampTest() override {}
+ std::string Timestamp(uint64_t ts) const {
+ std::string ret;
+ PutFixed64(&ret, ts);
+ return ret;
+ }
+ std::string PackUserKeyAndTimestamp(const Slice& ukey, uint64_t ts) const {
+ std::string ret;
+ ret.assign(ukey.data(), ukey.size());
+ PutFixed64(&ret, ts);
+ return ret;
+ }
+};
+
+TEST_F(VersionStorageInfoTimestampTest, GetOverlappingInputs) {
+ Add(/*level=*/1, /*file_number=*/1, /*smallest=*/
+ {PackUserKeyAndTimestamp("a", /*ts=*/9), /*s=*/0, kTypeValue},
+ /*largest=*/
+ {PackUserKeyAndTimestamp("a", /*ts=*/8), /*s=*/0, kTypeValue},
+ /*file_size=*/100);
+ Add(/*level=*/1, /*file_number=*/2, /*smallest=*/
+ {PackUserKeyAndTimestamp("a", /*ts=*/5), /*s=*/0, kTypeValue},
+ /*largest=*/
+ {PackUserKeyAndTimestamp("b", /*ts=*/10), /*s=*/0, kTypeValue},
+ /*file_size=*/100);
+ Add(/*level=*/1, /*file_number=*/3, /*smallest=*/
+ {PackUserKeyAndTimestamp("c", /*ts=*/12), /*s=*/0, kTypeValue},
+ /*largest=*/
+ {PackUserKeyAndTimestamp("d", /*ts=*/1), /*s=*/0, kTypeValue},
+ /*file_size=*/100);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(
+ "1,2",
+ GetOverlappingFiles(
+ /*level=*/1,
+ {PackUserKeyAndTimestamp("a", /*ts=*/12), /*s=*/0, kTypeValue},
+ {PackUserKeyAndTimestamp("a", /*ts=*/11), /*s=*/0, kTypeValue}));
+ ASSERT_EQ("3",
+ GetOverlappingFiles(
+ /*level=*/1,
+ {PackUserKeyAndTimestamp("c", /*ts=*/15), /*s=*/0, kTypeValue},
+ {PackUserKeyAndTimestamp("c", /*ts=*/2), /*s=*/0, kTypeValue}));
+}
+
+class FindLevelFileTest : public testing::Test {
+ public:
+ LevelFilesBrief file_level_;
+ bool disjoint_sorted_files_;
+ Arena arena_;
+
+ FindLevelFileTest() : disjoint_sorted_files_(true) {}
+
+ ~FindLevelFileTest() override {}
+
+ void LevelFileInit(size_t num = 0) {
+ char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange));
+ file_level_.files = new (mem) FdWithKeyRange[num];
+ file_level_.num_files = 0;
+ }
+
+ void Add(const char* smallest, const char* largest,
+ SequenceNumber smallest_seq = 100,
+ SequenceNumber largest_seq = 100) {
+ InternalKey smallest_key = InternalKey(smallest, smallest_seq, kTypeValue);
+ InternalKey largest_key = InternalKey(largest, largest_seq, kTypeValue);
+
+ Slice smallest_slice = smallest_key.Encode();
+ Slice largest_slice = largest_key.Encode();
+
+ char* mem =
+ arena_.AllocateAligned(smallest_slice.size() + largest_slice.size());
+ memcpy(mem, smallest_slice.data(), smallest_slice.size());
+ memcpy(mem + smallest_slice.size(), largest_slice.data(),
+ largest_slice.size());
+
+ // add to file_level_
+ size_t num = file_level_.num_files;
+ auto& file = file_level_.files[num];
+ file.fd = FileDescriptor(num + 1, 0, 0);
+ file.smallest_key = Slice(mem, smallest_slice.size());
+ file.largest_key = Slice(mem + smallest_slice.size(), largest_slice.size());
+ file_level_.num_files++;
+ }
+
+ int Find(const char* key) {
+ InternalKey target(key, 100, kTypeValue);
+ InternalKeyComparator cmp(BytewiseComparator());
+ return FindFile(cmp, file_level_, target.Encode());
+ }
+
+ bool Overlaps(const char* smallest, const char* largest) {
+ InternalKeyComparator cmp(BytewiseComparator());
+ Slice s(smallest != nullptr ? smallest : "");
+ Slice l(largest != nullptr ? largest : "");
+ return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, file_level_,
+ (smallest != nullptr ? &s : nullptr),
+ (largest != nullptr ? &l : nullptr));
+ }
+};
+
+TEST_F(FindLevelFileTest, LevelEmpty) {
+ LevelFileInit(0);
+
+ ASSERT_EQ(0, Find("foo"));
+ ASSERT_TRUE(!Overlaps("a", "z"));
+ ASSERT_TRUE(!Overlaps(nullptr, "z"));
+ ASSERT_TRUE(!Overlaps("a", nullptr));
+ ASSERT_TRUE(!Overlaps(nullptr, nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelSingle) {
+ LevelFileInit(1);
+
+ Add("p", "q");
+ ASSERT_EQ(0, Find("a"));
+ ASSERT_EQ(0, Find("p"));
+ ASSERT_EQ(0, Find("p1"));
+ ASSERT_EQ(0, Find("q"));
+ ASSERT_EQ(1, Find("q1"));
+ ASSERT_EQ(1, Find("z"));
+
+ ASSERT_TRUE(!Overlaps("a", "b"));
+ ASSERT_TRUE(!Overlaps("z1", "z2"));
+ ASSERT_TRUE(Overlaps("a", "p"));
+ ASSERT_TRUE(Overlaps("a", "q"));
+ ASSERT_TRUE(Overlaps("a", "z"));
+ ASSERT_TRUE(Overlaps("p", "p1"));
+ ASSERT_TRUE(Overlaps("p", "q"));
+ ASSERT_TRUE(Overlaps("p", "z"));
+ ASSERT_TRUE(Overlaps("p1", "p2"));
+ ASSERT_TRUE(Overlaps("p1", "z"));
+ ASSERT_TRUE(Overlaps("q", "q"));
+ ASSERT_TRUE(Overlaps("q", "q1"));
+
+ ASSERT_TRUE(!Overlaps(nullptr, "j"));
+ ASSERT_TRUE(!Overlaps("r", nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, "p"));
+ ASSERT_TRUE(Overlaps(nullptr, "p1"));
+ ASSERT_TRUE(Overlaps("q", nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelMultiple) {
+ LevelFileInit(4);
+
+ Add("150", "200");
+ Add("200", "250");
+ Add("300", "350");
+ Add("400", "450");
+ ASSERT_EQ(0, Find("100"));
+ ASSERT_EQ(0, Find("150"));
+ ASSERT_EQ(0, Find("151"));
+ ASSERT_EQ(0, Find("199"));
+ ASSERT_EQ(0, Find("200"));
+ ASSERT_EQ(1, Find("201"));
+ ASSERT_EQ(1, Find("249"));
+ ASSERT_EQ(1, Find("250"));
+ ASSERT_EQ(2, Find("251"));
+ ASSERT_EQ(2, Find("299"));
+ ASSERT_EQ(2, Find("300"));
+ ASSERT_EQ(2, Find("349"));
+ ASSERT_EQ(2, Find("350"));
+ ASSERT_EQ(3, Find("351"));
+ ASSERT_EQ(3, Find("400"));
+ ASSERT_EQ(3, Find("450"));
+ ASSERT_EQ(4, Find("451"));
+
+ ASSERT_TRUE(!Overlaps("100", "149"));
+ ASSERT_TRUE(!Overlaps("251", "299"));
+ ASSERT_TRUE(!Overlaps("451", "500"));
+ ASSERT_TRUE(!Overlaps("351", "399"));
+
+ ASSERT_TRUE(Overlaps("100", "150"));
+ ASSERT_TRUE(Overlaps("100", "200"));
+ ASSERT_TRUE(Overlaps("100", "300"));
+ ASSERT_TRUE(Overlaps("100", "400"));
+ ASSERT_TRUE(Overlaps("100", "500"));
+ ASSERT_TRUE(Overlaps("375", "400"));
+ ASSERT_TRUE(Overlaps("450", "450"));
+ ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST_F(FindLevelFileTest, LevelMultipleNullBoundaries) {
+ LevelFileInit(4);
+
+ Add("150", "200");
+ Add("200", "250");
+ Add("300", "350");
+ Add("400", "450");
+ ASSERT_TRUE(!Overlaps(nullptr, "149"));
+ ASSERT_TRUE(!Overlaps("451", nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, "150"));
+ ASSERT_TRUE(Overlaps(nullptr, "199"));
+ ASSERT_TRUE(Overlaps(nullptr, "200"));
+ ASSERT_TRUE(Overlaps(nullptr, "201"));
+ ASSERT_TRUE(Overlaps(nullptr, "400"));
+ ASSERT_TRUE(Overlaps(nullptr, "800"));
+ ASSERT_TRUE(Overlaps("100", nullptr));
+ ASSERT_TRUE(Overlaps("200", nullptr));
+ ASSERT_TRUE(Overlaps("449", nullptr));
+ ASSERT_TRUE(Overlaps("450", nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelOverlapSequenceChecks) {
+ LevelFileInit(1);
+
+ Add("200", "200", 5000, 3000);
+ ASSERT_TRUE(!Overlaps("199", "199"));
+ ASSERT_TRUE(!Overlaps("201", "300"));
+ ASSERT_TRUE(Overlaps("200", "200"));
+ ASSERT_TRUE(Overlaps("190", "200"));
+ ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST_F(FindLevelFileTest, LevelOverlappingFiles) {
+ LevelFileInit(2);
+
+ Add("150", "600");
+ Add("400", "500");
+ disjoint_sorted_files_ = false;
+ ASSERT_TRUE(!Overlaps("100", "149"));
+ ASSERT_TRUE(!Overlaps("601", "700"));
+ ASSERT_TRUE(Overlaps("100", "150"));
+ ASSERT_TRUE(Overlaps("100", "200"));
+ ASSERT_TRUE(Overlaps("100", "300"));
+ ASSERT_TRUE(Overlaps("100", "400"));
+ ASSERT_TRUE(Overlaps("100", "500"));
+ ASSERT_TRUE(Overlaps("375", "400"));
+ ASSERT_TRUE(Overlaps("450", "450"));
+ ASSERT_TRUE(Overlaps("450", "500"));
+ ASSERT_TRUE(Overlaps("450", "700"));
+ ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+class VersionSetTestBase {
+ public:
+ const static std::string kColumnFamilyName1;
+ const static std::string kColumnFamilyName2;
+ const static std::string kColumnFamilyName3;
+ int num_initial_edits_;
+
+ explicit VersionSetTestBase(const std::string& name)
+ : env_(nullptr),
+ dbname_(test::PerThreadDBPath(name)),
+ options_(),
+ db_options_(options_),
+ cf_options_(options_),
+ immutable_options_(db_options_, cf_options_),
+ mutable_cf_options_(cf_options_),
+ table_cache_(NewLRUCache(50000, 16)),
+ write_buffer_manager_(db_options_.db_write_buffer_size),
+ shutting_down_(false),
+ mock_table_factory_(std::make_shared<mock::MockTableFactory>()) {
+ EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
+ if (env_ == Env::Default() && getenv("MEM_ENV")) {
+ env_guard_.reset(NewMemEnv(Env::Default()));
+ env_ = env_guard_.get();
+ }
+ EXPECT_NE(nullptr, env_);
+
+ fs_ = env_->GetFileSystem();
+ EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr));
+
+ options_.env = env_;
+ db_options_.env = env_;
+ db_options_.fs = fs_;
+ immutable_options_.env = env_;
+ immutable_options_.fs = fs_;
+ immutable_options_.clock = env_->GetSystemClock().get();
+
+ versions_.reset(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ reactive_versions_ = std::make_shared<ReactiveVersionSet>(
+ dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_, nullptr);
+ db_options_.db_paths.emplace_back(dbname_,
+ std::numeric_limits<uint64_t>::max());
+ }
+
+ virtual ~VersionSetTestBase() {
+ if (getenv("KEEP_DB")) {
+ fprintf(stdout, "DB is still at %s\n", dbname_.c_str());
+ } else {
+ Options options;
+ options.env = env_;
+ EXPECT_OK(DestroyDB(dbname_, options));
+ }
+ }
+
+ protected:
+ virtual void PrepareManifest(
+ std::vector<ColumnFamilyDescriptor>* column_families,
+ SequenceNumber* last_seqno, std::unique_ptr<log::Writer>* log_writer) {
+ assert(column_families != nullptr);
+ assert(last_seqno != nullptr);
+ assert(log_writer != nullptr);
+ VersionEdit new_db;
+ if (db_options_.write_dbid_to_manifest) {
+ DBOptions tmp_db_options;
+ tmp_db_options.env = env_;
+ std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
+ std::string db_id;
+ impl->GetDbIdentityFromIdentityFile(&db_id);
+ new_db.SetDBId(db_id);
+ }
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ const std::vector<std::string> cf_names = {
+ kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+ kColumnFamilyName3};
+ const int kInitialNumOfCfs = static_cast<int>(cf_names.size());
+ autovector<VersionEdit> new_cfs;
+ uint64_t last_seq = 1;
+ uint32_t cf_id = 1;
+ for (int i = 1; i != kInitialNumOfCfs; ++i) {
+ VersionEdit new_cf;
+ new_cf.AddColumnFamily(cf_names[i]);
+ new_cf.SetColumnFamily(cf_id++);
+ new_cf.SetLogNumber(0);
+ new_cf.SetNextFile(2);
+ new_cf.SetLastSequence(last_seq++);
+ new_cfs.emplace_back(new_cf);
+ }
+ *last_seqno = last_seq;
+ num_initial_edits_ = static_cast<int>(new_cfs.size() + 1);
+ std::unique_ptr<WritableFileWriter> file_writer;
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ const auto& fs = env_->GetFileSystem();
+ Status s = WritableFileWriter::Create(
+ fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+ nullptr);
+ ASSERT_OK(s);
+ {
+ log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = (*log_writer)->AddRecord(record);
+ for (const auto& e : new_cfs) {
+ record.clear();
+ e.EncodeTo(&record);
+ s = (*log_writer)->AddRecord(record);
+ ASSERT_OK(s);
+ }
+ }
+ ASSERT_OK(s);
+
+ cf_options_.table_factory = mock_table_factory_;
+ for (const auto& cf_name : cf_names) {
+ column_families->emplace_back(cf_name, cf_options_);
+ }
+ }
+
+ // Create DB with 3 column families.
+ void NewDB() {
+ SequenceNumber last_seqno;
+ std::unique_ptr<log::Writer> log_writer;
+ SetIdentityFile(env_, dbname_);
+ PrepareManifest(&column_families_, &last_seqno, &log_writer);
+ log_writer.reset();
+ // Make "CURRENT" file point to the new manifest file.
+ Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+ ASSERT_OK(s);
+
+ EXPECT_OK(versions_->Recover(column_families_, false));
+ EXPECT_EQ(column_families_.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ }
+
+ void ReopenDB() {
+ versions_.reset(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ EXPECT_OK(versions_->Recover(column_families_, false));
+ }
+
+ void VerifyManifest(std::string* manifest_path) const {
+ assert(manifest_path != nullptr);
+ uint64_t manifest_file_number = 0;
+ Status s = versions_->GetCurrentManifestPath(
+ dbname_, fs_.get(), manifest_path, &manifest_file_number);
+ ASSERT_OK(s);
+ ASSERT_EQ(1, manifest_file_number);
+ }
+
+ Status LogAndApplyToDefaultCF(VersionEdit& edit) {
+ mutex_.Lock();
+ Status s =
+ versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+ mutable_cf_options_, &edit, &mutex_, nullptr);
+ mutex_.Unlock();
+ return s;
+ }
+
+ Status LogAndApplyToDefaultCF(
+ const autovector<std::unique_ptr<VersionEdit>>& edits) {
+ autovector<VersionEdit*> vedits;
+ for (auto& e : edits) {
+ vedits.push_back(e.get());
+ }
+ mutex_.Lock();
+ Status s =
+ versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+ mutable_cf_options_, vedits, &mutex_, nullptr);
+ mutex_.Unlock();
+ return s;
+ }
+
+ void CreateNewManifest() {
+ constexpr FSDirectory* db_directory = nullptr;
+ constexpr bool new_descriptor_log = true;
+ mutex_.Lock();
+ VersionEdit dummy;
+ ASSERT_OK(versions_->LogAndApply(
+ versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
+ &dummy, &mutex_, db_directory, new_descriptor_log));
+ mutex_.Unlock();
+ }
+
+ ColumnFamilyData* CreateColumnFamily(const std::string& cf_name,
+ const ColumnFamilyOptions& cf_options) {
+ VersionEdit new_cf;
+ new_cf.AddColumnFamily(cf_name);
+ uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+ new_cf.SetColumnFamily(new_id);
+ new_cf.SetLogNumber(0);
+ new_cf.SetComparatorName(cf_options.comparator->Name());
+ Status s;
+ mutex_.Lock();
+ s = versions_->LogAndApply(/*column_family_data=*/nullptr,
+ MutableCFOptions(cf_options), &new_cf, &mutex_,
+ /*db_directory=*/nullptr,
+ /*new_descriptor_log=*/false, &cf_options);
+ mutex_.Unlock();
+ EXPECT_OK(s);
+ ColumnFamilyData* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(cf_name);
+ EXPECT_NE(nullptr, cfd);
+ return cfd;
+ }
+
+ Env* mem_env_;
+ Env* env_;
+ std::shared_ptr<Env> env_guard_;
+ std::shared_ptr<FileSystem> fs_;
+ const std::string dbname_;
+ EnvOptions env_options_;
+ Options options_;
+ ImmutableDBOptions db_options_;
+ ColumnFamilyOptions cf_options_;
+ ImmutableOptions immutable_options_;
+ MutableCFOptions mutable_cf_options_;
+ std::shared_ptr<Cache> table_cache_;
+ WriteController write_controller_;
+ WriteBufferManager write_buffer_manager_;
+ std::shared_ptr<VersionSet> versions_;
+ std::shared_ptr<ReactiveVersionSet> reactive_versions_;
+ InstrumentedMutex mutex_;
+ std::atomic<bool> shutting_down_;
+ std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+ std::vector<ColumnFamilyDescriptor> column_families_;
+};
+
+const std::string VersionSetTestBase::kColumnFamilyName1 = "alice";
+const std::string VersionSetTestBase::kColumnFamilyName2 = "bob";
+const std::string VersionSetTestBase::kColumnFamilyName3 = "charles";
+
+class VersionSetTest : public VersionSetTestBase, public testing::Test {
+ public:
+ VersionSetTest() : VersionSetTestBase("version_set_test") {}
+};
+
+TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
+ NewDB();
+ const int kGroupSize = 5;
+ autovector<VersionEdit> edits;
+ for (int i = 0; i != kGroupSize; ++i) {
+ edits.emplace_back(VersionEdit());
+ }
+ autovector<ColumnFamilyData*> cfds;
+ autovector<const MutableCFOptions*> all_mutable_cf_options;
+ autovector<autovector<VersionEdit*>> edit_lists;
+ for (int i = 0; i != kGroupSize; ++i) {
+ cfds.emplace_back(versions_->GetColumnFamilySet()->GetDefault());
+ all_mutable_cf_options.emplace_back(&mutable_cf_options_);
+ autovector<VersionEdit*> edit_list;
+ edit_list.emplace_back(&edits[i]);
+ edit_lists.emplace_back(edit_list);
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ int count = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) {
+ uint32_t* cf_id = reinterpret_cast<uint32_t*>(arg);
+ EXPECT_EQ(0u, *cf_id);
+ ++count;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ mutex_.Lock();
+ Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists,
+ &mutex_, nullptr);
+ mutex_.Unlock();
+ EXPECT_OK(s);
+ EXPECT_EQ(kGroupSize - 1, count);
+}
+
+TEST_F(VersionSetTest, PersistBlobFileStateInNewManifest) {
+ // Initialize the database and add a couple of blob files, one with some
+ // garbage in it, and one without any garbage.
+ NewDB();
+
+ assert(versions_);
+ assert(versions_->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ Version* const version = cfd->current();
+ assert(version);
+
+ VersionStorageInfo* const storage_info = version->storage_info();
+ assert(storage_info);
+
+ {
+ constexpr uint64_t blob_file_number = 123;
+ constexpr uint64_t total_blob_count = 456;
+ constexpr uint64_t total_blob_bytes = 77777777;
+ constexpr char checksum_method[] = "SHA1";
+ constexpr char checksum_value[] =
+ "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c"
+ "\x52\x5c\xbd";
+
+ auto shared_meta = SharedBlobFileMetaData::Create(
+ blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+ checksum_value);
+
+ constexpr uint64_t garbage_blob_count = 89;
+ constexpr uint64_t garbage_blob_bytes = 1000000;
+
+ auto meta = BlobFileMetaData::Create(
+ std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+ garbage_blob_count, garbage_blob_bytes);
+
+ storage_info->AddBlobFile(std::move(meta));
+ }
+
+ {
+ constexpr uint64_t blob_file_number = 234;
+ constexpr uint64_t total_blob_count = 555;
+ constexpr uint64_t total_blob_bytes = 66666;
+ constexpr char checksum_method[] = "CRC32";
+ constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+ auto shared_meta = SharedBlobFileMetaData::Create(
+ blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+ checksum_value);
+
+ constexpr uint64_t garbage_blob_count = 0;
+ constexpr uint64_t garbage_blob_bytes = 0;
+
+ auto meta = BlobFileMetaData::Create(
+ std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+ garbage_blob_count, garbage_blob_bytes);
+
+ storage_info->AddBlobFile(std::move(meta));
+ }
+
+ // Force the creation of a new manifest file and make sure metadata for
+ // the blob files is re-persisted.
+ size_t addition_encoded = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileAddition::EncodeTo::CustomFields",
+ [&](void* /* arg */) { ++addition_encoded; });
+
+ size_t garbage_encoded = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileGarbage::EncodeTo::CustomFields",
+ [&](void* /* arg */) { ++garbage_encoded; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateNewManifest();
+
+ ASSERT_EQ(addition_encoded, 2);
+ ASSERT_EQ(garbage_encoded, 1);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(VersionSetTest, AddLiveBlobFiles) {
+ // Initialize the database and add a blob file.
+ NewDB();
+
+ assert(versions_);
+ assert(versions_->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ Version* const first_version = cfd->current();
+ assert(first_version);
+
+ VersionStorageInfo* const first_storage_info = first_version->storage_info();
+ assert(first_storage_info);
+
+ constexpr uint64_t first_blob_file_number = 234;
+ constexpr uint64_t first_total_blob_count = 555;
+ constexpr uint64_t first_total_blob_bytes = 66666;
+ constexpr char first_checksum_method[] = "CRC32";
+ constexpr char first_checksum_value[] = "\x3d\x87\xff\x57";
+
+ auto first_shared_meta = SharedBlobFileMetaData::Create(
+ first_blob_file_number, first_total_blob_count, first_total_blob_bytes,
+ first_checksum_method, first_checksum_value);
+
+ constexpr uint64_t garbage_blob_count = 0;
+ constexpr uint64_t garbage_blob_bytes = 0;
+
+ auto first_meta = BlobFileMetaData::Create(
+ std::move(first_shared_meta), BlobFileMetaData::LinkedSsts(),
+ garbage_blob_count, garbage_blob_bytes);
+
+ first_storage_info->AddBlobFile(first_meta);
+
+ // Reference the version so it stays alive even after the following version
+ // edit.
+ first_version->Ref();
+
+ // Get live files directly from version.
+ std::vector<uint64_t> version_table_files;
+ std::vector<uint64_t> version_blob_files;
+
+ first_version->AddLiveFiles(&version_table_files, &version_blob_files);
+
+ ASSERT_EQ(version_blob_files.size(), 1);
+ ASSERT_EQ(version_blob_files[0], first_blob_file_number);
+
+ // Create a new version containing an additional blob file.
+ versions_->TEST_CreateAndAppendVersion(cfd);
+
+ Version* const second_version = cfd->current();
+ assert(second_version);
+ assert(second_version != first_version);
+
+ VersionStorageInfo* const second_storage_info =
+ second_version->storage_info();
+ assert(second_storage_info);
+
+ constexpr uint64_t second_blob_file_number = 456;
+ constexpr uint64_t second_total_blob_count = 100;
+ constexpr uint64_t second_total_blob_bytes = 2000000;
+ constexpr char second_checksum_method[] = "CRC32B";
+ constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a";
+
+ auto second_shared_meta = SharedBlobFileMetaData::Create(
+ second_blob_file_number, second_total_blob_count, second_total_blob_bytes,
+ second_checksum_method, second_checksum_value);
+
+ auto second_meta = BlobFileMetaData::Create(
+ std::move(second_shared_meta), BlobFileMetaData::LinkedSsts(),
+ garbage_blob_count, garbage_blob_bytes);
+
+ second_storage_info->AddBlobFile(std::move(first_meta));
+ second_storage_info->AddBlobFile(std::move(second_meta));
+
+ // Get all live files from version set. Note that the result contains
+ // duplicates.
+ std::vector<uint64_t> all_table_files;
+ std::vector<uint64_t> all_blob_files;
+
+ versions_->AddLiveFiles(&all_table_files, &all_blob_files);
+
+ ASSERT_EQ(all_blob_files.size(), 3);
+ ASSERT_EQ(all_blob_files[0], first_blob_file_number);
+ ASSERT_EQ(all_blob_files[1], first_blob_file_number);
+ ASSERT_EQ(all_blob_files[2], second_blob_file_number);
+
+ // Clean up previous version.
+ first_version->Unref();
+}
+
+TEST_F(VersionSetTest, ObsoleteBlobFile) {
+ // Initialize the database and add a blob file that is entirely garbage
+ // and thus can immediately be marked obsolete.
+ NewDB();
+
+ VersionEdit edit;
+
+ constexpr uint64_t blob_file_number = 234;
+ constexpr uint64_t total_blob_count = 555;
+ constexpr uint64_t total_blob_bytes = 66666;
+ constexpr char checksum_method[] = "CRC32";
+ constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+ edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+ checksum_method, checksum_value);
+
+ edit.AddBlobFileGarbage(blob_file_number, total_blob_count, total_blob_bytes);
+
+ mutex_.Lock();
+ Status s =
+ versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+ mutable_cf_options_, &edit, &mutex_, nullptr);
+ mutex_.Unlock();
+
+ ASSERT_OK(s);
+
+ // Make sure blob files from the pending number range are not returned
+ // as obsolete.
+ {
+ std::vector<ObsoleteFileInfo> table_files;
+ std::vector<ObsoleteBlobFileInfo> blob_files;
+ std::vector<std::string> manifest_files;
+ constexpr uint64_t min_pending_output = blob_file_number;
+
+ versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+ min_pending_output);
+
+ ASSERT_TRUE(blob_files.empty());
+ }
+
+ // Make sure the blob file is returned as obsolete if it's not in the pending
+ // range.
+ {
+ std::vector<ObsoleteFileInfo> table_files;
+ std::vector<ObsoleteBlobFileInfo> blob_files;
+ std::vector<std::string> manifest_files;
+ constexpr uint64_t min_pending_output = blob_file_number + 1;
+
+ versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+ min_pending_output);
+
+ ASSERT_EQ(blob_files.size(), 1);
+ ASSERT_EQ(blob_files[0].GetBlobFileNumber(), blob_file_number);
+ }
+
+ // Make sure it's not returned a second time.
+ {
+ std::vector<ObsoleteFileInfo> table_files;
+ std::vector<ObsoleteBlobFileInfo> blob_files;
+ std::vector<std::string> manifest_files;
+ constexpr uint64_t min_pending_output = blob_file_number + 1;
+
+ versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+ min_pending_output);
+
+ ASSERT_TRUE(blob_files.empty());
+ }
+}
+
+TEST_F(VersionSetTest, WalEditsNotAppliedToVersion) {
+ NewDB();
+
+ constexpr uint64_t kNumWals = 5;
+
+ autovector<std::unique_ptr<VersionEdit>> edits;
+ // Add some WALs.
+ for (uint64_t i = 1; i <= kNumWals; i++) {
+ edits.emplace_back(new VersionEdit);
+ // WAL's size equals its log number.
+ edits.back()->AddWal(i, WalMetadata(i));
+ }
+ // Delete the first half of the WALs.
+ edits.emplace_back(new VersionEdit);
+ edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
+
+ autovector<Version*> versions;
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ProcessManifestWrites:NewVersion",
+ [&](void* arg) { versions.push_back(reinterpret_cast<Version*>(arg)); });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Since the edits are all WAL edits, no version should be created.
+ ASSERT_EQ(versions.size(), 1);
+ ASSERT_EQ(versions[0], nullptr);
+}
+
+// Similar to WalEditsNotAppliedToVersion, but contains a non-WAL edit.
+TEST_F(VersionSetTest, NonWalEditsAppliedToVersion) {
+ NewDB();
+
+ const std::string kDBId = "db_db";
+ constexpr uint64_t kNumWals = 5;
+
+ autovector<std::unique_ptr<VersionEdit>> edits;
+ // Add some WALs.
+ for (uint64_t i = 1; i <= kNumWals; i++) {
+ edits.emplace_back(new VersionEdit);
+ // WAL's size equals its log number.
+ edits.back()->AddWal(i, WalMetadata(i));
+ }
+ // Delete the first half of the WALs.
+ edits.emplace_back(new VersionEdit);
+ edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
+ edits.emplace_back(new VersionEdit);
+ edits.back()->SetDBId(kDBId);
+
+ autovector<Version*> versions;
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ProcessManifestWrites:NewVersion",
+ [&](void* arg) { versions.push_back(reinterpret_cast<Version*>(arg)); });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Since the edits are all WAL edits, no version should be created.
+ ASSERT_EQ(versions.size(), 1);
+ ASSERT_NE(versions[0], nullptr);
+}
+
+TEST_F(VersionSetTest, WalAddition) {
+ NewDB();
+
+ constexpr WalNumber kLogNumber = 10;
+ constexpr uint64_t kSizeInBytes = 111;
+
+ // A WAL is just created.
+ {
+ VersionEdit edit;
+ edit.AddWal(kLogNumber);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+ const auto& wals = versions_->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 1);
+ ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+ ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
+ }
+
+ // The WAL is synced for several times before closing.
+ {
+ for (uint64_t size_delta = 100; size_delta > 0; size_delta /= 2) {
+ uint64_t size = kSizeInBytes - size_delta;
+ WalMetadata wal(size);
+ VersionEdit edit;
+ edit.AddWal(kLogNumber, wal);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+ const auto& wals = versions_->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 1);
+ ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+ ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+ ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), size);
+ }
+ }
+
+ // The WAL is closed.
+ {
+ WalMetadata wal(kSizeInBytes);
+ VersionEdit edit;
+ edit.AddWal(kLogNumber, wal);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+ const auto& wals = versions_->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 1);
+ ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+ ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+ ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+ }
+
+ // Recover a new VersionSet.
+ {
+ std::unique_ptr<VersionSet> new_versions(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false));
+ const auto& wals = new_versions->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 1);
+ ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+ ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+ ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+ }
+}
+
+TEST_F(VersionSetTest, WalCloseWithoutSync) {
+ NewDB();
+
+ constexpr WalNumber kLogNumber = 10;
+ constexpr uint64_t kSizeInBytes = 111;
+ constexpr uint64_t kSyncedSizeInBytes = kSizeInBytes / 2;
+
+ // A WAL is just created.
+ {
+ VersionEdit edit;
+ edit.AddWal(kLogNumber);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+ const auto& wals = versions_->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 1);
+ ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+ ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
+ }
+
+ // The WAL is synced before closing.
+ {
+ WalMetadata wal(kSyncedSizeInBytes);
+ VersionEdit edit;
+ edit.AddWal(kLogNumber, wal);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+ const auto& wals = versions_->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 1);
+ ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+ ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+ ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+ }
+
+ // A new WAL with larger log number is created,
+ // implicitly marking the current WAL closed.
+ {
+ VersionEdit edit;
+ edit.AddWal(kLogNumber + 1);
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+ const auto& wals = versions_->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 2);
+ ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+ ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+ ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+ ASSERT_TRUE(wals.find(kLogNumber + 1) != wals.end());
+ ASSERT_FALSE(wals.at(kLogNumber + 1).HasSyncedSize());
+ }
+
+ // Recover a new VersionSet.
+ {
+ std::unique_ptr<VersionSet> new_versions(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ ASSERT_OK(new_versions->Recover(column_families_, false));
+ const auto& wals = new_versions->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 2);
+ ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+ ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+ ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+ }
+}
+
+TEST_F(VersionSetTest, WalDeletion) {
+ NewDB();
+
+ constexpr WalNumber kClosedLogNumber = 10;
+ constexpr WalNumber kNonClosedLogNumber = 20;
+ constexpr uint64_t kSizeInBytes = 111;
+
+ // Add a non-closed and a closed WAL.
+ {
+ VersionEdit edit;
+ edit.AddWal(kClosedLogNumber, WalMetadata(kSizeInBytes));
+ edit.AddWal(kNonClosedLogNumber);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+ const auto& wals = versions_->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 2);
+ ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+ ASSERT_TRUE(wals.find(kClosedLogNumber) != wals.end());
+ ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+ ASSERT_TRUE(wals.at(kClosedLogNumber).HasSyncedSize());
+ ASSERT_EQ(wals.at(kClosedLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+ }
+
+ // Delete the closed WAL.
+ {
+ VersionEdit edit;
+ edit.DeleteWalsBefore(kNonClosedLogNumber);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+ const auto& wals = versions_->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 1);
+ ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+ ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+ }
+
+ // Recover a new VersionSet, only the non-closed WAL should show up.
+ {
+ std::unique_ptr<VersionSet> new_versions(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ ASSERT_OK(new_versions->Recover(column_families_, false));
+ const auto& wals = new_versions->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 1);
+ ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+ ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+ }
+
+ // Force the creation of a new MANIFEST file,
+ // only the non-closed WAL should be written to the new MANIFEST.
+ {
+ std::vector<WalAddition> wal_additions;
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::WriteCurrentStateToManifest:SaveWal", [&](void* arg) {
+ VersionEdit* edit = reinterpret_cast<VersionEdit*>(arg);
+ ASSERT_TRUE(edit->IsWalAddition());
+ for (auto& addition : edit->GetWalAdditions()) {
+ wal_additions.push_back(addition);
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ CreateNewManifest();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_EQ(wal_additions.size(), 1);
+ ASSERT_EQ(wal_additions[0].GetLogNumber(), kNonClosedLogNumber);
+ ASSERT_FALSE(wal_additions[0].GetMetadata().HasSyncedSize());
+ }
+
+ // Recover from the new MANIFEST, only the non-closed WAL should show up.
+ {
+ std::unique_ptr<VersionSet> new_versions(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ ASSERT_OK(new_versions->Recover(column_families_, false));
+ const auto& wals = new_versions->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 1);
+ ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+ ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+ }
+}
+
+TEST_F(VersionSetTest, WalCreateTwice) {
+ NewDB();
+
+ constexpr WalNumber kLogNumber = 10;
+
+ VersionEdit edit;
+ edit.AddWal(kLogNumber);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+ Status s = LogAndApplyToDefaultCF(edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
+ std::string::npos)
+ << s.ToString();
+}
+
+TEST_F(VersionSetTest, WalCreateAfterClose) {
+ NewDB();
+
+ constexpr WalNumber kLogNumber = 10;
+ constexpr uint64_t kSizeInBytes = 111;
+
+ {
+ // Add a closed WAL.
+ VersionEdit edit;
+ edit.AddWal(kLogNumber);
+ WalMetadata wal(kSizeInBytes);
+ edit.AddWal(kLogNumber, wal);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+ }
+
+ {
+ // Create the same WAL again.
+ VersionEdit edit;
+ edit.AddWal(kLogNumber);
+
+ Status s = LogAndApplyToDefaultCF(edit);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
+ std::string::npos)
+ << s.ToString();
+ }
+}
+
+TEST_F(VersionSetTest, AddWalWithSmallerSize) {
+ NewDB();
+ assert(versions_);
+
+ constexpr WalNumber kLogNumber = 10;
+ constexpr uint64_t kSizeInBytes = 111;
+
+ {
+ // Add a closed WAL.
+ VersionEdit edit;
+ WalMetadata wal(kSizeInBytes);
+ edit.AddWal(kLogNumber, wal);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+ }
+ // Copy for future comparison.
+ const std::map<WalNumber, WalMetadata> wals1 =
+ versions_->GetWalSet().GetWals();
+
+ {
+ // Add the same WAL with smaller synced size.
+ VersionEdit edit;
+ WalMetadata wal(kSizeInBytes / 2);
+ edit.AddWal(kLogNumber, wal);
+
+ Status s = LogAndApplyToDefaultCF(edit);
+ ASSERT_OK(s);
+ }
+ const std::map<WalNumber, WalMetadata> wals2 =
+ versions_->GetWalSet().GetWals();
+ ASSERT_EQ(wals1, wals2);
+}
+
+TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) {
+ NewDB();
+
+ constexpr WalNumber kLogNumber0 = 10;
+ constexpr WalNumber kLogNumber1 = 20;
+ constexpr WalNumber kNonExistingNumber = 15;
+ constexpr uint64_t kSizeInBytes = 111;
+
+ {
+ // Add closed WALs.
+ VersionEdit edit;
+ WalMetadata wal(kSizeInBytes);
+ edit.AddWal(kLogNumber0, wal);
+ edit.AddWal(kLogNumber1, wal);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+ }
+
+ {
+ // Delete WALs before a non-existing WAL.
+ VersionEdit edit;
+ edit.DeleteWalsBefore(kNonExistingNumber);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+ }
+
+ // Recover a new VersionSet, WAL0 is deleted, WAL1 is not.
+ {
+ std::unique_ptr<VersionSet> new_versions(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ ASSERT_OK(new_versions->Recover(column_families_, false));
+ const auto& wals = new_versions->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 1);
+ ASSERT_TRUE(wals.find(kLogNumber1) != wals.end());
+ }
+}
+
+TEST_F(VersionSetTest, DeleteAllWals) {
+ NewDB();
+
+ constexpr WalNumber kMaxLogNumber = 10;
+ constexpr uint64_t kSizeInBytes = 111;
+
+ {
+ // Add a closed WAL.
+ VersionEdit edit;
+ WalMetadata wal(kSizeInBytes);
+ edit.AddWal(kMaxLogNumber, wal);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+ }
+
+ {
+ VersionEdit edit;
+ edit.DeleteWalsBefore(kMaxLogNumber + 10);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+ }
+
+ // Recover a new VersionSet, all WALs are deleted.
+ {
+ std::unique_ptr<VersionSet> new_versions(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ ASSERT_OK(new_versions->Recover(column_families_, false));
+ const auto& wals = new_versions->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 0);
+ }
+}
+
+TEST_F(VersionSetTest, AtomicGroupWithWalEdits) {
+ NewDB();
+
+ constexpr int kAtomicGroupSize = 7;
+ constexpr uint64_t kNumWals = 5;
+ const std::string kDBId = "db_db";
+
+ int remaining = kAtomicGroupSize;
+ autovector<std::unique_ptr<VersionEdit>> edits;
+ // Add 5 WALs.
+ for (uint64_t i = 1; i <= kNumWals; i++) {
+ edits.emplace_back(new VersionEdit);
+ // WAL's size equals its log number.
+ edits.back()->AddWal(i, WalMetadata(i));
+ edits.back()->MarkAtomicGroup(--remaining);
+ }
+ // One edit with the min log number set.
+ edits.emplace_back(new VersionEdit);
+ edits.back()->SetDBId(kDBId);
+ edits.back()->MarkAtomicGroup(--remaining);
+ // Delete the first added 4 WALs.
+ edits.emplace_back(new VersionEdit);
+ edits.back()->DeleteWalsBefore(kNumWals);
+ edits.back()->MarkAtomicGroup(--remaining);
+ ASSERT_EQ(remaining, 0);
+
+ ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+ // Recover a new VersionSet, the min log number and the last WAL should be
+ // kept.
+ {
+ std::unique_ptr<VersionSet> new_versions(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ std::string db_id;
+ ASSERT_OK(
+ new_versions->Recover(column_families_, /*read_only=*/false, &db_id));
+
+ ASSERT_EQ(db_id, kDBId);
+
+ const auto& wals = new_versions->GetWalSet().GetWals();
+ ASSERT_EQ(wals.size(), 1);
+ ASSERT_TRUE(wals.find(kNumWals) != wals.end());
+ ASSERT_TRUE(wals.at(kNumWals).HasSyncedSize());
+ ASSERT_EQ(wals.at(kNumWals).GetSyncedSizeInBytes(), kNumWals);
+ }
+}
+
+class VersionSetWithTimestampTest : public VersionSetTest {
+ public:
+ static const std::string kNewCfName;
+
+ explicit VersionSetWithTimestampTest() : VersionSetTest() {}
+
+ void SetUp() override {
+ NewDB();
+ Options options;
+ options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ cfd_ = CreateColumnFamily(kNewCfName, options);
+ EXPECT_NE(nullptr, cfd_);
+ EXPECT_NE(nullptr, cfd_->GetLatestMutableCFOptions());
+ column_families_.emplace_back(kNewCfName, options);
+ }
+
+ void TearDown() override {
+ for (auto* e : edits_) {
+ delete e;
+ }
+ edits_.clear();
+ }
+
+ void GenVersionEditsToSetFullHistoryTsLow(
+ const std::vector<uint64_t>& ts_lbs) {
+ for (const auto ts_lb : ts_lbs) {
+ VersionEdit* edit = new VersionEdit;
+ edit->SetColumnFamily(cfd_->GetID());
+ std::string ts_str = test::EncodeInt(ts_lb);
+ edit->SetFullHistoryTsLow(ts_str);
+ edits_.emplace_back(edit);
+ }
+ }
+
+ void VerifyFullHistoryTsLow(uint64_t expected_ts_low) {
+ std::unique_ptr<VersionSet> vset(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false,
+ /*db_id=*/nullptr));
+ for (auto* cfd : *(vset->GetColumnFamilySet())) {
+ ASSERT_NE(nullptr, cfd);
+ if (cfd->GetName() == kNewCfName) {
+ ASSERT_EQ(test::EncodeInt(expected_ts_low), cfd->GetFullHistoryTsLow());
+ } else {
+ ASSERT_TRUE(cfd->GetFullHistoryTsLow().empty());
+ }
+ }
+ }
+
+ void DoTest(const std::vector<uint64_t>& ts_lbs) {
+ if (ts_lbs.empty()) {
+ return;
+ }
+
+ GenVersionEditsToSetFullHistoryTsLow(ts_lbs);
+
+ Status s;
+ mutex_.Lock();
+ s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()),
+ edits_, &mutex_, nullptr);
+ mutex_.Unlock();
+ ASSERT_OK(s);
+ VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end()));
+ }
+
+ protected:
+ ColumnFamilyData* cfd_{nullptr};
+ // edits_ must contain and own pointers to heap-alloc VersionEdit objects.
+ autovector<VersionEdit*> edits_;
+};
+
+const std::string VersionSetWithTimestampTest::kNewCfName("new_cf");
+
+TEST_F(VersionSetWithTimestampTest, SetFullHistoryTsLbOnce) {
+ constexpr uint64_t kTsLow = 100;
+ DoTest({kTsLow});
+}
+
+// Simulate the application increasing full_history_ts_low.
+TEST_F(VersionSetWithTimestampTest, IncreaseFullHistoryTsLb) {
+ const std::vector<uint64_t> ts_lbs = {100, 101, 102, 103};
+ DoTest(ts_lbs);
+}
+
+// Simulate the application trying to decrease full_history_ts_low
+// unsuccessfully. If the application calls public API sequentially to
+// decrease the lower bound ts, RocksDB will return an InvalidArgument
+// status before involving VersionSet. Only when multiple threads trying
+// to decrease the lower bound concurrently will this case ever happen. Even
+// so, the lower bound cannot be decreased. The application will be notified
+// via return value of the API.
+TEST_F(VersionSetWithTimestampTest, TryDecreaseFullHistoryTsLb) {
+ const std::vector<uint64_t> ts_lbs = {103, 102, 101, 100};
+ DoTest(ts_lbs);
+}
+
+class VersionSetAtomicGroupTest : public VersionSetTestBase,
+ public testing::Test {
+ public:
+ VersionSetAtomicGroupTest()
+ : VersionSetTestBase("version_set_atomic_group_test") {}
+
+ void SetUp() override {
+ PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+ SetupTestSyncPoints();
+ }
+
+ void SetupValidAtomicGroup(int atomic_group_size) {
+ edits_.resize(atomic_group_size);
+ int remaining = atomic_group_size;
+ for (size_t i = 0; i != edits_.size(); ++i) {
+ edits_[i].SetLogNumber(0);
+ edits_[i].SetNextFile(2);
+ edits_[i].MarkAtomicGroup(--remaining);
+ edits_[i].SetLastSequence(last_seqno_++);
+ }
+ ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+ }
+
+ void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
+ edits_.resize(atomic_group_size);
+ int remaining = atomic_group_size;
+ for (size_t i = 0; i != edits_.size(); ++i) {
+ edits_[i].SetLogNumber(0);
+ edits_[i].SetNextFile(2);
+ edits_[i].MarkAtomicGroup(--remaining);
+ edits_[i].SetLastSequence(last_seqno_++);
+ }
+ ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+ }
+
+ void SetupCorruptedAtomicGroup(int atomic_group_size) {
+ edits_.resize(atomic_group_size);
+ int remaining = atomic_group_size;
+ for (size_t i = 0; i != edits_.size(); ++i) {
+ edits_[i].SetLogNumber(0);
+ edits_[i].SetNextFile(2);
+ if (i != ((size_t)atomic_group_size / 2)) {
+ edits_[i].MarkAtomicGroup(--remaining);
+ }
+ edits_[i].SetLastSequence(last_seqno_++);
+ }
+ ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+ }
+
+ void SetupIncorrectAtomicGroup(int atomic_group_size) {
+ edits_.resize(atomic_group_size);
+ int remaining = atomic_group_size;
+ for (size_t i = 0; i != edits_.size(); ++i) {
+ edits_[i].SetLogNumber(0);
+ edits_[i].SetNextFile(2);
+ if (i != 1) {
+ edits_[i].MarkAtomicGroup(--remaining);
+ } else {
+ edits_[i].MarkAtomicGroup(remaining--);
+ }
+ edits_[i].SetLastSequence(last_seqno_++);
+ }
+ ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+ }
+
+ void SetupTestSyncPoints() {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) {
+ VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+ EXPECT_EQ(edits_.front().DebugString(),
+ e->DebugString()); // compare based on value
+ first_in_atomic_group_ = true;
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) {
+ VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+ EXPECT_EQ(edits_.back().DebugString(),
+ e->DebugString()); // compare based on value
+ EXPECT_TRUE(first_in_atomic_group_);
+ last_in_atomic_group_ = true;
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionEditHandlerBase::Iterate:Finish", [&](void* arg) {
+ num_recovered_edits_ = *reinterpret_cast<size_t*>(arg);
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "AtomicGroupReadBuffer::AddEdit:AtomicGroup",
+ [&](void* /* arg */) { ++num_edits_in_atomic_group_; });
+ SyncPoint::GetInstance()->SetCallBack(
+ "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits",
+ [&](void* arg) {
+ corrupted_edit_ = *reinterpret_cast<VersionEdit*>(arg);
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize",
+ [&](void* arg) {
+ edit_with_incorrect_group_size_ =
+ *reinterpret_cast<VersionEdit*>(arg);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ }
+
+ void AddNewEditsToLog(int num_edits) {
+ for (int i = 0; i < num_edits; i++) {
+ std::string record;
+ edits_[i].EncodeTo(&record);
+ ASSERT_OK(log_writer_->AddRecord(record));
+ }
+ }
+
+ void TearDown() override {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ log_writer_.reset();
+ }
+
+ protected:
+ std::vector<ColumnFamilyDescriptor> column_families_;
+ SequenceNumber last_seqno_;
+ std::vector<VersionEdit> edits_;
+ bool first_in_atomic_group_ = false;
+ bool last_in_atomic_group_ = false;
+ int num_edits_in_atomic_group_ = 0;
+ size_t num_recovered_edits_ = 0;
+ VersionEdit corrupted_edit_;
+ VersionEdit edit_with_incorrect_group_size_;
+ std::unique_ptr<log::Writer> log_writer_;
+};
+
+TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) {
+ const int kAtomicGroupSize = 3;
+ SetupValidAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ EXPECT_OK(versions_->Recover(column_families_, false));
+ EXPECT_EQ(column_families_.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_TRUE(last_in_atomic_group_);
+ EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleValidAtomicGroupWithReactiveVersionSetRecover) {
+ const int kAtomicGroupSize = 3;
+ SetupValidAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ EXPECT_EQ(column_families_.size(),
+ reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_TRUE(last_in_atomic_group_);
+ // The recover should clean up the replay buffer.
+ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+ EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+ EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) {
+ const int kAtomicGroupSize = 3;
+ SetupValidAtomicGroup(kAtomicGroupSize);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+ AddNewEditsToLog(kAtomicGroupSize);
+ InstrumentedMutex mu;
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ mu.Lock();
+ EXPECT_OK(reactive_versions_->ReadAndApply(
+ &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+ mu.Unlock();
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_TRUE(last_in_atomic_group_);
+ // The recover should clean up the replay buffer.
+ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+ EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+ EXPECT_EQ(kAtomicGroupSize, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+ SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+ EXPECT_OK(versions_->Recover(column_families_, false));
+ EXPECT_EQ(column_families_.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_FALSE(last_in_atomic_group_);
+ EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+ EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+ SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ EXPECT_EQ(column_families_.size(),
+ reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_FALSE(last_in_atomic_group_);
+ EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+ // Reactive version set should store the edits in the replay buffer.
+ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+ kNumberOfPersistedVersionEdits);
+ EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+ // Write the last record. The reactive version set should now apply all
+ // edits.
+ std::string last_record;
+ edits_[kAtomicGroupSize - 1].EncodeTo(&last_record);
+ EXPECT_OK(log_writer_->AddRecord(last_record));
+ InstrumentedMutex mu;
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ mu.Lock();
+ EXPECT_OK(reactive_versions_->ReadAndApply(
+ &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+ mu.Unlock();
+ // Reactive version set should be empty now.
+ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+ EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+ EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) {
+ const int kAtomicGroupSize = 4;
+ const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+ SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ // No edits in an atomic group.
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ EXPECT_EQ(column_families_.size(),
+ reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+ // Write a few edits in an atomic group.
+ AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+ InstrumentedMutex mu;
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ mu.Lock();
+ EXPECT_OK(reactive_versions_->ReadAndApply(
+ &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+ mu.Unlock();
+ EXPECT_TRUE(first_in_atomic_group_);
+ EXPECT_FALSE(last_in_atomic_group_);
+ EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+ // Reactive version set should store the edits in the replay buffer.
+ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+ kNumberOfPersistedVersionEdits);
+ EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleCorruptedAtomicGroupWithVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ SetupCorruptedAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ EXPECT_NOK(versions_->Recover(column_families_, false));
+ EXPECT_EQ(column_families_.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+ corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ SetupCorruptedAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ EXPECT_EQ(column_families_.size(),
+ reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+ corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) {
+ const int kAtomicGroupSize = 4;
+ SetupCorruptedAtomicGroup(kAtomicGroupSize);
+ InstrumentedMutex mu;
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ // Write the corrupted edits.
+ AddNewEditsToLog(kAtomicGroupSize);
+ mu.Lock();
+ EXPECT_NOK(reactive_versions_->ReadAndApply(
+ &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+ mu.Unlock();
+ EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+ corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncorrectAtomicGroupSizeWithVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ SetupIncorrectAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ EXPECT_NOK(versions_->Recover(column_families_, false));
+ EXPECT_EQ(column_families_.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_EQ(edits_[1].DebugString(),
+ edit_with_incorrect_group_size_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) {
+ const int kAtomicGroupSize = 4;
+ SetupIncorrectAtomicGroup(kAtomicGroupSize);
+ AddNewEditsToLog(kAtomicGroupSize);
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ EXPECT_EQ(column_families_.size(),
+ reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+ EXPECT_EQ(edits_[1].DebugString(),
+ edit_with_incorrect_group_size_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+ HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) {
+ const int kAtomicGroupSize = 4;
+ SetupIncorrectAtomicGroup(kAtomicGroupSize);
+ InstrumentedMutex mu;
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+ std::unique_ptr<Status> manifest_reader_status;
+ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+ &manifest_reporter,
+ &manifest_reader_status));
+ AddNewEditsToLog(kAtomicGroupSize);
+ mu.Lock();
+ EXPECT_NOK(reactive_versions_->ReadAndApply(
+ &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+ mu.Unlock();
+ EXPECT_EQ(edits_[1].DebugString(),
+ edit_with_incorrect_group_size_.DebugString());
+}
+
+class VersionSetTestDropOneCF : public VersionSetTestBase,
+ public testing::TestWithParam<std::string> {
+ public:
+ VersionSetTestDropOneCF()
+ : VersionSetTestBase("version_set_test_drop_one_cf") {}
+};
+
+// This test simulates the following execution sequence
+// Time thread1 bg_flush_thr
+// | Prepare version edits (e1,e2,e3) for atomic
+// | flush cf1, cf2, cf3
+// | Enqueue e to drop cfi
+// | to manifest_writers_
+// | Enqueue (e1,e2,e3) to manifest_writers_
+// |
+// | Apply e,
+// | cfi.IsDropped() is true
+// | Apply (e1,e2,e3),
+// | since cfi.IsDropped() == true, we need to
+// | drop ei and write the rest to MANIFEST.
+// V
+//
+// Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and
+// last column family in an atomic group.
+TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ SequenceNumber last_seqno;
+ std::unique_ptr<log::Writer> log_writer;
+ PrepareManifest(&column_families, &last_seqno, &log_writer);
+ Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+ ASSERT_OK(s);
+
+ EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
+ EXPECT_EQ(column_families.size(),
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+
+ const int kAtomicGroupSize = 3;
+ const std::vector<std::string> non_default_cf_names = {
+ kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3};
+
+ // Drop one column family
+ VersionEdit drop_cf_edit;
+ drop_cf_edit.DropColumnFamily();
+ const std::string cf_to_drop_name(GetParam());
+ auto cfd_to_drop =
+ versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name);
+ ASSERT_NE(nullptr, cfd_to_drop);
+ // Increase its refcount because cfd_to_drop is used later, and we need to
+ // prevent it from being deleted.
+ cfd_to_drop->Ref();
+ drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
+ mutex_.Lock();
+ s = versions_->LogAndApply(cfd_to_drop,
+ *cfd_to_drop->GetLatestMutableCFOptions(),
+ &drop_cf_edit, &mutex_, nullptr);
+ mutex_.Unlock();
+ ASSERT_OK(s);
+
+ std::vector<VersionEdit> edits(kAtomicGroupSize);
+ uint32_t remaining = kAtomicGroupSize;
+ size_t i = 0;
+ autovector<ColumnFamilyData*> cfds;
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ autovector<autovector<VersionEdit*>> edit_lists;
+ for (const auto& cf_name : non_default_cf_names) {
+ auto cfd = (cf_name != cf_to_drop_name)
+ ? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name)
+ : cfd_to_drop;
+ ASSERT_NE(nullptr, cfd);
+ cfds.push_back(cfd);
+ mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions());
+ edits[i].SetColumnFamily(cfd->GetID());
+ edits[i].SetLogNumber(0);
+ edits[i].SetNextFile(2);
+ edits[i].MarkAtomicGroup(--remaining);
+ edits[i].SetLastSequence(last_seqno++);
+ autovector<VersionEdit*> tmp_edits;
+ tmp_edits.push_back(&edits[i]);
+ edit_lists.emplace_back(tmp_edits);
+ ++i;
+ }
+ int called = 0;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) {
+ std::vector<VersionEdit*>* tmp_edits =
+ reinterpret_cast<std::vector<VersionEdit*>*>(arg);
+ EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size());
+ for (const auto e : *tmp_edits) {
+ bool found = false;
+ for (const auto& e2 : edits) {
+ if (&e2 == e) {
+ found = true;
+ break;
+ }
+ }
+ ASSERT_TRUE(found);
+ }
+ ++called;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ mutex_.Lock();
+ s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists, &mutex_,
+ nullptr);
+ mutex_.Unlock();
+ ASSERT_OK(s);
+ ASSERT_EQ(1, called);
+ cfd_to_drop->UnrefAndTryDelete();
+}
+
+INSTANTIATE_TEST_CASE_P(
+ AtomicGroup, VersionSetTestDropOneCF,
+ testing::Values(VersionSetTestBase::kColumnFamilyName1,
+ VersionSetTestBase::kColumnFamilyName2,
+ VersionSetTestBase::kColumnFamilyName3));
+
+class EmptyDefaultCfNewManifest : public VersionSetTestBase,
+ public testing::Test {
+ public:
+ EmptyDefaultCfNewManifest() : VersionSetTestBase("version_set_new_db_test") {}
+ // Emulate DBImpl::NewDB()
+ void PrepareManifest(std::vector<ColumnFamilyDescriptor>* /*column_families*/,
+ SequenceNumber* /*last_seqno*/,
+ std::unique_ptr<log::Writer>* log_writer) override {
+ assert(log_writer != nullptr);
+ VersionEdit new_db;
+ new_db.SetLogNumber(0);
+ const std::string manifest_path = DescriptorFileName(dbname_, 1);
+ const auto& fs = env_->GetFileSystem();
+ std::unique_ptr<WritableFileWriter> file_writer;
+ Status s = WritableFileWriter::Create(
+ fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
+ &file_writer, nullptr);
+ ASSERT_OK(s);
+ log_writer->reset(new log::Writer(std::move(file_writer), 0, true));
+ std::string record;
+ ASSERT_TRUE(new_db.EncodeTo(&record));
+ s = (*log_writer)->AddRecord(record);
+ ASSERT_OK(s);
+ // Create new column family
+ VersionEdit new_cf;
+ new_cf.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1);
+ new_cf.SetColumnFamily(1);
+ new_cf.SetLastSequence(2);
+ new_cf.SetNextFile(2);
+ record.clear();
+ ASSERT_TRUE(new_cf.EncodeTo(&record));
+ s = (*log_writer)->AddRecord(record);
+ ASSERT_OK(s);
+ }
+
+ protected:
+ bool write_dbid_to_manifest_ = false;
+ std::unique_ptr<log::Writer> log_writer_;
+};
+
+// Create db, create column family. Cf creation will switch to a new MANIFEST.
+// Then reopen db, trying to recover.
+TEST_F(EmptyDefaultCfNewManifest, Recover) {
+ PrepareManifest(nullptr, nullptr, &log_writer_);
+ log_writer_.reset();
+ Status s =
+ SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+ ASSERT_OK(s);
+ std::string manifest_path;
+ VerifyManifest(&manifest_path);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+ column_families.emplace_back(VersionSetTestBase::kColumnFamilyName1,
+ cf_options_);
+ std::string db_id;
+ bool has_missing_table_file = false;
+ s = versions_->TryRecoverFromOneManifest(
+ manifest_path, column_families, false, &db_id, &has_missing_table_file);
+ ASSERT_OK(s);
+ ASSERT_FALSE(has_missing_table_file);
+}
+
+class VersionSetTestEmptyDb
+ : public VersionSetTestBase,
+ public testing::TestWithParam<
+ std::tuple<bool, bool, std::vector<std::string>>> {
+ public:
+ static const std::string kUnknownColumnFamilyName;
+ VersionSetTestEmptyDb() : VersionSetTestBase("version_set_test_empty_db") {}
+
+ protected:
+ void PrepareManifest(std::vector<ColumnFamilyDescriptor>* /*column_families*/,
+ SequenceNumber* /*last_seqno*/,
+ std::unique_ptr<log::Writer>* log_writer) override {
+ assert(nullptr != log_writer);
+ VersionEdit new_db;
+ if (db_options_.write_dbid_to_manifest) {
+ DBOptions tmp_db_options;
+ tmp_db_options.env = env_;
+ std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
+ std::string db_id;
+ impl->GetDbIdentityFromIdentityFile(&db_id);
+ new_db.SetDBId(db_id);
+ }
+ const std::string manifest_path = DescriptorFileName(dbname_, 1);
+ const auto& fs = env_->GetFileSystem();
+ std::unique_ptr<WritableFileWriter> file_writer;
+ Status s = WritableFileWriter::Create(
+ fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
+ &file_writer, nullptr);
+ ASSERT_OK(s);
+ {
+ log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = (*log_writer)->AddRecord(record);
+ ASSERT_OK(s);
+ }
+ }
+
+ std::unique_ptr<log::Writer> log_writer_;
+};
+
+const std::string VersionSetTestEmptyDb::kUnknownColumnFamilyName = "unknown";
+
+TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
+ db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+ PrepareManifest(nullptr, nullptr, &log_writer_);
+ log_writer_.reset();
+ Status s =
+ SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+ ASSERT_OK(s);
+
+ std::string manifest_path;
+ VerifyManifest(&manifest_path);
+
+ bool read_only = std::get<1>(GetParam());
+ const std::vector<std::string> cf_names = std::get<2>(GetParam());
+
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (const auto& cf_name : cf_names) {
+ column_families.emplace_back(cf_name, cf_options_);
+ }
+
+ std::string db_id;
+ bool has_missing_table_file = false;
+ s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+ read_only, &db_id,
+ &has_missing_table_file);
+ auto iter =
+ std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+ if (iter == cf_names.end()) {
+ ASSERT_TRUE(s.IsInvalidArgument());
+ } else {
+ ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+ ASSERT_TRUE(s.IsCorruption());
+ }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
+ db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+ PrepareManifest(nullptr, nullptr, &log_writer_);
+ // Only a subset of column families in the MANIFEST.
+ VersionEdit new_cf1;
+ new_cf1.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1);
+ new_cf1.SetColumnFamily(1);
+ Status s;
+ {
+ std::string record;
+ new_cf1.EncodeTo(&record);
+ s = log_writer_->AddRecord(record);
+ ASSERT_OK(s);
+ }
+ log_writer_.reset();
+ s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+ ASSERT_OK(s);
+
+ std::string manifest_path;
+ VerifyManifest(&manifest_path);
+
+ bool read_only = std::get<1>(GetParam());
+ const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (const auto& cf_name : cf_names) {
+ column_families.emplace_back(cf_name, cf_options_);
+ }
+ std::string db_id;
+ bool has_missing_table_file = false;
+ s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+ read_only, &db_id,
+ &has_missing_table_file);
+ auto iter =
+ std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+ if (iter == cf_names.end()) {
+ ASSERT_TRUE(s.IsInvalidArgument());
+ } else {
+ ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+ ASSERT_TRUE(s.IsCorruption());
+ }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
+ db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+ PrepareManifest(nullptr, nullptr, &log_writer_);
+ // Write all column families but no log_number, next_file_number and
+ // last_sequence.
+ const std::vector<std::string> all_cf_names = {
+ kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+ kColumnFamilyName3};
+ uint32_t cf_id = 1;
+ Status s;
+ for (size_t i = 1; i != all_cf_names.size(); ++i) {
+ VersionEdit new_cf;
+ new_cf.AddColumnFamily(all_cf_names[i]);
+ new_cf.SetColumnFamily(cf_id++);
+ std::string record;
+ ASSERT_TRUE(new_cf.EncodeTo(&record));
+ s = log_writer_->AddRecord(record);
+ ASSERT_OK(s);
+ }
+ log_writer_.reset();
+ s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+ ASSERT_OK(s);
+
+ std::string manifest_path;
+ VerifyManifest(&manifest_path);
+
+ bool read_only = std::get<1>(GetParam());
+ const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (const auto& cf_name : cf_names) {
+ column_families.emplace_back(cf_name, cf_options_);
+ }
+ std::string db_id;
+ bool has_missing_table_file = false;
+ s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+ read_only, &db_id,
+ &has_missing_table_file);
+ auto iter =
+ std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+ if (iter == cf_names.end()) {
+ ASSERT_TRUE(s.IsInvalidArgument());
+ } else {
+ ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+ ASSERT_TRUE(s.IsCorruption());
+ }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
+ db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+ PrepareManifest(nullptr, nullptr, &log_writer_);
+ // Write all column families but no log_number, next_file_number and
+ // last_sequence.
+ const std::vector<std::string> all_cf_names = {
+ kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+ kColumnFamilyName3};
+ uint32_t cf_id = 1;
+ Status s;
+ for (size_t i = 1; i != all_cf_names.size(); ++i) {
+ VersionEdit new_cf;
+ new_cf.AddColumnFamily(all_cf_names[i]);
+ new_cf.SetColumnFamily(cf_id++);
+ std::string record;
+ ASSERT_TRUE(new_cf.EncodeTo(&record));
+ s = log_writer_->AddRecord(record);
+ ASSERT_OK(s);
+ }
+ {
+ VersionEdit tmp_edit;
+ tmp_edit.SetColumnFamily(4);
+ tmp_edit.SetLogNumber(0);
+ tmp_edit.SetNextFile(2);
+ tmp_edit.SetLastSequence(0);
+ std::string record;
+ ASSERT_TRUE(tmp_edit.EncodeTo(&record));
+ s = log_writer_->AddRecord(record);
+ ASSERT_OK(s);
+ }
+ log_writer_.reset();
+ s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+ ASSERT_OK(s);
+
+ std::string manifest_path;
+ VerifyManifest(&manifest_path);
+
+ bool read_only = std::get<1>(GetParam());
+ const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (const auto& cf_name : cf_names) {
+ column_families.emplace_back(cf_name, cf_options_);
+ }
+ std::string db_id;
+ bool has_missing_table_file = false;
+ s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+ read_only, &db_id,
+ &has_missing_table_file);
+ auto iter =
+ std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+ if (iter == cf_names.end()) {
+ ASSERT_TRUE(s.IsInvalidArgument());
+ } else {
+ ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+ ASSERT_TRUE(s.IsCorruption());
+ }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
+ db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+ PrepareManifest(nullptr, nullptr, &log_writer_);
+ // Write all column families but no log_number, next_file_number and
+ // last_sequence.
+ const std::vector<std::string> all_cf_names = {
+ kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+ kColumnFamilyName3};
+ uint32_t cf_id = 1;
+ Status s;
+ for (size_t i = 1; i != all_cf_names.size(); ++i) {
+ VersionEdit new_cf;
+ new_cf.AddColumnFamily(all_cf_names[i]);
+ new_cf.SetColumnFamily(cf_id++);
+ std::string record;
+ ASSERT_TRUE(new_cf.EncodeTo(&record));
+ s = log_writer_->AddRecord(record);
+ ASSERT_OK(s);
+ }
+ {
+ VersionEdit tmp_edit;
+ tmp_edit.SetLogNumber(0);
+ tmp_edit.SetNextFile(2);
+ tmp_edit.SetLastSequence(0);
+ std::string record;
+ ASSERT_TRUE(tmp_edit.EncodeTo(&record));
+ s = log_writer_->AddRecord(record);
+ ASSERT_OK(s);
+ }
+ log_writer_.reset();
+ s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+ ASSERT_OK(s);
+
+ std::string manifest_path;
+ VerifyManifest(&manifest_path);
+
+ bool read_only = std::get<1>(GetParam());
+ const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (const auto& cf_name : cf_names) {
+ column_families.emplace_back(cf_name, cf_options_);
+ }
+ std::string db_id;
+ bool has_missing_table_file = false;
+ s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+ read_only, &db_id,
+ &has_missing_table_file);
+ auto iter =
+ std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+ if (iter == cf_names.end()) {
+ ASSERT_TRUE(s.IsInvalidArgument());
+ } else if (read_only) {
+ ASSERT_OK(s);
+ ASSERT_FALSE(has_missing_table_file);
+ } else if (cf_names.size() == all_cf_names.size()) {
+ ASSERT_OK(s);
+ ASSERT_FALSE(has_missing_table_file);
+ } else if (cf_names.size() < all_cf_names.size()) {
+ ASSERT_TRUE(s.IsInvalidArgument());
+ } else {
+ ASSERT_OK(s);
+ ASSERT_FALSE(has_missing_table_file);
+ ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetColumnFamily(
+ kUnknownColumnFamilyName);
+ ASSERT_EQ(nullptr, cfd);
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ BestEffortRecovery, VersionSetTestEmptyDb,
+ testing::Combine(
+ /*write_dbid_to_manifest=*/testing::Bool(),
+ /*read_only=*/testing::Bool(),
+ /*cf_names=*/
+ testing::Values(
+ std::vector<std::string>(),
+ std::vector<std::string>({kDefaultColumnFamilyName}),
+ std::vector<std::string>({VersionSetTestBase::kColumnFamilyName1,
+ VersionSetTestBase::kColumnFamilyName2,
+ VersionSetTestBase::kColumnFamilyName3}),
+ std::vector<std::string>({kDefaultColumnFamilyName,
+ VersionSetTestBase::kColumnFamilyName1}),
+ std::vector<std::string>({kDefaultColumnFamilyName,
+ VersionSetTestBase::kColumnFamilyName1,
+ VersionSetTestBase::kColumnFamilyName2,
+ VersionSetTestBase::kColumnFamilyName3}),
+ std::vector<std::string>(
+ {kDefaultColumnFamilyName,
+ VersionSetTestBase::kColumnFamilyName1,
+ VersionSetTestBase::kColumnFamilyName2,
+ VersionSetTestBase::kColumnFamilyName3,
+ VersionSetTestEmptyDb::kUnknownColumnFamilyName}))));
+
+class VersionSetTestMissingFiles : public VersionSetTestBase,
+ public testing::Test {
+ public:
+ VersionSetTestMissingFiles()
+ : VersionSetTestBase("version_set_test_missing_files"),
+ block_based_table_options_(),
+ table_factory_(std::make_shared<BlockBasedTableFactory>(
+ block_based_table_options_)),
+ internal_comparator_(
+ std::make_shared<InternalKeyComparator>(options_.comparator)) {}
+
+ protected:
+ void PrepareManifest(std::vector<ColumnFamilyDescriptor>* column_families,
+ SequenceNumber* last_seqno,
+ std::unique_ptr<log::Writer>* log_writer) override {
+ assert(column_families != nullptr);
+ assert(last_seqno != nullptr);
+ assert(log_writer != nullptr);
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ const auto& fs = env_->GetFileSystem();
+ std::unique_ptr<WritableFileWriter> file_writer;
+ Status s = WritableFileWriter::Create(
+ fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+ nullptr);
+ ASSERT_OK(s);
+ log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+ VersionEdit new_db;
+ if (db_options_.write_dbid_to_manifest) {
+ DBOptions tmp_db_options;
+ tmp_db_options.env = env_;
+ std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
+ std::string db_id;
+ impl->GetDbIdentityFromIdentityFile(&db_id);
+ new_db.SetDBId(db_id);
+ }
+ {
+ std::string record;
+ ASSERT_TRUE(new_db.EncodeTo(&record));
+ s = (*log_writer)->AddRecord(record);
+ ASSERT_OK(s);
+ }
+ const std::vector<std::string> cf_names = {
+ kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+ kColumnFamilyName3};
+ uint32_t cf_id = 1; // default cf id is 0
+ cf_options_.table_factory = table_factory_;
+ for (const auto& cf_name : cf_names) {
+ column_families->emplace_back(cf_name, cf_options_);
+ if (cf_name == kDefaultColumnFamilyName) {
+ continue;
+ }
+ VersionEdit new_cf;
+ new_cf.AddColumnFamily(cf_name);
+ new_cf.SetColumnFamily(cf_id);
+ std::string record;
+ ASSERT_TRUE(new_cf.EncodeTo(&record));
+ s = (*log_writer)->AddRecord(record);
+ ASSERT_OK(s);
+
+ VersionEdit cf_files;
+ cf_files.SetColumnFamily(cf_id);
+ cf_files.SetLogNumber(0);
+ record.clear();
+ ASSERT_TRUE(cf_files.EncodeTo(&record));
+ s = (*log_writer)->AddRecord(record);
+ ASSERT_OK(s);
+ ++cf_id;
+ }
+ SequenceNumber seq = 2;
+ {
+ VersionEdit edit;
+ edit.SetNextFile(7);
+ edit.SetLastSequence(seq);
+ std::string record;
+ ASSERT_TRUE(edit.EncodeTo(&record));
+ s = (*log_writer)->AddRecord(record);
+ ASSERT_OK(s);
+ }
+ *last_seqno = seq + 1;
+ }
+
+ struct SstInfo {
+ uint64_t file_number;
+ std::string column_family;
+ std::string key; // the only key
+ int level = 0;
+ SstInfo(uint64_t file_num, const std::string& cf_name,
+ const std::string& _key)
+ : SstInfo(file_num, cf_name, _key, 0) {}
+ SstInfo(uint64_t file_num, const std::string& cf_name,
+ const std::string& _key, int lvl)
+ : file_number(file_num),
+ column_family(cf_name),
+ key(_key),
+ level(lvl) {}
+ };
+
+ // Create dummy sst, return their metadata. Note that only file name and size
+ // are used.
+ void CreateDummyTableFiles(const std::vector<SstInfo>& file_infos,
+ std::vector<FileMetaData>* file_metas) {
+ assert(file_metas != nullptr);
+ for (const auto& info : file_infos) {
+ uint64_t file_num = info.file_number;
+ std::string fname = MakeTableFileName(dbname_, file_num);
+ std::unique_ptr<FSWritableFile> file;
+ Status s = fs_->NewWritableFile(fname, FileOptions(), &file, nullptr);
+ ASSERT_OK(s);
+ std::unique_ptr<WritableFileWriter> fwriter(new WritableFileWriter(
+ std::move(file), fname, FileOptions(), env_->GetSystemClock().get()));
+ IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+ std::unique_ptr<TableBuilder> builder(table_factory_->NewTableBuilder(
+ TableBuilderOptions(
+ immutable_options_, mutable_cf_options_, *internal_comparator_,
+ &int_tbl_prop_collector_factories, kNoCompression,
+ CompressionOptions(),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ info.column_family, info.level),
+ fwriter.get()));
+ InternalKey ikey(info.key, 0, ValueType::kTypeValue);
+ builder->Add(ikey.Encode(), "value");
+ ASSERT_OK(builder->Finish());
+ ASSERT_OK(fwriter->Flush());
+ uint64_t file_size = 0;
+ s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr);
+ ASSERT_OK(s);
+ ASSERT_NE(0, file_size);
+ file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey,
+ ikey, 0, 0, false, Temperature::kUnknown, 0, 0,
+ 0, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ }
+ }
+
+ // This method updates last_sequence_.
+ void WriteFileAdditionAndDeletionToManifest(
+ uint32_t cf, const std::vector<std::pair<int, FileMetaData>>& added_files,
+ const std::vector<std::pair<int, uint64_t>>& deleted_files) {
+ VersionEdit edit;
+ edit.SetColumnFamily(cf);
+ for (const auto& elem : added_files) {
+ int level = elem.first;
+ edit.AddFile(level, elem.second);
+ }
+ for (const auto& elem : deleted_files) {
+ int level = elem.first;
+ edit.DeleteFile(level, elem.second);
+ }
+ edit.SetLastSequence(last_seqno_);
+ ++last_seqno_;
+ assert(log_writer_.get() != nullptr);
+ std::string record;
+ ASSERT_TRUE(edit.EncodeTo(&record));
+ Status s = log_writer_->AddRecord(record);
+ ASSERT_OK(s);
+ }
+
+ BlockBasedTableOptions block_based_table_options_;
+ std::shared_ptr<TableFactory> table_factory_;
+ std::shared_ptr<InternalKeyComparator> internal_comparator_;
+ std::vector<ColumnFamilyDescriptor> column_families_;
+ SequenceNumber last_seqno_;
+ std::unique_ptr<log::Writer> log_writer_;
+};
+
+TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
+ std::vector<SstInfo> existing_files = {
+ SstInfo(100, kDefaultColumnFamilyName, "a"),
+ SstInfo(102, kDefaultColumnFamilyName, "b"),
+ SstInfo(103, kDefaultColumnFamilyName, "c"),
+ SstInfo(107, kDefaultColumnFamilyName, "d"),
+ SstInfo(110, kDefaultColumnFamilyName, "e")};
+ std::vector<FileMetaData> file_metas;
+ CreateDummyTableFiles(existing_files, &file_metas);
+
+ PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+ std::vector<std::pair<int, FileMetaData>> added_files;
+ for (uint64_t file_num = 10; file_num < 15; ++file_num) {
+ std::string smallest_ukey = "a";
+ std::string largest_ukey = "b";
+ InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue);
+ InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue);
+ FileMetaData meta = FileMetaData(
+ file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
+ largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ added_files.emplace_back(0, meta);
+ }
+ WriteFileAdditionAndDeletionToManifest(
+ /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+ std::vector<std::pair<int, uint64_t>> deleted_files;
+ deleted_files.emplace_back(0, 10);
+ WriteFileAdditionAndDeletionToManifest(
+ /*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
+ log_writer_.reset();
+ Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+ ASSERT_OK(s);
+ std::string manifest_path;
+ VerifyManifest(&manifest_path);
+ std::string db_id;
+ bool has_missing_table_file = false;
+ s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+ /*read_only=*/false, &db_id,
+ &has_missing_table_file);
+ ASSERT_OK(s);
+ ASSERT_TRUE(has_missing_table_file);
+ for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+ VersionStorageInfo* vstorage = cfd->current()->storage_info();
+ const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+ ASSERT_TRUE(files.empty());
+ }
+}
+
+TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
+ std::vector<SstInfo> existing_files = {
+ SstInfo(100, kDefaultColumnFamilyName, "a"),
+ SstInfo(102, kDefaultColumnFamilyName, "b"),
+ SstInfo(103, kDefaultColumnFamilyName, "c"),
+ SstInfo(107, kDefaultColumnFamilyName, "d"),
+ SstInfo(110, kDefaultColumnFamilyName, "e")};
+ std::vector<FileMetaData> file_metas;
+ CreateDummyTableFiles(existing_files, &file_metas);
+
+ PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+ std::vector<std::pair<int, FileMetaData>> added_files;
+ for (size_t i = 3; i != 5; ++i) {
+ added_files.emplace_back(0, file_metas[i]);
+ }
+ WriteFileAdditionAndDeletionToManifest(
+ /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+
+ added_files.clear();
+ for (uint64_t file_num = 120; file_num < 130; ++file_num) {
+ std::string smallest_ukey = "a";
+ std::string largest_ukey = "b";
+ InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue);
+ InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue);
+ FileMetaData meta = FileMetaData(
+ file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
+ largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ added_files.emplace_back(0, meta);
+ }
+ WriteFileAdditionAndDeletionToManifest(
+ /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+ log_writer_.reset();
+ Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+ ASSERT_OK(s);
+ std::string manifest_path;
+ VerifyManifest(&manifest_path);
+ std::string db_id;
+ bool has_missing_table_file = false;
+ s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+ /*read_only=*/false, &db_id,
+ &has_missing_table_file);
+ ASSERT_OK(s);
+ ASSERT_TRUE(has_missing_table_file);
+ for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+ VersionStorageInfo* vstorage = cfd->current()->storage_info();
+ const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+ if (cfd->GetName() == kDefaultColumnFamilyName) {
+ ASSERT_EQ(2, files.size());
+ for (const auto* fmeta : files) {
+ if (fmeta->fd.GetNumber() != 107 && fmeta->fd.GetNumber() != 110) {
+ ASSERT_FALSE(true);
+ }
+ }
+ } else {
+ ASSERT_TRUE(files.empty());
+ }
+ }
+}
+
+TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
+ std::vector<SstInfo> existing_files = {
+ SstInfo(100, kDefaultColumnFamilyName, "a"),
+ SstInfo(102, kDefaultColumnFamilyName, "b"),
+ SstInfo(103, kDefaultColumnFamilyName, "c"),
+ SstInfo(107, kDefaultColumnFamilyName, "d"),
+ SstInfo(110, kDefaultColumnFamilyName, "e")};
+ std::vector<FileMetaData> file_metas;
+ CreateDummyTableFiles(existing_files, &file_metas);
+
+ PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+ std::vector<std::pair<int, FileMetaData>> added_files;
+ for (const auto& meta : file_metas) {
+ added_files.emplace_back(0, meta);
+ }
+ WriteFileAdditionAndDeletionToManifest(
+ /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+ std::vector<std::pair<int, uint64_t>> deleted_files;
+ deleted_files.emplace_back(/*level=*/0, 100);
+ WriteFileAdditionAndDeletionToManifest(
+ /*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
+ log_writer_.reset();
+ Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+ ASSERT_OK(s);
+ std::string manifest_path;
+ VerifyManifest(&manifest_path);
+ std::string db_id;
+ bool has_missing_table_file = false;
+ s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+ /*read_only=*/false, &db_id,
+ &has_missing_table_file);
+ ASSERT_OK(s);
+ ASSERT_FALSE(has_missing_table_file);
+ for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+ VersionStorageInfo* vstorage = cfd->current()->storage_info();
+ const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+ if (cfd->GetName() == kDefaultColumnFamilyName) {
+ ASSERT_EQ(existing_files.size() - deleted_files.size(), files.size());
+ bool has_deleted_file = false;
+ for (const auto* fmeta : files) {
+ if (fmeta->fd.GetNumber() == 100) {
+ has_deleted_file = true;
+ break;
+ }
+ }
+ ASSERT_FALSE(has_deleted_file);
+ } else {
+ ASSERT_TRUE(files.empty());
+ }
+ }
+}
+
+TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
+ db_options_.allow_2pc = true;
+ NewDB();
+
+ SstInfo sst(100, kDefaultColumnFamilyName, "a");
+ std::vector<FileMetaData> file_metas;
+ CreateDummyTableFiles({sst}, &file_metas);
+
+ constexpr WalNumber kMinWalNumberToKeep2PC = 10;
+ VersionEdit edit;
+ edit.AddFile(0, file_metas[0]);
+ edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC);
+ ASSERT_OK(LogAndApplyToDefaultCF(edit));
+ ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
+
+ for (int i = 0; i < 3; i++) {
+ CreateNewManifest();
+ ReopenDB();
+ ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
+ }
+}
+
+class ChargeFileMetadataTest : public DBTestBase {
+ public:
+ ChargeFileMetadataTest()
+ : DBTestBase("charge_file_metadata_test", /*env_do_fsync=*/true) {}
+};
+
+class ChargeFileMetadataTestWithParam
+ : public ChargeFileMetadataTest,
+ public testing::WithParamInterface<CacheEntryRoleOptions::Decision> {
+ public:
+ ChargeFileMetadataTestWithParam() {}
+};
+
+#ifndef ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(
+ ChargeFileMetadataTestWithParam, ChargeFileMetadataTestWithParam,
+ ::testing::Values(CacheEntryRoleOptions::Decision::kEnabled,
+ CacheEntryRoleOptions::Decision::kDisabled));
+
+TEST_P(ChargeFileMetadataTestWithParam, Basic) {
+ Options options;
+ BlockBasedTableOptions table_options;
+ CacheEntryRoleOptions::Decision charge_file_metadata = GetParam();
+ table_options.cache_usage_options.options_overrides.insert(
+ {CacheEntryRole::kFileMetadata, {/*.charged = */ charge_file_metadata}});
+ std::shared_ptr<TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>
+ file_metadata_charge_only_cache = std::make_shared<
+ TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
+ NewLRUCache(
+ 4 * CacheReservationManagerImpl<
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
+ 0 /* num_shard_bits */, true /* strict_capacity_limit */));
+ table_options.block_cache = file_metadata_charge_only_cache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ // Create 128 file metadata, each of which is roughly 1024 bytes.
+ // This results in 1 *
+ // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
+ // cache reservation for file metadata.
+ for (int i = 1; i <= 128; ++i) {
+ ASSERT_OK(Put(std::string(1024, 'a'), "va"));
+ ASSERT_OK(Put("b", "vb"));
+ ASSERT_OK(Flush());
+ }
+ if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+ 1 * CacheReservationManagerImpl<
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+
+ } else {
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
+ }
+
+ // Create another 128 file metadata.
+ // This increases the file metadata cache reservation to 2 *
+ // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize().
+ for (int i = 1; i <= 128; ++i) {
+ ASSERT_OK(Put(std::string(1024, 'a'), "vva"));
+ ASSERT_OK(Put("b", "vvb"));
+ ASSERT_OK(Flush());
+ }
+ if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+ 2 * CacheReservationManagerImpl<
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+ } else {
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
+ }
+ // Compaction will create 1 new file metadata, obsolete and delete all 256
+ // file metadata above. This results in 1 *
+ // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
+ // cache reservation for file metadata.
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+ "ChargeFileMetadataTestWithParam::"
+ "PreVerifyingCacheReservationRelease"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ TEST_SYNC_POINT(
+ "ChargeFileMetadataTestWithParam::PreVerifyingCacheReservationRelease");
+ if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+ 1 * CacheReservationManagerImpl<
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+ } else {
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ // Destroying the db will delete the remaining 1 new file metadata
+ // This results in no cache reservation for file metadata.
+ Destroy(options);
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+ 0 * CacheReservationManagerImpl<
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+
+ // Reopen the db with a smaller cache in order to test failure in allocating
+ // file metadata due to memory limit based on cache capacity
+ file_metadata_charge_only_cache = std::make_shared<
+ TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
+ NewLRUCache(1 * CacheReservationManagerImpl<
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
+ 0 /* num_shard_bits */, true /* strict_capacity_limit */));
+ table_options.block_cache = file_metadata_charge_only_cache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ ASSERT_OK(Put(std::string(1024, 'a'), "va"));
+ ASSERT_OK(Put("b", "vb"));
+ Status s = Flush();
+ if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+ EXPECT_TRUE(s.IsMemoryLimit());
+ EXPECT_TRUE(s.ToString().find(
+ kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+ CacheEntryRole::kFileMetadata)]) != std::string::npos);
+ EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
+ std::string::npos);
+ } else {
+ EXPECT_TRUE(s.ok());
+ }
+}
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_util.h b/src/rocksdb/db/version_util.h
new file mode 100644
index 000000000..5ec6fda11
--- /dev/null
+++ b/src/rocksdb/db/version_util.h
@@ -0,0 +1,71 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Instead of opening a `DB` to perform certain manifest updates, this
+// uses the underlying `VersionSet` API to read and modify the MANIFEST. This
+// allows us to use the user's real options, while not having to worry about
+// the DB persisting new SST files via flush/compaction or attempting to read/
+// compact files which may fail, particularly for the file we intend to remove
+// (the user may want to remove an already deleted file from MANIFEST).
+class OfflineManifestWriter {
+ public:
+ OfflineManifestWriter(const DBOptions& options, const std::string& db_path)
+ : wc_(options.delayed_write_rate),
+ wb_(options.db_write_buffer_size),
+ immutable_db_options_(WithDbPath(options, db_path)),
+ tc_(NewLRUCache(1 << 20 /* capacity */,
+ options.table_cache_numshardbits)),
+ versions_(db_path, &immutable_db_options_, sopt_, tc_.get(), &wb_, &wc_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ "") {}
+
+ Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families) {
+ return versions_.Recover(column_families, /*read_only*/ false,
+ /*db_id*/ nullptr,
+ /*no_error_if_files_missing*/ true);
+ }
+
+ Status LogAndApply(ColumnFamilyData* cfd, VersionEdit* edit,
+ FSDirectory* dir_contains_current_file) {
+ // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`.
+ InstrumentedMutex mutex;
+ mutex.Lock();
+ Status s = versions_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ edit, &mutex, dir_contains_current_file,
+ false /* new_descriptor_log */);
+ mutex.Unlock();
+ return s;
+ }
+
+ VersionSet& Versions() { return versions_; }
+ const ImmutableDBOptions& IOptions() { return immutable_db_options_; }
+
+ private:
+ WriteController wc_;
+ WriteBufferManager wb_;
+ ImmutableDBOptions immutable_db_options_;
+ std::shared_ptr<Cache> tc_;
+ EnvOptions sopt_;
+ VersionSet versions_;
+
+ static ImmutableDBOptions WithDbPath(const DBOptions& options,
+ const std::string& db_path) {
+ ImmutableDBOptions rv(options);
+ if (rv.db_paths.empty()) {
+ // `VersionSet` expects options that have been through
+ // `SanitizeOptions()`, which would sanitize an empty `db_paths`.
+ rv.db_paths.emplace_back(db_path, 0 /* target_size */);
+ }
+ return rv;
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_edit.cc b/src/rocksdb/db/wal_edit.cc
new file mode 100644
index 000000000..2525be610
--- /dev/null
+++ b/src/rocksdb/db/wal_edit.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wal_edit.h"
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void WalAddition::EncodeTo(std::string* dst) const {
+ PutVarint64(dst, number_);
+
+ if (metadata_.HasSyncedSize()) {
+ PutVarint32(dst, static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+ PutVarint64(dst, metadata_.GetSyncedSizeInBytes());
+ }
+
+ PutVarint32(dst, static_cast<uint32_t>(WalAdditionTag::kTerminate));
+}
+
+Status WalAddition::DecodeFrom(Slice* src) {
+ constexpr char class_name[] = "WalAddition";
+
+ if (!GetVarint64(src, &number_)) {
+ return Status::Corruption(class_name, "Error decoding WAL log number");
+ }
+
+ while (true) {
+ uint32_t tag_value = 0;
+ if (!GetVarint32(src, &tag_value)) {
+ return Status::Corruption(class_name, "Error decoding tag");
+ }
+ WalAdditionTag tag = static_cast<WalAdditionTag>(tag_value);
+ switch (tag) {
+ case WalAdditionTag::kSyncedSize: {
+ uint64_t size = 0;
+ if (!GetVarint64(src, &size)) {
+ return Status::Corruption(class_name, "Error decoding WAL file size");
+ }
+ metadata_.SetSyncedSizeInBytes(size);
+ break;
+ }
+ // TODO: process future tags such as checksum.
+ case WalAdditionTag::kTerminate:
+ return Status::OK();
+ default: {
+ std::stringstream ss;
+ ss << "Unknown tag " << tag_value;
+ return Status::Corruption(class_name, ss.str());
+ }
+ }
+ }
+}
+
+JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal) {
+ jw << "LogNumber" << wal.GetLogNumber() << "SyncedSizeInBytes"
+ << wal.GetMetadata().GetSyncedSizeInBytes();
+ return jw;
+}
+
+std::ostream& operator<<(std::ostream& os, const WalAddition& wal) {
+ os << "log_number: " << wal.GetLogNumber()
+ << " synced_size_in_bytes: " << wal.GetMetadata().GetSyncedSizeInBytes();
+ return os;
+}
+
+std::string WalAddition::DebugString() const {
+ std::ostringstream oss;
+ oss << *this;
+ return oss.str();
+}
+
+void WalDeletion::EncodeTo(std::string* dst) const {
+ PutVarint64(dst, number_);
+}
+
+Status WalDeletion::DecodeFrom(Slice* src) {
+ constexpr char class_name[] = "WalDeletion";
+
+ if (!GetVarint64(src, &number_)) {
+ return Status::Corruption(class_name, "Error decoding WAL log number");
+ }
+
+ return Status::OK();
+}
+
+JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal) {
+ jw << "LogNumber" << wal.GetLogNumber();
+ return jw;
+}
+
+std::ostream& operator<<(std::ostream& os, const WalDeletion& wal) {
+ os << "log_number: " << wal.GetLogNumber();
+ return os;
+}
+
+std::string WalDeletion::DebugString() const {
+ std::ostringstream oss;
+ oss << *this;
+ return oss.str();
+}
+
+Status WalSet::AddWal(const WalAddition& wal) {
+ if (wal.GetLogNumber() < min_wal_number_to_keep_) {
+ // The WAL has been obsolete, ignore it.
+ return Status::OK();
+ }
+
+ auto it = wals_.lower_bound(wal.GetLogNumber());
+ bool existing = it != wals_.end() && it->first == wal.GetLogNumber();
+
+ if (!existing) {
+ wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()});
+ return Status::OK();
+ }
+
+ assert(existing);
+ if (!wal.GetMetadata().HasSyncedSize()) {
+ std::stringstream ss;
+ ss << "WAL " << wal.GetLogNumber() << " is created more than once";
+ return Status::Corruption("WalSet::AddWal", ss.str());
+ }
+
+ assert(wal.GetMetadata().HasSyncedSize());
+ if (it->second.HasSyncedSize() && wal.GetMetadata().GetSyncedSizeInBytes() <=
+ it->second.GetSyncedSizeInBytes()) {
+ // This is possible because version edits with different synced WAL sizes
+ // for the same WAL can be committed out-of-order. For example, thread
+ // 1 synces the first 10 bytes of 1.log, while thread 2 synces the first 20
+ // bytes of 1.log. It's possible that thread 1 calls LogAndApply() after
+ // thread 2.
+ // In this case, just return ok.
+ return Status::OK();
+ }
+
+ // Update synced size for the given WAL.
+ it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes());
+ return Status::OK();
+}
+
+Status WalSet::AddWals(const WalAdditions& wals) {
+ Status s;
+ for (const WalAddition& wal : wals) {
+ s = AddWal(wal);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ return s;
+}
+
+Status WalSet::DeleteWalsBefore(WalNumber wal) {
+ if (wal > min_wal_number_to_keep_) {
+ min_wal_number_to_keep_ = wal;
+ wals_.erase(wals_.begin(), wals_.lower_bound(wal));
+ }
+ return Status::OK();
+}
+
+void WalSet::Reset() {
+ wals_.clear();
+ min_wal_number_to_keep_ = 0;
+}
+
+Status WalSet::CheckWals(
+ Env* env,
+ const std::unordered_map<WalNumber, std::string>& logs_on_disk) const {
+ assert(env != nullptr);
+
+ Status s;
+ for (const auto& wal : wals_) {
+ const uint64_t log_number = wal.first;
+ const WalMetadata& wal_meta = wal.second;
+
+ if (!wal_meta.HasSyncedSize()) {
+ // The WAL and WAL directory is not even synced,
+ // so the WAL's inode may not be persisted,
+ // then the WAL might not show up when listing WAL directory.
+ continue;
+ }
+
+ if (logs_on_disk.find(log_number) == logs_on_disk.end()) {
+ std::stringstream ss;
+ ss << "Missing WAL with log number: " << log_number << ".";
+ s = Status::Corruption(ss.str());
+ break;
+ }
+
+ uint64_t log_file_size = 0;
+ s = env->GetFileSize(logs_on_disk.at(log_number), &log_file_size);
+ if (!s.ok()) {
+ break;
+ }
+ if (log_file_size < wal_meta.GetSyncedSizeInBytes()) {
+ std::stringstream ss;
+ ss << "Size mismatch: WAL (log number: " << log_number
+ << ") in MANIFEST is " << wal_meta.GetSyncedSizeInBytes()
+ << " bytes , but actually is " << log_file_size << " bytes on disk.";
+ s = Status::Corruption(ss.str());
+ break;
+ }
+ }
+
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_edit.h b/src/rocksdb/db/wal_edit.h
new file mode 100644
index 000000000..bb5c5e292
--- /dev/null
+++ b/src/rocksdb/db/wal_edit.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// WAL related classes used in VersionEdit and VersionSet.
+// Modifications to WalAddition and WalDeletion may need to update
+// VersionEdit and its related tests.
+
+#pragma once
+
+#include <map>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "logging/event_logger.h"
+#include "port/port.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+using WalNumber = uint64_t;
+
+// Metadata of a WAL.
+class WalMetadata {
+ public:
+ WalMetadata() = default;
+
+ explicit WalMetadata(uint64_t synced_size_bytes)
+ : synced_size_bytes_(synced_size_bytes) {}
+
+ bool HasSyncedSize() const { return synced_size_bytes_ != kUnknownWalSize; }
+
+ void SetSyncedSizeInBytes(uint64_t bytes) { synced_size_bytes_ = bytes; }
+
+ uint64_t GetSyncedSizeInBytes() const { return synced_size_bytes_; }
+
+ private:
+ friend bool operator==(const WalMetadata& lhs, const WalMetadata& rhs);
+ friend bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs);
+ // The size of WAL is unknown, used when the WAL is not synced yet or is
+ // empty.
+ constexpr static uint64_t kUnknownWalSize =
+ std::numeric_limits<uint64_t>::max();
+
+ // Size of the most recently synced WAL in bytes.
+ uint64_t synced_size_bytes_ = kUnknownWalSize;
+};
+
+inline bool operator==(const WalMetadata& lhs, const WalMetadata& rhs) {
+ return lhs.synced_size_bytes_ == rhs.synced_size_bytes_;
+}
+
+inline bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs) {
+ return !(lhs == rhs);
+}
+
+// These tags are persisted to MANIFEST, so it's part of the user API.
+enum class WalAdditionTag : uint32_t {
+ // Indicates that there are no more tags.
+ kTerminate = 1,
+ // Synced Size in bytes.
+ kSyncedSize = 2,
+ // Add tags in the future, such as checksum?
+};
+
+// Records the event of adding a WAL in VersionEdit.
+class WalAddition {
+ public:
+ WalAddition() : number_(0), metadata_() {}
+
+ explicit WalAddition(WalNumber number) : number_(number), metadata_() {}
+
+ WalAddition(WalNumber number, WalMetadata meta)
+ : number_(number), metadata_(std::move(meta)) {}
+
+ WalNumber GetLogNumber() const { return number_; }
+
+ const WalMetadata& GetMetadata() const { return metadata_; }
+
+ void EncodeTo(std::string* dst) const;
+
+ Status DecodeFrom(Slice* src);
+
+ std::string DebugString() const;
+
+ private:
+ WalNumber number_;
+ WalMetadata metadata_;
+};
+
+std::ostream& operator<<(std::ostream& os, const WalAddition& wal);
+JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal);
+
+using WalAdditions = std::vector<WalAddition>;
+
+// Records the event of deleting WALs before the specified log number.
+class WalDeletion {
+ public:
+ WalDeletion() : number_(kEmpty) {}
+
+ explicit WalDeletion(WalNumber number) : number_(number) {}
+
+ WalNumber GetLogNumber() const { return number_; }
+
+ void EncodeTo(std::string* dst) const;
+
+ Status DecodeFrom(Slice* src);
+
+ std::string DebugString() const;
+
+ bool IsEmpty() const { return number_ == kEmpty; }
+
+ void Reset() { number_ = kEmpty; }
+
+ private:
+ static constexpr WalNumber kEmpty = 0;
+
+ WalNumber number_;
+};
+
+std::ostream& operator<<(std::ostream& os, const WalDeletion& wal);
+JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal);
+
+// Used in VersionSet to keep the current set of WALs.
+//
+// When a WAL is synced or becomes obsoleted,
+// a VersionEdit is logged to MANIFEST and
+// the WAL is added to or deleted from WalSet.
+//
+// Not thread safe, needs external synchronization such as holding DB mutex.
+class WalSet {
+ public:
+ // Add WAL(s).
+ // If the WAL is closed,
+ // then there must be an existing unclosed WAL,
+ // otherwise, return Status::Corruption.
+ // Can happen when applying a VersionEdit or recovering from MANIFEST.
+ Status AddWal(const WalAddition& wal);
+ Status AddWals(const WalAdditions& wals);
+
+ // Delete WALs with log number smaller than the specified wal number.
+ // Can happen when applying a VersionEdit or recovering from MANIFEST.
+ Status DeleteWalsBefore(WalNumber wal);
+
+ // Resets the internal state.
+ void Reset();
+
+ // WALs with number less than MinWalNumberToKeep should not exist in WalSet.
+ WalNumber GetMinWalNumberToKeep() const { return min_wal_number_to_keep_; }
+
+ const std::map<WalNumber, WalMetadata>& GetWals() const { return wals_; }
+
+ // Checks whether there are missing or corrupted WALs.
+ // Returns Status::OK if there is no missing nor corrupted WAL,
+ // otherwise returns Status::Corruption.
+ // logs_on_disk is a map from log number to the log filename.
+ // Note that logs_on_disk may contain logs that is obsolete but
+ // haven't been deleted from disk.
+ Status CheckWals(
+ Env* env,
+ const std::unordered_map<WalNumber, std::string>& logs_on_disk) const;
+
+ private:
+ std::map<WalNumber, WalMetadata> wals_;
+ // WAL number < min_wal_number_to_keep_ should not exist in wals_.
+ // It's monotonically increasing, in-memory only, not written to MANIFEST.
+ WalNumber min_wal_number_to_keep_ = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_edit_test.cc b/src/rocksdb/db/wal_edit_test.cc
new file mode 100644
index 000000000..0c18fb125
--- /dev/null
+++ b/src/rocksdb/db/wal_edit_test.cc
@@ -0,0 +1,213 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wal_edit.h"
+
+#include "db/db_test_util.h"
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(WalSet, AddDeleteReset) {
+ WalSet wals;
+ ASSERT_TRUE(wals.GetWals().empty());
+
+ // Create WAL 1 - 10.
+ for (WalNumber log_number = 1; log_number <= 10; log_number++) {
+ wals.AddWal(WalAddition(log_number));
+ }
+ ASSERT_EQ(wals.GetWals().size(), 10);
+
+ // Delete WAL 1 - 5.
+ wals.DeleteWalsBefore(6);
+ ASSERT_EQ(wals.GetWals().size(), 5);
+
+ WalNumber expected_log_number = 6;
+ for (auto it : wals.GetWals()) {
+ WalNumber log_number = it.first;
+ ASSERT_EQ(log_number, expected_log_number++);
+ }
+
+ wals.Reset();
+ ASSERT_TRUE(wals.GetWals().empty());
+}
+
+TEST(WalSet, Overwrite) {
+ constexpr WalNumber kNumber = 100;
+ constexpr uint64_t kBytes = 200;
+ WalSet wals;
+ wals.AddWal(WalAddition(kNumber));
+ ASSERT_FALSE(wals.GetWals().at(kNumber).HasSyncedSize());
+ wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes)));
+ ASSERT_TRUE(wals.GetWals().at(kNumber).HasSyncedSize());
+ ASSERT_EQ(wals.GetWals().at(kNumber).GetSyncedSizeInBytes(), kBytes);
+}
+
+TEST(WalSet, SmallerSyncedSize) {
+ constexpr WalNumber kNumber = 100;
+ constexpr uint64_t kBytes = 100;
+ WalSet wals;
+ ASSERT_OK(wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes))));
+ const auto wals1 = wals.GetWals();
+ Status s = wals.AddWal(WalAddition(kNumber, WalMetadata(0)));
+ const auto wals2 = wals.GetWals();
+ ASSERT_OK(s);
+ ASSERT_EQ(wals1, wals2);
+}
+
+TEST(WalSet, CreateTwice) {
+ constexpr WalNumber kNumber = 100;
+ WalSet wals;
+ ASSERT_OK(wals.AddWal(WalAddition(kNumber)));
+ Status s = wals.AddWal(WalAddition(kNumber));
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(s.ToString().find("WAL 100 is created more than once") !=
+ std::string::npos);
+}
+
+TEST(WalSet, DeleteAllWals) {
+ constexpr WalNumber kMaxWalNumber = 10;
+ WalSet wals;
+ for (WalNumber i = 1; i <= kMaxWalNumber; i++) {
+ wals.AddWal(WalAddition(i));
+ }
+ ASSERT_OK(wals.DeleteWalsBefore(kMaxWalNumber + 1));
+}
+
+TEST(WalSet, AddObsoleteWal) {
+ constexpr WalNumber kNumber = 100;
+ WalSet wals;
+ ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1));
+ ASSERT_OK(wals.AddWal(WalAddition(kNumber)));
+ ASSERT_TRUE(wals.GetWals().empty());
+}
+
+TEST(WalSet, MinWalNumberToKeep) {
+ constexpr WalNumber kNumber = 100;
+ WalSet wals;
+ ASSERT_EQ(wals.GetMinWalNumberToKeep(), 0);
+ ASSERT_OK(wals.DeleteWalsBefore(kNumber));
+ ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber);
+ ASSERT_OK(wals.DeleteWalsBefore(kNumber - 1));
+ ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber);
+ ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1));
+ ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber + 1);
+}
+
+class WalSetTest : public DBTestBase {
+ public:
+ WalSetTest() : DBTestBase("WalSetTest", /* env_do_fsync */ true) {}
+
+ void SetUp() override {
+ test_dir_ = test::PerThreadDBPath("wal_set_test");
+ ASSERT_OK(env_->CreateDir(test_dir_));
+ }
+
+ void TearDown() override {
+ EXPECT_OK(DestroyDir(env_, test_dir_));
+ logs_on_disk_.clear();
+ wals_.Reset();
+ }
+
+ void CreateWalOnDisk(WalNumber number, const std::string& fname,
+ uint64_t size_bytes) {
+ std::unique_ptr<WritableFile> f;
+ std::string fpath = Path(fname);
+ ASSERT_OK(env_->NewWritableFile(fpath, &f, EnvOptions()));
+ std::string content(size_bytes, '0');
+ ASSERT_OK(f->Append(content));
+ ASSERT_OK(f->Close());
+
+ logs_on_disk_[number] = fpath;
+ }
+
+ void AddWalToWalSet(WalNumber number, uint64_t size_bytes) {
+ // Create WAL.
+ ASSERT_OK(wals_.AddWal(WalAddition(number)));
+ // Close WAL.
+ WalMetadata wal(size_bytes);
+ ASSERT_OK(wals_.AddWal(WalAddition(number, wal)));
+ }
+
+ Status CheckWals() const { return wals_.CheckWals(env_, logs_on_disk_); }
+
+ private:
+ std::string test_dir_;
+ std::unordered_map<WalNumber, std::string> logs_on_disk_;
+ WalSet wals_;
+
+ std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+};
+
+TEST_F(WalSetTest, CheckEmptyWals) { ASSERT_OK(CheckWals()); }
+
+TEST_F(WalSetTest, CheckWals) {
+ for (int number = 1; number < 10; number++) {
+ uint64_t size = rand() % 100;
+ std::stringstream ss;
+ ss << "log" << number;
+ std::string fname = ss.str();
+ CreateWalOnDisk(number, fname, size);
+ // log 0 - 5 are obsolete.
+ if (number > 5) {
+ AddWalToWalSet(number, size);
+ }
+ }
+ ASSERT_OK(CheckWals());
+}
+
+TEST_F(WalSetTest, CheckMissingWals) {
+ for (int number = 1; number < 10; number++) {
+ uint64_t size = rand() % 100;
+ AddWalToWalSet(number, size);
+ // logs with even number are missing from disk.
+ if (number % 2) {
+ std::stringstream ss;
+ ss << "log" << number;
+ std::string fname = ss.str();
+ CreateWalOnDisk(number, fname, size);
+ }
+ }
+
+ Status s = CheckWals();
+ ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+ // The first log with even number is missing.
+ std::stringstream expected_err;
+ expected_err << "Missing WAL with log number: " << 2;
+ ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos)
+ << s.ToString();
+}
+
+TEST_F(WalSetTest, CheckWalsWithShrinkedSize) {
+ for (int number = 1; number < 10; number++) {
+ uint64_t size = rand() % 100 + 1;
+ AddWalToWalSet(number, size);
+ // logs with even number have shrinked size.
+ std::stringstream ss;
+ ss << "log" << number;
+ std::string fname = ss.str();
+ CreateWalOnDisk(number, fname, (number % 2) ? size : size - 1);
+ }
+
+ Status s = CheckWals();
+ ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+ // The first log with even number has wrong size.
+ std::stringstream expected_err;
+ expected_err << "Size mismatch: WAL (log number: " << 2 << ")";
+ ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos)
+ << s.ToString();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/wal_manager.cc b/src/rocksdb/db/wal_manager.cc
new file mode 100644
index 000000000..a6060235f
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.cc
@@ -0,0 +1,529 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/wal_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <memory>
+#include <vector>
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/transaction_log_impl.h"
+#include "db/write_batch_internal.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/sequence_file_reader.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+Status WalManager::DeleteFile(const std::string& fname, uint64_t number) {
+ auto s = env_->DeleteFile(wal_dir_ + "/" + fname);
+ if (s.ok()) {
+ MutexLock l(&read_first_record_cache_mutex_);
+ read_first_record_cache_.erase(number);
+ }
+ return s;
+}
+
+Status WalManager::GetSortedWalFiles(VectorLogPtr& files) {
+ // First get sorted files in db dir, then get sorted files from archived
+ // dir, to avoid a race condition where a log file is moved to archived
+ // dir in between.
+ Status s;
+ // list wal files in main db dir.
+ VectorLogPtr logs;
+ s = GetSortedWalsOfType(wal_dir_, logs, kAliveLogFile);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Reproduce the race condition where a log file is moved
+ // to archived dir, between these two sync points, used in
+ // (DBTest,TransactionLogIteratorRace)
+ TEST_SYNC_POINT("WalManager::GetSortedWalFiles:1");
+ TEST_SYNC_POINT("WalManager::GetSortedWalFiles:2");
+
+ files.clear();
+ // list wal files in archive dir.
+ std::string archivedir = ArchivalDirectory(wal_dir_);
+ Status exists = env_->FileExists(archivedir);
+ if (exists.ok()) {
+ s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
+ if (!s.ok()) {
+ return s;
+ }
+ } else if (!exists.IsNotFound()) {
+ assert(s.IsIOError());
+ return s;
+ }
+
+ uint64_t latest_archived_log_number = 0;
+ if (!files.empty()) {
+ latest_archived_log_number = files.back()->LogNumber();
+ ROCKS_LOG_INFO(db_options_.info_log, "Latest Archived log: %" PRIu64,
+ latest_archived_log_number);
+ }
+
+ files.reserve(files.size() + logs.size());
+ for (auto& log : logs) {
+ if (log->LogNumber() > latest_archived_log_number) {
+ files.push_back(std::move(log));
+ } else {
+ // When the race condition happens, we could see the
+ // same log in both db dir and archived dir. Simply
+ // ignore the one in db dir. Note that, if we read
+ // archived dir first, we would have missed the log file.
+ ROCKS_LOG_WARN(db_options_.info_log, "%s already moved to archive",
+ log->PathName().c_str());
+ }
+ }
+
+ return s;
+}
+
+Status WalManager::GetUpdatesSince(
+ SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options,
+ VersionSet* version_set) {
+ if (seq_per_batch_) {
+ return Status::NotSupported();
+ }
+
+ assert(!seq_per_batch_);
+
+ // Get all sorted Wal Files.
+ // Do binary search and open files and find the seq number.
+
+ std::unique_ptr<VectorLogPtr> wal_files(new VectorLogPtr);
+ Status s = GetSortedWalFiles(*wal_files);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = RetainProbableWalFiles(*wal_files, seq);
+ if (!s.ok()) {
+ return s;
+ }
+ iter->reset(new TransactionLogIteratorImpl(
+ wal_dir_, &db_options_, read_options, file_options_, seq,
+ std::move(wal_files), version_set, seq_per_batch_, io_tracer_));
+ return (*iter)->status();
+}
+
+// 1. Go through all archived files and
+// a. if ttl is enabled, delete outdated files
+// b. if archive size limit is enabled, delete empty files,
+// compute file number and size.
+// 2. If size limit is enabled:
+// a. compute how many files should be deleted
+// b. get sorted non-empty archived logs
+// c. delete what should be deleted
+void WalManager::PurgeObsoleteWALFiles() {
+ bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0;
+ bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0;
+ if (!ttl_enabled && !size_limit_enabled) {
+ return;
+ }
+
+ int64_t current_time = 0;
+ Status s = db_options_.clock->GetCurrentTime(&current_time);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s",
+ s.ToString().c_str());
+ assert(false);
+ return;
+ }
+ uint64_t const now_seconds = static_cast<uint64_t>(current_time);
+ uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled)
+ ? db_options_.WAL_ttl_seconds / 2
+ : kDefaultIntervalToDeleteObsoleteWAL;
+
+ if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
+ return;
+ }
+
+ purge_wal_files_last_run_ = now_seconds;
+
+ std::string archival_dir = ArchivalDirectory(wal_dir_);
+ std::vector<std::string> files;
+ s = env_->GetChildren(archival_dir, &files);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log, "Can't get archive files: %s",
+ s.ToString().c_str());
+ assert(false);
+ return;
+ }
+
+ size_t log_files_num = 0;
+ uint64_t log_file_size = 0;
+ for (auto& f : files) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(f, &number, &type) && type == kWalFile) {
+ std::string const file_path = archival_dir + "/" + f;
+ if (ttl_enabled) {
+ uint64_t file_m_time;
+ s = env_->GetFileModificationTime(file_path, &file_m_time);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Can't get file mod time: %s: %s", file_path.c_str(),
+ s.ToString().c_str());
+ continue;
+ }
+ if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
+ s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+ /*force_fg=*/!wal_in_db_path_);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s",
+ file_path.c_str(), s.ToString().c_str());
+ continue;
+ } else {
+ MutexLock l(&read_first_record_cache_mutex_);
+ read_first_record_cache_.erase(number);
+ }
+ continue;
+ }
+ }
+
+ if (size_limit_enabled) {
+ uint64_t file_size;
+ s = env_->GetFileSize(file_path, &file_size);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Unable to get file size: %s: %s", file_path.c_str(),
+ s.ToString().c_str());
+ return;
+ } else {
+ if (file_size > 0) {
+ log_file_size = std::max(log_file_size, file_size);
+ ++log_files_num;
+ } else {
+ s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+ /*force_fg=*/!wal_in_db_path_);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Unable to delete file: %s: %s", file_path.c_str(),
+ s.ToString().c_str());
+ continue;
+ } else {
+ MutexLock l(&read_first_record_cache_mutex_);
+ read_first_record_cache_.erase(number);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (0 == log_files_num || !size_limit_enabled) {
+ return;
+ }
+
+ size_t const files_keep_num = static_cast<size_t>(
+ db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size);
+ if (log_files_num <= files_keep_num) {
+ return;
+ }
+
+ size_t files_del_num = log_files_num - files_keep_num;
+ VectorLogPtr archived_logs;
+ s = GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Unable to get archived WALs from: %s: %s",
+ archival_dir.c_str(), s.ToString().c_str());
+ files_del_num = 0;
+ } else if (files_del_num > archived_logs.size()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Trying to delete more archived log files than "
+ "exist. Deleting all");
+ files_del_num = archived_logs.size();
+ }
+
+ for (size_t i = 0; i < files_del_num; ++i) {
+ std::string const file_path = archived_logs[i]->PathName();
+ s = DeleteDBFile(&db_options_, wal_dir_ + "/" + file_path, wal_dir_, false,
+ /*force_fg=*/!wal_in_db_path_);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s",
+ file_path.c_str(), s.ToString().c_str());
+ continue;
+ } else {
+ MutexLock l(&read_first_record_cache_mutex_);
+ read_first_record_cache_.erase(archived_logs[i]->LogNumber());
+ }
+ }
+}
+
+void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
+ auto archived_log_name = ArchivedLogFileName(wal_dir_, number);
+ // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+ TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1");
+ Status s = env_->RenameFile(fname, archived_log_name);
+ // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+ TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2");
+ ROCKS_LOG_INFO(db_options_.info_log, "Move log file %s to %s -- %s\n",
+ fname.c_str(), archived_log_name.c_str(),
+ s.ToString().c_str());
+}
+
+Status WalManager::GetSortedWalsOfType(const std::string& path,
+ VectorLogPtr& log_files,
+ WalFileType log_type) {
+ std::vector<std::string> all_files;
+ const Status status = env_->GetChildren(path, &all_files);
+ if (!status.ok()) {
+ return status;
+ }
+ log_files.reserve(all_files.size());
+ for (const auto& f : all_files) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(f, &number, &type) && type == kWalFile) {
+ SequenceNumber sequence;
+ Status s = ReadFirstRecord(log_type, number, &sequence);
+ if (!s.ok()) {
+ return s;
+ }
+ if (sequence == 0) {
+ // empty file
+ continue;
+ }
+
+ // Reproduce the race condition where a log file is moved
+ // to archived dir, between these two sync points, used in
+ // (DBTest,TransactionLogIteratorRace)
+ TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:1");
+ TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:2");
+
+ uint64_t size_bytes;
+ s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
+ // re-try in case the alive log file has been moved to archive.
+ if (!s.ok() && log_type == kAliveLogFile) {
+ std::string archived_file = ArchivedLogFileName(path, number);
+ if (env_->FileExists(archived_file).ok()) {
+ s = env_->GetFileSize(archived_file, &size_bytes);
+ if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+ // oops, the file just got deleted from archived dir! move on
+ s = Status::OK();
+ continue;
+ }
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ log_files.push_back(std::unique_ptr<LogFile>(
+ new LogFileImpl(number, log_type, sequence, size_bytes)));
+ }
+ }
+ std::sort(
+ log_files.begin(), log_files.end(),
+ [](const std::unique_ptr<LogFile>& a, const std::unique_ptr<LogFile>& b) {
+ LogFileImpl* a_impl = static_cast_with_check<LogFileImpl>(a.get());
+ LogFileImpl* b_impl = static_cast_with_check<LogFileImpl>(b.get());
+ return *a_impl < *b_impl;
+ });
+ return status;
+}
+
+Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs,
+ const SequenceNumber target) {
+ int64_t start = 0; // signed to avoid overflow when target is < first file.
+ int64_t end = static_cast<int64_t>(all_logs.size()) - 1;
+ // Binary Search. avoid opening all files.
+ while (end >= start) {
+ int64_t mid = start + (end - start) / 2; // Avoid overflow.
+ SequenceNumber current_seq_num =
+ all_logs.at(static_cast<size_t>(mid))->StartSequence();
+ if (current_seq_num == target) {
+ end = mid;
+ break;
+ } else if (current_seq_num < target) {
+ start = mid + 1;
+ } else {
+ end = mid - 1;
+ }
+ }
+ // end could be -ve.
+ size_t start_index =
+ static_cast<size_t>(std::max(static_cast<int64_t>(0), end));
+ // The last wal file is always included
+ all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
+ return Status::OK();
+}
+
+Status WalManager::ReadFirstRecord(const WalFileType type,
+ const uint64_t number,
+ SequenceNumber* sequence) {
+ *sequence = 0;
+ if (type != kAliveLogFile && type != kArchivedLogFile) {
+ ROCKS_LOG_ERROR(db_options_.info_log, "[WalManger] Unknown file type %s",
+ std::to_string(type).c_str());
+ return Status::NotSupported("File Type Not Known " + std::to_string(type));
+ }
+ {
+ MutexLock l(&read_first_record_cache_mutex_);
+ auto itr = read_first_record_cache_.find(number);
+ if (itr != read_first_record_cache_.end()) {
+ *sequence = itr->second;
+ return Status::OK();
+ }
+ }
+ Status s;
+ if (type == kAliveLogFile) {
+ std::string fname = LogFileName(wal_dir_, number);
+ s = ReadFirstLine(fname, number, sequence);
+ if (!s.ok() && env_->FileExists(fname).ok()) {
+ // return any error that is not caused by non-existing file
+ return s;
+ }
+ }
+
+ if (type == kArchivedLogFile || !s.ok()) {
+ // check if the file got moved to archive.
+ std::string archived_file = ArchivedLogFileName(wal_dir_, number);
+ s = ReadFirstLine(archived_file, number, sequence);
+ // maybe the file was deleted from archive dir. If that's the case, return
+ // Status::OK(). The caller with identify this as empty file because
+ // *sequence == 0
+ if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+ return Status::OK();
+ }
+ }
+
+ if (s.ok() && *sequence != 0) {
+ MutexLock l(&read_first_record_cache_mutex_);
+ read_first_record_cache_.insert({number, *sequence});
+ }
+ return s;
+}
+
+Status WalManager::GetLiveWalFile(uint64_t number,
+ std::unique_ptr<LogFile>* log_file) {
+ if (!log_file) {
+ return Status::InvalidArgument("log_file not preallocated.");
+ }
+
+ if (!number) {
+ return Status::PathNotFound("log file not available");
+ }
+
+ Status s;
+
+ uint64_t size_bytes;
+ s = env_->GetFileSize(LogFileName(wal_dir_, number), &size_bytes);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ log_file->reset(new LogFileImpl(number, kAliveLogFile,
+ 0, // SequenceNumber
+ size_bytes));
+
+ return Status::OK();
+}
+
+// the function returns status.ok() and sequence == 0 if the file exists, but is
+// empty
+Status WalManager::ReadFirstLine(const std::string& fname,
+ const uint64_t number,
+ SequenceNumber* sequence) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ const char* fname;
+
+ Status* status;
+ bool ignore_error; // true if db_options_.paranoid_checks==false
+ void Corruption(size_t bytes, const Status& s) override {
+ ROCKS_LOG_WARN(info_log, "[WalManager] %s%s: dropping %d bytes; %s",
+ (this->ignore_error ? "(ignoring error) " : ""), fname,
+ static_cast<int>(bytes), s.ToString().c_str());
+ if (this->status->ok()) {
+ // only keep the first error
+ *this->status = s;
+ }
+ }
+ };
+
+ std::unique_ptr<FSSequentialFile> file;
+ Status status = fs_->NewSequentialFile(
+ fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
+ std::unique_ptr<SequentialFileReader> file_reader(
+ new SequentialFileReader(std::move(file), fname, io_tracer_));
+
+ if (!status.ok()) {
+ return status;
+ }
+
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = db_options_.info_log.get();
+ reporter.fname = fname.c_str();
+ reporter.status = &status;
+ reporter.ignore_error = !db_options_.paranoid_checks;
+ log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
+ true /*checksum*/, number);
+ std::string scratch;
+ Slice record;
+
+ if (reader.ReadRecord(&record, &scratch) &&
+ (status.ok() || !db_options_.paranoid_checks)) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter.Corruption(record.size(),
+ Status::Corruption("log record too small"));
+ // TODO read record's till the first no corrupt entry?
+ } else {
+ WriteBatch batch;
+ // We can overwrite an existing non-OK Status since it'd only reach here
+ // with `paranoid_checks == false`.
+ status = WriteBatchInternal::SetContents(&batch, record);
+ if (status.ok()) {
+ *sequence = WriteBatchInternal::Sequence(&batch);
+ return status;
+ }
+ }
+ }
+
+ if (status.ok() && reader.IsCompressedAndEmptyFile()) {
+ // In case of wal_compression, it writes a `kSetCompressionType` record
+ // which is not associated with any sequence number. As result for an empty
+ // file, GetSortedWalsOfType() will skip these WALs causing the operations
+ // to fail.
+ // Therefore, in order to avoid that failure, it sets sequence_number to 1
+ // indicating those WALs should be included.
+ *sequence = 1;
+ } else {
+ // ReadRecord might have returned false on EOF, which means that the log
+ // file is empty. Or, a failure may have occurred while processing the first
+ // entry. In any case, return status and set sequence number to 0.
+ *sequence = 0;
+ }
+ return status;
+}
+
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_manager.h b/src/rocksdb/db/wal_manager.h
new file mode 100644
index 000000000..8cc067935
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+// WAL manager provides the abstraction for reading the WAL files as a single
+// unit. Internally, it opens and reads the files using Reader or Writer
+// abstraction.
+class WalManager {
+ public:
+ WalManager(const ImmutableDBOptions& db_options,
+ const FileOptions& file_options,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const bool seq_per_batch = false)
+ : db_options_(db_options),
+ file_options_(file_options),
+ env_(db_options.env),
+ fs_(db_options.fs, io_tracer),
+ purge_wal_files_last_run_(0),
+ seq_per_batch_(seq_per_batch),
+ wal_dir_(db_options_.GetWalDir()),
+ wal_in_db_path_(db_options_.IsWalDirSameAsDBPath()),
+ io_tracer_(io_tracer) {}
+
+ Status GetSortedWalFiles(VectorLogPtr& files);
+
+ // Allow user to tail transaction log to find all recent changes to the
+ // database that are newer than `seq_number`.
+ Status GetUpdatesSince(
+ SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options,
+ VersionSet* version_set);
+
+ void PurgeObsoleteWALFiles();
+
+ void ArchiveWALFile(const std::string& fname, uint64_t number);
+
+ Status DeleteFile(const std::string& fname, uint64_t number);
+
+ Status GetLiveWalFile(uint64_t number, std::unique_ptr<LogFile>* log_file);
+
+ Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
+ SequenceNumber* sequence) {
+ return ReadFirstRecord(type, number, sequence);
+ }
+
+ Status TEST_ReadFirstLine(const std::string& fname, const uint64_t number,
+ SequenceNumber* sequence) {
+ return ReadFirstLine(fname, number, sequence);
+ }
+
+ private:
+ Status GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files,
+ WalFileType type);
+ // Requires: all_logs should be sorted with earliest log file first
+ // Retains all log files in all_logs which contain updates with seq no.
+ // Greater Than or Equal to the requested SequenceNumber.
+ Status RetainProbableWalFiles(VectorLogPtr& all_logs,
+ const SequenceNumber target);
+
+ // ReadFirstRecord checks the read_first_record_cache_ to see if the entry
+ // exists or not. If not, it will read the WAL file.
+ // In case of wal_compression, WAL contains a `kSetCompressionType` record
+ // which is not associated with any sequence number. So the sequence_number is
+ // set to 1 if that WAL doesn't include any other record (basically empty) in
+ // order to include that WAL and is inserted in read_first_record_cache_.
+ // Therefore, sequence_number is used as boolean if WAL should be included or
+ // not and that sequence_number shouldn't be use for any other purpose.
+ Status ReadFirstRecord(const WalFileType type, const uint64_t number,
+ SequenceNumber* sequence);
+
+ // In case of no wal_compression, ReadFirstLine returns status.ok() and
+ // sequence == 0 if the file exists, but is empty.
+ // In case of wal_compression, WAL contains
+ // `kSetCompressionType` record which is not associated with any sequence
+ // number if that WAL doesn't include any other record (basically empty). As
+ // result for an empty file, GetSortedWalsOfType() will skip these WALs
+ // causing the operations to fail. To avoid that, it sets sequence_number to
+ // 1 inorder to include that WAL.
+ Status ReadFirstLine(const std::string& fname, const uint64_t number,
+ SequenceNumber* sequence);
+
+ // ------- state from DBImpl ------
+ const ImmutableDBOptions& db_options_;
+ const FileOptions file_options_;
+ Env* env_;
+ const FileSystemPtr fs_;
+
+ // ------- WalManager state -------
+ // cache for ReadFirstRecord() calls
+ std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
+ port::Mutex read_first_record_cache_mutex_;
+
+ // last time when PurgeObsoleteWALFiles ran.
+ uint64_t purge_wal_files_last_run_;
+
+ bool seq_per_batch_;
+
+ const std::string& wal_dir_;
+
+ bool wal_in_db_path_;
+
+ // obsolete files will be deleted every this seconds if ttl deletion is
+ // enabled and archive size_limit is disabled.
+ static constexpr uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
+
+ std::shared_ptr<IOTracer> io_tracer_;
+};
+
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_manager_test.cc b/src/rocksdb/db/wal_manager_test.cc
new file mode 100644
index 000000000..4ad4e9749
--- /dev/null
+++ b/src/rocksdb/db/wal_manager_test.cc
@@ -0,0 +1,346 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/wal_manager.h"
+
+#include <map>
+#include <string>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/log_writer.h"
+#include "db/version_set.h"
+#include "env/mock_env.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(icanadi) mock out VersionSet
+// TODO(icanadi) move other WalManager-specific tests from db_test here
+class WalManagerTest : public testing::Test {
+ public:
+ WalManagerTest()
+ : dbname_(test::PerThreadDBPath("wal_manager_test")),
+ db_options_(),
+ table_cache_(NewLRUCache(50000, 16)),
+ write_buffer_manager_(db_options_.db_write_buffer_size),
+ current_log_number_(0) {
+ env_.reset(MockEnv::Create(Env::Default()));
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ }
+
+ void Init() {
+ ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+ ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_)));
+ db_options_.db_paths.emplace_back(dbname_,
+ std::numeric_limits<uint64_t>::max());
+ db_options_.wal_dir = dbname_;
+ db_options_.env = env_.get();
+ db_options_.fs = env_->GetFileSystem();
+ db_options_.clock = env_->GetSystemClock().get();
+
+ versions_.reset(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+
+ wal_manager_.reset(
+ new WalManager(db_options_, env_options_, nullptr /*IOTracer*/));
+ }
+
+ void Reopen() {
+ wal_manager_.reset(
+ new WalManager(db_options_, env_options_, nullptr /*IOTracer*/));
+ }
+
+ // NOT thread safe
+ void Put(const std::string& key, const std::string& value) {
+ assert(current_log_writer_.get() != nullptr);
+ uint64_t seq = versions_->LastSequence() + 1;
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(key, value));
+ WriteBatchInternal::SetSequence(&batch, seq);
+ ASSERT_OK(
+ current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch)));
+ versions_->SetLastAllocatedSequence(seq);
+ versions_->SetLastPublishedSequence(seq);
+ versions_->SetLastSequence(seq);
+ }
+
+ // NOT thread safe
+ void RollTheLog(bool /*archived*/) {
+ current_log_number_++;
+ std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
+ const auto& fs = env_->GetFileSystem();
+ std::unique_ptr<WritableFileWriter> file_writer;
+ ASSERT_OK(WritableFileWriter::Create(fs, fname, env_options_, &file_writer,
+ nullptr));
+ current_log_writer_.reset(
+ new log::Writer(std::move(file_writer), 0, false));
+ }
+
+ void CreateArchiveLogs(int num_logs, int entries_per_log) {
+ for (int i = 1; i <= num_logs; ++i) {
+ RollTheLog(true);
+ for (int k = 0; k < entries_per_log; ++k) {
+ Put(std::to_string(k), std::string(1024, 'a'));
+ }
+ }
+ }
+
+ std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+ const SequenceNumber seq) {
+ std::unique_ptr<TransactionLogIterator> iter;
+ Status status = wal_manager_->GetUpdatesSince(
+ seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get());
+ EXPECT_OK(status);
+ return iter;
+ }
+
+ std::unique_ptr<MockEnv> env_;
+ std::string dbname_;
+ ImmutableDBOptions db_options_;
+ WriteController write_controller_;
+ EnvOptions env_options_;
+ std::shared_ptr<Cache> table_cache_;
+ WriteBufferManager write_buffer_manager_;
+ std::unique_ptr<VersionSet> versions_;
+ std::unique_ptr<WalManager> wal_manager_;
+
+ std::unique_ptr<log::Writer> current_log_writer_;
+ uint64_t current_log_number_;
+};
+
+TEST_F(WalManagerTest, ReadFirstRecordCache) {
+ Init();
+ std::string path = dbname_ + "/000001.log";
+ std::unique_ptr<FSWritableFile> file;
+ ASSERT_OK(env_->GetFileSystem()->NewWritableFile(path, FileOptions(), &file,
+ nullptr));
+
+ SequenceNumber s;
+ ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s));
+ ASSERT_EQ(s, 0U);
+
+ ASSERT_OK(
+ wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s));
+ ASSERT_EQ(s, 0U);
+
+ std::unique_ptr<WritableFileWriter> file_writer(
+ new WritableFileWriter(std::move(file), path, FileOptions()));
+ log::Writer writer(std::move(file_writer), 1,
+ db_options_.recycle_log_file_num > 0);
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("foo", "bar"));
+ WriteBatchInternal::SetSequence(&batch, 10);
+ ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
+
+ // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here.
+ // Waiting for lei to finish with db_test
+ // env_->count_sequential_reads_ = true;
+ // sequential_read_counter_ sanity test
+ // ASSERT_EQ(env_->sequential_read_counter_.Read(), 0);
+
+ ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+ ASSERT_EQ(s, 10U);
+ // did a read
+ // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+ // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+
+ ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+ ASSERT_EQ(s, 10U);
+ // no new reads since the value is cached
+ // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+ // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+}
+
+namespace {
+uint64_t GetLogDirSize(std::string dir_path, Env* env) {
+ uint64_t dir_size = 0;
+ std::vector<std::string> files;
+ EXPECT_OK(env->GetChildren(dir_path, &files));
+ for (auto& f : files) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(f, &number, &type) && type == kWalFile) {
+ std::string const file_path = dir_path + "/" + f;
+ uint64_t file_size;
+ EXPECT_OK(env->GetFileSize(file_path, &file_size));
+ dir_size += file_size;
+ }
+ }
+ return dir_size;
+}
+std::vector<std::uint64_t> ListSpecificFiles(
+ Env* env, const std::string& path, const FileType expected_file_type) {
+ std::vector<std::string> files;
+ std::vector<uint64_t> file_numbers;
+ uint64_t number;
+ FileType type;
+ EXPECT_OK(env->GetChildren(path, &files));
+ for (size_t i = 0; i < files.size(); ++i) {
+ if (ParseFileName(files[i], &number, &type)) {
+ if (type == expected_file_type) {
+ file_numbers.push_back(number);
+ }
+ }
+ }
+ return file_numbers;
+}
+
+int CountRecords(TransactionLogIterator* iter) {
+ int count = 0;
+ SequenceNumber lastSequence = 0;
+ BatchResult res;
+ while (iter->Valid()) {
+ res = iter->GetBatch();
+ EXPECT_TRUE(res.sequence > lastSequence);
+ ++count;
+ lastSequence = res.sequence;
+ EXPECT_OK(iter->status());
+ iter->Next();
+ }
+ EXPECT_OK(iter->status());
+ return count;
+}
+} // anonymous namespace
+
+TEST_F(WalManagerTest, WALArchivalSizeLimit) {
+ db_options_.WAL_ttl_seconds = 0;
+ db_options_.WAL_size_limit_MB = 1000;
+ Init();
+
+ // TEST : Create WalManager with huge size limit and no ttl.
+ // Create some archived files and call PurgeObsoleteWALFiles().
+ // Count the archived log files that survived.
+ // Assert that all of them did.
+ // Change size limit. Re-open WalManager.
+ // Assert that archive is not greater than WAL_size_limit_MB after
+ // PurgeObsoleteWALFiles()
+ // Set ttl and time_to_check_ to small values. Re-open db.
+ // Assert that there are no archived logs left.
+
+ std::string archive_dir = ArchivalDirectory(dbname_);
+ CreateArchiveLogs(20, 5000);
+
+ std::vector<std::uint64_t> log_files =
+ ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+ ASSERT_EQ(log_files.size(), 20U);
+
+ db_options_.WAL_size_limit_MB = 8;
+ Reopen();
+ wal_manager_->PurgeObsoleteWALFiles();
+
+ uint64_t archive_size = GetLogDirSize(archive_dir, env_.get());
+ ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024);
+
+ db_options_.WAL_ttl_seconds = 1;
+ env_->SleepForMicroseconds(2 * 1000 * 1000);
+ Reopen();
+ wal_manager_->PurgeObsoleteWALFiles();
+
+ log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+ ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, WALArchivalTtl) {
+ db_options_.WAL_ttl_seconds = 1000;
+ Init();
+
+ // TEST : Create WalManager with a ttl and no size limit.
+ // Create some archived log files and call PurgeObsoleteWALFiles().
+ // Assert that files are not deleted
+ // Reopen db with small ttl.
+ // Assert that all archived logs was removed.
+
+ std::string archive_dir = ArchivalDirectory(dbname_);
+ CreateArchiveLogs(20, 5000);
+
+ std::vector<uint64_t> log_files =
+ ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+ ASSERT_GT(log_files.size(), 0U);
+
+ db_options_.WAL_ttl_seconds = 1;
+ env_->SleepForMicroseconds(3 * 1000 * 1000);
+ Reopen();
+ wal_manager_->PurgeObsoleteWALFiles();
+
+ log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+ ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) {
+ Init();
+ RollTheLog(false);
+ Put("key1", std::string(1024, 'a'));
+ // Create a zero record WAL file.
+ RollTheLog(false);
+ RollTheLog(false);
+
+ Put("key2", std::string(1024, 'a'));
+
+ auto iter = OpenTransactionLogIter(0);
+ ASSERT_EQ(2, CountRecords(iter.get()));
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
+ Init();
+ RollTheLog(false);
+ auto iter = OpenTransactionLogIter(0);
+ // Check that an empty iterator is returned
+ ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) {
+ Init();
+ CreateArchiveLogs(2, 100);
+ auto iter = OpenTransactionLogIter(0);
+ CreateArchiveLogs(1, 100);
+ int i = 0;
+ for (; iter->Valid(); iter->Next()) {
+ i++;
+ }
+ ASSERT_EQ(i, 200);
+ // A new log file was added after the iterator was created.
+ // TryAgain indicates a new iterator is needed to fetch the new data
+ ASSERT_TRUE(iter->status().IsTryAgain());
+
+ iter = OpenTransactionLogIter(0);
+ i = 0;
+ for (; iter->Valid(); iter->Next()) {
+ i++;
+ }
+ ASSERT_EQ(i, 300);
+ ASSERT_TRUE(iter->status().ok());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as WalManager is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/wide/db_wide_basic_test.cc b/src/rocksdb/db/wide/db_wide_basic_test.cc
new file mode 100644
index 000000000..1ffe314fe
--- /dev/null
+++ b/src/rocksdb/db/wide/db_wide_basic_test.cc
@@ -0,0 +1,654 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <array>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBWideBasicTest : public DBTestBase {
+ protected:
+ explicit DBWideBasicTest()
+ : DBTestBase("db_wide_basic_test", /* env_do_fsync */ false) {}
+};
+
+TEST_F(DBWideBasicTest, PutEntity) {
+ Options options = GetDefaultOptions();
+
+ // Write a couple of wide-column entities and a plain old key-value, then read
+ // them back.
+ constexpr char first_key[] = "first";
+ constexpr char first_value_of_default_column[] = "hello";
+ WideColumns first_columns{
+ {kDefaultWideColumnName, first_value_of_default_column},
+ {"attr_name1", "foo"},
+ {"attr_name2", "bar"}};
+
+ constexpr char second_key[] = "second";
+ WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}};
+
+ constexpr char third_key[] = "third";
+ constexpr char third_value[] = "baz";
+
+ auto verify = [&]() {
+ const WideColumns expected_third_columns{
+ {kDefaultWideColumnName, third_value}};
+
+ {
+ PinnableSlice result;
+ ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), first_key,
+ &result));
+ ASSERT_EQ(result, first_value_of_default_column);
+ }
+
+ {
+ PinnableWideColumns result;
+ ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+ first_key, &result));
+ ASSERT_EQ(result.columns(), first_columns);
+ }
+
+ {
+ PinnableSlice result;
+ ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), second_key,
+ &result));
+ ASSERT_TRUE(result.empty());
+ }
+
+ {
+ PinnableWideColumns result;
+ ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+ second_key, &result));
+ ASSERT_EQ(result.columns(), second_columns);
+ }
+
+ {
+ PinnableSlice result;
+ ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), third_key,
+ &result));
+ ASSERT_EQ(result, third_value);
+ }
+
+ {
+ PinnableWideColumns result;
+ ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+ third_key, &result));
+
+ ASSERT_EQ(result.columns(), expected_third_columns);
+ }
+
+ {
+ constexpr size_t num_keys = 3;
+
+ std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+ &keys[0], &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value_of_default_column);
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_TRUE(values[1].empty());
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], third_value);
+ }
+
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), first_value_of_default_column);
+ ASSERT_EQ(iter->columns(), first_columns);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_TRUE(iter->value().empty());
+ ASSERT_EQ(iter->columns(), second_columns);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), third_key);
+ ASSERT_EQ(iter->value(), third_value);
+ ASSERT_EQ(iter->columns(), expected_third_columns);
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), third_key);
+ ASSERT_EQ(iter->value(), third_value);
+ ASSERT_EQ(iter->columns(), expected_third_columns);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_TRUE(iter->value().empty());
+ ASSERT_EQ(iter->columns(), second_columns);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), first_value_of_default_column);
+ ASSERT_EQ(iter->columns(), first_columns);
+
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ }
+ };
+
+ // Use the DB::PutEntity API to write the first entity
+ ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
+ first_key, first_columns));
+
+ // Use WriteBatch to write the second entity
+ WriteBatch batch;
+ ASSERT_OK(
+ batch.PutEntity(db_->DefaultColumnFamily(), second_key, second_columns));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+ // Use Put to write the plain key-value
+ ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), third_key,
+ third_value));
+
+ // Try reading from memtable
+ verify();
+
+ // Try reading after recovery
+ Close();
+ options.avoid_flush_during_recovery = true;
+ Reopen(options);
+
+ verify();
+
+ // Try reading from storage
+ ASSERT_OK(Flush());
+
+ verify();
+}
+
+TEST_F(DBWideBasicTest, PutEntityColumnFamily) {
+ Options options = GetDefaultOptions();
+ CreateAndReopenWithCF({"corinthian"}, options);
+
+ // Use the DB::PutEntity API
+ constexpr char first_key[] = "first";
+ WideColumns first_columns{{"attr_name1", "foo"}, {"attr_name2", "bar"}};
+
+ ASSERT_OK(
+ db_->PutEntity(WriteOptions(), handles_[1], first_key, first_columns));
+
+ // Use WriteBatch
+ constexpr char second_key[] = "second";
+ WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}};
+
+ WriteBatch batch;
+ ASSERT_OK(batch.PutEntity(handles_[1], second_key, second_columns));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+}
+
+TEST_F(DBWideBasicTest, MergePlainKeyValue) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ Reopen(options);
+
+ // Put + Merge
+ constexpr char first_key[] = "first";
+ constexpr char first_base_value[] = "hello";
+ constexpr char first_merge_op[] = "world";
+
+ // Delete + Merge
+ constexpr char second_key[] = "second";
+ constexpr char second_merge_op[] = "foo";
+
+ // Merge without any preceding KV
+ constexpr char third_key[] = "third";
+ constexpr char third_merge_op[] = "bar";
+
+ auto write_base = [&]() {
+ // Write "base" KVs: a Put for the 1st key and a Delete for the 2nd one;
+ // note there is no "base" KV for the 3rd
+ ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), first_key,
+ first_base_value));
+ ASSERT_OK(
+ db_->Delete(WriteOptions(), db_->DefaultColumnFamily(), second_key));
+ };
+
+ auto write_merge = [&]() {
+ // Write Merge operands
+ ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key,
+ first_merge_op));
+ ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key,
+ second_merge_op));
+ ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key,
+ third_merge_op));
+ };
+
+ const std::string expected_first_column(std::string(first_base_value) + "," +
+ first_merge_op);
+ const WideColumns expected_first_columns{
+ {kDefaultWideColumnName, expected_first_column}};
+ const WideColumns expected_second_columns{
+ {kDefaultWideColumnName, second_merge_op}};
+ const WideColumns expected_third_columns{
+ {kDefaultWideColumnName, third_merge_op}};
+
+ auto verify = [&]() {
+ {
+ PinnableWideColumns result;
+ ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+ first_key, &result));
+ ASSERT_EQ(result.columns(), expected_first_columns);
+ }
+
+ {
+ PinnableWideColumns result;
+ ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+ second_key, &result));
+ ASSERT_EQ(result.columns(), expected_second_columns);
+ }
+
+ {
+ PinnableWideColumns result;
+ ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+ third_key, &result));
+
+ ASSERT_EQ(result.columns(), expected_third_columns);
+ }
+
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), expected_first_columns[0].value());
+ ASSERT_EQ(iter->columns(), expected_first_columns);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_EQ(iter->value(), expected_second_columns[0].value());
+ ASSERT_EQ(iter->columns(), expected_second_columns);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), third_key);
+ ASSERT_EQ(iter->value(), expected_third_columns[0].value());
+ ASSERT_EQ(iter->columns(), expected_third_columns);
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), third_key);
+ ASSERT_EQ(iter->value(), expected_third_columns[0].value());
+ ASSERT_EQ(iter->columns(), expected_third_columns);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_EQ(iter->value(), expected_second_columns[0].value());
+ ASSERT_EQ(iter->columns(), expected_second_columns);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), expected_first_columns[0].value());
+ ASSERT_EQ(iter->columns(), expected_first_columns);
+
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ }
+ };
+
+ {
+ // Base KVs (if any) and Merge operands both in memtable (note: we take a
+ // snapshot in between to make sure they do not get reconciled during the
+ // subsequent flush)
+ write_base();
+ ManagedSnapshot snapshot(db_);
+ write_merge();
+ verify();
+
+ // Base KVs (if any) and Merge operands both in storage
+ ASSERT_OK(Flush());
+ verify();
+ }
+
+ // Base KVs (if any) in storage, Merge operands in memtable
+ DestroyAndReopen(options);
+ write_base();
+ ASSERT_OK(Flush());
+ write_merge();
+ verify();
+}
+
+TEST_F(DBWideBasicTest, MergeEntity) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+
+ const std::string delim("|");
+ options.merge_operator = MergeOperators::CreateStringAppendOperator(delim);
+
+ Reopen(options);
+
+ // Test Merge with two entities: one that has the default column and one that
+ // doesn't
+ constexpr char first_key[] = "first";
+ WideColumns first_columns{{kDefaultWideColumnName, "a"},
+ {"attr_name1", "foo"},
+ {"attr_name2", "bar"}};
+ constexpr char first_merge_operand[] = "bla1";
+
+ constexpr char second_key[] = "second";
+ WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}};
+ constexpr char second_merge_operand[] = "bla2";
+
+ auto write_base = [&]() {
+ // Use the DB::PutEntity API
+ ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
+ first_key, first_columns));
+
+ // Use WriteBatch
+ WriteBatch batch;
+ ASSERT_OK(batch.PutEntity(db_->DefaultColumnFamily(), second_key,
+ second_columns));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ };
+
+ auto write_merge = [&]() {
+ ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key,
+ first_merge_operand));
+ ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key,
+ second_merge_operand));
+ };
+
+ const std::string first_expected_default(first_columns[0].value().ToString() +
+ delim + first_merge_operand);
+ const std::string second_expected_default(delim + second_merge_operand);
+
+ auto verify_basic = [&]() {
+ WideColumns first_expected_columns{
+ {kDefaultWideColumnName, first_expected_default},
+ first_columns[1],
+ first_columns[2]};
+
+ WideColumns second_expected_columns{
+ {kDefaultWideColumnName, second_expected_default},
+ second_columns[0],
+ second_columns[1]};
+
+ {
+ PinnableSlice result;
+ ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), first_key,
+ &result));
+ ASSERT_EQ(result, first_expected_default);
+ }
+
+ {
+ PinnableWideColumns result;
+ ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+ first_key, &result));
+ ASSERT_EQ(result.columns(), first_expected_columns);
+ }
+
+ {
+ PinnableSlice result;
+ ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), second_key,
+ &result));
+ ASSERT_EQ(result, second_expected_default);
+ }
+
+ {
+ PinnableWideColumns result;
+ ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+ second_key, &result));
+ ASSERT_EQ(result.columns(), second_expected_columns);
+ }
+
+ {
+ constexpr size_t num_keys = 2;
+
+ std::array<Slice, num_keys> keys{{first_key, second_key}};
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+ &keys[0], &values[0], &statuses[0]);
+
+ ASSERT_EQ(values[0], first_expected_default);
+ ASSERT_OK(statuses[0]);
+
+ ASSERT_EQ(values[1], second_expected_default);
+ ASSERT_OK(statuses[1]);
+ }
+
+ {
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), first_expected_default);
+ ASSERT_EQ(iter->columns(), first_expected_columns);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_EQ(iter->value(), second_expected_default);
+ ASSERT_EQ(iter->columns(), second_expected_columns);
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_EQ(iter->value(), second_expected_default);
+ ASSERT_EQ(iter->columns(), second_expected_columns);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), first_expected_default);
+ ASSERT_EQ(iter->columns(), first_expected_columns);
+
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ }
+ };
+
+ auto verify_merge_ops_pre_compaction = [&]() {
+ constexpr size_t num_merge_operands = 2;
+
+ GetMergeOperandsOptions get_merge_opts;
+ get_merge_opts.expected_max_number_of_operands = num_merge_operands;
+
+ {
+ std::array<PinnableSlice, num_merge_operands> merge_operands;
+ int number_of_operands = 0;
+
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ first_key, &merge_operands[0],
+ &get_merge_opts, &number_of_operands));
+
+ ASSERT_EQ(number_of_operands, num_merge_operands);
+ ASSERT_EQ(merge_operands[0], first_columns[0].value());
+ ASSERT_EQ(merge_operands[1], first_merge_operand);
+ }
+
+ {
+ std::array<PinnableSlice, num_merge_operands> merge_operands;
+ int number_of_operands = 0;
+
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ second_key, &merge_operands[0],
+ &get_merge_opts, &number_of_operands));
+
+ ASSERT_EQ(number_of_operands, num_merge_operands);
+ ASSERT_TRUE(merge_operands[0].empty());
+ ASSERT_EQ(merge_operands[1], second_merge_operand);
+ }
+ };
+
+ auto verify_merge_ops_post_compaction = [&]() {
+ constexpr size_t num_merge_operands = 1;
+
+ GetMergeOperandsOptions get_merge_opts;
+ get_merge_opts.expected_max_number_of_operands = num_merge_operands;
+
+ {
+ std::array<PinnableSlice, num_merge_operands> merge_operands;
+ int number_of_operands = 0;
+
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ first_key, &merge_operands[0],
+ &get_merge_opts, &number_of_operands));
+
+ ASSERT_EQ(number_of_operands, num_merge_operands);
+ ASSERT_EQ(merge_operands[0], first_expected_default);
+ }
+
+ {
+ std::array<PinnableSlice, num_merge_operands> merge_operands;
+ int number_of_operands = 0;
+
+ ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+ second_key, &merge_operands[0],
+ &get_merge_opts, &number_of_operands));
+
+ ASSERT_EQ(number_of_operands, num_merge_operands);
+ ASSERT_EQ(merge_operands[0], second_expected_default);
+ }
+ };
+
+ {
+ // Base KVs and Merge operands both in memtable (note: we take a snapshot in
+ // between to make sure they do not get reconciled during the subsequent
+ // flush)
+ write_base();
+ ManagedSnapshot snapshot(db_);
+ write_merge();
+ verify_basic();
+ verify_merge_ops_pre_compaction();
+
+ // Base KVs and Merge operands both in storage
+ ASSERT_OK(Flush());
+ verify_basic();
+ verify_merge_ops_pre_compaction();
+ }
+
+ // Base KVs in storage, Merge operands in memtable
+ DestroyAndReopen(options);
+ write_base();
+ ASSERT_OK(Flush());
+ write_merge();
+ verify_basic();
+ verify_merge_ops_pre_compaction();
+
+ // Flush and compact
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr,
+ /* end */ nullptr));
+ verify_basic();
+ verify_merge_ops_post_compaction();
+}
+
+TEST_F(DBWideBasicTest, PutEntityTimestampError) {
+ // Note: timestamps are currently not supported
+
+ Options options = GetDefaultOptions();
+ options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+
+ ColumnFamilyHandle* handle = nullptr;
+ ASSERT_OK(db_->CreateColumnFamily(options, "corinthian", &handle));
+ std::unique_ptr<ColumnFamilyHandle> handle_guard(handle);
+
+ // Use the DB::PutEntity API
+ constexpr char first_key[] = "first";
+ WideColumns first_columns{{"attr_name1", "foo"}, {"attr_name2", "bar"}};
+
+ ASSERT_TRUE(db_->PutEntity(WriteOptions(), handle, first_key, first_columns)
+ .IsInvalidArgument());
+
+ // Use WriteBatch
+ constexpr char second_key[] = "second";
+ WideColumns second_columns{{"doric", "column"}, {"ionic", "column"}};
+
+ WriteBatch batch;
+ ASSERT_TRUE(
+ batch.PutEntity(handle, second_key, second_columns).IsInvalidArgument());
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+}
+
+TEST_F(DBWideBasicTest, PutEntitySerializationError) {
+ // Make sure duplicate columns are caught
+
+ Options options = GetDefaultOptions();
+
+ // Use the DB::PutEntity API
+ constexpr char first_key[] = "first";
+ WideColumns first_columns{{"foo", "bar"}, {"foo", "baz"}};
+
+ ASSERT_TRUE(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
+ first_key, first_columns)
+ .IsCorruption());
+
+ // Use WriteBatch
+ constexpr char second_key[] = "second";
+ WideColumns second_columns{{"column", "doric"}, {"column", "ionic"}};
+
+ WriteBatch batch;
+ ASSERT_TRUE(
+ batch.PutEntity(db_->DefaultColumnFamily(), second_key, second_columns)
+ .IsCorruption());
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/wide/wide_column_serialization.cc b/src/rocksdb/db/wide/wide_column_serialization.cc
new file mode 100644
index 000000000..f62143c40
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_column_serialization.cc
@@ -0,0 +1,182 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wide/wide_column_serialization.h"
+
+#include <algorithm>
+#include <cassert>
+#include <limits>
+
+#include "rocksdb/slice.h"
+#include "util/autovector.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status WideColumnSerialization::SerializeImpl(const Slice* value_of_default,
+ const WideColumns& columns,
+ std::string& output) {
+ const size_t num_columns =
+ value_of_default ? columns.size() + 1 : columns.size();
+
+ if (num_columns > static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+ return Status::InvalidArgument("Too many wide columns");
+ }
+
+ PutVarint32(&output, kCurrentVersion);
+
+ PutVarint32(&output, static_cast<uint32_t>(num_columns));
+
+ const Slice* prev_name = nullptr;
+ if (value_of_default) {
+ if (value_of_default->size() >
+ static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+ return Status::InvalidArgument("Wide column value too long");
+ }
+
+ PutLengthPrefixedSlice(&output, kDefaultWideColumnName);
+ PutVarint32(&output, static_cast<uint32_t>(value_of_default->size()));
+
+ prev_name = &kDefaultWideColumnName;
+ }
+
+ for (size_t i = 0; i < columns.size(); ++i) {
+ const WideColumn& column = columns[i];
+
+ const Slice& name = column.name();
+ if (name.size() >
+ static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+ return Status::InvalidArgument("Wide column name too long");
+ }
+
+ if (prev_name && prev_name->compare(name) >= 0) {
+ return Status::Corruption("Wide columns out of order");
+ }
+
+ const Slice& value = column.value();
+ if (value.size() >
+ static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+ return Status::InvalidArgument("Wide column value too long");
+ }
+
+ PutLengthPrefixedSlice(&output, name);
+ PutVarint32(&output, static_cast<uint32_t>(value.size()));
+
+ prev_name = &name;
+ }
+
+ if (value_of_default) {
+ output.append(value_of_default->data(), value_of_default->size());
+ }
+
+ for (const auto& column : columns) {
+ const Slice& value = column.value();
+
+ output.append(value.data(), value.size());
+ }
+
+ return Status::OK();
+}
+
+Status WideColumnSerialization::Deserialize(Slice& input,
+ WideColumns& columns) {
+ assert(columns.empty());
+
+ uint32_t version = 0;
+ if (!GetVarint32(&input, &version)) {
+ return Status::Corruption("Error decoding wide column version");
+ }
+
+ if (version > kCurrentVersion) {
+ return Status::NotSupported("Unsupported wide column version");
+ }
+
+ uint32_t num_columns = 0;
+ if (!GetVarint32(&input, &num_columns)) {
+ return Status::Corruption("Error decoding number of wide columns");
+ }
+
+ if (!num_columns) {
+ return Status::OK();
+ }
+
+ columns.reserve(num_columns);
+
+ autovector<uint32_t, 16> column_value_sizes;
+ column_value_sizes.reserve(num_columns);
+
+ for (uint32_t i = 0; i < num_columns; ++i) {
+ Slice name;
+ if (!GetLengthPrefixedSlice(&input, &name)) {
+ return Status::Corruption("Error decoding wide column name");
+ }
+
+ if (!columns.empty() && columns.back().name().compare(name) >= 0) {
+ return Status::Corruption("Wide columns out of order");
+ }
+
+ columns.emplace_back(name, Slice());
+
+ uint32_t value_size = 0;
+ if (!GetVarint32(&input, &value_size)) {
+ return Status::Corruption("Error decoding wide column value size");
+ }
+
+ column_value_sizes.emplace_back(value_size);
+ }
+
+ const Slice data(input);
+ size_t pos = 0;
+
+ for (uint32_t i = 0; i < num_columns; ++i) {
+ const uint32_t value_size = column_value_sizes[i];
+
+ if (pos + value_size > data.size()) {
+ return Status::Corruption("Error decoding wide column value payload");
+ }
+
+ columns[i].value() = Slice(data.data() + pos, value_size);
+
+ pos += value_size;
+ }
+
+ return Status::OK();
+}
+
+WideColumns::const_iterator WideColumnSerialization::Find(
+ const WideColumns& columns, const Slice& column_name) {
+ const auto it =
+ std::lower_bound(columns.cbegin(), columns.cend(), column_name,
+ [](const WideColumn& lhs, const Slice& rhs) {
+ return lhs.name().compare(rhs) < 0;
+ });
+
+ if (it == columns.cend() || it->name() != column_name) {
+ return columns.cend();
+ }
+
+ return it;
+}
+
+Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input,
+ Slice& value) {
+ WideColumns columns;
+
+ const Status s = Deserialize(input, columns);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (columns.empty() || columns[0].name() != kDefaultWideColumnName) {
+ value.clear();
+ return Status::OK();
+ }
+
+ value = columns[0].value();
+
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wide/wide_column_serialization.h b/src/rocksdb/db/wide/wide_column_serialization.h
new file mode 100644
index 000000000..f0ffbd392
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_column_serialization.h
@@ -0,0 +1,77 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/wide_columns.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// Wide-column serialization/deserialization primitives.
+//
+// The two main parts of the layout are 1) a sorted index containing the column
+// names and column value sizes and 2) the column values themselves. Keeping the
+// index and the values separate will enable selectively reading column values
+// down the line. Note that currently the index has to be fully parsed in order
+// to find out the offset of each column value.
+//
+// Legend: cn = column name, cv = column value, cns = column name size, cvs =
+// column value size.
+//
+// +----------+--------------+----------+-------+----------+---...
+// | version | # of columns | cns 1 | cn 1 | cvs 1 |
+// +----------+--------------+------------------+--------- +---...
+// | varint32 | varint32 | varint32 | bytes | varint32 |
+// +----------+--------------+----------+-------+----------+---...
+//
+// ... continued ...
+//
+// ...---+----------+-------+----------+-------+---...---+-------+
+// | cns N | cn N | cvs N | cv 1 | | cv N |
+// ...---+----------+-------+----------+-------+---...---+-------+
+// | varint32 | bytes | varint32 | bytes | | bytes |
+// ...---+----------+-------+----------+-------+---...---+-------+
+
+class WideColumnSerialization {
+ public:
+ static Status Serialize(const WideColumns& columns, std::string& output);
+ static Status Serialize(const Slice& value_of_default,
+ const WideColumns& other_columns,
+ std::string& output);
+
+ static Status Deserialize(Slice& input, WideColumns& columns);
+
+ static WideColumns::const_iterator Find(const WideColumns& columns,
+ const Slice& column_name);
+ static Status GetValueOfDefaultColumn(Slice& input, Slice& value);
+
+ static constexpr uint32_t kCurrentVersion = 1;
+
+ private:
+ static Status SerializeImpl(const Slice* value_of_default,
+ const WideColumns& columns, std::string& output);
+};
+
+inline Status WideColumnSerialization::Serialize(const WideColumns& columns,
+ std::string& output) {
+ constexpr Slice* value_of_default = nullptr;
+
+ return SerializeImpl(value_of_default, columns, output);
+}
+
+inline Status WideColumnSerialization::Serialize(
+ const Slice& value_of_default, const WideColumns& other_columns,
+ std::string& output) {
+ return SerializeImpl(&value_of_default, other_columns, output);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wide/wide_column_serialization_test.cc b/src/rocksdb/db/wide/wide_column_serialization_test.cc
new file mode 100644
index 000000000..8060d2f24
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_column_serialization_test.cc
@@ -0,0 +1,338 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wide/wide_column_serialization.h"
+
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(WideColumnSerializationTest, Construct) {
+ constexpr char foo[] = "foo";
+ constexpr char bar[] = "bar";
+
+ const std::string foo_str(foo);
+ const std::string bar_str(bar);
+
+ const Slice foo_slice(foo_str);
+ const Slice bar_slice(bar_str);
+
+ {
+ WideColumn column(foo, bar);
+ ASSERT_EQ(column.name(), foo);
+ ASSERT_EQ(column.value(), bar);
+ }
+
+ {
+ WideColumn column(foo_str, bar);
+ ASSERT_EQ(column.name(), foo_str);
+ ASSERT_EQ(column.value(), bar);
+ }
+
+ {
+ WideColumn column(foo_slice, bar);
+ ASSERT_EQ(column.name(), foo_slice);
+ ASSERT_EQ(column.value(), bar);
+ }
+
+ {
+ WideColumn column(foo, bar_str);
+ ASSERT_EQ(column.name(), foo);
+ ASSERT_EQ(column.value(), bar_str);
+ }
+
+ {
+ WideColumn column(foo_str, bar_str);
+ ASSERT_EQ(column.name(), foo_str);
+ ASSERT_EQ(column.value(), bar_str);
+ }
+
+ {
+ WideColumn column(foo_slice, bar_str);
+ ASSERT_EQ(column.name(), foo_slice);
+ ASSERT_EQ(column.value(), bar_str);
+ }
+
+ {
+ WideColumn column(foo, bar_slice);
+ ASSERT_EQ(column.name(), foo);
+ ASSERT_EQ(column.value(), bar_slice);
+ }
+
+ {
+ WideColumn column(foo_str, bar_slice);
+ ASSERT_EQ(column.name(), foo_str);
+ ASSERT_EQ(column.value(), bar_slice);
+ }
+
+ {
+ WideColumn column(foo_slice, bar_slice);
+ ASSERT_EQ(column.name(), foo_slice);
+ ASSERT_EQ(column.value(), bar_slice);
+ }
+
+ {
+ constexpr char foo_name[] = "foo_name";
+ constexpr char bar_value[] = "bar_value";
+
+ WideColumn column(std::piecewise_construct,
+ std::forward_as_tuple(foo_name, sizeof(foo) - 1),
+ std::forward_as_tuple(bar_value, sizeof(bar) - 1));
+ ASSERT_EQ(column.name(), foo);
+ ASSERT_EQ(column.value(), bar);
+ }
+}
+
+TEST(WideColumnSerializationTest, SerializeDeserialize) {
+ WideColumns columns{{"foo", "bar"}, {"hello", "world"}};
+ std::string output;
+
+ ASSERT_OK(WideColumnSerialization::Serialize(columns, output));
+
+ Slice input(output);
+ WideColumns deserialized_columns;
+
+ ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized_columns));
+ ASSERT_EQ(columns, deserialized_columns);
+
+ {
+ const auto it = WideColumnSerialization::Find(deserialized_columns, "foo");
+ ASSERT_NE(it, deserialized_columns.cend());
+ ASSERT_EQ(*it, deserialized_columns.front());
+ }
+
+ {
+ const auto it =
+ WideColumnSerialization::Find(deserialized_columns, "hello");
+ ASSERT_NE(it, deserialized_columns.cend());
+ ASSERT_EQ(*it, deserialized_columns.back());
+ }
+
+ {
+ const auto it =
+ WideColumnSerialization::Find(deserialized_columns, "fubar");
+ ASSERT_EQ(it, deserialized_columns.cend());
+ }
+
+ {
+ const auto it =
+ WideColumnSerialization::Find(deserialized_columns, "snafu");
+ ASSERT_EQ(it, deserialized_columns.cend());
+ }
+}
+
+TEST(WideColumnSerializationTest, SerializeWithPrepend) {
+ Slice value_of_default("baz");
+ WideColumns other_columns{{"foo", "bar"}, {"hello", "world"}};
+
+ std::string output;
+ ASSERT_OK(WideColumnSerialization::Serialize(value_of_default, other_columns,
+ output));
+
+ Slice input(output);
+
+ WideColumns deserialized_columns;
+ ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized_columns));
+
+ WideColumns expected_columns{{kDefaultWideColumnName, value_of_default},
+ other_columns[0],
+ other_columns[1]};
+ ASSERT_EQ(deserialized_columns, expected_columns);
+}
+
+TEST(WideColumnSerializationTest, SerializeDuplicateError) {
+ WideColumns columns{{"foo", "bar"}, {"foo", "baz"}};
+ std::string output;
+
+ ASSERT_TRUE(
+ WideColumnSerialization::Serialize(columns, output).IsCorruption());
+}
+
+TEST(WideColumnSerializationTest, SerializeWithPrependDuplicateError) {
+ Slice value_of_default("baz");
+ WideColumns other_columns{{kDefaultWideColumnName, "dup"}, {"foo", "bar"}};
+
+ std::string output;
+ ASSERT_TRUE(WideColumnSerialization::Serialize(value_of_default,
+ other_columns, output)
+ .IsCorruption());
+}
+
+TEST(WideColumnSerializationTest, SerializeOutOfOrderError) {
+ WideColumns columns{{"hello", "world"}, {"foo", "bar"}};
+ std::string output;
+
+ ASSERT_TRUE(
+ WideColumnSerialization::Serialize(columns, output).IsCorruption());
+}
+
+TEST(WideColumnSerializationTest, DeserializeVersionError) {
+ // Can't decode version
+
+ std::string buf;
+
+ Slice input(buf);
+ WideColumns columns;
+
+ const Status s = WideColumnSerialization::Deserialize(input, columns);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "version"));
+}
+
+TEST(WideColumnSerializationTest, DeserializeUnsupportedVersion) {
+ // Unsupported version
+ constexpr uint32_t future_version = 1000;
+
+ std::string buf;
+ PutVarint32(&buf, future_version);
+
+ Slice input(buf);
+ WideColumns columns;
+
+ const Status s = WideColumnSerialization::Deserialize(input, columns);
+ ASSERT_TRUE(s.IsNotSupported());
+ ASSERT_TRUE(std::strstr(s.getState(), "version"));
+}
+
+TEST(WideColumnSerializationTest, DeserializeNumberOfColumnsError) {
+ // Can't decode number of columns
+
+ std::string buf;
+ PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+
+ Slice input(buf);
+ WideColumns columns;
+
+ const Status s = WideColumnSerialization::Deserialize(input, columns);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "number"));
+}
+
+TEST(WideColumnSerializationTest, DeserializeColumnsError) {
+ std::string buf;
+
+ PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+
+ constexpr uint32_t num_columns = 2;
+ PutVarint32(&buf, num_columns);
+
+ // Can't decode the first column name
+ {
+ Slice input(buf);
+ WideColumns columns;
+
+ const Status s = WideColumnSerialization::Deserialize(input, columns);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "name"));
+ }
+
+ constexpr char first_column_name[] = "foo";
+ PutLengthPrefixedSlice(&buf, first_column_name);
+
+ // Can't decode the size of the first column value
+ {
+ Slice input(buf);
+ WideColumns columns;
+
+ const Status s = WideColumnSerialization::Deserialize(input, columns);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "value size"));
+ }
+
+ constexpr uint32_t first_value_size = 16;
+ PutVarint32(&buf, first_value_size);
+
+ // Can't decode the second column name
+ {
+ Slice input(buf);
+ WideColumns columns;
+
+ const Status s = WideColumnSerialization::Deserialize(input, columns);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "name"));
+ }
+
+ constexpr char second_column_name[] = "hello";
+ PutLengthPrefixedSlice(&buf, second_column_name);
+
+ // Can't decode the size of the second column value
+ {
+ Slice input(buf);
+ WideColumns columns;
+
+ const Status s = WideColumnSerialization::Deserialize(input, columns);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "value size"));
+ }
+
+ constexpr uint32_t second_value_size = 64;
+ PutVarint32(&buf, second_value_size);
+
+ // Can't decode the payload of the first column
+ {
+ Slice input(buf);
+ WideColumns columns;
+
+ const Status s = WideColumnSerialization::Deserialize(input, columns);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "payload"));
+ }
+
+ buf.append(first_value_size, '0');
+
+ // Can't decode the payload of the second column
+ {
+ Slice input(buf);
+ WideColumns columns;
+
+ const Status s = WideColumnSerialization::Deserialize(input, columns);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "payload"));
+ }
+
+ buf.append(second_value_size, 'x');
+
+ // Success
+ {
+ Slice input(buf);
+ WideColumns columns;
+
+ ASSERT_OK(WideColumnSerialization::Deserialize(input, columns));
+ }
+}
+
+TEST(WideColumnSerializationTest, DeserializeColumnsOutOfOrder) {
+ std::string buf;
+
+ PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+
+ constexpr uint32_t num_columns = 2;
+ PutVarint32(&buf, num_columns);
+
+ constexpr char first_column_name[] = "b";
+ PutLengthPrefixedSlice(&buf, first_column_name);
+
+ constexpr uint32_t first_value_size = 16;
+ PutVarint32(&buf, first_value_size);
+
+ constexpr char second_column_name[] = "a";
+ PutLengthPrefixedSlice(&buf, second_column_name);
+
+ Slice input(buf);
+ WideColumns columns;
+
+ const Status s = WideColumnSerialization::Deserialize(input, columns);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "order"));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/wide/wide_columns.cc b/src/rocksdb/db/wide/wide_columns.cc
new file mode 100644
index 000000000..186be7f85
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_columns.cc
@@ -0,0 +1,22 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/wide_columns.h"
+
+#include "db/wide/wide_column_serialization.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const Slice kDefaultWideColumnName;
+
+const WideColumns kNoWideColumns;
+
+Status PinnableWideColumns::CreateIndexForWideColumns() {
+ Slice value_copy = value_;
+
+ return WideColumnSerialization::Deserialize(value_copy, columns_);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch.cc b/src/rocksdb/db/write_batch.cc
new file mode 100644
index 000000000..796697cfc
--- /dev/null
+++ b/src/rocksdb/db/write_batch.cc
@@ -0,0 +1,3137 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+// sequence: fixed64
+// count: fixed32
+// data: record[count]
+// record :=
+// kTypeValue varstring varstring
+// kTypeDeletion varstring
+// kTypeSingleDeletion varstring
+// kTypeRangeDeletion varstring varstring
+// kTypeMerge varstring varstring
+// kTypeColumnFamilyValue varint32 varstring varstring
+// kTypeColumnFamilyDeletion varint32 varstring
+// kTypeColumnFamilySingleDeletion varint32 varstring
+// kTypeColumnFamilyRangeDeletion varint32 varstring varstring
+// kTypeColumnFamilyMerge varint32 varstring varstring
+// kTypeBeginPrepareXID
+// kTypeEndPrepareXID varstring
+// kTypeCommitXID varstring
+// kTypeCommitXIDAndTimestamp varstring varstring
+// kTypeRollbackXID varstring
+// kTypeBeginPersistedPrepareXID
+// kTypeBeginUnprepareXID
+// kTypeWideColumnEntity varstring varstring
+// kTypeColumnFamilyWideColumnEntity varint32 varstring varstring
+// kTypeNoop
+// varstring :=
+// len: varint32
+// data: uint8[len]
+
+#include "rocksdb/write_batch.h"
+
+#include <algorithm>
+#include <limits>
+#include <map>
+#include <stack>
+#include <stdexcept>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/flush_scheduler.h"
+#include "db/kv_checksum.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
+#include "db/wide/wide_column_serialization.h"
+#include "db/write_batch_internal.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/duplicate_detector.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// anon namespace for file-local types
+namespace {
+
+enum ContentFlags : uint32_t {
+ DEFERRED = 1 << 0,
+ HAS_PUT = 1 << 1,
+ HAS_DELETE = 1 << 2,
+ HAS_SINGLE_DELETE = 1 << 3,
+ HAS_MERGE = 1 << 4,
+ HAS_BEGIN_PREPARE = 1 << 5,
+ HAS_END_PREPARE = 1 << 6,
+ HAS_COMMIT = 1 << 7,
+ HAS_ROLLBACK = 1 << 8,
+ HAS_DELETE_RANGE = 1 << 9,
+ HAS_BLOB_INDEX = 1 << 10,
+ HAS_BEGIN_UNPREPARE = 1 << 11,
+ HAS_PUT_ENTITY = 1 << 12,
+};
+
+struct BatchContentClassifier : public WriteBatch::Handler {
+ uint32_t content_flags = 0;
+
+ Status PutCF(uint32_t, const Slice&, const Slice&) override {
+ content_flags |= ContentFlags::HAS_PUT;
+ return Status::OK();
+ }
+
+ Status PutEntityCF(uint32_t /* column_family_id */, const Slice& /* key */,
+ const Slice& /* entity */) override {
+ content_flags |= ContentFlags::HAS_PUT_ENTITY;
+ return Status::OK();
+ }
+
+ Status DeleteCF(uint32_t, const Slice&) override {
+ content_flags |= ContentFlags::HAS_DELETE;
+ return Status::OK();
+ }
+
+ Status SingleDeleteCF(uint32_t, const Slice&) override {
+ content_flags |= ContentFlags::HAS_SINGLE_DELETE;
+ return Status::OK();
+ }
+
+ Status DeleteRangeCF(uint32_t, const Slice&, const Slice&) override {
+ content_flags |= ContentFlags::HAS_DELETE_RANGE;
+ return Status::OK();
+ }
+
+ Status MergeCF(uint32_t, const Slice&, const Slice&) override {
+ content_flags |= ContentFlags::HAS_MERGE;
+ return Status::OK();
+ }
+
+ Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
+ content_flags |= ContentFlags::HAS_BLOB_INDEX;
+ return Status::OK();
+ }
+
+ Status MarkBeginPrepare(bool unprepare) override {
+ content_flags |= ContentFlags::HAS_BEGIN_PREPARE;
+ if (unprepare) {
+ content_flags |= ContentFlags::HAS_BEGIN_UNPREPARE;
+ }
+ return Status::OK();
+ }
+
+ Status MarkEndPrepare(const Slice&) override {
+ content_flags |= ContentFlags::HAS_END_PREPARE;
+ return Status::OK();
+ }
+
+ Status MarkCommit(const Slice&) override {
+ content_flags |= ContentFlags::HAS_COMMIT;
+ return Status::OK();
+ }
+
+ Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+ content_flags |= ContentFlags::HAS_COMMIT;
+ return Status::OK();
+ }
+
+ Status MarkRollback(const Slice&) override {
+ content_flags |= ContentFlags::HAS_ROLLBACK;
+ return Status::OK();
+ }
+};
+
+} // anonymous namespace
+
+struct SavePoints {
+ std::stack<SavePoint, autovector<SavePoint>> stack;
+};
+
+WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes,
+ size_t protection_bytes_per_key, size_t default_cf_ts_sz)
+ : content_flags_(0),
+ max_bytes_(max_bytes),
+ default_cf_ts_sz_(default_cf_ts_sz),
+ rep_() {
+ // Currently `protection_bytes_per_key` can only be enabled at 8 bytes per
+ // entry.
+ assert(protection_bytes_per_key == 0 || protection_bytes_per_key == 8);
+ if (protection_bytes_per_key != 0) {
+ prot_info_.reset(new WriteBatch::ProtectionInfo());
+ }
+ rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
+ ? reserved_bytes
+ : WriteBatchInternal::kHeader);
+ rep_.resize(WriteBatchInternal::kHeader);
+}
+
+WriteBatch::WriteBatch(const std::string& rep)
+ : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), rep_(rep) {}
+
+WriteBatch::WriteBatch(std::string&& rep)
+ : content_flags_(ContentFlags::DEFERRED),
+ max_bytes_(0),
+ rep_(std::move(rep)) {}
+
+WriteBatch::WriteBatch(const WriteBatch& src)
+ : wal_term_point_(src.wal_term_point_),
+ content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
+ max_bytes_(src.max_bytes_),
+ default_cf_ts_sz_(src.default_cf_ts_sz_),
+ rep_(src.rep_) {
+ if (src.save_points_ != nullptr) {
+ save_points_.reset(new SavePoints());
+ save_points_->stack = src.save_points_->stack;
+ }
+ if (src.prot_info_ != nullptr) {
+ prot_info_.reset(new WriteBatch::ProtectionInfo());
+ prot_info_->entries_ = src.prot_info_->entries_;
+ }
+}
+
+WriteBatch::WriteBatch(WriteBatch&& src) noexcept
+ : save_points_(std::move(src.save_points_)),
+ wal_term_point_(std::move(src.wal_term_point_)),
+ content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
+ max_bytes_(src.max_bytes_),
+ prot_info_(std::move(src.prot_info_)),
+ default_cf_ts_sz_(src.default_cf_ts_sz_),
+ rep_(std::move(src.rep_)) {}
+
+WriteBatch& WriteBatch::operator=(const WriteBatch& src) {
+ if (&src != this) {
+ this->~WriteBatch();
+ new (this) WriteBatch(src);
+ }
+ return *this;
+}
+
+WriteBatch& WriteBatch::operator=(WriteBatch&& src) {
+ if (&src != this) {
+ this->~WriteBatch();
+ new (this) WriteBatch(std::move(src));
+ }
+ return *this;
+}
+
+WriteBatch::~WriteBatch() {}
+
+WriteBatch::Handler::~Handler() {}
+
+void WriteBatch::Handler::LogData(const Slice& /*blob*/) {
+ // If the user has not specified something to do with blobs, then we ignore
+ // them.
+}
+
+bool WriteBatch::Handler::Continue() { return true; }
+
+void WriteBatch::Clear() {
+ rep_.clear();
+ rep_.resize(WriteBatchInternal::kHeader);
+
+ content_flags_.store(0, std::memory_order_relaxed);
+
+ if (save_points_ != nullptr) {
+ while (!save_points_->stack.empty()) {
+ save_points_->stack.pop();
+ }
+ }
+
+ if (prot_info_ != nullptr) {
+ prot_info_->entries_.clear();
+ }
+ wal_term_point_.clear();
+ default_cf_ts_sz_ = 0;
+}
+
+uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); }
+
+uint32_t WriteBatch::ComputeContentFlags() const {
+ auto rv = content_flags_.load(std::memory_order_relaxed);
+ if ((rv & ContentFlags::DEFERRED) != 0) {
+ BatchContentClassifier classifier;
+ // Should we handle status here?
+ Iterate(&classifier).PermitUncheckedError();
+ rv = classifier.content_flags;
+
+ // this method is conceptually const, because it is performing a lazy
+ // computation that doesn't affect the abstract state of the batch.
+ // content_flags_ is marked mutable so that we can perform the
+ // following assignment
+ content_flags_.store(rv, std::memory_order_relaxed);
+ }
+ return rv;
+}
+
+void WriteBatch::MarkWalTerminationPoint() {
+ wal_term_point_.size = GetDataSize();
+ wal_term_point_.count = Count();
+ wal_term_point_.content_flags = content_flags_;
+}
+
+size_t WriteBatch::GetProtectionBytesPerKey() const {
+ if (prot_info_ != nullptr) {
+ return prot_info_->GetBytesPerKey();
+ }
+ return 0;
+}
+
+bool WriteBatch::HasPut() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0;
+}
+
+bool WriteBatch::HasPutEntity() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_PUT_ENTITY) != 0;
+}
+
+bool WriteBatch::HasDelete() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_DELETE) != 0;
+}
+
+bool WriteBatch::HasSingleDelete() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_SINGLE_DELETE) != 0;
+}
+
+bool WriteBatch::HasDeleteRange() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_DELETE_RANGE) != 0;
+}
+
+bool WriteBatch::HasMerge() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_MERGE) != 0;
+}
+
+bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record) {
+ assert(input != nullptr && key != nullptr);
+ // Skip tag byte
+ input->remove_prefix(1);
+
+ if (cf_record) {
+ // Skip column_family bytes
+ uint32_t cf;
+ if (!GetVarint32(input, &cf)) {
+ return false;
+ }
+ }
+
+ // Extract key
+ return GetLengthPrefixedSlice(input, key);
+}
+
+bool WriteBatch::HasBeginPrepare() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_BEGIN_PREPARE) != 0;
+}
+
+bool WriteBatch::HasEndPrepare() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_END_PREPARE) != 0;
+}
+
+bool WriteBatch::HasCommit() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_COMMIT) != 0;
+}
+
+bool WriteBatch::HasRollback() const {
+ return (ComputeContentFlags() & ContentFlags::HAS_ROLLBACK) != 0;
+}
+
+Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+ uint32_t* column_family, Slice* key,
+ Slice* value, Slice* blob, Slice* xid) {
+ assert(key != nullptr && value != nullptr);
+ *tag = (*input)[0];
+ input->remove_prefix(1);
+ *column_family = 0; // default
+ switch (*tag) {
+ case kTypeColumnFamilyValue:
+ if (!GetVarint32(input, column_family)) {
+ return Status::Corruption("bad WriteBatch Put");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeValue:
+ if (!GetLengthPrefixedSlice(input, key) ||
+ !GetLengthPrefixedSlice(input, value)) {
+ return Status::Corruption("bad WriteBatch Put");
+ }
+ break;
+ case kTypeColumnFamilyDeletion:
+ case kTypeColumnFamilySingleDeletion:
+ if (!GetVarint32(input, column_family)) {
+ return Status::Corruption("bad WriteBatch Delete");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeDeletion:
+ case kTypeSingleDeletion:
+ if (!GetLengthPrefixedSlice(input, key)) {
+ return Status::Corruption("bad WriteBatch Delete");
+ }
+ break;
+ case kTypeColumnFamilyRangeDeletion:
+ if (!GetVarint32(input, column_family)) {
+ return Status::Corruption("bad WriteBatch DeleteRange");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeRangeDeletion:
+ // for range delete, "key" is begin_key, "value" is end_key
+ if (!GetLengthPrefixedSlice(input, key) ||
+ !GetLengthPrefixedSlice(input, value)) {
+ return Status::Corruption("bad WriteBatch DeleteRange");
+ }
+ break;
+ case kTypeColumnFamilyMerge:
+ if (!GetVarint32(input, column_family)) {
+ return Status::Corruption("bad WriteBatch Merge");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeMerge:
+ if (!GetLengthPrefixedSlice(input, key) ||
+ !GetLengthPrefixedSlice(input, value)) {
+ return Status::Corruption("bad WriteBatch Merge");
+ }
+ break;
+ case kTypeColumnFamilyBlobIndex:
+ if (!GetVarint32(input, column_family)) {
+ return Status::Corruption("bad WriteBatch BlobIndex");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeBlobIndex:
+ if (!GetLengthPrefixedSlice(input, key) ||
+ !GetLengthPrefixedSlice(input, value)) {
+ return Status::Corruption("bad WriteBatch BlobIndex");
+ }
+ break;
+ case kTypeLogData:
+ assert(blob != nullptr);
+ if (!GetLengthPrefixedSlice(input, blob)) {
+ return Status::Corruption("bad WriteBatch Blob");
+ }
+ break;
+ case kTypeNoop:
+ case kTypeBeginPrepareXID:
+ // This indicates that the prepared batch is also persisted in the db.
+ // This is used in WritePreparedTxn
+ case kTypeBeginPersistedPrepareXID:
+ // This is used in WriteUnpreparedTxn
+ case kTypeBeginUnprepareXID:
+ break;
+ case kTypeEndPrepareXID:
+ if (!GetLengthPrefixedSlice(input, xid)) {
+ return Status::Corruption("bad EndPrepare XID");
+ }
+ break;
+ case kTypeCommitXIDAndTimestamp:
+ if (!GetLengthPrefixedSlice(input, key)) {
+ return Status::Corruption("bad commit timestamp");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeCommitXID:
+ if (!GetLengthPrefixedSlice(input, xid)) {
+ return Status::Corruption("bad Commit XID");
+ }
+ break;
+ case kTypeRollbackXID:
+ if (!GetLengthPrefixedSlice(input, xid)) {
+ return Status::Corruption("bad Rollback XID");
+ }
+ break;
+ case kTypeColumnFamilyWideColumnEntity:
+ if (!GetVarint32(input, column_family)) {
+ return Status::Corruption("bad WriteBatch PutEntity");
+ }
+ FALLTHROUGH_INTENDED;
+ case kTypeWideColumnEntity:
+ if (!GetLengthPrefixedSlice(input, key) ||
+ !GetLengthPrefixedSlice(input, value)) {
+ return Status::Corruption("bad WriteBatch PutEntity");
+ }
+ break;
+ default:
+ return Status::Corruption("unknown WriteBatch tag");
+ }
+ return Status::OK();
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+ if (rep_.size() < WriteBatchInternal::kHeader) {
+ return Status::Corruption("malformed WriteBatch (too small)");
+ }
+
+ return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader,
+ rep_.size());
+}
+
+Status WriteBatchInternal::Iterate(const WriteBatch* wb,
+ WriteBatch::Handler* handler, size_t begin,
+ size_t end) {
+ if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) {
+ return Status::Corruption("Invalid start/end bounds for Iterate");
+ }
+ assert(begin <= end);
+ Slice input(wb->rep_.data() + begin, static_cast<size_t>(end - begin));
+ bool whole_batch =
+ (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size());
+
+ Slice key, value, blob, xid;
+
+ // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as
+ // the batch boundary symbols otherwise we would mis-count the number of
+ // batches. We do that by checking whether the accumulated batch is empty
+ // before seeing the next Noop.
+ bool empty_batch = true;
+ uint32_t found = 0;
+ Status s;
+ char tag = 0;
+ uint32_t column_family = 0; // default
+ bool last_was_try_again = false;
+ bool handler_continue = true;
+ while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain()))) {
+ handler_continue = handler->Continue();
+ if (!handler_continue) {
+ break;
+ }
+
+ if (LIKELY(!s.IsTryAgain())) {
+ last_was_try_again = false;
+ tag = 0;
+ column_family = 0; // default
+
+ s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
+ &blob, &xid);
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ assert(s.IsTryAgain());
+ assert(!last_was_try_again); // to detect infinite loop bugs
+ if (UNLIKELY(last_was_try_again)) {
+ return Status::Corruption(
+ "two consecutive TryAgain in WriteBatch handler; this is either a "
+ "software bug or data corruption.");
+ }
+ last_was_try_again = true;
+ s = Status::OK();
+ }
+
+ switch (tag) {
+ case kTypeColumnFamilyValue:
+ case kTypeValue:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_PUT));
+ s = handler->PutCF(column_family, key, value);
+ if (LIKELY(s.ok())) {
+ empty_batch = false;
+ found++;
+ }
+ break;
+ case kTypeColumnFamilyDeletion:
+ case kTypeDeletion:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE));
+ s = handler->DeleteCF(column_family, key);
+ if (LIKELY(s.ok())) {
+ empty_batch = false;
+ found++;
+ }
+ break;
+ case kTypeColumnFamilySingleDeletion:
+ case kTypeSingleDeletion:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE));
+ s = handler->SingleDeleteCF(column_family, key);
+ if (LIKELY(s.ok())) {
+ empty_batch = false;
+ found++;
+ }
+ break;
+ case kTypeColumnFamilyRangeDeletion:
+ case kTypeRangeDeletion:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE));
+ s = handler->DeleteRangeCF(column_family, key, value);
+ if (LIKELY(s.ok())) {
+ empty_batch = false;
+ found++;
+ }
+ break;
+ case kTypeColumnFamilyMerge:
+ case kTypeMerge:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE));
+ s = handler->MergeCF(column_family, key, value);
+ if (LIKELY(s.ok())) {
+ empty_batch = false;
+ found++;
+ }
+ break;
+ case kTypeColumnFamilyBlobIndex:
+ case kTypeBlobIndex:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
+ s = handler->PutBlobIndexCF(column_family, key, value);
+ if (LIKELY(s.ok())) {
+ found++;
+ }
+ break;
+ case kTypeLogData:
+ handler->LogData(blob);
+ // A batch might have nothing but LogData. It is still a batch.
+ empty_batch = false;
+ break;
+ case kTypeBeginPrepareXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
+ s = handler->MarkBeginPrepare();
+ assert(s.ok());
+ empty_batch = false;
+ if (handler->WriteAfterCommit() ==
+ WriteBatch::Handler::OptionState::kDisabled) {
+ s = Status::NotSupported(
+ "WriteCommitted txn tag when write_after_commit_ is disabled (in "
+ "WritePrepared/WriteUnprepared mode). If it is not due to "
+ "corruption, the WAL must be emptied before changing the "
+ "WritePolicy.");
+ }
+ if (handler->WriteBeforePrepare() ==
+ WriteBatch::Handler::OptionState::kEnabled) {
+ s = Status::NotSupported(
+ "WriteCommitted txn tag when write_before_prepare_ is enabled "
+ "(in WriteUnprepared mode). If it is not due to corruption, the "
+ "WAL must be emptied before changing the WritePolicy.");
+ }
+ break;
+ case kTypeBeginPersistedPrepareXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
+ s = handler->MarkBeginPrepare();
+ assert(s.ok());
+ empty_batch = false;
+ if (handler->WriteAfterCommit() ==
+ WriteBatch::Handler::OptionState::kEnabled) {
+ s = Status::NotSupported(
+ "WritePrepared/WriteUnprepared txn tag when write_after_commit_ "
+ "is enabled (in default WriteCommitted mode). If it is not due "
+ "to corruption, the WAL must be emptied before changing the "
+ "WritePolicy.");
+ }
+ break;
+ case kTypeBeginUnprepareXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
+ s = handler->MarkBeginPrepare(true /* unprepared */);
+ assert(s.ok());
+ empty_batch = false;
+ if (handler->WriteAfterCommit() ==
+ WriteBatch::Handler::OptionState::kEnabled) {
+ s = Status::NotSupported(
+ "WriteUnprepared txn tag when write_after_commit_ is enabled (in "
+ "default WriteCommitted mode). If it is not due to corruption, "
+ "the WAL must be emptied before changing the WritePolicy.");
+ }
+ if (handler->WriteBeforePrepare() ==
+ WriteBatch::Handler::OptionState::kDisabled) {
+ s = Status::NotSupported(
+ "WriteUnprepared txn tag when write_before_prepare_ is disabled "
+ "(in WriteCommitted/WritePrepared mode). If it is not due to "
+ "corruption, the WAL must be emptied before changing the "
+ "WritePolicy.");
+ }
+ break;
+ case kTypeEndPrepareXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
+ s = handler->MarkEndPrepare(xid);
+ assert(s.ok());
+ empty_batch = true;
+ break;
+ case kTypeCommitXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
+ s = handler->MarkCommit(xid);
+ assert(s.ok());
+ empty_batch = true;
+ break;
+ case kTypeCommitXIDAndTimestamp:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
+ // key stores the commit timestamp.
+ assert(!key.empty());
+ s = handler->MarkCommitWithTimestamp(xid, key);
+ if (LIKELY(s.ok())) {
+ empty_batch = true;
+ }
+ break;
+ case kTypeRollbackXID:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
+ s = handler->MarkRollback(xid);
+ assert(s.ok());
+ empty_batch = true;
+ break;
+ case kTypeNoop:
+ s = handler->MarkNoop(empty_batch);
+ assert(s.ok());
+ empty_batch = true;
+ break;
+ case kTypeWideColumnEntity:
+ case kTypeColumnFamilyWideColumnEntity:
+ assert(wb->content_flags_.load(std::memory_order_relaxed) &
+ (ContentFlags::DEFERRED | ContentFlags::HAS_PUT_ENTITY));
+ s = handler->PutEntityCF(column_family, key, value);
+ if (LIKELY(s.ok())) {
+ empty_batch = false;
+ ++found;
+ }
+ break;
+ default:
+ return Status::Corruption("unknown WriteBatch tag");
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+ if (handler_continue && whole_batch &&
+ found != WriteBatchInternal::Count(wb)) {
+ return Status::Corruption("WriteBatch has wrong count");
+ } else {
+ return Status::OK();
+ }
+}
+
+bool WriteBatchInternal::IsLatestPersistentState(const WriteBatch* b) {
+ return b->is_latest_persistent_state_;
+}
+
+void WriteBatchInternal::SetAsLatestPersistentState(WriteBatch* b) {
+ b->is_latest_persistent_state_ = true;
+}
+
+uint32_t WriteBatchInternal::Count(const WriteBatch* b) {
+ return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) {
+ EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+ return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+ EncodeFixed64(&b->rep_[0], seq);
+}
+
+size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) {
+ return WriteBatchInternal::kHeader;
+}
+
+std::tuple<Status, uint32_t, size_t>
+WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(
+ WriteBatch* b, ColumnFamilyHandle* column_family) {
+ uint32_t cf_id = GetColumnFamilyID(column_family);
+ size_t ts_sz = 0;
+ Status s;
+ if (column_family) {
+ const Comparator* const ucmp = column_family->GetComparator();
+ if (ucmp) {
+ ts_sz = ucmp->timestamp_size();
+ if (0 == cf_id && b->default_cf_ts_sz_ != ts_sz) {
+ s = Status::InvalidArgument("Default cf timestamp size mismatch");
+ }
+ }
+ } else if (b->default_cf_ts_sz_ > 0) {
+ ts_sz = b->default_cf_ts_sz_;
+ }
+ return std::make_tuple(s, cf_id, ts_sz);
+}
+
+namespace {
+Status CheckColumnFamilyTimestampSize(ColumnFamilyHandle* column_family,
+ const Slice& ts) {
+ if (!column_family) {
+ return Status::InvalidArgument("column family handle cannot be null");
+ }
+ const Comparator* const ucmp = column_family->GetComparator();
+ assert(ucmp);
+ size_t cf_ts_sz = ucmp->timestamp_size();
+ if (0 == cf_ts_sz) {
+ return Status::InvalidArgument("timestamp disabled");
+ }
+ if (cf_ts_sz != ts.size()) {
+ return Status::InvalidArgument("timestamp size mismatch");
+ }
+ return Status::OK();
+}
+} // anonymous namespace
+
+Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+ const Slice& key, const Slice& value) {
+ if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+ return Status::InvalidArgument("key is too large");
+ }
+ if (value.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+ return Status::InvalidArgument("value is too large");
+ }
+
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeValue));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+ PutLengthPrefixedSlice(&b->rep_, value);
+ b->content_flags_.store(
+ b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
+ std::memory_order_relaxed);
+ if (b->prot_info_ != nullptr) {
+ // Technically the optype could've been `kTypeColumnFamilyValue` with the
+ // CF ID encoded in the `WriteBatch`. That distinction is unimportant
+ // however since we verify CF ID is correct, as well as all other fields
+ // (a missing/extra encoded CF ID would corrupt another field). It is
+ // convenient to consolidate on `kTypeValue` here as that is what will be
+ // inserted into memtable.
+ b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+ .ProtectKVO(key, value, kTypeValue)
+ .ProtectC(column_family_id));
+ }
+ return save.commit();
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) {
+ size_t ts_sz = 0;
+ uint32_t cf_id = 0;
+ Status s;
+
+ std::tie(s, cf_id, ts_sz) =
+ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+ column_family);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (0 == ts_sz) {
+ return WriteBatchInternal::Put(this, cf_id, key, value);
+ }
+
+ needs_in_place_update_ts_ = true;
+ has_key_with_ts_ = true;
+ std::string dummy_ts(ts_sz, '\0');
+ std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+ return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
+ SliceParts(&value, 1));
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts, const Slice& value) {
+ const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+ if (!s.ok()) {
+ return s;
+ }
+ has_key_with_ts_ = true;
+ assert(column_family);
+ uint32_t cf_id = column_family->GetID();
+ std::array<Slice, 2> key_with_ts{{key, ts}};
+ return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
+ SliceParts(&value, 1));
+}
+
+Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key,
+ const SliceParts& value) {
+ size_t total_key_bytes = 0;
+ for (int i = 0; i < key.num_parts; ++i) {
+ total_key_bytes += key.parts[i].size();
+ }
+ if (total_key_bytes >= size_t{std::numeric_limits<uint32_t>::max()}) {
+ return Status::InvalidArgument("key is too large");
+ }
+
+ size_t total_value_bytes = 0;
+ for (int i = 0; i < value.num_parts; ++i) {
+ total_value_bytes += value.parts[i].size();
+ }
+ if (total_value_bytes >= size_t{std::numeric_limits<uint32_t>::max()}) {
+ return Status::InvalidArgument("value is too large");
+ }
+ return Status::OK();
+}
+
+Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+ const SliceParts& key, const SliceParts& value) {
+ Status s = CheckSlicePartsLength(key, value);
+ if (!s.ok()) {
+ return s;
+ }
+
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeValue));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSliceParts(&b->rep_, key);
+ PutLengthPrefixedSliceParts(&b->rep_, value);
+ b->content_flags_.store(
+ b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
+ std::memory_order_relaxed);
+ if (b->prot_info_ != nullptr) {
+ // See comment in first `WriteBatchInternal::Put()` overload concerning the
+ // `ValueType` argument passed to `ProtectKVO()`.
+ b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+ .ProtectKVO(key, value, kTypeValue)
+ .ProtectC(column_family_id));
+ }
+ return save.commit();
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value) {
+ size_t ts_sz = 0;
+ uint32_t cf_id = 0;
+ Status s;
+
+ std::tie(s, cf_id, ts_sz) =
+ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+ column_family);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (ts_sz == 0) {
+ return WriteBatchInternal::Put(this, cf_id, key, value);
+ }
+
+ return Status::InvalidArgument(
+ "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::PutEntity(WriteBatch* b, uint32_t column_family_id,
+ const Slice& key,
+ const WideColumns& columns) {
+ assert(b);
+
+ if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+ return Status::InvalidArgument("key is too large");
+ }
+
+ WideColumns sorted_columns(columns);
+ std::sort(sorted_columns.begin(), sorted_columns.end(),
+ [](const WideColumn& lhs, const WideColumn& rhs) {
+ return lhs.name().compare(rhs.name()) < 0;
+ });
+
+ std::string entity;
+ const Status s = WideColumnSerialization::Serialize(sorted_columns, entity);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (entity.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+ return Status::InvalidArgument("wide column entity is too large");
+ }
+
+ LocalSavePoint save(b);
+
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeWideColumnEntity));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyWideColumnEntity));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+
+ PutLengthPrefixedSlice(&b->rep_, key);
+ PutLengthPrefixedSlice(&b->rep_, entity);
+
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_PUT_ENTITY,
+ std::memory_order_relaxed);
+
+ if (b->prot_info_ != nullptr) {
+ b->prot_info_->entries_.emplace_back(
+ ProtectionInfo64()
+ .ProtectKVO(key, entity, kTypeWideColumnEntity)
+ .ProtectC(column_family_id));
+ }
+
+ return save.commit();
+}
+
+Status WriteBatch::PutEntity(ColumnFamilyHandle* column_family,
+ const Slice& key, const WideColumns& columns) {
+ if (!column_family) {
+ return Status::InvalidArgument(
+ "Cannot call this method without a column family handle");
+ }
+
+ Status s;
+ uint32_t cf_id = 0;
+ size_t ts_sz = 0;
+
+ std::tie(s, cf_id, ts_sz) =
+ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+ column_family);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (ts_sz) {
+ return Status::InvalidArgument(
+ "Cannot call this method on column family enabling timestamp");
+ }
+
+ return WriteBatchInternal::PutEntity(this, cf_id, key, columns);
+}
+
+Status WriteBatchInternal::InsertNoop(WriteBatch* b) {
+ b->rep_.push_back(static_cast<char>(kTypeNoop));
+ return Status::OK();
+}
+
+Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid,
+ bool write_after_commit,
+ bool unprepared_batch) {
+ // a manually constructed batch can only contain one prepare section
+ assert(b->rep_[12] == static_cast<char>(kTypeNoop));
+
+ // all savepoints up to this point are cleared
+ if (b->save_points_ != nullptr) {
+ while (!b->save_points_->stack.empty()) {
+ b->save_points_->stack.pop();
+ }
+ }
+
+ // rewrite noop as begin marker
+ b->rep_[12] = static_cast<char>(
+ write_after_commit ? kTypeBeginPrepareXID
+ : (unprepared_batch ? kTypeBeginUnprepareXID
+ : kTypeBeginPersistedPrepareXID));
+ b->rep_.push_back(static_cast<char>(kTypeEndPrepareXID));
+ PutLengthPrefixedSlice(&b->rep_, xid);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_END_PREPARE |
+ ContentFlags::HAS_BEGIN_PREPARE,
+ std::memory_order_relaxed);
+ if (unprepared_batch) {
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_BEGIN_UNPREPARE,
+ std::memory_order_relaxed);
+ }
+ return Status::OK();
+}
+
+Status WriteBatchInternal::MarkCommit(WriteBatch* b, const Slice& xid) {
+ b->rep_.push_back(static_cast<char>(kTypeCommitXID));
+ PutLengthPrefixedSlice(&b->rep_, xid);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_COMMIT,
+ std::memory_order_relaxed);
+ return Status::OK();
+}
+
+Status WriteBatchInternal::MarkCommitWithTimestamp(WriteBatch* b,
+ const Slice& xid,
+ const Slice& commit_ts) {
+ assert(!commit_ts.empty());
+ b->rep_.push_back(static_cast<char>(kTypeCommitXIDAndTimestamp));
+ PutLengthPrefixedSlice(&b->rep_, commit_ts);
+ PutLengthPrefixedSlice(&b->rep_, xid);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_COMMIT,
+ std::memory_order_relaxed);
+ return Status::OK();
+}
+
+Status WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) {
+ b->rep_.push_back(static_cast<char>(kTypeRollbackXID));
+ PutLengthPrefixedSlice(&b->rep_, xid);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_ROLLBACK,
+ std::memory_order_relaxed);
+ return Status::OK();
+}
+
+Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+ const Slice& key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_DELETE,
+ std::memory_order_relaxed);
+ if (b->prot_info_ != nullptr) {
+ // See comment in first `WriteBatchInternal::Put()` overload concerning the
+ // `ValueType` argument passed to `ProtectKVO()`.
+ b->prot_info_->entries_.emplace_back(
+ ProtectionInfo64()
+ .ProtectKVO(key, "" /* value */, kTypeDeletion)
+ .ProtectC(column_family_id));
+ }
+ return save.commit();
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
+ size_t ts_sz = 0;
+ uint32_t cf_id = 0;
+ Status s;
+
+ std::tie(s, cf_id, ts_sz) =
+ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+ column_family);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (0 == ts_sz) {
+ return WriteBatchInternal::Delete(this, cf_id, key);
+ }
+
+ needs_in_place_update_ts_ = true;
+ has_key_with_ts_ = true;
+ std::string dummy_ts(ts_sz, '\0');
+ std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+ return WriteBatchInternal::Delete(this, cf_id,
+ SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) {
+ const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+ if (!s.ok()) {
+ return s;
+ }
+ assert(column_family);
+ has_key_with_ts_ = true;
+ uint32_t cf_id = column_family->GetID();
+ std::array<Slice, 2> key_with_ts{{key, ts}};
+ return WriteBatchInternal::Delete(this, cf_id,
+ SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+ const SliceParts& key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSliceParts(&b->rep_, key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_DELETE,
+ std::memory_order_relaxed);
+ if (b->prot_info_ != nullptr) {
+ // See comment in first `WriteBatchInternal::Put()` overload concerning the
+ // `ValueType` argument passed to `ProtectKVO()`.
+ b->prot_info_->entries_.emplace_back(
+ ProtectionInfo64()
+ .ProtectKVO(key,
+ SliceParts(nullptr /* _parts */, 0 /* _num_parts */),
+ kTypeDeletion)
+ .ProtectC(column_family_id));
+ }
+ return save.commit();
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) {
+ size_t ts_sz = 0;
+ uint32_t cf_id = 0;
+ Status s;
+
+ std::tie(s, cf_id, ts_sz) =
+ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+ column_family);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (0 == ts_sz) {
+ return WriteBatchInternal::Delete(this, cf_id, key);
+ }
+
+ return Status::InvalidArgument(
+ "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::SingleDelete(WriteBatch* b,
+ uint32_t column_family_id,
+ const Slice& key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_SINGLE_DELETE,
+ std::memory_order_relaxed);
+ if (b->prot_info_ != nullptr) {
+ // See comment in first `WriteBatchInternal::Put()` overload concerning the
+ // `ValueType` argument passed to `ProtectKVO()`.
+ b->prot_info_->entries_.emplace_back(
+ ProtectionInfo64()
+ .ProtectKVO(key, "" /* value */, kTypeSingleDeletion)
+ .ProtectC(column_family_id));
+ }
+ return save.commit();
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ size_t ts_sz = 0;
+ uint32_t cf_id = 0;
+ Status s;
+
+ std::tie(s, cf_id, ts_sz) =
+ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+ column_family);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (0 == ts_sz) {
+ return WriteBatchInternal::SingleDelete(this, cf_id, key);
+ }
+
+ needs_in_place_update_ts_ = true;
+ has_key_with_ts_ = true;
+ std::string dummy_ts(ts_sz, '\0');
+ std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+ return WriteBatchInternal::SingleDelete(this, cf_id,
+ SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts) {
+ const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+ if (!s.ok()) {
+ return s;
+ }
+ has_key_with_ts_ = true;
+ assert(column_family);
+ uint32_t cf_id = column_family->GetID();
+ std::array<Slice, 2> key_with_ts{{key, ts}};
+ return WriteBatchInternal::SingleDelete(this, cf_id,
+ SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatchInternal::SingleDelete(WriteBatch* b,
+ uint32_t column_family_id,
+ const SliceParts& key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSliceParts(&b->rep_, key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_SINGLE_DELETE,
+ std::memory_order_relaxed);
+ if (b->prot_info_ != nullptr) {
+ // See comment in first `WriteBatchInternal::Put()` overload concerning the
+ // `ValueType` argument passed to `ProtectKVO()`.
+ b->prot_info_->entries_.emplace_back(
+ ProtectionInfo64()
+ .ProtectKVO(key,
+ SliceParts(nullptr /* _parts */,
+ 0 /* _num_parts */) /* value */,
+ kTypeSingleDeletion)
+ .ProtectC(column_family_id));
+ }
+ return save.commit();
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) {
+ size_t ts_sz = 0;
+ uint32_t cf_id = 0;
+ Status s;
+
+ std::tie(s, cf_id, ts_sz) =
+ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+ column_family);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (0 == ts_sz) {
+ return WriteBatchInternal::SingleDelete(this, cf_id, key);
+ }
+
+ return Status::InvalidArgument(
+ "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
+ const Slice& begin_key,
+ const Slice& end_key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, begin_key);
+ PutLengthPrefixedSlice(&b->rep_, end_key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_DELETE_RANGE,
+ std::memory_order_relaxed);
+ if (b->prot_info_ != nullptr) {
+ // See comment in first `WriteBatchInternal::Put()` overload concerning the
+ // `ValueType` argument passed to `ProtectKVO()`.
+ // In `DeleteRange()`, the end key is treated as the value.
+ b->prot_info_->entries_.emplace_back(
+ ProtectionInfo64()
+ .ProtectKVO(begin_key, end_key, kTypeRangeDeletion)
+ .ProtectC(column_family_id));
+ }
+ return save.commit();
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key) {
+ size_t ts_sz = 0;
+ uint32_t cf_id = 0;
+ Status s;
+
+ std::tie(s, cf_id, ts_sz) =
+ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+ column_family);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (0 == ts_sz) {
+ return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
+ }
+
+ needs_in_place_update_ts_ = true;
+ has_key_with_ts_ = true;
+ std::string dummy_ts(ts_sz, '\0');
+ std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
+ std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
+ return WriteBatchInternal::DeleteRange(
+ this, cf_id, SliceParts(begin_key_with_ts.data(), 2),
+ SliceParts(end_key_with_ts.data(), 2));
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key,
+ const Slice& ts) {
+ const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+ if (!s.ok()) {
+ return s;
+ }
+ assert(column_family);
+ has_key_with_ts_ = true;
+ uint32_t cf_id = column_family->GetID();
+ std::array<Slice, 2> key_with_ts{{begin_key, ts}};
+ std::array<Slice, 2> end_key_with_ts{{end_key, ts}};
+ return WriteBatchInternal::DeleteRange(this, cf_id,
+ SliceParts(key_with_ts.data(), 2),
+ SliceParts(end_key_with_ts.data(), 2));
+}
+
+Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
+ const SliceParts& begin_key,
+ const SliceParts& end_key) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSliceParts(&b->rep_, begin_key);
+ PutLengthPrefixedSliceParts(&b->rep_, end_key);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_DELETE_RANGE,
+ std::memory_order_relaxed);
+ if (b->prot_info_ != nullptr) {
+ // See comment in first `WriteBatchInternal::Put()` overload concerning the
+ // `ValueType` argument passed to `ProtectKVO()`.
+ // In `DeleteRange()`, the end key is treated as the value.
+ b->prot_info_->entries_.emplace_back(
+ ProtectionInfo64()
+ .ProtectKVO(begin_key, end_key, kTypeRangeDeletion)
+ .ProtectC(column_family_id));
+ }
+ return save.commit();
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+ const SliceParts& begin_key,
+ const SliceParts& end_key) {
+ size_t ts_sz = 0;
+ uint32_t cf_id = 0;
+ Status s;
+
+ std::tie(s, cf_id, ts_sz) =
+ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+ column_family);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (0 == ts_sz) {
+ return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
+ }
+
+ return Status::InvalidArgument(
+ "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+ const Slice& key, const Slice& value) {
+ if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+ return Status::InvalidArgument("key is too large");
+ }
+ if (value.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+ return Status::InvalidArgument("value is too large");
+ }
+
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeMerge));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+ PutLengthPrefixedSlice(&b->rep_, value);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_MERGE,
+ std::memory_order_relaxed);
+ if (b->prot_info_ != nullptr) {
+ // See comment in first `WriteBatchInternal::Put()` overload concerning the
+ // `ValueType` argument passed to `ProtectKVO()`.
+ b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+ .ProtectKVO(key, value, kTypeMerge)
+ .ProtectC(column_family_id));
+ }
+ return save.commit();
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) {
+ size_t ts_sz = 0;
+ uint32_t cf_id = 0;
+ Status s;
+
+ std::tie(s, cf_id, ts_sz) =
+ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+ column_family);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (0 == ts_sz) {
+ return WriteBatchInternal::Merge(this, cf_id, key, value);
+ }
+
+ needs_in_place_update_ts_ = true;
+ has_key_with_ts_ = true;
+ std::string dummy_ts(ts_sz, '\0');
+ std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+
+ return WriteBatchInternal::Merge(
+ this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts, const Slice& value) {
+ const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+ if (!s.ok()) {
+ return s;
+ }
+ has_key_with_ts_ = true;
+ assert(column_family);
+ uint32_t cf_id = column_family->GetID();
+ std::array<Slice, 2> key_with_ts{{key, ts}};
+ return WriteBatchInternal::Merge(
+ this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
+}
+
+Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+ const SliceParts& key,
+ const SliceParts& value) {
+ Status s = CheckSlicePartsLength(key, value);
+ if (!s.ok()) {
+ return s;
+ }
+
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeMerge));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSliceParts(&b->rep_, key);
+ PutLengthPrefixedSliceParts(&b->rep_, value);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_MERGE,
+ std::memory_order_relaxed);
+ if (b->prot_info_ != nullptr) {
+ // See comment in first `WriteBatchInternal::Put()` overload concerning the
+ // `ValueType` argument passed to `ProtectKVO()`.
+ b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+ .ProtectKVO(key, value, kTypeMerge)
+ .ProtectC(column_family_id));
+ }
+ return save.commit();
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
+ const SliceParts& key, const SliceParts& value) {
+ size_t ts_sz = 0;
+ uint32_t cf_id = 0;
+ Status s;
+
+ std::tie(s, cf_id, ts_sz) =
+ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+ column_family);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (0 == ts_sz) {
+ return WriteBatchInternal::Merge(this, cf_id, key, value);
+ }
+
+ return Status::InvalidArgument(
+ "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::PutBlobIndex(WriteBatch* b,
+ uint32_t column_family_id,
+ const Slice& key, const Slice& value) {
+ LocalSavePoint save(b);
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeBlobIndex));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyBlobIndex));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+ PutLengthPrefixedSlice(&b->rep_, value);
+ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+ ContentFlags::HAS_BLOB_INDEX,
+ std::memory_order_relaxed);
+ if (b->prot_info_ != nullptr) {
+ // See comment in first `WriteBatchInternal::Put()` overload concerning the
+ // `ValueType` argument passed to `ProtectKVO()`.
+ b->prot_info_->entries_.emplace_back(
+ ProtectionInfo64()
+ .ProtectKVO(key, value, kTypeBlobIndex)
+ .ProtectC(column_family_id));
+ }
+ return save.commit();
+}
+
+Status WriteBatch::PutLogData(const Slice& blob) {
+ LocalSavePoint save(this);
+ rep_.push_back(static_cast<char>(kTypeLogData));
+ PutLengthPrefixedSlice(&rep_, blob);
+ return save.commit();
+}
+
+void WriteBatch::SetSavePoint() {
+ if (save_points_ == nullptr) {
+ save_points_.reset(new SavePoints());
+ }
+ // Record length and count of current batch of writes.
+ save_points_->stack.push(SavePoint(
+ GetDataSize(), Count(), content_flags_.load(std::memory_order_relaxed)));
+}
+
+Status WriteBatch::RollbackToSavePoint() {
+ if (save_points_ == nullptr || save_points_->stack.size() == 0) {
+ return Status::NotFound();
+ }
+
+ // Pop the most recent savepoint off the stack
+ SavePoint savepoint = save_points_->stack.top();
+ save_points_->stack.pop();
+
+ assert(savepoint.size <= rep_.size());
+ assert(static_cast<uint32_t>(savepoint.count) <= Count());
+
+ if (savepoint.size == rep_.size()) {
+ // No changes to rollback
+ } else if (savepoint.size == 0) {
+ // Rollback everything
+ Clear();
+ } else {
+ rep_.resize(savepoint.size);
+ if (prot_info_ != nullptr) {
+ prot_info_->entries_.resize(savepoint.count);
+ }
+ WriteBatchInternal::SetCount(this, savepoint.count);
+ content_flags_.store(savepoint.content_flags, std::memory_order_relaxed);
+ }
+
+ return Status::OK();
+}
+
+Status WriteBatch::PopSavePoint() {
+ if (save_points_ == nullptr || save_points_->stack.size() == 0) {
+ return Status::NotFound();
+ }
+
+ // Pop the most recent savepoint off the stack
+ save_points_->stack.pop();
+
+ return Status::OK();
+}
+
+Status WriteBatch::UpdateTimestamps(
+ const Slice& ts, std::function<size_t(uint32_t)> ts_sz_func) {
+ TimestampUpdater<decltype(ts_sz_func)> ts_updater(prot_info_.get(),
+ std::move(ts_sz_func), ts);
+ const Status s = Iterate(&ts_updater);
+ if (s.ok()) {
+ needs_in_place_update_ts_ = false;
+ }
+ return s;
+}
+
+Status WriteBatch::VerifyChecksum() const {
+ if (prot_info_ == nullptr) {
+ return Status::OK();
+ }
+ Slice input(rep_.data() + WriteBatchInternal::kHeader,
+ rep_.size() - WriteBatchInternal::kHeader);
+ Slice key, value, blob, xid;
+ char tag = 0;
+ uint32_t column_family = 0; // default
+ Status s;
+ size_t prot_info_idx = 0;
+ bool checksum_protected = true;
+ while (!input.empty() && prot_info_idx < prot_info_->entries_.size()) {
+ // In case key/value/column_family are not updated by
+ // ReadRecordFromWriteBatch
+ key.clear();
+ value.clear();
+ column_family = 0;
+ s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
+ &blob, &xid);
+ if (!s.ok()) {
+ return s;
+ }
+ checksum_protected = true;
+ // Write batch checksum uses op_type without ColumnFamily (e.g., if op_type
+ // in the write batch is kTypeColumnFamilyValue, kTypeValue is used to
+ // compute the checksum), and encodes column family id separately. See
+ // comment in first `WriteBatchInternal::Put()` for more detail.
+ switch (tag) {
+ case kTypeColumnFamilyValue:
+ case kTypeValue:
+ tag = kTypeValue;
+ break;
+ case kTypeColumnFamilyDeletion:
+ case kTypeDeletion:
+ tag = kTypeDeletion;
+ break;
+ case kTypeColumnFamilySingleDeletion:
+ case kTypeSingleDeletion:
+ tag = kTypeSingleDeletion;
+ break;
+ case kTypeColumnFamilyRangeDeletion:
+ case kTypeRangeDeletion:
+ tag = kTypeRangeDeletion;
+ break;
+ case kTypeColumnFamilyMerge:
+ case kTypeMerge:
+ tag = kTypeMerge;
+ break;
+ case kTypeColumnFamilyBlobIndex:
+ case kTypeBlobIndex:
+ tag = kTypeBlobIndex;
+ break;
+ case kTypeLogData:
+ case kTypeBeginPrepareXID:
+ case kTypeEndPrepareXID:
+ case kTypeCommitXID:
+ case kTypeRollbackXID:
+ case kTypeNoop:
+ case kTypeBeginPersistedPrepareXID:
+ case kTypeBeginUnprepareXID:
+ case kTypeDeletionWithTimestamp:
+ case kTypeCommitXIDAndTimestamp:
+ checksum_protected = false;
+ break;
+ case kTypeColumnFamilyWideColumnEntity:
+ case kTypeWideColumnEntity:
+ tag = kTypeWideColumnEntity;
+ break;
+ default:
+ return Status::Corruption(
+ "unknown WriteBatch tag",
+ std::to_string(static_cast<unsigned int>(tag)));
+ }
+ if (checksum_protected) {
+ s = prot_info_->entries_[prot_info_idx++]
+ .StripC(column_family)
+ .StripKVO(key, value, static_cast<ValueType>(tag))
+ .GetStatus();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+
+ if (prot_info_idx != WriteBatchInternal::Count(this)) {
+ return Status::Corruption("WriteBatch has wrong count");
+ }
+ assert(WriteBatchInternal::Count(this) == prot_info_->entries_.size());
+ return Status::OK();
+}
+
+namespace {
+
+class MemTableInserter : public WriteBatch::Handler {
+ SequenceNumber sequence_;
+ ColumnFamilyMemTables* const cf_mems_;
+ FlushScheduler* const flush_scheduler_;
+ TrimHistoryScheduler* const trim_history_scheduler_;
+ const bool ignore_missing_column_families_;
+ const uint64_t recovering_log_number_;
+ // log number that all Memtables inserted into should reference
+ uint64_t log_number_ref_;
+ DBImpl* db_;
+ const bool concurrent_memtable_writes_;
+ bool post_info_created_;
+ const WriteBatch::ProtectionInfo* prot_info_;
+ size_t prot_info_idx_;
+
+ bool* has_valid_writes_;
+ // On some (!) platforms just default creating
+ // a map is too expensive in the Write() path as they
+ // cause memory allocations though unused.
+ // Make creation optional but do not incur
+ // std::unique_ptr additional allocation
+ using MemPostInfoMap = std::map<MemTable*, MemTablePostProcessInfo>;
+ using PostMapType = std::aligned_storage<sizeof(MemPostInfoMap)>::type;
+ PostMapType mem_post_info_map_;
+ // current recovered transaction we are rebuilding (recovery)
+ WriteBatch* rebuilding_trx_;
+ SequenceNumber rebuilding_trx_seq_;
+ // Increase seq number once per each write batch. Otherwise increase it once
+ // per key.
+ bool seq_per_batch_;
+ // Whether the memtable write will be done only after the commit
+ bool write_after_commit_;
+ // Whether memtable write can be done before prepare
+ bool write_before_prepare_;
+ // Whether this batch was unprepared or not
+ bool unprepared_batch_;
+ using DupDetector = std::aligned_storage<sizeof(DuplicateDetector)>::type;
+ DupDetector duplicate_detector_;
+ bool dup_dectector_on_;
+
+ bool hint_per_batch_;
+ bool hint_created_;
+ // Hints for this batch
+ using HintMap = std::unordered_map<MemTable*, void*>;
+ using HintMapType = std::aligned_storage<sizeof(HintMap)>::type;
+ HintMapType hint_;
+
+ HintMap& GetHintMap() {
+ assert(hint_per_batch_);
+ if (!hint_created_) {
+ new (&hint_) HintMap();
+ hint_created_ = true;
+ }
+ return *reinterpret_cast<HintMap*>(&hint_);
+ }
+
+ MemPostInfoMap& GetPostMap() {
+ assert(concurrent_memtable_writes_);
+ if (!post_info_created_) {
+ new (&mem_post_info_map_) MemPostInfoMap();
+ post_info_created_ = true;
+ }
+ return *reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_);
+ }
+
+ bool IsDuplicateKeySeq(uint32_t column_family_id, const Slice& key) {
+ assert(!write_after_commit_);
+ assert(rebuilding_trx_ != nullptr);
+ if (!dup_dectector_on_) {
+ new (&duplicate_detector_) DuplicateDetector(db_);
+ dup_dectector_on_ = true;
+ }
+ return reinterpret_cast<DuplicateDetector*>(&duplicate_detector_)
+ ->IsDuplicateKeySeq(column_family_id, key, sequence_);
+ }
+
+ const ProtectionInfoKVOC64* NextProtectionInfo() {
+ const ProtectionInfoKVOC64* res = nullptr;
+ if (prot_info_ != nullptr) {
+ assert(prot_info_idx_ < prot_info_->entries_.size());
+ res = &prot_info_->entries_[prot_info_idx_];
+ ++prot_info_idx_;
+ }
+ return res;
+ }
+
+ void DecrementProtectionInfoIdxForTryAgain() {
+ if (prot_info_ != nullptr) --prot_info_idx_;
+ }
+
+ void ResetProtectionInfo() {
+ prot_info_idx_ = 0;
+ prot_info_ = nullptr;
+ }
+
+ protected:
+ Handler::OptionState WriteBeforePrepare() const override {
+ return write_before_prepare_ ? Handler::OptionState::kEnabled
+ : Handler::OptionState::kDisabled;
+ }
+ Handler::OptionState WriteAfterCommit() const override {
+ return write_after_commit_ ? Handler::OptionState::kEnabled
+ : Handler::OptionState::kDisabled;
+ }
+
+ public:
+ // cf_mems should not be shared with concurrent inserters
+ MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
+ FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families,
+ uint64_t recovering_log_number, DB* db,
+ bool concurrent_memtable_writes,
+ const WriteBatch::ProtectionInfo* prot_info,
+ bool* has_valid_writes = nullptr, bool seq_per_batch = false,
+ bool batch_per_txn = true, bool hint_per_batch = false)
+ : sequence_(_sequence),
+ cf_mems_(cf_mems),
+ flush_scheduler_(flush_scheduler),
+ trim_history_scheduler_(trim_history_scheduler),
+ ignore_missing_column_families_(ignore_missing_column_families),
+ recovering_log_number_(recovering_log_number),
+ log_number_ref_(0),
+ db_(static_cast_with_check<DBImpl>(db)),
+ concurrent_memtable_writes_(concurrent_memtable_writes),
+ post_info_created_(false),
+ prot_info_(prot_info),
+ prot_info_idx_(0),
+ has_valid_writes_(has_valid_writes),
+ rebuilding_trx_(nullptr),
+ rebuilding_trx_seq_(0),
+ seq_per_batch_(seq_per_batch),
+ // Write after commit currently uses one seq per key (instead of per
+ // batch). So seq_per_batch being false indicates write_after_commit
+ // approach.
+ write_after_commit_(!seq_per_batch),
+ // WriteUnprepared can write WriteBatches per transaction, so
+ // batch_per_txn being false indicates write_before_prepare.
+ write_before_prepare_(!batch_per_txn),
+ unprepared_batch_(false),
+ duplicate_detector_(),
+ dup_dectector_on_(false),
+ hint_per_batch_(hint_per_batch),
+ hint_created_(false) {
+ assert(cf_mems_);
+ }
+
+ ~MemTableInserter() override {
+ if (dup_dectector_on_) {
+ reinterpret_cast<DuplicateDetector*>(&duplicate_detector_)
+ ->~DuplicateDetector();
+ }
+ if (post_info_created_) {
+ reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_)->~MemPostInfoMap();
+ }
+ if (hint_created_) {
+ for (auto iter : GetHintMap()) {
+ delete[] reinterpret_cast<char*>(iter.second);
+ }
+ reinterpret_cast<HintMap*>(&hint_)->~HintMap();
+ }
+ delete rebuilding_trx_;
+ }
+
+ MemTableInserter(const MemTableInserter&) = delete;
+ MemTableInserter& operator=(const MemTableInserter&) = delete;
+
+ // The batch seq is regularly restarted; In normal mode it is set when
+ // MemTableInserter is constructed in the write thread and in recovery mode it
+ // is set when a batch, which is tagged with seq, is read from the WAL.
+ // Within a sequenced batch, which could be a merge of multiple batches, we
+ // have two policies to advance the seq: i) seq_per_key (default) and ii)
+ // seq_per_batch. To implement the latter we need to mark the boundary between
+ // the individual batches. The approach is this: 1) Use the terminating
+ // markers to indicate the boundary (kTypeEndPrepareXID, kTypeCommitXID,
+ // kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absence of a
+ // natural boundary marker.
+ void MaybeAdvanceSeq(bool batch_boundry = false) {
+ if (batch_boundry == seq_per_batch_) {
+ sequence_++;
+ }
+ }
+
+ void set_log_number_ref(uint64_t log) { log_number_ref_ = log; }
+ void set_prot_info(const WriteBatch::ProtectionInfo* prot_info) {
+ prot_info_ = prot_info;
+ prot_info_idx_ = 0;
+ }
+
+ SequenceNumber sequence() const { return sequence_; }
+
+ void PostProcess() {
+ assert(concurrent_memtable_writes_);
+ // If post info was not created there is nothing
+ // to process and no need to create on demand
+ if (post_info_created_) {
+ for (auto& pair : GetPostMap()) {
+ pair.first->BatchPostProcess(pair.second);
+ }
+ }
+ }
+
+ bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
+ // If we are in a concurrent mode, it is the caller's responsibility
+ // to clone the original ColumnFamilyMemTables so that each thread
+ // has its own instance. Otherwise, it must be guaranteed that there
+ // is no concurrent access
+ bool found = cf_mems_->Seek(column_family_id);
+ if (!found) {
+ if (ignore_missing_column_families_) {
+ *s = Status::OK();
+ } else {
+ *s = Status::InvalidArgument(
+ "Invalid column family specified in write batch");
+ }
+ return false;
+ }
+ if (recovering_log_number_ != 0 &&
+ recovering_log_number_ < cf_mems_->GetLogNumber()) {
+ // This is true only in recovery environment (recovering_log_number_ is
+ // always 0 in
+ // non-recovery, regular write code-path)
+ // * If recovering_log_number_ < cf_mems_->GetLogNumber(), this means that
+ // column family already contains updates from this log. We can't apply
+ // updates twice because of update-in-place or merge workloads -- ignore
+ // the update
+ *s = Status::OK();
+ return false;
+ }
+
+ if (has_valid_writes_ != nullptr) {
+ *has_valid_writes_ = true;
+ }
+
+ if (log_number_ref_ > 0) {
+ cf_mems_->GetMemTable()->RefLogContainingPrepSection(log_number_ref_);
+ }
+
+ return true;
+ }
+
+ Status PutCFImpl(uint32_t column_family_id, const Slice& key,
+ const Slice& value, ValueType value_type,
+ const ProtectionInfoKVOS64* kv_prot_info) {
+ // optimize for non-recovery mode
+ if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ return WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key,
+ value);
+ // else insert the values to the memtable right away
+ }
+
+ Status ret_status;
+ if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+ if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+ assert(!write_after_commit_);
+ // The CF is probably flushed and hence no need for insert but we still
+ // need to keep track of the keys for upcoming rollback/commit.
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id,
+ key, value);
+ if (ret_status.ok()) {
+ MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+ }
+ } else if (ret_status.ok()) {
+ MaybeAdvanceSeq(false /* batch_boundary */);
+ }
+ return ret_status;
+ }
+ assert(ret_status.ok());
+
+ MemTable* mem = cf_mems_->GetMemTable();
+ auto* moptions = mem->GetImmutableMemTableOptions();
+ // inplace_update_support is inconsistent with snapshots, and therefore with
+ // any kind of transactions including the ones that use seq_per_batch
+ assert(!seq_per_batch_ || !moptions->inplace_update_support);
+ if (!moptions->inplace_update_support) {
+ ret_status =
+ mem->Add(sequence_, value_type, key, value, kv_prot_info,
+ concurrent_memtable_writes_, get_post_process_info(mem),
+ hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
+ } else if (moptions->inplace_callback == nullptr ||
+ value_type != kTypeValue) {
+ assert(!concurrent_memtable_writes_);
+ ret_status = mem->Update(sequence_, value_type, key, value, kv_prot_info);
+ } else {
+ assert(!concurrent_memtable_writes_);
+ assert(value_type == kTypeValue);
+ ret_status = mem->UpdateCallback(sequence_, key, value, kv_prot_info);
+ if (ret_status.IsNotFound()) {
+ // key not found in memtable. Do sst get, update, add
+ SnapshotImpl read_from_snapshot;
+ read_from_snapshot.number_ = sequence_;
+ ReadOptions ropts;
+ // it's going to be overwritten for sure, so no point caching data block
+ // containing the old version
+ ropts.fill_cache = false;
+ ropts.snapshot = &read_from_snapshot;
+
+ std::string prev_value;
+ std::string merged_value;
+
+ auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+ Status get_status = Status::NotSupported();
+ if (db_ != nullptr && recovering_log_number_ == 0) {
+ if (cf_handle == nullptr) {
+ cf_handle = db_->DefaultColumnFamily();
+ }
+ // TODO (yanqin): fix when user-defined timestamp is enabled.
+ get_status = db_->Get(ropts, cf_handle, key, &prev_value);
+ }
+ // Intentionally overwrites the `NotFound` in `ret_status`.
+ if (!get_status.ok() && !get_status.IsNotFound()) {
+ ret_status = get_status;
+ } else {
+ ret_status = Status::OK();
+ }
+ if (ret_status.ok()) {
+ UpdateStatus update_status;
+ char* prev_buffer = const_cast<char*>(prev_value.c_str());
+ uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+ if (get_status.ok()) {
+ update_status = moptions->inplace_callback(prev_buffer, &prev_size,
+ value, &merged_value);
+ } else {
+ update_status = moptions->inplace_callback(
+ nullptr /* existing_value */, nullptr /* existing_value_size */,
+ value, &merged_value);
+ }
+ if (update_status == UpdateStatus::UPDATED_INPLACE) {
+ assert(get_status.ok());
+ if (kv_prot_info != nullptr) {
+ ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+ updated_kv_prot_info.UpdateV(value,
+ Slice(prev_buffer, prev_size));
+ // prev_value is updated in-place with final value.
+ ret_status = mem->Add(sequence_, value_type, key,
+ Slice(prev_buffer, prev_size),
+ &updated_kv_prot_info);
+ } else {
+ ret_status = mem->Add(sequence_, value_type, key,
+ Slice(prev_buffer, prev_size),
+ nullptr /* kv_prot_info */);
+ }
+ if (ret_status.ok()) {
+ RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+ }
+ } else if (update_status == UpdateStatus::UPDATED) {
+ if (kv_prot_info != nullptr) {
+ ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+ updated_kv_prot_info.UpdateV(value, merged_value);
+ // merged_value contains the final value.
+ ret_status = mem->Add(sequence_, value_type, key,
+ Slice(merged_value), &updated_kv_prot_info);
+ } else {
+ // merged_value contains the final value.
+ ret_status =
+ mem->Add(sequence_, value_type, key, Slice(merged_value),
+ nullptr /* kv_prot_info */);
+ }
+ if (ret_status.ok()) {
+ RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+ }
+ }
+ }
+ }
+ }
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ assert(seq_per_batch_);
+ const bool kBatchBoundary = true;
+ MaybeAdvanceSeq(kBatchBoundary);
+ } else if (ret_status.ok()) {
+ MaybeAdvanceSeq();
+ CheckMemtableFull();
+ }
+ // optimize for non-recovery mode
+ // If `ret_status` is `TryAgain` then the next (successful) try will add
+ // the key to the rebuilding transaction object. If `ret_status` is
+ // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+ // away. So we only need to add to it when `ret_status.ok()`.
+ if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+ assert(!write_after_commit_);
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id,
+ key, value);
+ }
+ return ret_status;
+ }
+
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ const auto* kv_prot_info = NextProtectionInfo();
+ Status ret_status;
+ if (kv_prot_info != nullptr) {
+ // Memtable needs seqno, doesn't need CF ID
+ auto mem_kv_prot_info =
+ kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+ ret_status = PutCFImpl(column_family_id, key, value, kTypeValue,
+ &mem_kv_prot_info);
+ } else {
+ ret_status = PutCFImpl(column_family_id, key, value, kTypeValue,
+ nullptr /* kv_prot_info */);
+ }
+ // TODO: this assumes that if TryAgain status is returned to the caller,
+ // the operation is actually tried again. The proper way to do this is to
+ // pass a `try_again` parameter to the operation itself and decrement
+ // prot_info_idx_ based on that
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+ return ret_status;
+ }
+
+ Status PutEntityCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ const auto* kv_prot_info = NextProtectionInfo();
+
+ Status s;
+ if (kv_prot_info) {
+ // Memtable needs seqno, doesn't need CF ID
+ auto mem_kv_prot_info =
+ kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+ s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity,
+ &mem_kv_prot_info);
+ } else {
+ s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity,
+ /* kv_prot_info */ nullptr);
+ }
+
+ if (UNLIKELY(s.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+
+ return s;
+ }
+
+ Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key,
+ const Slice& value, ValueType delete_type,
+ const ProtectionInfoKVOS64* kv_prot_info) {
+ Status ret_status;
+ MemTable* mem = cf_mems_->GetMemTable();
+ ret_status =
+ mem->Add(sequence_, delete_type, key, value, kv_prot_info,
+ concurrent_memtable_writes_, get_post_process_info(mem),
+ hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ assert(seq_per_batch_);
+ const bool kBatchBoundary = true;
+ MaybeAdvanceSeq(kBatchBoundary);
+ } else if (ret_status.ok()) {
+ MaybeAdvanceSeq();
+ CheckMemtableFull();
+ }
+ return ret_status;
+ }
+
+ Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+ const auto* kv_prot_info = NextProtectionInfo();
+ // optimize for non-recovery mode
+ if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ return WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+ // else insert the values to the memtable right away
+ }
+
+ Status ret_status;
+ if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+ if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+ assert(!write_after_commit_);
+ // The CF is probably flushed and hence no need for insert but we still
+ // need to keep track of the keys for upcoming rollback/commit.
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ ret_status =
+ WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+ if (ret_status.ok()) {
+ MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+ }
+ } else if (ret_status.ok()) {
+ MaybeAdvanceSeq(false /* batch_boundary */);
+ }
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+ return ret_status;
+ }
+
+ ColumnFamilyData* cfd = cf_mems_->current();
+ assert(!cfd || cfd->user_comparator());
+ const size_t ts_sz = (cfd && cfd->user_comparator())
+ ? cfd->user_comparator()->timestamp_size()
+ : 0;
+ const ValueType delete_type =
+ (0 == ts_sz) ? kTypeDeletion : kTypeDeletionWithTimestamp;
+ if (kv_prot_info != nullptr) {
+ auto mem_kv_prot_info =
+ kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+ mem_kv_prot_info.UpdateO(kTypeDeletion, delete_type);
+ ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
+ &mem_kv_prot_info);
+ } else {
+ ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
+ nullptr /* kv_prot_info */);
+ }
+ // optimize for non-recovery mode
+ // If `ret_status` is `TryAgain` then the next (successful) try will add
+ // the key to the rebuilding transaction object. If `ret_status` is
+ // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+ // away. So we only need to add to it when `ret_status.ok()`.
+ if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+ assert(!write_after_commit_);
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ ret_status =
+ WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+ }
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+ return ret_status;
+ }
+
+ Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+ const auto* kv_prot_info = NextProtectionInfo();
+ // optimize for non-recovery mode
+ if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ return WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
+ key);
+ // else insert the values to the memtable right away
+ }
+
+ Status ret_status;
+ if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+ if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+ assert(!write_after_commit_);
+ // The CF is probably flushed and hence no need for insert but we still
+ // need to keep track of the keys for upcoming rollback/commit.
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
+ column_family_id, key);
+ if (ret_status.ok()) {
+ MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+ }
+ } else if (ret_status.ok()) {
+ MaybeAdvanceSeq(false /* batch_boundary */);
+ }
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+ return ret_status;
+ }
+ assert(ret_status.ok());
+
+ if (kv_prot_info != nullptr) {
+ auto mem_kv_prot_info =
+ kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+ ret_status = DeleteImpl(column_family_id, key, Slice(),
+ kTypeSingleDeletion, &mem_kv_prot_info);
+ } else {
+ ret_status = DeleteImpl(column_family_id, key, Slice(),
+ kTypeSingleDeletion, nullptr /* kv_prot_info */);
+ }
+ // optimize for non-recovery mode
+ // If `ret_status` is `TryAgain` then the next (successful) try will add
+ // the key to the rebuilding transaction object. If `ret_status` is
+ // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+ // away. So we only need to add to it when `ret_status.ok()`.
+ if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+ assert(!write_after_commit_);
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
+ column_family_id, key);
+ }
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+ return ret_status;
+ }
+
+ Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+ const Slice& end_key) override {
+ const auto* kv_prot_info = NextProtectionInfo();
+ // optimize for non-recovery mode
+ if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ return WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+ begin_key, end_key);
+ // else insert the values to the memtable right away
+ }
+
+ Status ret_status;
+ if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+ if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+ assert(!write_after_commit_);
+ // The CF is probably flushed and hence no need for insert but we still
+ // need to keep track of the keys for upcoming rollback/commit.
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ ret_status = WriteBatchInternal::DeleteRange(
+ rebuilding_trx_, column_family_id, begin_key, end_key);
+ if (ret_status.ok()) {
+ MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, begin_key));
+ }
+ } else if (ret_status.ok()) {
+ MaybeAdvanceSeq(false /* batch_boundary */);
+ }
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+ return ret_status;
+ }
+ assert(ret_status.ok());
+
+ if (db_ != nullptr) {
+ auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+ if (cf_handle == nullptr) {
+ cf_handle = db_->DefaultColumnFamily();
+ }
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(cf_handle)->cfd();
+ if (!cfd->is_delete_range_supported()) {
+ // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+ ret_status.PermitUncheckedError();
+ return Status::NotSupported(
+ std::string("DeleteRange not supported for table type ") +
+ cfd->ioptions()->table_factory->Name() + " in CF " +
+ cfd->GetName());
+ }
+ int cmp =
+ cfd->user_comparator()->CompareWithoutTimestamp(begin_key, end_key);
+ if (cmp > 0) {
+ // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+ ret_status.PermitUncheckedError();
+ // It's an empty range where endpoints appear mistaken. Don't bother
+ // applying it to the DB, and return an error to the user.
+ return Status::InvalidArgument("end key comes before start key");
+ } else if (cmp == 0) {
+ // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+ ret_status.PermitUncheckedError();
+ // It's an empty range. Don't bother applying it to the DB.
+ return Status::OK();
+ }
+ }
+
+ if (kv_prot_info != nullptr) {
+ auto mem_kv_prot_info =
+ kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+ ret_status = DeleteImpl(column_family_id, begin_key, end_key,
+ kTypeRangeDeletion, &mem_kv_prot_info);
+ } else {
+ ret_status = DeleteImpl(column_family_id, begin_key, end_key,
+ kTypeRangeDeletion, nullptr /* kv_prot_info */);
+ }
+ // optimize for non-recovery mode
+ // If `ret_status` is `TryAgain` then the next (successful) try will add
+ // the key to the rebuilding transaction object. If `ret_status` is
+ // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+ // away. So we only need to add to it when `ret_status.ok()`.
+ if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+ assert(!write_after_commit_);
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ ret_status = WriteBatchInternal::DeleteRange(
+ rebuilding_trx_, column_family_id, begin_key, end_key);
+ }
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+ return ret_status;
+ }
+
+ Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ const auto* kv_prot_info = NextProtectionInfo();
+ // optimize for non-recovery mode
+ if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ return WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
+ value);
+ // else insert the values to the memtable right away
+ }
+
+ Status ret_status;
+ if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+ if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+ assert(!write_after_commit_);
+ // The CF is probably flushed and hence no need for insert but we still
+ // need to keep track of the keys for upcoming rollback/commit.
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ ret_status = WriteBatchInternal::Merge(rebuilding_trx_,
+ column_family_id, key, value);
+ if (ret_status.ok()) {
+ MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+ }
+ } else if (ret_status.ok()) {
+ MaybeAdvanceSeq(false /* batch_boundary */);
+ }
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+ return ret_status;
+ }
+ assert(ret_status.ok());
+
+ MemTable* mem = cf_mems_->GetMemTable();
+ auto* moptions = mem->GetImmutableMemTableOptions();
+ if (moptions->merge_operator == nullptr) {
+ return Status::InvalidArgument(
+ "Merge requires `ColumnFamilyOptions::merge_operator != nullptr`");
+ }
+ bool perform_merge = false;
+ assert(!concurrent_memtable_writes_ ||
+ moptions->max_successive_merges == 0);
+
+ // If we pass DB through and options.max_successive_merges is hit
+ // during recovery, Get() will be issued which will try to acquire
+ // DB mutex and cause deadlock, as DB mutex is already held.
+ // So we disable merge in recovery
+ if (moptions->max_successive_merges > 0 && db_ != nullptr &&
+ recovering_log_number_ == 0) {
+ assert(!concurrent_memtable_writes_);
+ LookupKey lkey(key, sequence_);
+
+ // Count the number of successive merges at the head
+ // of the key in the memtable
+ size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
+
+ if (num_merges >= moptions->max_successive_merges) {
+ perform_merge = true;
+ }
+ }
+
+ if (perform_merge) {
+ // 1) Get the existing value
+ std::string get_value;
+
+ // Pass in the sequence number so that we also include previous merge
+ // operations in the same batch.
+ SnapshotImpl read_from_snapshot;
+ read_from_snapshot.number_ = sequence_;
+ ReadOptions read_options;
+ read_options.snapshot = &read_from_snapshot;
+
+ auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+ if (cf_handle == nullptr) {
+ cf_handle = db_->DefaultColumnFamily();
+ }
+ Status get_status = db_->Get(read_options, cf_handle, key, &get_value);
+ if (!get_status.ok()) {
+ // Failed to read a key we know exists. Store the delta in memtable.
+ perform_merge = false;
+ } else {
+ Slice get_value_slice = Slice(get_value);
+
+ // 2) Apply this merge
+ auto merge_operator = moptions->merge_operator;
+ assert(merge_operator);
+
+ std::string new_value;
+ Status merge_status = MergeHelper::TimedFullMerge(
+ merge_operator, key, &get_value_slice, {value}, &new_value,
+ moptions->info_log, moptions->statistics,
+ SystemClock::Default().get(), /* result_operand */ nullptr,
+ /* update_num_ops_stats */ false);
+
+ if (!merge_status.ok()) {
+ // Failed to merge!
+ // Store the delta in memtable
+ perform_merge = false;
+ } else {
+ // 3) Add value to memtable
+ assert(!concurrent_memtable_writes_);
+ if (kv_prot_info != nullptr) {
+ auto merged_kv_prot_info =
+ kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+ merged_kv_prot_info.UpdateV(value, new_value);
+ merged_kv_prot_info.UpdateO(kTypeMerge, kTypeValue);
+ ret_status = mem->Add(sequence_, kTypeValue, key, new_value,
+ &merged_kv_prot_info);
+ } else {
+ ret_status = mem->Add(sequence_, kTypeValue, key, new_value,
+ nullptr /* kv_prot_info */);
+ }
+ }
+ }
+ }
+
+ if (!perform_merge) {
+ assert(ret_status.ok());
+ // Add merge operand to memtable
+ if (kv_prot_info != nullptr) {
+ auto mem_kv_prot_info =
+ kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+ ret_status =
+ mem->Add(sequence_, kTypeMerge, key, value, &mem_kv_prot_info,
+ concurrent_memtable_writes_, get_post_process_info(mem));
+ } else {
+ ret_status = mem->Add(
+ sequence_, kTypeMerge, key, value, nullptr /* kv_prot_info */,
+ concurrent_memtable_writes_, get_post_process_info(mem));
+ }
+ }
+
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ assert(seq_per_batch_);
+ const bool kBatchBoundary = true;
+ MaybeAdvanceSeq(kBatchBoundary);
+ } else if (ret_status.ok()) {
+ MaybeAdvanceSeq();
+ CheckMemtableFull();
+ }
+ // optimize for non-recovery mode
+ // If `ret_status` is `TryAgain` then the next (successful) try will add
+ // the key to the rebuilding transaction object. If `ret_status` is
+ // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+ // away. So we only need to add to it when `ret_status.ok()`.
+ if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+ assert(!write_after_commit_);
+ // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+ ret_status = WriteBatchInternal::Merge(rebuilding_trx_, column_family_id,
+ key, value);
+ }
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+ return ret_status;
+ }
+
+ Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ const auto* kv_prot_info = NextProtectionInfo();
+ Status ret_status;
+ if (kv_prot_info != nullptr) {
+ // Memtable needs seqno, doesn't need CF ID
+ auto mem_kv_prot_info =
+ kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+ // Same as PutCF except for value type.
+ ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
+ &mem_kv_prot_info);
+ } else {
+ ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
+ nullptr /* kv_prot_info */);
+ }
+ if (UNLIKELY(ret_status.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+ return ret_status;
+ }
+
+ void CheckMemtableFull() {
+ if (flush_scheduler_ != nullptr) {
+ auto* cfd = cf_mems_->current();
+ assert(cfd != nullptr);
+ if (cfd->mem()->ShouldScheduleFlush() &&
+ cfd->mem()->MarkFlushScheduled()) {
+ // MarkFlushScheduled only returns true if we are the one that
+ // should take action, so no need to dedup further
+ flush_scheduler_->ScheduleWork(cfd);
+ }
+ }
+ // check if memtable_list size exceeds max_write_buffer_size_to_maintain
+ if (trim_history_scheduler_ != nullptr) {
+ auto* cfd = cf_mems_->current();
+
+ assert(cfd);
+ assert(cfd->ioptions());
+
+ const size_t size_to_maintain = static_cast<size_t>(
+ cfd->ioptions()->max_write_buffer_size_to_maintain);
+
+ if (size_to_maintain > 0) {
+ MemTableList* const imm = cfd->imm();
+ assert(imm);
+
+ if (imm->HasHistory()) {
+ const MemTable* const mem = cfd->mem();
+ assert(mem);
+
+ if (mem->MemoryAllocatedBytes() +
+ imm->MemoryAllocatedBytesExcludingLast() >=
+ size_to_maintain &&
+ imm->MarkTrimHistoryNeeded()) {
+ trim_history_scheduler_->ScheduleWork(cfd);
+ }
+ }
+ }
+ }
+ }
+
+ // The write batch handler calls MarkBeginPrepare with unprepare set to true
+ // if it encounters the kTypeBeginUnprepareXID marker.
+ Status MarkBeginPrepare(bool unprepare) override {
+ assert(rebuilding_trx_ == nullptr);
+ assert(db_);
+
+ if (recovering_log_number_ != 0) {
+ db_->mutex()->AssertHeld();
+ // during recovery we rebuild a hollow transaction
+ // from all encountered prepare sections of the wal
+ if (db_->allow_2pc() == false) {
+ return Status::NotSupported(
+ "WAL contains prepared transactions. Open with "
+ "TransactionDB::Open().");
+ }
+
+ // we are now iterating through a prepared section
+ rebuilding_trx_ = new WriteBatch();
+ rebuilding_trx_seq_ = sequence_;
+ // Verify that we have matching MarkBeginPrepare/MarkEndPrepare markers.
+ // unprepared_batch_ should be false because it is false by default, and
+ // gets reset to false in MarkEndPrepare.
+ assert(!unprepared_batch_);
+ unprepared_batch_ = unprepare;
+
+ if (has_valid_writes_ != nullptr) {
+ *has_valid_writes_ = true;
+ }
+ }
+
+ return Status::OK();
+ }
+
+ Status MarkEndPrepare(const Slice& name) override {
+ assert(db_);
+ assert((rebuilding_trx_ != nullptr) == (recovering_log_number_ != 0));
+
+ if (recovering_log_number_ != 0) {
+ db_->mutex()->AssertHeld();
+ assert(db_->allow_2pc());
+ size_t batch_cnt =
+ write_after_commit_
+ ? 0 // 0 will disable further checks
+ : static_cast<size_t>(sequence_ - rebuilding_trx_seq_ + 1);
+ db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(),
+ rebuilding_trx_, rebuilding_trx_seq_,
+ batch_cnt, unprepared_batch_);
+ unprepared_batch_ = false;
+ rebuilding_trx_ = nullptr;
+ } else {
+ assert(rebuilding_trx_ == nullptr);
+ }
+ const bool batch_boundry = true;
+ MaybeAdvanceSeq(batch_boundry);
+
+ return Status::OK();
+ }
+
+ Status MarkNoop(bool empty_batch) override {
+ if (recovering_log_number_ != 0) {
+ db_->mutex()->AssertHeld();
+ }
+ // A hack in pessimistic transaction could result into a noop at the start
+ // of the write batch, that should be ignored.
+ if (!empty_batch) {
+ // In the absence of Prepare markers, a kTypeNoop tag indicates the end of
+ // a batch. This happens when write batch commits skipping the prepare
+ // phase.
+ const bool batch_boundry = true;
+ MaybeAdvanceSeq(batch_boundry);
+ }
+ return Status::OK();
+ }
+
+ Status MarkCommit(const Slice& name) override {
+ assert(db_);
+
+ Status s;
+
+ if (recovering_log_number_ != 0) {
+ // We must hold db mutex in recovery.
+ db_->mutex()->AssertHeld();
+ // in recovery when we encounter a commit marker
+ // we lookup this transaction in our set of rebuilt transactions
+ // and commit.
+ auto trx = db_->GetRecoveredTransaction(name.ToString());
+
+ // the log containing the prepared section may have
+ // been released in the last incarnation because the
+ // data was flushed to L0
+ if (trx != nullptr) {
+ // at this point individual CF lognumbers will prevent
+ // duplicate re-insertion of values.
+ assert(log_number_ref_ == 0);
+ if (write_after_commit_) {
+ // write_after_commit_ can only have one batch in trx.
+ assert(trx->batches_.size() == 1);
+ const auto& batch_info = trx->batches_.begin()->second;
+ // all inserts must reference this trx log number
+ log_number_ref_ = batch_info.log_number_;
+ ResetProtectionInfo();
+ s = batch_info.batch_->Iterate(this);
+ log_number_ref_ = 0;
+ }
+ // else the values are already inserted before the commit
+
+ if (s.ok()) {
+ db_->DeleteRecoveredTransaction(name.ToString());
+ }
+ if (has_valid_writes_ != nullptr) {
+ *has_valid_writes_ = true;
+ }
+ }
+ } else {
+ // When writes are not delayed until commit, there is no disconnect
+ // between a memtable write and the WAL that supports it. So the commit
+ // need not reference any log as the only log to which it depends.
+ assert(!write_after_commit_ || log_number_ref_ > 0);
+ }
+ const bool batch_boundry = true;
+ MaybeAdvanceSeq(batch_boundry);
+
+ if (UNLIKELY(s.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+
+ return s;
+ }
+
+ Status MarkCommitWithTimestamp(const Slice& name,
+ const Slice& commit_ts) override {
+ assert(db_);
+
+ Status s;
+
+ if (recovering_log_number_ != 0) {
+ // In recovery, db mutex must be held.
+ db_->mutex()->AssertHeld();
+ // in recovery when we encounter a commit marker
+ // we lookup this transaction in our set of rebuilt transactions
+ // and commit.
+ auto trx = db_->GetRecoveredTransaction(name.ToString());
+ // the log containing the prepared section may have
+ // been released in the last incarnation because the
+ // data was flushed to L0
+ if (trx) {
+ // at this point individual CF lognumbers will prevent
+ // duplicate re-insertion of values.
+ assert(0 == log_number_ref_);
+ if (write_after_commit_) {
+ // write_after_commit_ can only have one batch in trx.
+ assert(trx->batches_.size() == 1);
+ const auto& batch_info = trx->batches_.begin()->second;
+ // all inserts must reference this trx log number
+ log_number_ref_ = batch_info.log_number_;
+
+ s = batch_info.batch_->UpdateTimestamps(
+ commit_ts, [this](uint32_t cf) {
+ assert(db_);
+ VersionSet* const vset = db_->GetVersionSet();
+ assert(vset);
+ ColumnFamilySet* const cf_set = vset->GetColumnFamilySet();
+ assert(cf_set);
+ ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf);
+ assert(cfd);
+ const auto* const ucmp = cfd->user_comparator();
+ assert(ucmp);
+ return ucmp->timestamp_size();
+ });
+ if (s.ok()) {
+ ResetProtectionInfo();
+ s = batch_info.batch_->Iterate(this);
+ log_number_ref_ = 0;
+ }
+ }
+ // else the values are already inserted before the commit
+
+ if (s.ok()) {
+ db_->DeleteRecoveredTransaction(name.ToString());
+ }
+ if (has_valid_writes_) {
+ *has_valid_writes_ = true;
+ }
+ }
+ } else {
+ // When writes are not delayed until commit, there is no connection
+ // between a memtable write and the WAL that supports it. So the commit
+ // need not reference any log as the only log to which it depends.
+ assert(!write_after_commit_ || log_number_ref_ > 0);
+ }
+ constexpr bool batch_boundary = true;
+ MaybeAdvanceSeq(batch_boundary);
+
+ if (UNLIKELY(s.IsTryAgain())) {
+ DecrementProtectionInfoIdxForTryAgain();
+ }
+
+ return s;
+ }
+
+ Status MarkRollback(const Slice& name) override {
+ assert(db_);
+
+ if (recovering_log_number_ != 0) {
+ auto trx = db_->GetRecoveredTransaction(name.ToString());
+
+ // the log containing the transactions prep section
+ // may have been released in the previous incarnation
+ // because we knew it had been rolled back
+ if (trx != nullptr) {
+ db_->DeleteRecoveredTransaction(name.ToString());
+ }
+ } else {
+ // in non recovery we simply ignore this tag
+ }
+
+ const bool batch_boundry = true;
+ MaybeAdvanceSeq(batch_boundry);
+
+ return Status::OK();
+ }
+
+ private:
+ MemTablePostProcessInfo* get_post_process_info(MemTable* mem) {
+ if (!concurrent_memtable_writes_) {
+ // No need to batch counters locally if we don't use concurrent mode.
+ return nullptr;
+ }
+ return &GetPostMap()[mem];
+ }
+};
+
+} // anonymous namespace
+
+// This function can only be called in these conditions:
+// 1) During Recovery()
+// 2) During Write(), in a single-threaded write thread
+// 3) During Write(), in a concurrent context where memtables has been cloned
+// The reason is that it calls memtables->Seek(), which has a stateful cache
+Status WriteBatchInternal::InsertInto(
+ WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+ ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db,
+ bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) {
+ MemTableInserter inserter(
+ sequence, memtables, flush_scheduler, trim_history_scheduler,
+ ignore_missing_column_families, recovery_log_number, db,
+ concurrent_memtable_writes, nullptr /* prot_info */,
+ nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn);
+ for (auto w : write_group) {
+ if (w->CallbackFailed()) {
+ continue;
+ }
+ w->sequence = inserter.sequence();
+ if (!w->ShouldWriteToMemtable()) {
+ // In seq_per_batch_ mode this advances the seq by one.
+ inserter.MaybeAdvanceSeq(true);
+ continue;
+ }
+ SetSequence(w->batch, inserter.sequence());
+ inserter.set_log_number_ref(w->log_ref);
+ inserter.set_prot_info(w->batch->prot_info_.get());
+ w->status = w->batch->Iterate(&inserter);
+ if (!w->status.ok()) {
+ return w->status;
+ }
+ assert(!seq_per_batch || w->batch_cnt != 0);
+ assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt);
+ }
+ return Status::OK();
+}
+
+Status WriteBatchInternal::InsertInto(
+ WriteThread::Writer* writer, SequenceNumber sequence,
+ ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families, uint64_t log_number, DB* db,
+ bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt,
+ bool batch_per_txn, bool hint_per_batch) {
+#ifdef NDEBUG
+ (void)batch_cnt;
+#endif
+ assert(writer->ShouldWriteToMemtable());
+ MemTableInserter inserter(sequence, memtables, flush_scheduler,
+ trim_history_scheduler,
+ ignore_missing_column_families, log_number, db,
+ concurrent_memtable_writes, nullptr /* prot_info */,
+ nullptr /*has_valid_writes*/, seq_per_batch,
+ batch_per_txn, hint_per_batch);
+ SetSequence(writer->batch, sequence);
+ inserter.set_log_number_ref(writer->log_ref);
+ inserter.set_prot_info(writer->batch->prot_info_.get());
+ Status s = writer->batch->Iterate(&inserter);
+ assert(!seq_per_batch || batch_cnt != 0);
+ assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt);
+ if (concurrent_memtable_writes) {
+ inserter.PostProcess();
+ }
+ return s;
+}
+
+Status WriteBatchInternal::InsertInto(
+ const WriteBatch* batch, ColumnFamilyMemTables* memtables,
+ FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families, uint64_t log_number, DB* db,
+ bool concurrent_memtable_writes, SequenceNumber* next_seq,
+ bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) {
+ MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler,
+ trim_history_scheduler,
+ ignore_missing_column_families, log_number, db,
+ concurrent_memtable_writes, batch->prot_info_.get(),
+ has_valid_writes, seq_per_batch, batch_per_txn);
+ Status s = batch->Iterate(&inserter);
+ if (next_seq != nullptr) {
+ *next_seq = inserter.sequence();
+ }
+ if (concurrent_memtable_writes) {
+ inserter.PostProcess();
+ }
+ return s;
+}
+
+namespace {
+
+// This class updates protection info for a WriteBatch.
+class ProtectionInfoUpdater : public WriteBatch::Handler {
+ public:
+ explicit ProtectionInfoUpdater(WriteBatch::ProtectionInfo* prot_info)
+ : prot_info_(prot_info) {}
+
+ ~ProtectionInfoUpdater() override {}
+
+ Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override {
+ return UpdateProtInfo(cf, key, val, kTypeValue);
+ }
+
+ Status PutEntityCF(uint32_t cf, const Slice& key,
+ const Slice& entity) override {
+ return UpdateProtInfo(cf, key, entity, kTypeWideColumnEntity);
+ }
+
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ return UpdateProtInfo(cf, key, "", kTypeDeletion);
+ }
+
+ Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+ return UpdateProtInfo(cf, key, "", kTypeSingleDeletion);
+ }
+
+ Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
+ const Slice& end_key) override {
+ return UpdateProtInfo(cf, begin_key, end_key, kTypeRangeDeletion);
+ }
+
+ Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override {
+ return UpdateProtInfo(cf, key, val, kTypeMerge);
+ }
+
+ Status PutBlobIndexCF(uint32_t cf, const Slice& key,
+ const Slice& val) override {
+ return UpdateProtInfo(cf, key, val, kTypeBlobIndex);
+ }
+
+ Status MarkBeginPrepare(bool /* unprepare */) override {
+ return Status::OK();
+ }
+
+ Status MarkEndPrepare(const Slice& /* xid */) override {
+ return Status::OK();
+ }
+
+ Status MarkCommit(const Slice& /* xid */) override { return Status::OK(); }
+
+ Status MarkCommitWithTimestamp(const Slice& /* xid */,
+ const Slice& /* ts */) override {
+ return Status::OK();
+ }
+
+ Status MarkRollback(const Slice& /* xid */) override { return Status::OK(); }
+
+ Status MarkNoop(bool /* empty_batch */) override { return Status::OK(); }
+
+ private:
+ Status UpdateProtInfo(uint32_t cf, const Slice& key, const Slice& val,
+ const ValueType op_type) {
+ if (prot_info_) {
+ prot_info_->entries_.emplace_back(
+ ProtectionInfo64().ProtectKVO(key, val, op_type).ProtectC(cf));
+ }
+ return Status::OK();
+ }
+
+ // No copy or move.
+ ProtectionInfoUpdater(const ProtectionInfoUpdater&) = delete;
+ ProtectionInfoUpdater(ProtectionInfoUpdater&&) = delete;
+ ProtectionInfoUpdater& operator=(const ProtectionInfoUpdater&) = delete;
+ ProtectionInfoUpdater& operator=(ProtectionInfoUpdater&&) = delete;
+
+ WriteBatch::ProtectionInfo* const prot_info_ = nullptr;
+};
+
+} // anonymous namespace
+
+Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+ assert(contents.size() >= WriteBatchInternal::kHeader);
+ assert(b->prot_info_ == nullptr);
+
+ b->rep_.assign(contents.data(), contents.size());
+ b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed);
+ return Status::OK();
+}
+
+Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
+ const bool wal_only) {
+ assert(dst->Count() == 0 ||
+ (dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr));
+ if ((src->prot_info_ != nullptr &&
+ src->prot_info_->entries_.size() != src->Count()) ||
+ (dst->prot_info_ != nullptr &&
+ dst->prot_info_->entries_.size() != dst->Count())) {
+ return Status::Corruption(
+ "Write batch has inconsistent count and number of checksums");
+ }
+
+ size_t src_len;
+ int src_count;
+ uint32_t src_flags;
+
+ const SavePoint& batch_end = src->GetWalTerminationPoint();
+
+ if (wal_only && !batch_end.is_cleared()) {
+ src_len = batch_end.size - WriteBatchInternal::kHeader;
+ src_count = batch_end.count;
+ src_flags = batch_end.content_flags;
+ } else {
+ src_len = src->rep_.size() - WriteBatchInternal::kHeader;
+ src_count = Count(src);
+ src_flags = src->content_flags_.load(std::memory_order_relaxed);
+ }
+
+ if (src->prot_info_ != nullptr) {
+ if (dst->prot_info_ == nullptr) {
+ dst->prot_info_.reset(new WriteBatch::ProtectionInfo());
+ }
+ std::copy(src->prot_info_->entries_.begin(),
+ src->prot_info_->entries_.begin() + src_count,
+ std::back_inserter(dst->prot_info_->entries_));
+ } else if (dst->prot_info_ != nullptr) {
+ // dst has empty prot_info->entries
+ // In this special case, we allow write batch without prot_info to
+ // be appende to write batch with empty prot_info
+ dst->prot_info_ = nullptr;
+ }
+ SetCount(dst, Count(dst) + src_count);
+ assert(src->rep_.size() >= WriteBatchInternal::kHeader);
+ dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len);
+ dst->content_flags_.store(
+ dst->content_flags_.load(std::memory_order_relaxed) | src_flags,
+ std::memory_order_relaxed);
+ return Status::OK();
+}
+
+size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize,
+ size_t rightByteSize) {
+ if (leftByteSize == 0 || rightByteSize == 0) {
+ return leftByteSize + rightByteSize;
+ } else {
+ return leftByteSize + rightByteSize - WriteBatchInternal::kHeader;
+ }
+}
+
+Status WriteBatchInternal::UpdateProtectionInfo(WriteBatch* wb,
+ size_t bytes_per_key,
+ uint64_t* checksum) {
+ if (bytes_per_key == 0) {
+ if (wb->prot_info_ != nullptr) {
+ wb->prot_info_.reset();
+ return Status::OK();
+ } else {
+ // Already not protected.
+ return Status::OK();
+ }
+ } else if (bytes_per_key == 8) {
+ if (wb->prot_info_ == nullptr) {
+ wb->prot_info_.reset(new WriteBatch::ProtectionInfo());
+ ProtectionInfoUpdater prot_info_updater(wb->prot_info_.get());
+ Status s = wb->Iterate(&prot_info_updater);
+ if (s.ok() && checksum != nullptr) {
+ uint64_t expected_hash = XXH3_64bits(wb->rep_.data(), wb->rep_.size());
+ if (expected_hash != *checksum) {
+ return Status::Corruption("Write batch content corrupted.");
+ }
+ }
+ return s;
+ } else {
+ // Already protected.
+ return Status::OK();
+ }
+ }
+ return Status::NotSupported(
+ "WriteBatch protection info must be zero or eight bytes/key");
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_base.cc b/src/rocksdb/db/write_batch_base.cc
new file mode 100644
index 000000000..e4c0e74bd
--- /dev/null
+++ b/src/rocksdb/db/write_batch_base.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/write_batch_base.h"
+
+#include <string>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Simple implementation of SlicePart variants of Put(). Child classes
+// can override these method with more performant solutions if they choose.
+Status WriteBatchBase::Put(ColumnFamilyHandle* column_family,
+ const SliceParts& key, const SliceParts& value) {
+ std::string key_buf, value_buf;
+ Slice key_slice(key, &key_buf);
+ Slice value_slice(value, &value_buf);
+
+ return Put(column_family, key_slice, value_slice);
+}
+
+Status WriteBatchBase::Put(const SliceParts& key, const SliceParts& value) {
+ std::string key_buf, value_buf;
+ Slice key_slice(key, &key_buf);
+ Slice value_slice(value, &value_buf);
+
+ return Put(key_slice, value_slice);
+}
+
+Status WriteBatchBase::Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) {
+ std::string key_buf;
+ Slice key_slice(key, &key_buf);
+ return Delete(column_family, key_slice);
+}
+
+Status WriteBatchBase::Delete(const SliceParts& key) {
+ std::string key_buf;
+ Slice key_slice(key, &key_buf);
+ return Delete(key_slice);
+}
+
+Status WriteBatchBase::SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) {
+ std::string key_buf;
+ Slice key_slice(key, &key_buf);
+ return SingleDelete(column_family, key_slice);
+}
+
+Status WriteBatchBase::SingleDelete(const SliceParts& key) {
+ std::string key_buf;
+ Slice key_slice(key, &key_buf);
+ return SingleDelete(key_slice);
+}
+
+Status WriteBatchBase::DeleteRange(ColumnFamilyHandle* column_family,
+ const SliceParts& begin_key,
+ const SliceParts& end_key) {
+ std::string begin_key_buf, end_key_buf;
+ Slice begin_key_slice(begin_key, &begin_key_buf);
+ Slice end_key_slice(end_key, &end_key_buf);
+ return DeleteRange(column_family, begin_key_slice, end_key_slice);
+}
+
+Status WriteBatchBase::DeleteRange(const SliceParts& begin_key,
+ const SliceParts& end_key) {
+ std::string begin_key_buf, end_key_buf;
+ Slice begin_key_slice(begin_key, &begin_key_buf);
+ Slice end_key_slice(end_key, &end_key_buf);
+ return DeleteRange(begin_key_slice, end_key_slice);
+}
+
+Status WriteBatchBase::Merge(ColumnFamilyHandle* column_family,
+ const SliceParts& key, const SliceParts& value) {
+ std::string key_buf, value_buf;
+ Slice key_slice(key, &key_buf);
+ Slice value_slice(value, &value_buf);
+
+ return Merge(column_family, key_slice, value_slice);
+}
+
+Status WriteBatchBase::Merge(const SliceParts& key, const SliceParts& value) {
+ std::string key_buf, value_buf;
+ Slice key_slice(key, &key_buf);
+ Slice value_slice(value, &value_buf);
+
+ return Merge(key_slice, value_slice);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_internal.h b/src/rocksdb/db/write_batch_internal.h
new file mode 100644
index 000000000..1be0bd140
--- /dev/null
+++ b/src/rocksdb/db/write_batch_internal.h
@@ -0,0 +1,401 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <array>
+#include <vector>
+
+#include "db/flush_scheduler.h"
+#include "db/kv_checksum.h"
+#include "db/trim_history_scheduler.h"
+#include "db/write_thread.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTable;
+class FlushScheduler;
+class ColumnFamilyData;
+
+class ColumnFamilyMemTables {
+ public:
+ virtual ~ColumnFamilyMemTables() {}
+ virtual bool Seek(uint32_t column_family_id) = 0;
+ // returns true if the update to memtable should be ignored
+ // (useful when recovering from log whose updates have already
+ // been processed)
+ virtual uint64_t GetLogNumber() const = 0;
+ virtual MemTable* GetMemTable() const = 0;
+ virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
+ virtual ColumnFamilyData* current() { return nullptr; }
+};
+
+class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
+ public:
+ explicit ColumnFamilyMemTablesDefault(MemTable* mem)
+ : ok_(false), mem_(mem) {}
+
+ bool Seek(uint32_t column_family_id) override {
+ ok_ = (column_family_id == 0);
+ return ok_;
+ }
+
+ uint64_t GetLogNumber() const override { return 0; }
+
+ MemTable* GetMemTable() const override {
+ assert(ok_);
+ return mem_;
+ }
+
+ ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
+
+ private:
+ bool ok_;
+ MemTable* mem_;
+};
+
+struct WriteBatch::ProtectionInfo {
+ // `WriteBatch` usually doesn't contain a huge number of keys so protecting
+ // with a fixed, non-configurable eight bytes per key may work well enough.
+ autovector<ProtectionInfoKVOC64> entries_;
+
+ size_t GetBytesPerKey() const { return 8; }
+};
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+ // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+ static constexpr size_t kHeader = 12;
+
+ // WriteBatch methods with column_family_id instead of ColumnFamilyHandle*
+ static Status Put(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key, const Slice& value);
+
+ static Status Put(WriteBatch* batch, uint32_t column_family_id,
+ const SliceParts& key, const SliceParts& value);
+
+ static Status PutEntity(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key, const WideColumns& columns);
+
+ static Status Delete(WriteBatch* batch, uint32_t column_family_id,
+ const SliceParts& key);
+
+ static Status Delete(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key);
+
+ static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+ const SliceParts& key);
+
+ static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key);
+
+ static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
+ const Slice& begin_key, const Slice& end_key);
+
+ static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
+ const SliceParts& begin_key,
+ const SliceParts& end_key);
+
+ static Status Merge(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key, const Slice& value);
+
+ static Status Merge(WriteBatch* batch, uint32_t column_family_id,
+ const SliceParts& key, const SliceParts& value);
+
+ static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key, const Slice& value);
+
+ static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid,
+ const bool write_after_commit = true,
+ const bool unprepared_batch = false);
+
+ static Status MarkRollback(WriteBatch* batch, const Slice& xid);
+
+ static Status MarkCommit(WriteBatch* batch, const Slice& xid);
+
+ static Status MarkCommitWithTimestamp(WriteBatch* batch, const Slice& xid,
+ const Slice& commit_ts);
+
+ static Status InsertNoop(WriteBatch* batch);
+
+ // Return the number of entries in the batch.
+ static uint32_t Count(const WriteBatch* batch);
+
+ // Set the count for the number of entries in the batch.
+ static void SetCount(WriteBatch* batch, uint32_t n);
+
+ // Return the sequence number for the start of this batch.
+ static SequenceNumber Sequence(const WriteBatch* batch);
+
+ // Store the specified number as the sequence number for the start of
+ // this batch.
+ static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+ // Returns the offset of the first entry in the batch.
+ // This offset is only valid if the batch is not empty.
+ static size_t GetFirstOffset(WriteBatch* batch);
+
+ static Slice Contents(const WriteBatch* batch) { return Slice(batch->rep_); }
+
+ static size_t ByteSize(const WriteBatch* batch) { return batch->rep_.size(); }
+
+ static Status SetContents(WriteBatch* batch, const Slice& contents);
+
+ static Status CheckSlicePartsLength(const SliceParts& key,
+ const SliceParts& value);
+
+ // Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive.
+ //
+ // If ignore_missing_column_families == true. WriteBatch
+ // referencing non-existing column family will be ignored.
+ // If ignore_missing_column_families == false, processing of the
+ // batches will be stopped if a reference is found to a non-existing
+ // column family and InvalidArgument() will be returned. The writes
+ // in batches may be only partially applied at that point.
+ //
+ // If log_number is non-zero, the memtable will be updated only if
+ // memtables->GetLogNumber() >= log_number.
+ //
+ // If flush_scheduler is non-null, it will be invoked if the memtable
+ // should be flushed.
+ //
+ // Under concurrent use, the caller is responsible for making sure that
+ // the memtables object itself is thread-local.
+ static Status InsertInto(
+ WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+ ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families = false, uint64_t log_number = 0,
+ DB* db = nullptr, bool concurrent_memtable_writes = false,
+ bool seq_per_batch = false, bool batch_per_txn = true);
+
+ // Convenience form of InsertInto when you have only one batch
+ // next_seq returns the seq after last sequence number used in MemTable insert
+ static Status InsertInto(
+ const WriteBatch* batch, ColumnFamilyMemTables* memtables,
+ FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families = false, uint64_t log_number = 0,
+ DB* db = nullptr, bool concurrent_memtable_writes = false,
+ SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr,
+ bool seq_per_batch = false, bool batch_per_txn = true);
+
+ static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence,
+ ColumnFamilyMemTables* memtables,
+ FlushScheduler* flush_scheduler,
+ TrimHistoryScheduler* trim_history_scheduler,
+ bool ignore_missing_column_families = false,
+ uint64_t log_number = 0, DB* db = nullptr,
+ bool concurrent_memtable_writes = false,
+ bool seq_per_batch = false, size_t batch_cnt = 0,
+ bool batch_per_txn = true,
+ bool hint_per_batch = false);
+
+ // Appends src write batch to dst write batch and updates count in dst
+ // write batch. Returns OK if the append is successful. Checks number of
+ // checksum against count in dst and src write batches, and returns Corruption
+ // if the count is inconsistent.
+ static Status Append(WriteBatch* dst, const WriteBatch* src,
+ const bool WAL_only = false);
+
+ // Returns the byte size of appending a WriteBatch with ByteSize
+ // leftByteSize and a WriteBatch with ByteSize rightByteSize
+ static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize);
+
+ // Iterate over [begin, end) range of a write batch
+ static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler,
+ size_t begin, size_t end);
+
+ // This write batch includes the latest state that should be persisted. Such
+ // state meant to be used only during recovery.
+ static void SetAsLatestPersistentState(WriteBatch* b);
+ static bool IsLatestPersistentState(const WriteBatch* b);
+
+ static std::tuple<Status, uint32_t, size_t> GetColumnFamilyIdAndTimestampSize(
+ WriteBatch* b, ColumnFamilyHandle* column_family);
+
+ static bool TimestampsUpdateNeeded(const WriteBatch& wb) {
+ return wb.needs_in_place_update_ts_;
+ }
+
+ static bool HasKeyWithTimestamp(const WriteBatch& wb) {
+ return wb.has_key_with_ts_;
+ }
+
+ // Update per-key value protection information on this write batch.
+ // If checksum is provided, the batch content is verfied against the checksum.
+ static Status UpdateProtectionInfo(WriteBatch* wb, size_t bytes_per_key,
+ uint64_t* checksum = nullptr);
+};
+
+// LocalSavePoint is similar to a scope guard
+class LocalSavePoint {
+ public:
+ explicit LocalSavePoint(WriteBatch* batch)
+ : batch_(batch),
+ savepoint_(batch->GetDataSize(), batch->Count(),
+ batch->content_flags_.load(std::memory_order_relaxed))
+#ifndef NDEBUG
+ ,
+ committed_(false)
+#endif
+ {
+ }
+
+#ifndef NDEBUG
+ ~LocalSavePoint() { assert(committed_); }
+#endif
+ Status commit() {
+#ifndef NDEBUG
+ committed_ = true;
+#endif
+ if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) {
+ batch_->rep_.resize(savepoint_.size);
+ WriteBatchInternal::SetCount(batch_, savepoint_.count);
+ if (batch_->prot_info_ != nullptr) {
+ batch_->prot_info_->entries_.resize(savepoint_.count);
+ }
+ batch_->content_flags_.store(savepoint_.content_flags,
+ std::memory_order_relaxed);
+ return Status::MemoryLimit();
+ }
+ return Status::OK();
+ }
+
+ private:
+ WriteBatch* batch_;
+ SavePoint savepoint_;
+#ifndef NDEBUG
+ bool committed_;
+#endif
+};
+
+template <typename TimestampSizeFuncType>
+class TimestampUpdater : public WriteBatch::Handler {
+ public:
+ explicit TimestampUpdater(WriteBatch::ProtectionInfo* prot_info,
+ TimestampSizeFuncType&& ts_sz_func, const Slice& ts)
+ : prot_info_(prot_info),
+ ts_sz_func_(std::move(ts_sz_func)),
+ timestamp_(ts) {
+ assert(!timestamp_.empty());
+ }
+
+ ~TimestampUpdater() override {}
+
+ Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+ return UpdateTimestamp(cf, key);
+ }
+
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ return UpdateTimestamp(cf, key);
+ }
+
+ Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+ return UpdateTimestamp(cf, key);
+ }
+
+ Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
+ const Slice& end_key) override {
+ Status s = UpdateTimestamp(cf, begin_key, true /* is_key */);
+ if (s.ok()) {
+ s = UpdateTimestamp(cf, end_key, false /* is_key */);
+ }
+ return s;
+ }
+
+ Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+ return UpdateTimestamp(cf, key);
+ }
+
+ Status PutBlobIndexCF(uint32_t cf, const Slice& key, const Slice&) override {
+ return UpdateTimestamp(cf, key);
+ }
+
+ Status MarkBeginPrepare(bool) override { return Status::OK(); }
+
+ Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+
+ Status MarkCommit(const Slice&) override { return Status::OK(); }
+
+ Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+ return Status::OK();
+ }
+
+ Status MarkRollback(const Slice&) override { return Status::OK(); }
+
+ Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); }
+
+ private:
+ // @param is_key specifies whether the update is for key or value.
+ Status UpdateTimestamp(uint32_t cf, const Slice& buf, bool is_key = true) {
+ Status s = UpdateTimestampImpl(cf, buf, idx_, is_key);
+ ++idx_;
+ return s;
+ }
+
+ Status UpdateTimestampImpl(uint32_t cf, const Slice& buf, size_t /*idx*/,
+ bool is_key) {
+ if (timestamp_.empty()) {
+ return Status::InvalidArgument("Timestamp is empty");
+ }
+ size_t cf_ts_sz = ts_sz_func_(cf);
+ if (0 == cf_ts_sz) {
+ // Skip this column family.
+ return Status::OK();
+ } else if (std::numeric_limits<size_t>::max() == cf_ts_sz) {
+ // Column family timestamp info not found.
+ return Status::NotFound();
+ } else if (cf_ts_sz != timestamp_.size()) {
+ return Status::InvalidArgument("timestamp size mismatch");
+ }
+ UpdateProtectionInformationIfNeeded(buf, timestamp_, is_key);
+
+ char* ptr = const_cast<char*>(buf.data() + buf.size() - cf_ts_sz);
+ assert(ptr);
+ memcpy(ptr, timestamp_.data(), timestamp_.size());
+ return Status::OK();
+ }
+
+ void UpdateProtectionInformationIfNeeded(const Slice& buf, const Slice& ts,
+ bool is_key) {
+ if (prot_info_ != nullptr) {
+ const size_t ts_sz = ts.size();
+ SliceParts old(&buf, 1);
+ Slice old_no_ts(buf.data(), buf.size() - ts_sz);
+ std::array<Slice, 2> new_key_cmpts{{old_no_ts, ts}};
+ SliceParts new_parts(new_key_cmpts.data(), 2);
+ if (is_key) {
+ prot_info_->entries_[idx_].UpdateK(old, new_parts);
+ } else {
+ prot_info_->entries_[idx_].UpdateV(old, new_parts);
+ }
+ }
+ }
+
+ // No copy or move.
+ TimestampUpdater(const TimestampUpdater&) = delete;
+ TimestampUpdater(TimestampUpdater&&) = delete;
+ TimestampUpdater& operator=(const TimestampUpdater&) = delete;
+ TimestampUpdater& operator=(TimestampUpdater&&) = delete;
+
+ WriteBatch::ProtectionInfo* const prot_info_ = nullptr;
+ const TimestampSizeFuncType ts_sz_func_{};
+ const Slice timestamp_;
+ size_t idx_ = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_test.cc b/src/rocksdb/db/write_batch_test.cc
new file mode 100644
index 000000000..d233853e2
--- /dev/null
+++ b/src/rocksdb/db/write_batch_test.cc
@@ -0,0 +1,1114 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+
+#include "db/column_family.h"
+#include "db/db_test_util.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string PrintContents(WriteBatch* b,
+ bool merge_operator_supported = true) {
+ InternalKeyComparator cmp(BytewiseComparator());
+ auto factory = std::make_shared<SkipListFactory>();
+ Options options;
+ options.memtable_factory = factory;
+ if (merge_operator_supported) {
+ options.merge_operator.reset(new TestPutOperator());
+ }
+ ImmutableOptions ioptions(options);
+ WriteBufferManager wb(options.db_write_buffer_size);
+ MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ mem->Ref();
+ std::string state;
+ ColumnFamilyMemTablesDefault cf_mems_default(mem);
+ Status s =
+ WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr);
+ uint32_t count = 0;
+ int put_count = 0;
+ int delete_count = 0;
+ int single_delete_count = 0;
+ int delete_range_count = 0;
+ int merge_count = 0;
+ for (int i = 0; i < 2; ++i) {
+ Arena arena;
+ ScopedArenaIterator arena_iter_guard;
+ std::unique_ptr<InternalIterator> iter_guard;
+ InternalIterator* iter;
+ if (i == 0) {
+ iter = mem->NewIterator(ReadOptions(), &arena);
+ arena_iter_guard.set(iter);
+ } else {
+ iter = mem->NewRangeTombstoneIterator(ReadOptions(),
+ kMaxSequenceNumber /* read_seq */,
+ false /* immutable_memtable */);
+ iter_guard.reset(iter);
+ }
+ if (iter == nullptr) {
+ continue;
+ }
+ EXPECT_OK(iter->status());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey ikey;
+ ikey.clear();
+ EXPECT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+ switch (ikey.type) {
+ case kTypeValue:
+ state.append("Put(");
+ state.append(ikey.user_key.ToString());
+ state.append(", ");
+ state.append(iter->value().ToString());
+ state.append(")");
+ count++;
+ put_count++;
+ break;
+ case kTypeDeletion:
+ state.append("Delete(");
+ state.append(ikey.user_key.ToString());
+ state.append(")");
+ count++;
+ delete_count++;
+ break;
+ case kTypeSingleDeletion:
+ state.append("SingleDelete(");
+ state.append(ikey.user_key.ToString());
+ state.append(")");
+ count++;
+ single_delete_count++;
+ break;
+ case kTypeRangeDeletion:
+ state.append("DeleteRange(");
+ state.append(ikey.user_key.ToString());
+ state.append(", ");
+ state.append(iter->value().ToString());
+ state.append(")");
+ count++;
+ delete_range_count++;
+ break;
+ case kTypeMerge:
+ state.append("Merge(");
+ state.append(ikey.user_key.ToString());
+ state.append(", ");
+ state.append(iter->value().ToString());
+ state.append(")");
+ count++;
+ merge_count++;
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ state.append("@");
+ state.append(std::to_string(ikey.sequence));
+ }
+ EXPECT_OK(iter->status());
+ }
+ if (s.ok()) {
+ EXPECT_EQ(b->HasPut(), put_count > 0);
+ EXPECT_EQ(b->HasDelete(), delete_count > 0);
+ EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0);
+ EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0);
+ EXPECT_EQ(b->HasMerge(), merge_count > 0);
+ if (count != WriteBatchInternal::Count(b)) {
+ state.append("CountMismatch()");
+ }
+ } else {
+ state.append(s.ToString());
+ }
+ delete mem->Unref();
+ return state;
+}
+
+class WriteBatchTest : public testing::Test {};
+
+TEST_F(WriteBatchTest, Empty) {
+ WriteBatch batch;
+ ASSERT_EQ("", PrintContents(&batch));
+ ASSERT_EQ(0u, WriteBatchInternal::Count(&batch));
+ ASSERT_EQ(0u, batch.Count());
+}
+
+TEST_F(WriteBatchTest, Multiple) {
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+ ASSERT_OK(batch.Delete(Slice("box")));
+ ASSERT_OK(batch.DeleteRange(Slice("bar"), Slice("foo")));
+ ASSERT_OK(batch.Put(Slice("baz"), Slice("boo")));
+ WriteBatchInternal::SetSequence(&batch, 100);
+ ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
+ ASSERT_EQ(4u, WriteBatchInternal::Count(&batch));
+ ASSERT_EQ(
+ "Put(baz, boo)@103"
+ "Delete(box)@101"
+ "Put(foo, bar)@100"
+ "DeleteRange(bar, foo)@102",
+ PrintContents(&batch));
+ ASSERT_EQ(4u, batch.Count());
+}
+
+TEST_F(WriteBatchTest, Corruption) {
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+ ASSERT_OK(batch.Delete(Slice("box")));
+ WriteBatchInternal::SetSequence(&batch, 200);
+ Slice contents = WriteBatchInternal::Contents(&batch);
+ ASSERT_OK(WriteBatchInternal::SetContents(
+ &batch, Slice(contents.data(), contents.size() - 1)));
+ ASSERT_EQ(
+ "Put(foo, bar)@200"
+ "Corruption: bad WriteBatch Delete",
+ PrintContents(&batch));
+}
+
+TEST_F(WriteBatchTest, Append) {
+ WriteBatch b1, b2;
+ WriteBatchInternal::SetSequence(&b1, 200);
+ WriteBatchInternal::SetSequence(&b2, 300);
+ ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+ ASSERT_EQ("", PrintContents(&b1));
+ ASSERT_EQ(0u, b1.Count());
+ ASSERT_OK(b2.Put("a", "va"));
+ ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+ ASSERT_EQ("Put(a, va)@200", PrintContents(&b1));
+ ASSERT_EQ(1u, b1.Count());
+ b2.Clear();
+ ASSERT_OK(b2.Put("b", "vb"));
+ ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+ ASSERT_EQ(
+ "Put(a, va)@200"
+ "Put(b, vb)@201",
+ PrintContents(&b1));
+ ASSERT_EQ(2u, b1.Count());
+ ASSERT_OK(b2.Delete("foo"));
+ ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+ ASSERT_EQ(
+ "Put(a, va)@200"
+ "Put(b, vb)@202"
+ "Put(b, vb)@201"
+ "Delete(foo)@203",
+ PrintContents(&b1));
+ ASSERT_EQ(4u, b1.Count());
+ b2.Clear();
+ ASSERT_OK(b2.Put("c", "cc"));
+ ASSERT_OK(b2.Put("d", "dd"));
+ b2.MarkWalTerminationPoint();
+ ASSERT_OK(b2.Put("e", "ee"));
+ ASSERT_OK(WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true));
+ ASSERT_EQ(
+ "Put(a, va)@200"
+ "Put(b, vb)@202"
+ "Put(b, vb)@201"
+ "Put(c, cc)@204"
+ "Put(d, dd)@205"
+ "Delete(foo)@203",
+ PrintContents(&b1));
+ ASSERT_EQ(6u, b1.Count());
+ ASSERT_EQ(
+ "Put(c, cc)@0"
+ "Put(d, dd)@1"
+ "Put(e, ee)@2",
+ PrintContents(&b2));
+ ASSERT_EQ(3u, b2.Count());
+}
+
+TEST_F(WriteBatchTest, SingleDeletion) {
+ WriteBatch batch;
+ WriteBatchInternal::SetSequence(&batch, 100);
+ ASSERT_EQ("", PrintContents(&batch));
+ ASSERT_EQ(0u, batch.Count());
+ ASSERT_OK(batch.Put("a", "va"));
+ ASSERT_EQ("Put(a, va)@100", PrintContents(&batch));
+ ASSERT_EQ(1u, batch.Count());
+ ASSERT_OK(batch.SingleDelete("a"));
+ ASSERT_EQ(
+ "SingleDelete(a)@101"
+ "Put(a, va)@100",
+ PrintContents(&batch));
+ ASSERT_EQ(2u, batch.Count());
+}
+
+namespace {
+struct TestHandler : public WriteBatch::Handler {
+ std::string seen;
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ if (column_family_id == 0) {
+ seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
+ } else {
+ seen += "PutCF(" + std::to_string(column_family_id) + ", " +
+ key.ToString() + ", " + value.ToString() + ")";
+ }
+ return Status::OK();
+ }
+ Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+ if (column_family_id == 0) {
+ seen += "Delete(" + key.ToString() + ")";
+ } else {
+ seen += "DeleteCF(" + std::to_string(column_family_id) + ", " +
+ key.ToString() + ")";
+ }
+ return Status::OK();
+ }
+ Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+ if (column_family_id == 0) {
+ seen += "SingleDelete(" + key.ToString() + ")";
+ } else {
+ seen += "SingleDeleteCF(" + std::to_string(column_family_id) + ", " +
+ key.ToString() + ")";
+ }
+ return Status::OK();
+ }
+ Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+ const Slice& end_key) override {
+ if (column_family_id == 0) {
+ seen += "DeleteRange(" + begin_key.ToString() + ", " +
+ end_key.ToString() + ")";
+ } else {
+ seen += "DeleteRangeCF(" + std::to_string(column_family_id) + ", " +
+ begin_key.ToString() + ", " + end_key.ToString() + ")";
+ }
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ if (column_family_id == 0) {
+ seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
+ } else {
+ seen += "MergeCF(" + std::to_string(column_family_id) + ", " +
+ key.ToString() + ", " + value.ToString() + ")";
+ }
+ return Status::OK();
+ }
+ void LogData(const Slice& blob) override {
+ seen += "LogData(" + blob.ToString() + ")";
+ }
+ Status MarkBeginPrepare(bool unprepare) override {
+ seen +=
+ "MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")";
+ return Status::OK();
+ }
+ Status MarkEndPrepare(const Slice& xid) override {
+ seen += "MarkEndPrepare(" + xid.ToString() + ")";
+ return Status::OK();
+ }
+ Status MarkNoop(bool empty_batch) override {
+ seen += "MarkNoop(" + std::string(empty_batch ? "true" : "false") + ")";
+ return Status::OK();
+ }
+ Status MarkCommit(const Slice& xid) override {
+ seen += "MarkCommit(" + xid.ToString() + ")";
+ return Status::OK();
+ }
+ Status MarkCommitWithTimestamp(const Slice& xid, const Slice& ts) override {
+ seen += "MarkCommitWithTimestamp(" + xid.ToString() + ", " +
+ ts.ToString(true) + ")";
+ return Status::OK();
+ }
+ Status MarkRollback(const Slice& xid) override {
+ seen += "MarkRollback(" + xid.ToString() + ")";
+ return Status::OK();
+ }
+};
+} // anonymous namespace
+
+TEST_F(WriteBatchTest, PutNotImplemented) {
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+ ASSERT_EQ(1u, batch.Count());
+ ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
+
+ WriteBatch::Handler handler;
+ ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, DeleteNotImplemented) {
+ WriteBatch batch;
+ ASSERT_OK(batch.Delete(Slice("k2")));
+ ASSERT_EQ(1u, batch.Count());
+ ASSERT_EQ("Delete(k2)@0", PrintContents(&batch));
+
+ WriteBatch::Handler handler;
+ ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
+ WriteBatch batch;
+ ASSERT_OK(batch.SingleDelete(Slice("k2")));
+ ASSERT_EQ(1u, batch.Count());
+ ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch));
+
+ WriteBatch::Handler handler;
+ ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, MergeNotImplemented) {
+ WriteBatch batch;
+ ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+ ASSERT_EQ(1u, batch.Count());
+ ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch));
+
+ WriteBatch::Handler handler;
+ ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, MergeWithoutOperatorInsertionFailure) {
+ WriteBatch batch;
+ ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+ ASSERT_EQ(1u, batch.Count());
+ ASSERT_EQ(
+ "Invalid argument: Merge requires `ColumnFamilyOptions::merge_operator "
+ "!= nullptr`",
+ PrintContents(&batch, false /* merge_operator_supported */));
+}
+
+TEST_F(WriteBatchTest, Blob) {
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+ ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+ ASSERT_OK(batch.Put(Slice("k3"), Slice("v3")));
+ ASSERT_OK(batch.PutLogData(Slice("blob1")));
+ ASSERT_OK(batch.Delete(Slice("k2")));
+ ASSERT_OK(batch.SingleDelete(Slice("k3")));
+ ASSERT_OK(batch.PutLogData(Slice("blob2")));
+ ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+ ASSERT_EQ(6u, batch.Count());
+ ASSERT_EQ(
+ "Merge(foo, bar)@5"
+ "Put(k1, v1)@0"
+ "Delete(k2)@3"
+ "Put(k2, v2)@1"
+ "SingleDelete(k3)@4"
+ "Put(k3, v3)@2",
+ PrintContents(&batch));
+
+ TestHandler handler;
+ ASSERT_OK(batch.Iterate(&handler));
+ ASSERT_EQ(
+ "Put(k1, v1)"
+ "Put(k2, v2)"
+ "Put(k3, v3)"
+ "LogData(blob1)"
+ "Delete(k2)"
+ "SingleDelete(k3)"
+ "LogData(blob2)"
+ "Merge(foo, bar)",
+ handler.seen);
+}
+
+TEST_F(WriteBatchTest, PrepareCommit) {
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::InsertNoop(&batch));
+ ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+ ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+ batch.SetSavePoint();
+ ASSERT_OK(WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1")));
+ Status s = batch.RollbackToSavePoint();
+ ASSERT_EQ(s, Status::NotFound());
+ ASSERT_OK(WriteBatchInternal::MarkCommit(&batch, Slice("xid1")));
+ ASSERT_OK(WriteBatchInternal::MarkRollback(&batch, Slice("xid1")));
+ ASSERT_EQ(2u, batch.Count());
+
+ TestHandler handler;
+ ASSERT_OK(batch.Iterate(&handler));
+ ASSERT_EQ(
+ "MarkBeginPrepare(false)"
+ "Put(k1, v1)"
+ "Put(k2, v2)"
+ "MarkEndPrepare(xid1)"
+ "MarkCommit(xid1)"
+ "MarkRollback(xid1)",
+ handler.seen);
+}
+
+// It requires more than 30GB of memory to run the test. With single memory
+// allocation of more than 30GB.
+// Not all platform can run it. Also it runs a long time. So disable it.
+TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
+ // Insert key and value of 3GB and push total batch size to 12GB.
+ static const size_t kKeyValueSize = 4u;
+ static const uint32_t kNumUpdates = uint32_t{3} << 30;
+ std::string raw(kKeyValueSize, 'A');
+ WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u);
+ char c = 'A';
+ for (uint32_t i = 0; i < kNumUpdates; i++) {
+ if (c > 'Z') {
+ c = 'A';
+ }
+ raw[0] = c;
+ raw[raw.length() - 1] = c;
+ c++;
+ ASSERT_OK(batch.Put(raw, raw));
+ }
+
+ ASSERT_EQ(kNumUpdates, batch.Count());
+
+ struct NoopHandler : public WriteBatch::Handler {
+ uint32_t num_seen = 0;
+ char expected_char = 'A';
+ Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+ const Slice& value) override {
+ EXPECT_EQ(kKeyValueSize, key.size());
+ EXPECT_EQ(kKeyValueSize, value.size());
+ EXPECT_EQ(expected_char, key[0]);
+ EXPECT_EQ(expected_char, value[0]);
+ EXPECT_EQ(expected_char, key[kKeyValueSize - 1]);
+ EXPECT_EQ(expected_char, value[kKeyValueSize - 1]);
+ expected_char++;
+ if (expected_char > 'Z') {
+ expected_char = 'A';
+ }
+ ++num_seen;
+ return Status::OK();
+ }
+ Status DeleteCF(uint32_t /*column_family_id*/,
+ const Slice& /*key*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ Status SingleDeleteCF(uint32_t /*column_family_id*/,
+ const Slice& /*key*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+ const Slice& /*value*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+ bool Continue() override { return num_seen < kNumUpdates; }
+ } handler;
+
+ ASSERT_OK(batch.Iterate(&handler));
+ ASSERT_EQ(kNumUpdates, handler.num_seen);
+}
+
+// The test requires more than 18GB memory to run it, with single memory
+// allocation of more than 12GB. Not all the platform can run it. So disable it.
+TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
+ // Insert key and value of 3GB and push total batch size to 12GB.
+ static const size_t kKeyValueSize = 3221225472u;
+ std::string raw(kKeyValueSize, 'A');
+ WriteBatch batch(size_t(12884901888ull + 1024u));
+ for (char i = 0; i < 2; i++) {
+ raw[0] = 'A' + i;
+ raw[raw.length() - 1] = 'A' - i;
+ ASSERT_OK(batch.Put(raw, raw));
+ }
+
+ ASSERT_EQ(2u, batch.Count());
+
+ struct NoopHandler : public WriteBatch::Handler {
+ int num_seen = 0;
+ Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+ const Slice& value) override {
+ EXPECT_EQ(kKeyValueSize, key.size());
+ EXPECT_EQ(kKeyValueSize, value.size());
+ EXPECT_EQ('A' + num_seen, key[0]);
+ EXPECT_EQ('A' + num_seen, value[0]);
+ EXPECT_EQ('A' - num_seen, key[kKeyValueSize - 1]);
+ EXPECT_EQ('A' - num_seen, value[kKeyValueSize - 1]);
+ ++num_seen;
+ return Status::OK();
+ }
+ Status DeleteCF(uint32_t /*column_family_id*/,
+ const Slice& /*key*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ Status SingleDeleteCF(uint32_t /*column_family_id*/,
+ const Slice& /*key*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+ const Slice& /*value*/) override {
+ ADD_FAILURE();
+ return Status::OK();
+ }
+ void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+ bool Continue() override { return num_seen < 2; }
+ } handler;
+
+ ASSERT_OK(batch.Iterate(&handler));
+ ASSERT_EQ(2, handler.num_seen);
+}
+
+TEST_F(WriteBatchTest, Continue) {
+ WriteBatch batch;
+
+ struct Handler : public TestHandler {
+ int num_seen = 0;
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ ++num_seen;
+ return TestHandler::PutCF(column_family_id, key, value);
+ }
+ Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+ ++num_seen;
+ return TestHandler::DeleteCF(column_family_id, key);
+ }
+ Status SingleDeleteCF(uint32_t column_family_id,
+ const Slice& key) override {
+ ++num_seen;
+ return TestHandler::SingleDeleteCF(column_family_id, key);
+ }
+ Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ ++num_seen;
+ return TestHandler::MergeCF(column_family_id, key, value);
+ }
+ void LogData(const Slice& blob) override {
+ ++num_seen;
+ TestHandler::LogData(blob);
+ }
+ bool Continue() override { return num_seen < 5; }
+ } handler;
+
+ ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+ ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+ ASSERT_OK(batch.PutLogData(Slice("blob1")));
+ ASSERT_OK(batch.Delete(Slice("k1")));
+ ASSERT_OK(batch.SingleDelete(Slice("k2")));
+ ASSERT_OK(batch.PutLogData(Slice("blob2")));
+ ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+ ASSERT_OK(batch.Iterate(&handler));
+ ASSERT_EQ(
+ "Put(k1, v1)"
+ "Put(k2, v2)"
+ "LogData(blob1)"
+ "Delete(k1)"
+ "SingleDelete(k2)",
+ handler.seen);
+}
+
+TEST_F(WriteBatchTest, PutGatherSlices) {
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+
+ {
+ // Try a write where the key is one slice but the value is two
+ Slice key_slice("baz");
+ Slice value_slices[2] = {Slice("header"), Slice("payload")};
+ ASSERT_OK(
+ batch.Put(SliceParts(&key_slice, 1), SliceParts(value_slices, 2)));
+ }
+
+ {
+ // One where the key is composite but the value is a single slice
+ Slice key_slices[3] = {Slice("key"), Slice("part2"), Slice("part3")};
+ Slice value_slice("value");
+ ASSERT_OK(
+ batch.Put(SliceParts(key_slices, 3), SliceParts(&value_slice, 1)));
+ }
+
+ WriteBatchInternal::SetSequence(&batch, 100);
+ ASSERT_EQ(
+ "Put(baz, headerpayload)@101"
+ "Put(foo, bar)@100"
+ "Put(keypart2part3, value)@102",
+ PrintContents(&batch));
+ ASSERT_EQ(3u, batch.Count());
+}
+
+namespace {
+class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
+ public:
+ explicit ColumnFamilyHandleImplDummy(int id)
+ : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
+ explicit ColumnFamilyHandleImplDummy(int id, const Comparator* ucmp)
+ : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
+ id_(id),
+ ucmp_(ucmp) {}
+ uint32_t GetID() const override { return id_; }
+ const Comparator* GetComparator() const override { return ucmp_; }
+
+ private:
+ uint32_t id_;
+ const Comparator* const ucmp_ = BytewiseComparator();
+};
+} // anonymous namespace
+
+TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
+ WriteBatch batch;
+ ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+ ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+ ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
+ ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
+ ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
+ ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
+ ASSERT_OK(batch.DeleteRange(&two, Slice("3foo"), Slice("4foo")));
+ ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
+ ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+ ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
+
+ TestHandler handler;
+ ASSERT_OK(batch.Iterate(&handler));
+ ASSERT_EQ(
+ "Put(foo, bar)"
+ "PutCF(2, twofoo, bar2)"
+ "PutCF(8, eightfoo, bar8)"
+ "DeleteCF(8, eightfoo)"
+ "SingleDeleteCF(2, twofoo)"
+ "DeleteRangeCF(2, 3foo, 4foo)"
+ "MergeCF(3, threethree, 3three)"
+ "Put(foo, bar)"
+ "Merge(omom, nom)",
+ handler.seen);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
+ WriteBatchWithIndex batch;
+ ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+ ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+ ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
+ ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
+ ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
+ ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
+ ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
+ ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+ ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
+
+ std::unique_ptr<WBWIIterator> iter;
+
+ iter.reset(batch.NewIterator(&eight));
+ iter->Seek("eightfoo");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+ ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+ ASSERT_EQ("bar8", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type);
+ ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter.reset(batch.NewIterator(&two));
+ iter->Seek("twofoo");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+ ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+ ASSERT_EQ("bar2", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kSingleDeleteRecord, iter->Entry().type);
+ ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter.reset(batch.NewIterator());
+ iter->Seek("gggg");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+ ASSERT_EQ("omom", iter->Entry().key.ToString());
+ ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter.reset(batch.NewIterator(&zero));
+ iter->Seek("foo");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+ ASSERT_EQ("foo", iter->Entry().key.ToString());
+ ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+ ASSERT_EQ("foo", iter->Entry().key.ToString());
+ ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+ ASSERT_EQ("omom", iter->Entry().key.ToString());
+ ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ TestHandler handler;
+ ASSERT_OK(batch.GetWriteBatch()->Iterate(&handler));
+ ASSERT_EQ(
+ "Put(foo, bar)"
+ "PutCF(2, twofoo, bar2)"
+ "PutCF(8, eightfoo, bar8)"
+ "DeleteCF(8, eightfoo)"
+ "SingleDeleteCF(2, twofoo)"
+ "MergeCF(3, threethree, 3three)"
+ "Put(foo, bar)"
+ "Merge(omom, nom)",
+ handler.seen);
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(WriteBatchTest, SavePointTest) {
+ Status s;
+ WriteBatch batch;
+ batch.SetSavePoint();
+
+ ASSERT_OK(batch.Put("A", "a"));
+ ASSERT_OK(batch.Put("B", "b"));
+ batch.SetSavePoint();
+
+ ASSERT_OK(batch.Put("C", "c"));
+ ASSERT_OK(batch.Delete("A"));
+ batch.SetSavePoint();
+ batch.SetSavePoint();
+
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_EQ(
+ "Delete(A)@3"
+ "Put(A, a)@0"
+ "Put(B, b)@1"
+ "Put(C, c)@2",
+ PrintContents(&batch));
+
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_EQ(
+ "Put(A, a)@0"
+ "Put(B, b)@1",
+ PrintContents(&batch));
+
+ ASSERT_OK(batch.Delete("A"));
+ ASSERT_OK(batch.Put("B", "bb"));
+
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_EQ("", PrintContents(&batch));
+
+ s = batch.RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ("", PrintContents(&batch));
+
+ ASSERT_OK(batch.Put("D", "d"));
+ ASSERT_OK(batch.Delete("A"));
+
+ batch.SetSavePoint();
+
+ ASSERT_OK(batch.Put("A", "aaa"));
+
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_EQ(
+ "Delete(A)@1"
+ "Put(D, d)@0",
+ PrintContents(&batch));
+
+ batch.SetSavePoint();
+
+ ASSERT_OK(batch.Put("D", "d"));
+ ASSERT_OK(batch.Delete("A"));
+
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_EQ(
+ "Delete(A)@1"
+ "Put(D, d)@0",
+ PrintContents(&batch));
+
+ s = batch.RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ(
+ "Delete(A)@1"
+ "Put(D, d)@0",
+ PrintContents(&batch));
+
+ WriteBatch batch2;
+
+ s = batch2.RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ("", PrintContents(&batch2));
+
+ ASSERT_OK(batch2.Delete("A"));
+ batch2.SetSavePoint();
+
+ s = batch2.RollbackToSavePoint();
+ ASSERT_OK(s);
+ ASSERT_EQ("Delete(A)@0", PrintContents(&batch2));
+
+ batch2.Clear();
+ ASSERT_EQ("", PrintContents(&batch2));
+
+ batch2.SetSavePoint();
+
+ ASSERT_OK(batch2.Delete("B"));
+ ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+ batch2.SetSavePoint();
+ s = batch2.RollbackToSavePoint();
+ ASSERT_OK(s);
+ ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+ s = batch2.RollbackToSavePoint();
+ ASSERT_OK(s);
+ ASSERT_EQ("", PrintContents(&batch2));
+
+ s = batch2.RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ("", PrintContents(&batch2));
+
+ WriteBatch batch3;
+
+ s = batch3.PopSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ("", PrintContents(&batch3));
+
+ batch3.SetSavePoint();
+ ASSERT_OK(batch3.Delete("A"));
+
+ s = batch3.PopSavePoint();
+ ASSERT_OK(s);
+ ASSERT_EQ("Delete(A)@0", PrintContents(&batch3));
+}
+
+TEST_F(WriteBatchTest, MemoryLimitTest) {
+ Status s;
+ // The header size is 12 bytes. The two Puts take 8 bytes which gives total
+ // of 12 + 8 * 2 = 28 bytes.
+ WriteBatch batch(0, 28);
+
+ ASSERT_OK(batch.Put("a", "...."));
+ ASSERT_OK(batch.Put("b", "...."));
+ s = batch.Put("c", "....");
+ ASSERT_TRUE(s.IsMemoryLimit());
+}
+
+namespace {
+class TimestampChecker : public WriteBatch::Handler {
+ public:
+ explicit TimestampChecker(
+ std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps, Slice ts)
+ : cf_to_ucmps_(std::move(cf_to_ucmps)), timestamp_(std::move(ts)) {}
+ Status PutCF(uint32_t cf, const Slice& key, const Slice& /*value*/) override {
+ auto cf_iter = cf_to_ucmps_.find(cf);
+ if (cf_iter == cf_to_ucmps_.end()) {
+ return Status::Corruption();
+ }
+ const Comparator* const ucmp = cf_iter->second;
+ assert(ucmp);
+ size_t ts_sz = ucmp->timestamp_size();
+ if (ts_sz == 0) {
+ return Status::OK();
+ }
+ if (key.size() < ts_sz) {
+ return Status::Corruption();
+ }
+ Slice ts = ExtractTimestampFromUserKey(key, ts_sz);
+ if (ts.compare(timestamp_) != 0) {
+ return Status::Corruption();
+ }
+ return Status::OK();
+ }
+
+ private:
+ std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps_;
+ Slice timestamp_;
+};
+
+Status CheckTimestampsInWriteBatch(
+ WriteBatch& wb, Slice timestamp,
+ std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps) {
+ TimestampChecker ts_checker(cf_to_ucmps, timestamp);
+ return wb.Iterate(&ts_checker);
+}
+} // anonymous namespace
+
+TEST_F(WriteBatchTest, SanityChecks) {
+ ColumnFamilyHandleImplDummy cf0(0,
+ test::BytewiseComparatorWithU64TsWrapper());
+ ColumnFamilyHandleImplDummy cf4(4);
+
+ WriteBatch wb(0, 0, 0, /*default_cf_ts_sz=*/sizeof(uint64_t));
+
+ // Sanity checks for the new WriteBatch APIs with extra 'ts' arg.
+ ASSERT_TRUE(wb.Put(nullptr, "key", "ts", "value").IsInvalidArgument());
+ ASSERT_TRUE(wb.Delete(nullptr, "key", "ts").IsInvalidArgument());
+ ASSERT_TRUE(wb.SingleDelete(nullptr, "key", "ts").IsInvalidArgument());
+ ASSERT_TRUE(wb.Merge(nullptr, "key", "ts", "value").IsInvalidArgument());
+ ASSERT_TRUE(wb.DeleteRange(nullptr, "begin_key", "end_key", "ts")
+ .IsInvalidArgument());
+
+ ASSERT_TRUE(wb.Put(&cf4, "key", "ts", "value").IsInvalidArgument());
+ ASSERT_TRUE(wb.Delete(&cf4, "key", "ts").IsInvalidArgument());
+ ASSERT_TRUE(wb.SingleDelete(&cf4, "key", "ts").IsInvalidArgument());
+ ASSERT_TRUE(wb.Merge(&cf4, "key", "ts", "value").IsInvalidArgument());
+ ASSERT_TRUE(
+ wb.DeleteRange(&cf4, "begin_key", "end_key", "ts").IsInvalidArgument());
+
+ constexpr size_t wrong_ts_sz = 1 + sizeof(uint64_t);
+ std::string ts(wrong_ts_sz, '\0');
+
+ ASSERT_TRUE(wb.Put(&cf0, "key", ts, "value").IsInvalidArgument());
+ ASSERT_TRUE(wb.Delete(&cf0, "key", ts).IsInvalidArgument());
+ ASSERT_TRUE(wb.SingleDelete(&cf0, "key", ts).IsInvalidArgument());
+ ASSERT_TRUE(wb.Merge(&cf0, "key", ts, "value").IsInvalidArgument());
+ ASSERT_TRUE(
+ wb.DeleteRange(&cf0, "begin_key", "end_key", ts).IsInvalidArgument());
+
+ // Sanity checks for the new WriteBatch APIs without extra 'ts' arg.
+ WriteBatch wb1(0, 0, 0, wrong_ts_sz);
+ ASSERT_TRUE(wb1.Put(&cf0, "key", "value").IsInvalidArgument());
+ ASSERT_TRUE(wb1.Delete(&cf0, "key").IsInvalidArgument());
+ ASSERT_TRUE(wb1.SingleDelete(&cf0, "key").IsInvalidArgument());
+ ASSERT_TRUE(wb1.Merge(&cf0, "key", "value").IsInvalidArgument());
+ ASSERT_TRUE(
+ wb1.DeleteRange(&cf0, "begin_key", "end_key").IsInvalidArgument());
+}
+
+TEST_F(WriteBatchTest, UpdateTimestamps) {
+ // We assume the last eight bytes of each key is reserved for timestamps.
+ // Therefore, we must make sure each key is longer than eight bytes.
+ constexpr size_t key_size = 16;
+ constexpr size_t num_of_keys = 10;
+ std::vector<std::string> key_strs(num_of_keys, std::string(key_size, '\0'));
+
+ ColumnFamilyHandleImplDummy cf0(0);
+ ColumnFamilyHandleImplDummy cf4(4,
+ test::BytewiseComparatorWithU64TsWrapper());
+ ColumnFamilyHandleImplDummy cf5(5,
+ test::BytewiseComparatorWithU64TsWrapper());
+
+ const std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps = {
+ {0, cf0.GetComparator()},
+ {4, cf4.GetComparator()},
+ {5, cf5.GetComparator()}};
+
+ static constexpr size_t timestamp_size = sizeof(uint64_t);
+
+ {
+ WriteBatch wb1, wb2, wb3, wb4, wb5, wb6, wb7;
+ ASSERT_OK(wb1.Put(&cf0, "key", "value"));
+ ASSERT_FALSE(WriteBatchInternal::HasKeyWithTimestamp(wb1));
+ ASSERT_OK(wb2.Put(&cf4, "key", "value"));
+ ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb2));
+ ASSERT_OK(wb3.Put(&cf4, "key", /*ts=*/std::string(timestamp_size, '\xfe'),
+ "value"));
+ ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb3));
+ ASSERT_OK(wb4.Delete(&cf4, "key",
+ /*ts=*/std::string(timestamp_size, '\xfe')));
+ ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb4));
+ ASSERT_OK(wb5.Delete(&cf4, "key"));
+ ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb5));
+ ASSERT_OK(wb6.SingleDelete(&cf4, "key"));
+ ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb6));
+ ASSERT_OK(wb7.SingleDelete(&cf4, "key",
+ /*ts=*/std::string(timestamp_size, '\xfe')));
+ ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb7));
+ }
+
+ WriteBatch batch;
+ // Write to the batch. We will assign timestamps later.
+ for (const auto& key_str : key_strs) {
+ ASSERT_OK(batch.Put(&cf0, key_str, "value"));
+ ASSERT_OK(batch.Put(&cf4, key_str, "value"));
+ ASSERT_OK(batch.Put(&cf5, key_str, "value"));
+ }
+
+ const auto checker1 = [](uint32_t cf) {
+ if (cf == 4 || cf == 5) {
+ return timestamp_size;
+ } else if (cf == 0) {
+ return static_cast<size_t>(0);
+ } else {
+ return std::numeric_limits<size_t>::max();
+ }
+ };
+ ASSERT_OK(
+ batch.UpdateTimestamps(std::string(timestamp_size, '\xfe'), checker1));
+ ASSERT_OK(CheckTimestampsInWriteBatch(
+ batch, std::string(timestamp_size, '\xfe'), cf_to_ucmps));
+
+ // We use indexed_cf_to_ucmps, non_indexed_cfs_with_ts and timestamp_size to
+ // simulate the case in which a transaction enables indexing for some writes
+ // while disables indexing for other writes. A transaction uses a
+ // WriteBatchWithIndex object to buffer writes (we consider Write-committed
+ // policy only). If indexing is enabled, then writes go through
+ // WriteBatchWithIndex API populating a WBWI internal data structure, i.e. a
+ // mapping from cf to user comparators. If indexing is disabled, a transaction
+ // writes directly to the underlying raw WriteBatch. We will need to track the
+ // comparator information for the column families to which un-indexed writes
+ // are performed. When calling UpdateTimestamp API of WriteBatch, we need
+ // indexed_cf_to_ucmps, non_indexed_cfs_with_ts, and timestamp_size to perform
+ // checking.
+ std::unordered_map<uint32_t, const Comparator*> indexed_cf_to_ucmps = {
+ {0, cf0.GetComparator()}, {4, cf4.GetComparator()}};
+ std::unordered_set<uint32_t> non_indexed_cfs_with_ts = {cf5.GetID()};
+ const auto checker2 = [&indexed_cf_to_ucmps,
+ &non_indexed_cfs_with_ts](uint32_t cf) {
+ if (non_indexed_cfs_with_ts.count(cf) > 0) {
+ return timestamp_size;
+ }
+ auto cf_iter = indexed_cf_to_ucmps.find(cf);
+ if (cf_iter == indexed_cf_to_ucmps.end()) {
+ assert(false);
+ return std::numeric_limits<size_t>::max();
+ }
+ const Comparator* const ucmp = cf_iter->second;
+ assert(ucmp);
+ return ucmp->timestamp_size();
+ };
+ ASSERT_OK(
+ batch.UpdateTimestamps(std::string(timestamp_size, '\xef'), checker2));
+ ASSERT_OK(CheckTimestampsInWriteBatch(
+ batch, std::string(timestamp_size, '\xef'), cf_to_ucmps));
+}
+
+TEST_F(WriteBatchTest, CommitWithTimestamp) {
+ WriteBatch wb;
+ const std::string txn_name = "xid1";
+ std::string ts;
+ constexpr uint64_t commit_ts = 23;
+ PutFixed64(&ts, commit_ts);
+ ASSERT_OK(WriteBatchInternal::MarkCommitWithTimestamp(&wb, txn_name, ts));
+ TestHandler handler;
+ ASSERT_OK(wb.Iterate(&handler));
+ ASSERT_EQ("MarkCommitWithTimestamp(" + txn_name + ", " +
+ Slice(ts).ToString(true) + ")",
+ handler.seen);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_callback.h b/src/rocksdb/db/write_callback.h
new file mode 100644
index 000000000..106d02041
--- /dev/null
+++ b/src/rocksdb/db/write_callback.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+
+class WriteCallback {
+ public:
+ virtual ~WriteCallback() {}
+
+ // Will be called while on the write thread before the write executes. If
+ // this function returns a non-OK status, the write will be aborted and this
+ // status will be returned to the caller of DB::Write().
+ virtual Status Callback(DB* db) = 0;
+
+ // return true if writes with this callback can be batched with other writes
+ virtual bool AllowWriteBatching() = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_callback_test.cc b/src/rocksdb/db/write_callback_test.cc
new file mode 100644
index 000000000..e6ebaae08
--- /dev/null
+++ b/src/rocksdb/db/write_callback_test.cc
@@ -0,0 +1,465 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/write_callback.h"
+
+#include <atomic>
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+using std::string;
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteCallbackTest : public testing::Test {
+ public:
+ string dbname;
+
+ WriteCallbackTest() {
+ dbname = test::PerThreadDBPath("write_callback_testdb");
+ }
+};
+
+class WriteCallbackTestWriteCallback1 : public WriteCallback {
+ public:
+ bool was_called = false;
+
+ Status Callback(DB* db) override {
+ was_called = true;
+
+ // Make sure db is a DBImpl
+ DBImpl* db_impl = dynamic_cast<DBImpl*>(db);
+ if (db_impl == nullptr) {
+ return Status::InvalidArgument("");
+ }
+
+ return Status::OK();
+ }
+
+ bool AllowWriteBatching() override { return true; }
+};
+
+class WriteCallbackTestWriteCallback2 : public WriteCallback {
+ public:
+ Status Callback(DB* /*db*/) override { return Status::Busy(); }
+ bool AllowWriteBatching() override { return true; }
+};
+
+class MockWriteCallback : public WriteCallback {
+ public:
+ bool should_fail_ = false;
+ bool allow_batching_ = false;
+ std::atomic<bool> was_called_{false};
+
+ MockWriteCallback() {}
+
+ MockWriteCallback(const MockWriteCallback& other) {
+ should_fail_ = other.should_fail_;
+ allow_batching_ = other.allow_batching_;
+ was_called_.store(other.was_called_.load());
+ }
+
+ Status Callback(DB* /*db*/) override {
+ was_called_.store(true);
+ if (should_fail_) {
+ return Status::Busy();
+ } else {
+ return Status::OK();
+ }
+ }
+
+ bool AllowWriteBatching() override { return allow_batching_; }
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class WriteCallbackPTest
+ : public WriteCallbackTest,
+ public ::testing::WithParamInterface<
+ std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
+ public:
+ WriteCallbackPTest() {
+ std::tie(unordered_write_, seq_per_batch_, two_queues_, allow_parallel_,
+ allow_batching_, enable_WAL_, enable_pipelined_write_) =
+ GetParam();
+ }
+
+ protected:
+ bool unordered_write_;
+ bool seq_per_batch_;
+ bool two_queues_;
+ bool allow_parallel_;
+ bool allow_batching_;
+ bool enable_WAL_;
+ bool enable_pipelined_write_;
+};
+
+TEST_P(WriteCallbackPTest, WriteWithCallbackTest) {
+ struct WriteOP {
+ WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; }
+
+ void Put(const string& key, const string& val) {
+ kvs_.push_back(std::make_pair(key, val));
+ ASSERT_OK(write_batch_.Put(key, val));
+ }
+
+ void Clear() {
+ kvs_.clear();
+ write_batch_.Clear();
+ callback_.was_called_.store(false);
+ }
+
+ MockWriteCallback callback_;
+ WriteBatch write_batch_;
+ std::vector<std::pair<string, string>> kvs_;
+ };
+
+ // In each scenario we'll launch multiple threads to write.
+ // The size of each array equals to number of threads, and
+ // each boolean in it denote whether callback of corresponding
+ // thread should succeed or fail.
+ std::vector<std::vector<WriteOP>> write_scenarios = {
+ {true},
+ {false},
+ {false, false},
+ {true, true},
+ {true, false},
+ {false, true},
+ {false, false, false},
+ {true, true, true},
+ {false, true, false},
+ {true, false, true},
+ {true, false, false, false, false},
+ {false, false, false, false, true},
+ {false, false, true, false, true},
+ };
+
+ for (auto& write_group : write_scenarios) {
+ Options options;
+ options.create_if_missing = true;
+ options.unordered_write = unordered_write_;
+ options.allow_concurrent_memtable_write = allow_parallel_;
+ options.enable_pipelined_write = enable_pipelined_write_;
+ options.two_write_queues = two_queues_;
+ // Skip unsupported combinations
+ if (options.enable_pipelined_write && seq_per_batch_) {
+ continue;
+ }
+ if (options.enable_pipelined_write && options.two_write_queues) {
+ continue;
+ }
+ if (options.unordered_write && !options.allow_concurrent_memtable_write) {
+ continue;
+ }
+ if (options.unordered_write && options.enable_pipelined_write) {
+ continue;
+ }
+
+ ReadOptions read_options;
+ DB* db;
+ DBImpl* db_impl;
+
+ ASSERT_OK(DestroyDB(dbname, options));
+
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ std::vector<ColumnFamilyHandle*> handles;
+ auto open_s = DBImpl::Open(db_options, dbname, column_families, &handles,
+ &db, seq_per_batch_, true /* batch_per_txn */);
+ ASSERT_OK(open_s);
+ assert(handles.size() == 1);
+ delete handles[0];
+
+ db_impl = dynamic_cast<DBImpl*>(db);
+ ASSERT_TRUE(db_impl);
+
+ // Writers that have called JoinBatchGroup.
+ std::atomic<uint64_t> threads_joining(0);
+ // Writers that have linked to the queue
+ std::atomic<uint64_t> threads_linked(0);
+ // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point.
+ std::atomic<uint64_t> threads_verified(0);
+
+ std::atomic<uint64_t> seq(db_impl->GetLatestSequenceNumber());
+ ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Start", [&](void*) {
+ uint64_t cur_threads_joining = threads_joining.fetch_add(1);
+ // Wait for the last joined writer to link to the queue.
+ // In this way the writers link to the queue one by one.
+ // This allows us to confidently detect the first writer
+ // who increases threads_linked as the leader.
+ while (threads_linked.load() < cur_threads_joining) {
+ }
+ });
+
+ // Verification once writers call JoinBatchGroup.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+ uint64_t cur_threads_linked = threads_linked.fetch_add(1);
+ bool is_leader = false;
+ bool is_last = false;
+
+ // who am i
+ is_leader = (cur_threads_linked == 0);
+ is_last = (cur_threads_linked == write_group.size() - 1);
+
+ // check my state
+ auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+ if (is_leader) {
+ ASSERT_TRUE(writer->state ==
+ WriteThread::State::STATE_GROUP_LEADER);
+ } else {
+ ASSERT_TRUE(writer->state == WriteThread::State::STATE_INIT);
+ }
+
+ // (meta test) the first WriteOP should indeed be the first
+ // and the last should be the last (all others can be out of
+ // order)
+ if (is_leader) {
+ ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+ !write_group.front().callback_.should_fail_);
+ } else if (is_last) {
+ ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+ !write_group.back().callback_.should_fail_);
+ }
+
+ threads_verified.fetch_add(1);
+ // Wait here until all verification in this sync-point
+ // callback finish for all writers.
+ while (threads_verified.load() < write_group.size()) {
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) {
+ // check my state
+ auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+ if (!allow_batching_) {
+ // no batching so everyone should be a leader
+ ASSERT_TRUE(writer->state ==
+ WriteThread::State::STATE_GROUP_LEADER);
+ } else if (!allow_parallel_) {
+ ASSERT_TRUE(writer->state == WriteThread::State::STATE_COMPLETED ||
+ (enable_pipelined_write_ &&
+ writer->state ==
+ WriteThread::State::STATE_MEMTABLE_WRITER_LEADER));
+ }
+ });
+
+ std::atomic<uint32_t> thread_num(0);
+ std::atomic<char> dummy_key(0);
+
+ // Each write thread create a random write batch and write to DB
+ // with a write callback.
+ std::function<void()> write_with_callback_func = [&]() {
+ uint32_t i = thread_num.fetch_add(1);
+ Random rnd(i);
+
+ // leaders gotta lead
+ while (i > 0 && threads_verified.load() < 1) {
+ }
+
+ // loser has to lose
+ while (i == write_group.size() - 1 &&
+ threads_verified.load() < write_group.size() - 1) {
+ }
+
+ auto& write_op = write_group.at(i);
+ write_op.Clear();
+ write_op.callback_.allow_batching_ = allow_batching_;
+
+ // insert some keys
+ for (uint32_t j = 0; j < rnd.Next() % 50; j++) {
+ // grab unique key
+ char my_key = dummy_key.fetch_add(1);
+
+ string skey(5, my_key);
+ string sval(10, my_key);
+ write_op.Put(skey, sval);
+
+ if (!write_op.callback_.should_fail_ && !seq_per_batch_) {
+ seq.fetch_add(1);
+ }
+ }
+ if (!write_op.callback_.should_fail_ && seq_per_batch_) {
+ seq.fetch_add(1);
+ }
+
+ WriteOptions woptions;
+ woptions.disableWAL = !enable_WAL_;
+ woptions.sync = enable_WAL_;
+ if (woptions.protection_bytes_per_key > 0) {
+ ASSERT_OK(WriteBatchInternal::UpdateProtectionInfo(
+ &write_op.write_batch_, woptions.protection_bytes_per_key));
+ }
+ Status s;
+ if (seq_per_batch_) {
+ class PublishSeqCallback : public PreReleaseCallback {
+ public:
+ PublishSeqCallback(DBImpl* db_impl_in) : db_impl_(db_impl_in) {}
+ Status Callback(SequenceNumber last_seq, bool /*not used*/, uint64_t,
+ size_t /*index*/, size_t /*total*/) override {
+ db_impl_->SetLastPublishedSequence(last_seq);
+ return Status::OK();
+ }
+ DBImpl* db_impl_;
+ } publish_seq_callback(db_impl);
+ // seq_per_batch_ requires a natural batch separator or Noop
+ ASSERT_OK(WriteBatchInternal::InsertNoop(&write_op.write_batch_));
+ const size_t ONE_BATCH = 1;
+ s = db_impl->WriteImpl(woptions, &write_op.write_batch_,
+ &write_op.callback_, nullptr, 0, false, nullptr,
+ ONE_BATCH,
+ two_queues_ ? &publish_seq_callback : nullptr);
+ } else {
+ s = db_impl->WriteWithCallback(woptions, &write_op.write_batch_,
+ &write_op.callback_);
+ }
+
+ if (write_op.callback_.should_fail_) {
+ ASSERT_TRUE(s.IsBusy());
+ } else {
+ ASSERT_OK(s);
+ }
+ };
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // do all the writes
+ std::vector<port::Thread> threads;
+ for (uint32_t i = 0; i < write_group.size(); i++) {
+ threads.emplace_back(write_with_callback_func);
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ // check for keys
+ string value;
+ for (auto& w : write_group) {
+ ASSERT_TRUE(w.callback_.was_called_.load());
+ for (auto& kvp : w.kvs_) {
+ if (w.callback_.should_fail_) {
+ ASSERT_TRUE(db->Get(read_options, kvp.first, &value).IsNotFound());
+ } else {
+ ASSERT_OK(db->Get(read_options, kvp.first, &value));
+ ASSERT_EQ(value, kvp.second);
+ }
+ }
+ }
+
+ ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence());
+
+ delete db;
+ ASSERT_OK(DestroyDB(dbname, options));
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(WriteCallbackPTest, WriteCallbackPTest,
+ ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+ ::testing::Bool(), ::testing::Bool(),
+ ::testing::Bool(), ::testing::Bool(),
+ ::testing::Bool()));
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(WriteCallbackTest, WriteCallBackTest) {
+ Options options;
+ WriteOptions write_options;
+ ReadOptions read_options;
+ string value;
+ DB* db;
+ DBImpl* db_impl;
+
+ ASSERT_OK(DestroyDB(dbname, options));
+
+ options.create_if_missing = true;
+ Status s = DB::Open(options, dbname, &db);
+ ASSERT_OK(s);
+
+ db_impl = dynamic_cast<DBImpl*>(db);
+ ASSERT_TRUE(db_impl);
+
+ WriteBatch wb;
+
+ ASSERT_OK(wb.Put("a", "value.a"));
+ ASSERT_OK(wb.Delete("x"));
+
+ // Test a simple Write
+ s = db->Write(write_options, &wb);
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "a", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value.a", value);
+
+ // Test WriteWithCallback
+ WriteCallbackTestWriteCallback1 callback1;
+ WriteBatch wb2;
+
+ ASSERT_OK(wb2.Put("a", "value.a2"));
+
+ s = db_impl->WriteWithCallback(write_options, &wb2, &callback1);
+ ASSERT_OK(s);
+ ASSERT_TRUE(callback1.was_called);
+
+ s = db->Get(read_options, "a", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value.a2", value);
+
+ // Test WriteWithCallback for a callback that fails
+ WriteCallbackTestWriteCallback2 callback2;
+ WriteBatch wb3;
+
+ ASSERT_OK(wb3.Put("a", "value.a3"));
+
+ s = db_impl->WriteWithCallback(write_options, &wb3, &callback2);
+ ASSERT_NOK(s);
+
+ s = db->Get(read_options, "a", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value.a2", value);
+
+ delete db;
+ ASSERT_OK(DestroyDB(dbname, options));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as WriteWithCallback is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/write_controller.cc b/src/rocksdb/db/write_controller.cc
new file mode 100644
index 000000000..c5f744375
--- /dev/null
+++ b/src/rocksdb/db/write_controller.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/write_controller.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <ratio>
+
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
+ ++total_stopped_;
+ return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
+ uint64_t write_rate) {
+ if (0 == total_delayed_++) {
+ // Starting delay, so reset counters.
+ next_refill_time_ = 0;
+ credit_in_bytes_ = 0;
+ }
+ // NOTE: for simplicity, any current credit_in_bytes_ or "debt" in
+ // next_refill_time_ will be based on an old rate. This rate will apply
+ // for subsequent additional debts and for the next refill.
+ set_delayed_write_rate(write_rate);
+ return std::unique_ptr<WriteControllerToken>(new DelayWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken>
+WriteController::GetCompactionPressureToken() {
+ ++total_compaction_pressure_;
+ return std::unique_ptr<WriteControllerToken>(
+ new CompactionPressureToken(this));
+}
+
+bool WriteController::IsStopped() const {
+ return total_stopped_.load(std::memory_order_relaxed) > 0;
+}
+// This is inside DB mutex, so we can't sleep and need to minimize
+// frequency to get time.
+// If it turns out to be a performance issue, we can redesign the thread
+// synchronization model here.
+// The function trust caller will sleep micros returned.
+uint64_t WriteController::GetDelay(SystemClock* clock, uint64_t num_bytes) {
+ if (total_stopped_.load(std::memory_order_relaxed) > 0) {
+ return 0;
+ }
+ if (total_delayed_.load(std::memory_order_relaxed) == 0) {
+ return 0;
+ }
+
+ if (credit_in_bytes_ >= num_bytes) {
+ credit_in_bytes_ -= num_bytes;
+ return 0;
+ }
+ // The frequency to get time inside DB mutex is less than one per refill
+ // interval.
+ auto time_now = NowMicrosMonotonic(clock);
+
+ const uint64_t kMicrosPerSecond = 1000000;
+ // Refill every 1 ms
+ const uint64_t kMicrosPerRefill = 1000;
+
+ if (next_refill_time_ == 0) {
+ // Start with an initial allotment of bytes for one interval
+ next_refill_time_ = time_now;
+ }
+ if (next_refill_time_ <= time_now) {
+ // Refill based on time interval plus any extra elapsed
+ uint64_t elapsed = time_now - next_refill_time_ + kMicrosPerRefill;
+ credit_in_bytes_ += static_cast<uint64_t>(
+ 1.0 * elapsed / kMicrosPerSecond * delayed_write_rate_ + 0.999999);
+ next_refill_time_ = time_now + kMicrosPerRefill;
+
+ if (credit_in_bytes_ >= num_bytes) {
+ // Avoid delay if possible, to reduce DB mutex release & re-aquire.
+ credit_in_bytes_ -= num_bytes;
+ return 0;
+ }
+ }
+
+ // We need to delay to avoid exceeding write rate.
+ assert(num_bytes > credit_in_bytes_);
+ uint64_t bytes_over_budget = num_bytes - credit_in_bytes_;
+ uint64_t needed_delay = static_cast<uint64_t>(
+ 1.0 * bytes_over_budget / delayed_write_rate_ * kMicrosPerSecond);
+
+ credit_in_bytes_ = 0;
+ next_refill_time_ += needed_delay;
+
+ // Minimum delay of refill interval, to reduce DB mutex contention.
+ return std::max(next_refill_time_ - time_now, kMicrosPerRefill);
+}
+
+uint64_t WriteController::NowMicrosMonotonic(SystemClock* clock) {
+ return clock->NowNanos() / std::milli::den;
+}
+
+StopWriteToken::~StopWriteToken() {
+ assert(controller_->total_stopped_ >= 1);
+ --controller_->total_stopped_;
+}
+
+DelayWriteToken::~DelayWriteToken() {
+ controller_->total_delayed_--;
+ assert(controller_->total_delayed_.load() >= 0);
+}
+
+CompactionPressureToken::~CompactionPressureToken() {
+ controller_->total_compaction_pressure_--;
+ assert(controller_->total_compaction_pressure_ >= 0);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_controller.h b/src/rocksdb/db/write_controller.h
new file mode 100644
index 000000000..bcead165b
--- /dev/null
+++ b/src/rocksdb/db/write_controller.h
@@ -0,0 +1,148 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SystemClock;
+class WriteControllerToken;
+
+// WriteController is controlling write stalls in our write code-path. Write
+// stalls happen when compaction can't keep up with write rate.
+// All of the methods here (including WriteControllerToken's destructors) need
+// to be called while holding DB mutex
+class WriteController {
+ public:
+ explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u,
+ int64_t low_pri_rate_bytes_per_sec = 1024 * 1024)
+ : total_stopped_(0),
+ total_delayed_(0),
+ total_compaction_pressure_(0),
+ credit_in_bytes_(0),
+ next_refill_time_(0),
+ low_pri_rate_limiter_(
+ NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) {
+ set_max_delayed_write_rate(_delayed_write_rate);
+ }
+ ~WriteController() = default;
+
+ // When an actor (column family) requests a stop token, all writes will be
+ // stopped until the stop token is released (deleted)
+ std::unique_ptr<WriteControllerToken> GetStopToken();
+ // When an actor (column family) requests a delay token, total delay for all
+ // writes to the DB will be controlled under the delayed write rate. Every
+ // write needs to call GetDelay() with number of bytes writing to the DB,
+ // which returns number of microseconds to sleep.
+ std::unique_ptr<WriteControllerToken> GetDelayToken(
+ uint64_t delayed_write_rate);
+ // When an actor (column family) requests a moderate token, compaction
+ // threads will be increased
+ std::unique_ptr<WriteControllerToken> GetCompactionPressureToken();
+
+ // these three metods are querying the state of the WriteController
+ bool IsStopped() const;
+ bool NeedsDelay() const { return total_delayed_.load() > 0; }
+ bool NeedSpeedupCompaction() const {
+ return IsStopped() || NeedsDelay() || total_compaction_pressure_.load() > 0;
+ }
+ // return how many microseconds the caller needs to sleep after the call
+ // num_bytes: how many number of bytes to put into the DB.
+ // Prerequisite: DB mutex held.
+ uint64_t GetDelay(SystemClock* clock, uint64_t num_bytes);
+ void set_delayed_write_rate(uint64_t write_rate) {
+ // avoid divide 0
+ if (write_rate == 0) {
+ write_rate = 1u;
+ } else if (write_rate > max_delayed_write_rate()) {
+ write_rate = max_delayed_write_rate();
+ }
+ delayed_write_rate_ = write_rate;
+ }
+
+ void set_max_delayed_write_rate(uint64_t write_rate) {
+ // avoid divide 0
+ if (write_rate == 0) {
+ write_rate = 1u;
+ }
+ max_delayed_write_rate_ = write_rate;
+ // update delayed_write_rate_ as well
+ delayed_write_rate_ = write_rate;
+ }
+
+ uint64_t delayed_write_rate() const { return delayed_write_rate_; }
+
+ uint64_t max_delayed_write_rate() const { return max_delayed_write_rate_; }
+
+ RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); }
+
+ private:
+ uint64_t NowMicrosMonotonic(SystemClock* clock);
+
+ friend class WriteControllerToken;
+ friend class StopWriteToken;
+ friend class DelayWriteToken;
+ friend class CompactionPressureToken;
+
+ std::atomic<int> total_stopped_;
+ std::atomic<int> total_delayed_;
+ std::atomic<int> total_compaction_pressure_;
+
+ // Number of bytes allowed to write without delay
+ uint64_t credit_in_bytes_;
+ // Next time that we can add more credit of bytes
+ uint64_t next_refill_time_;
+ // Write rate set when initialization or by `DBImpl::SetDBOptions`
+ uint64_t max_delayed_write_rate_;
+ // Current write rate (bytes / second)
+ uint64_t delayed_write_rate_;
+
+ std::unique_ptr<RateLimiter> low_pri_rate_limiter_;
+};
+
+class WriteControllerToken {
+ public:
+ explicit WriteControllerToken(WriteController* controller)
+ : controller_(controller) {}
+ virtual ~WriteControllerToken() {}
+
+ protected:
+ WriteController* controller_;
+
+ private:
+ // no copying allowed
+ WriteControllerToken(const WriteControllerToken&) = delete;
+ void operator=(const WriteControllerToken&) = delete;
+};
+
+class StopWriteToken : public WriteControllerToken {
+ public:
+ explicit StopWriteToken(WriteController* controller)
+ : WriteControllerToken(controller) {}
+ virtual ~StopWriteToken();
+};
+
+class DelayWriteToken : public WriteControllerToken {
+ public:
+ explicit DelayWriteToken(WriteController* controller)
+ : WriteControllerToken(controller) {}
+ virtual ~DelayWriteToken();
+};
+
+class CompactionPressureToken : public WriteControllerToken {
+ public:
+ explicit CompactionPressureToken(WriteController* controller)
+ : WriteControllerToken(controller) {}
+ virtual ~CompactionPressureToken();
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_controller_test.cc b/src/rocksdb/db/write_controller_test.cc
new file mode 100644
index 000000000..b6321a3bc
--- /dev/null
+++ b/src/rocksdb/db/write_controller_test.cc
@@ -0,0 +1,248 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/write_controller.h"
+
+#include <array>
+#include <ratio>
+
+#include "rocksdb/system_clock.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class TimeSetClock : public SystemClockWrapper {
+ public:
+ explicit TimeSetClock() : SystemClockWrapper(nullptr) {}
+ const char* Name() const override { return "TimeSetClock"; }
+ uint64_t now_micros_ = 6666;
+ uint64_t NowNanos() override { return now_micros_ * std::milli::den; }
+};
+} // anonymous namespace
+class WriteControllerTest : public testing::Test {
+ public:
+ WriteControllerTest() { clock_ = std::make_shared<TimeSetClock>(); }
+ std::shared_ptr<TimeSetClock> clock_;
+};
+
+// Make tests easier to read
+#define MILLION *1000000u
+#define MB MILLION
+#define MBPS MILLION
+#define SECS MILLION // in microseconds
+
+TEST_F(WriteControllerTest, BasicAPI) {
+ WriteController controller(40 MBPS); // also set max delayed rate
+ EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+ EXPECT_FALSE(controller.IsStopped());
+ EXPECT_FALSE(controller.NeedsDelay());
+ EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+
+ // set, get
+ controller.set_delayed_write_rate(20 MBPS);
+ EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS);
+ EXPECT_FALSE(controller.IsStopped());
+ EXPECT_FALSE(controller.NeedsDelay());
+ EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+
+ {
+ // set with token, get
+ auto delay_token_0 = controller.GetDelayToken(10 MBPS);
+ EXPECT_EQ(controller.delayed_write_rate(), 10 MBPS);
+ EXPECT_FALSE(controller.IsStopped());
+ EXPECT_TRUE(controller.NeedsDelay());
+ // test with delay
+ EXPECT_EQ(2 SECS, controller.GetDelay(clock_.get(), 20 MB));
+ clock_->now_micros_ += 2 SECS; // pay the "debt"
+
+ auto delay_token_1 = controller.GetDelayToken(2 MBPS);
+ EXPECT_EQ(10 SECS, controller.GetDelay(clock_.get(), 20 MB));
+ clock_->now_micros_ += 10 SECS; // pay the "debt"
+
+ auto delay_token_2 = controller.GetDelayToken(1 MBPS);
+ EXPECT_EQ(20 SECS, controller.GetDelay(clock_.get(), 20 MB));
+ clock_->now_micros_ += 20 SECS; // pay the "debt"
+
+ auto delay_token_3 = controller.GetDelayToken(20 MBPS);
+ EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 20 MB));
+ clock_->now_micros_ += 1 SECS; // pay the "debt"
+
+ // 60M is more than the max rate of 40M. Max rate will be used.
+ EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS);
+ auto delay_token_4 =
+ controller.GetDelayToken(controller.delayed_write_rate() * 3);
+ EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+ EXPECT_EQ(static_cast<uint64_t>(0.5 SECS),
+ controller.GetDelay(clock_.get(), 20 MB));
+
+ EXPECT_FALSE(controller.IsStopped());
+ EXPECT_TRUE(controller.NeedsDelay());
+
+ // Test stop tokens
+ {
+ auto stop_token_1 = controller.GetStopToken();
+ EXPECT_TRUE(controller.IsStopped());
+ EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+ {
+ auto stop_token_2 = controller.GetStopToken();
+ EXPECT_TRUE(controller.IsStopped());
+ EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+ }
+ EXPECT_TRUE(controller.IsStopped());
+ EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+ }
+ // Stop tokens released
+ EXPECT_FALSE(controller.IsStopped());
+ EXPECT_TRUE(controller.NeedsDelay());
+ EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+ // pay the previous "debt"
+ clock_->now_micros_ += static_cast<uint64_t>(0.5 SECS);
+ EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 40 MB));
+ }
+
+ // Delay tokens released
+ EXPECT_FALSE(controller.NeedsDelay());
+}
+
+TEST_F(WriteControllerTest, StartFilled) {
+ WriteController controller(10 MBPS);
+
+ // Attempt to write two things that combined would be allowed within
+ // a single refill interval
+ auto delay_token_0 =
+ controller.GetDelayToken(controller.delayed_write_rate());
+
+ // Verify no delay because write rate has not been exceeded within
+ // refill interval.
+ EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+ EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+
+ // Allow refill (kMicrosPerRefill)
+ clock_->now_micros_ += 1000;
+
+ // Again
+ EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+ EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+
+ // Control: something bigger that would exceed write rate within interval
+ uint64_t delay = controller.GetDelay(clock_.get(), 10 MB);
+ EXPECT_GT(1.0 * delay, 0.999 SECS);
+ EXPECT_LT(1.0 * delay, 1.001 SECS);
+}
+
+TEST_F(WriteControllerTest, DebtAccumulation) {
+ WriteController controller(10 MBPS);
+
+ std::array<std::unique_ptr<WriteControllerToken>, 10> tokens;
+
+ // Accumulate a time delay debt with no passage of time, like many column
+ // families delaying writes simultaneously. (Old versions of WriteController
+ // would reset the debt on every GetDelayToken.)
+ uint64_t debt = 0;
+ for (unsigned i = 0; i < tokens.size(); ++i) {
+ tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+ uint64_t delay = controller.GetDelay(clock_.get(), 63 MB);
+ ASSERT_GT(delay, debt);
+ uint64_t incremental = delay - debt;
+ ASSERT_EQ(incremental, (63 SECS) / (i + 1u));
+ debt += incremental;
+ }
+
+ // Pay down the debt
+ clock_->now_micros_ += debt;
+ debt = 0;
+
+ // Now accumulate debt with some passage of time.
+ for (unsigned i = 0; i < tokens.size(); ++i) {
+ // Debt is accumulated in time, not in bytes, so this new write
+ // limit is not applied to prior requested delays, even it they are
+ // in progress.
+ tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+ uint64_t delay = controller.GetDelay(clock_.get(), 63 MB);
+ ASSERT_GT(delay, debt);
+ uint64_t incremental = delay - debt;
+ ASSERT_EQ(incremental, (63 SECS) / (i + 1u));
+ debt += incremental;
+ uint64_t credit = debt / 2;
+ clock_->now_micros_ += credit;
+ debt -= credit;
+ }
+
+ // Pay down the debt
+ clock_->now_micros_ += debt;
+ debt = 0; // consistent state
+ (void)debt; // appease clang-analyze
+
+ // Verify paid down
+ EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+
+ // Accumulate another debt, without accounting, and releasing tokens
+ for (unsigned i = 0; i < tokens.size(); ++i) {
+ // Big and small are delayed
+ ASSERT_LT(0U, controller.GetDelay(clock_.get(), 63 MB));
+ ASSERT_LT(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+ tokens[i].reset();
+ }
+ // All tokens released.
+ // Verify that releasing all tokens pays down debt, even with no time passage.
+ tokens[0] = controller.GetDelayToken(1 MBPS);
+ ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+}
+
+// This may or may not be a "good" feature, but it's an old feature
+TEST_F(WriteControllerTest, CreditAccumulation) {
+ WriteController controller(10 MBPS);
+
+ std::array<std::unique_ptr<WriteControllerToken>, 10> tokens;
+
+ // Ensure started
+ tokens[0] = controller.GetDelayToken(1 MBPS);
+ ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+ clock_->now_micros_ += 10 SECS;
+
+ // Accumulate a credit
+ uint64_t credit = 1000 SECS /* see below: * 1 MB / 1 SEC */;
+ clock_->now_micros_ += credit;
+
+ // Spend some credit (burst of I/O)
+ for (unsigned i = 0; i < tokens.size(); ++i) {
+ tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+ ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 63 MB));
+ // In WriteController, credit is accumulated in bytes, not in time.
+ // After an "unnecessary" delay, all of our time credit will be
+ // translated to bytes on the next operation, in this case with
+ // setting 1 MBPS. So regardless of the rate at delay time, we just
+ // account for the bytes.
+ credit -= 63 MB;
+ }
+ // Spend remaining credit
+ tokens[0] = controller.GetDelayToken(1 MBPS);
+ ASSERT_EQ(0U, controller.GetDelay(clock_.get(), credit));
+ // Verify
+ ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+ clock_->now_micros_ += 10 SECS;
+
+ // Accumulate a credit, no accounting
+ clock_->now_micros_ += 1000 SECS;
+
+ // Spend a small amount, releasing tokens
+ for (unsigned i = 0; i < tokens.size(); ++i) {
+ ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 3 MB));
+ tokens[i].reset();
+ }
+
+ // All tokens released.
+ // Verify credit is wiped away on new delay.
+ tokens[0] = controller.GetDelayToken(1 MBPS);
+ ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_thread.cc b/src/rocksdb/db/write_thread.cc
new file mode 100644
index 000000000..cc8645f37
--- /dev/null
+++ b/src/rocksdb/db/write_thread.cc
@@ -0,0 +1,815 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/write_thread.h"
+
+#include <chrono>
+#include <thread>
+
+#include "db/column_family.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+WriteThread::WriteThread(const ImmutableDBOptions& db_options)
+ : max_yield_usec_(db_options.enable_write_thread_adaptive_yield
+ ? db_options.write_thread_max_yield_usec
+ : 0),
+ slow_yield_usec_(db_options.write_thread_slow_yield_usec),
+ allow_concurrent_memtable_write_(
+ db_options.allow_concurrent_memtable_write),
+ enable_pipelined_write_(db_options.enable_pipelined_write),
+ max_write_batch_group_size_bytes(
+ db_options.max_write_batch_group_size_bytes),
+ newest_writer_(nullptr),
+ newest_memtable_writer_(nullptr),
+ last_sequence_(0),
+ write_stall_dummy_(),
+ stall_mu_(),
+ stall_cv_(&stall_mu_) {}
+
+uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) {
+ // We're going to block. Lazily create the mutex. We guarantee
+ // propagation of this construction to the waker via the
+ // STATE_LOCKED_WAITING state. The waker won't try to touch the mutex
+ // or the condvar unless they CAS away the STATE_LOCKED_WAITING that
+ // we install below.
+ w->CreateMutex();
+
+ auto state = w->state.load(std::memory_order_acquire);
+ assert(state != STATE_LOCKED_WAITING);
+ if ((state & goal_mask) == 0 &&
+ w->state.compare_exchange_strong(state, STATE_LOCKED_WAITING)) {
+ // we have permission (and an obligation) to use StateMutex
+ std::unique_lock<std::mutex> guard(w->StateMutex());
+ w->StateCV().wait(guard, [w] {
+ return w->state.load(std::memory_order_relaxed) != STATE_LOCKED_WAITING;
+ });
+ state = w->state.load(std::memory_order_relaxed);
+ }
+ // else tricky. Goal is met or CAS failed. In the latter case the waker
+ // must have changed the state, and compare_exchange_strong has updated
+ // our local variable with the new one. At the moment WriteThread never
+ // waits for a transition across intermediate states, so we know that
+ // since a state change has occurred the goal must have been met.
+ assert((state & goal_mask) != 0);
+ return state;
+}
+
+uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
+ AdaptationContext* ctx) {
+ uint8_t state = 0;
+
+ // 1. Busy loop using "pause" for 1 micro sec
+ // 2. Else SOMETIMES busy loop using "yield" for 100 micro sec (default)
+ // 3. Else blocking wait
+
+ // On a modern Xeon each loop takes about 7 nanoseconds (most of which
+ // is the effect of the pause instruction), so 200 iterations is a bit
+ // more than a microsecond. This is long enough that waits longer than
+ // this can amortize the cost of accessing the clock and yielding.
+ for (uint32_t tries = 0; tries < 200; ++tries) {
+ state = w->state.load(std::memory_order_acquire);
+ if ((state & goal_mask) != 0) {
+ return state;
+ }
+ port::AsmVolatilePause();
+ }
+
+ // This is below the fast path, so that the stat is zero when all writes are
+ // from the same thread.
+ PERF_TIMER_GUARD(write_thread_wait_nanos);
+
+ // If we're only going to end up waiting a short period of time,
+ // it can be a lot more efficient to call std::this_thread::yield()
+ // in a loop than to block in StateMutex(). For reference, on my 4.0
+ // SELinux test server with support for syscall auditing enabled, the
+ // minimum latency between FUTEX_WAKE to returning from FUTEX_WAIT is
+ // 2.7 usec, and the average is more like 10 usec. That can be a big
+ // drag on RockDB's single-writer design. Of course, spinning is a
+ // bad idea if other threads are waiting to run or if we're going to
+ // wait for a long time. How do we decide?
+ //
+ // We break waiting into 3 categories: short-uncontended,
+ // short-contended, and long. If we had an oracle, then we would always
+ // spin for short-uncontended, always block for long, and our choice for
+ // short-contended might depend on whether we were trying to optimize
+ // RocksDB throughput or avoid being greedy with system resources.
+ //
+ // Bucketing into short or long is easy by measuring elapsed time.
+ // Differentiating short-uncontended from short-contended is a bit
+ // trickier, but not too bad. We could look for involuntary context
+ // switches using getrusage(RUSAGE_THREAD, ..), but it's less work
+ // (portability code and CPU) to just look for yield calls that take
+ // longer than we expect. sched_yield() doesn't actually result in any
+ // context switch overhead if there are no other runnable processes
+ // on the current core, in which case it usually takes less than
+ // a microsecond.
+ //
+ // There are two primary tunables here: the threshold between "short"
+ // and "long" waits, and the threshold at which we suspect that a yield
+ // is slow enough to indicate we should probably block. If these
+ // thresholds are chosen well then CPU-bound workloads that don't
+ // have more threads than cores will experience few context switches
+ // (voluntary or involuntary), and the total number of context switches
+ // (voluntary and involuntary) will not be dramatically larger (maybe
+ // 2x) than the number of voluntary context switches that occur when
+ // --max_yield_wait_micros=0.
+ //
+ // There's another constant, which is the number of slow yields we will
+ // tolerate before reversing our previous decision. Solitary slow
+ // yields are pretty common (low-priority small jobs ready to run),
+ // so this should be at least 2. We set this conservatively to 3 so
+ // that we can also immediately schedule a ctx adaptation, rather than
+ // waiting for the next update_ctx.
+
+ const size_t kMaxSlowYieldsWhileSpinning = 3;
+
+ // Whether the yield approach has any credit in this context. The credit is
+ // added by yield being succesfull before timing out, and decreased otherwise.
+ auto& yield_credit = ctx->value;
+ // Update the yield_credit based on sample runs or right after a hard failure
+ bool update_ctx = false;
+ // Should we reinforce the yield credit
+ bool would_spin_again = false;
+ // The samling base for updating the yeild credit. The sampling rate would be
+ // 1/sampling_base.
+ const int sampling_base = 256;
+
+ if (max_yield_usec_ > 0) {
+ update_ctx = Random::GetTLSInstance()->OneIn(sampling_base);
+
+ if (update_ctx || yield_credit.load(std::memory_order_relaxed) >= 0) {
+ // we're updating the adaptation statistics, or spinning has >
+ // 50% chance of being shorter than max_yield_usec_ and causing no
+ // involuntary context switches
+ auto spin_begin = std::chrono::steady_clock::now();
+
+ // this variable doesn't include the final yield (if any) that
+ // causes the goal to be met
+ size_t slow_yield_count = 0;
+
+ auto iter_begin = spin_begin;
+ while ((iter_begin - spin_begin) <=
+ std::chrono::microseconds(max_yield_usec_)) {
+ std::this_thread::yield();
+
+ state = w->state.load(std::memory_order_acquire);
+ if ((state & goal_mask) != 0) {
+ // success
+ would_spin_again = true;
+ break;
+ }
+
+ auto now = std::chrono::steady_clock::now();
+ if (now == iter_begin ||
+ now - iter_begin >= std::chrono::microseconds(slow_yield_usec_)) {
+ // conservatively count it as a slow yield if our clock isn't
+ // accurate enough to measure the yield duration
+ ++slow_yield_count;
+ if (slow_yield_count >= kMaxSlowYieldsWhileSpinning) {
+ // Not just one ivcsw, but several. Immediately update yield_credit
+ // and fall back to blocking
+ update_ctx = true;
+ break;
+ }
+ }
+ iter_begin = now;
+ }
+ }
+ }
+
+ if ((state & goal_mask) == 0) {
+ TEST_SYNC_POINT_CALLBACK("WriteThread::AwaitState:BlockingWaiting", w);
+ state = BlockingAwaitState(w, goal_mask);
+ }
+
+ if (update_ctx) {
+ // Since our update is sample based, it is ok if a thread overwrites the
+ // updates by other threads. Thus the update does not have to be atomic.
+ auto v = yield_credit.load(std::memory_order_relaxed);
+ // fixed point exponential decay with decay constant 1/1024, with +1
+ // and -1 scaled to avoid overflow for int32_t
+ //
+ // On each update the positive credit is decayed by a facor of 1/1024 (i.e.,
+ // 0.1%). If the sampled yield was successful, the credit is also increased
+ // by X. Setting X=2^17 ensures that the credit never exceeds
+ // 2^17*2^10=2^27, which is lower than 2^31 the upperbound of int32_t. Same
+ // logic applies to negative credits.
+ v = v - (v / 1024) + (would_spin_again ? 1 : -1) * 131072;
+ yield_credit.store(v, std::memory_order_relaxed);
+ }
+
+ assert((state & goal_mask) != 0);
+ return state;
+}
+
+void WriteThread::SetState(Writer* w, uint8_t new_state) {
+ assert(w);
+ auto state = w->state.load(std::memory_order_acquire);
+ if (state == STATE_LOCKED_WAITING ||
+ !w->state.compare_exchange_strong(state, new_state)) {
+ assert(state == STATE_LOCKED_WAITING);
+
+ std::lock_guard<std::mutex> guard(w->StateMutex());
+ assert(w->state.load(std::memory_order_relaxed) != new_state);
+ w->state.store(new_state, std::memory_order_relaxed);
+ w->StateCV().notify_one();
+ }
+}
+
+bool WriteThread::LinkOne(Writer* w, std::atomic<Writer*>* newest_writer) {
+ assert(newest_writer != nullptr);
+ assert(w->state == STATE_INIT);
+ Writer* writers = newest_writer->load(std::memory_order_relaxed);
+ while (true) {
+ // If write stall in effect, and w->no_slowdown is not true,
+ // block here until stall is cleared. If its true, then return
+ // immediately
+ if (writers == &write_stall_dummy_) {
+ if (w->no_slowdown) {
+ w->status = Status::Incomplete("Write stall");
+ SetState(w, STATE_COMPLETED);
+ return false;
+ }
+ // Since no_slowdown is false, wait here to be notified of the write
+ // stall clearing
+ {
+ MutexLock lock(&stall_mu_);
+ writers = newest_writer->load(std::memory_order_relaxed);
+ if (writers == &write_stall_dummy_) {
+ TEST_SYNC_POINT_CALLBACK("WriteThread::WriteStall::Wait", w);
+ stall_cv_.Wait();
+ // Load newest_writers_ again since it may have changed
+ writers = newest_writer->load(std::memory_order_relaxed);
+ continue;
+ }
+ }
+ }
+ w->link_older = writers;
+ if (newest_writer->compare_exchange_weak(writers, w)) {
+ return (writers == nullptr);
+ }
+ }
+}
+
+bool WriteThread::LinkGroup(WriteGroup& write_group,
+ std::atomic<Writer*>* newest_writer) {
+ assert(newest_writer != nullptr);
+ Writer* leader = write_group.leader;
+ Writer* last_writer = write_group.last_writer;
+ Writer* w = last_writer;
+ while (true) {
+ // Unset link_newer pointers to make sure when we call
+ // CreateMissingNewerLinks later it create all missing links.
+ w->link_newer = nullptr;
+ w->write_group = nullptr;
+ if (w == leader) {
+ break;
+ }
+ w = w->link_older;
+ }
+ Writer* newest = newest_writer->load(std::memory_order_relaxed);
+ while (true) {
+ leader->link_older = newest;
+ if (newest_writer->compare_exchange_weak(newest, last_writer)) {
+ return (newest == nullptr);
+ }
+ }
+}
+
+void WriteThread::CreateMissingNewerLinks(Writer* head) {
+ while (true) {
+ Writer* next = head->link_older;
+ if (next == nullptr || next->link_newer != nullptr) {
+ assert(next == nullptr || next->link_newer == head);
+ break;
+ }
+ next->link_newer = head;
+ head = next;
+ }
+}
+
+void WriteThread::CompleteLeader(WriteGroup& write_group) {
+ assert(write_group.size > 0);
+ Writer* leader = write_group.leader;
+ if (write_group.size == 1) {
+ write_group.leader = nullptr;
+ write_group.last_writer = nullptr;
+ } else {
+ assert(leader->link_newer != nullptr);
+ leader->link_newer->link_older = nullptr;
+ write_group.leader = leader->link_newer;
+ }
+ write_group.size -= 1;
+ SetState(leader, STATE_COMPLETED);
+}
+
+void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) {
+ assert(write_group.size > 1);
+ assert(w != write_group.leader);
+ if (w == write_group.last_writer) {
+ w->link_older->link_newer = nullptr;
+ write_group.last_writer = w->link_older;
+ } else {
+ w->link_older->link_newer = w->link_newer;
+ w->link_newer->link_older = w->link_older;
+ }
+ write_group.size -= 1;
+ SetState(w, STATE_COMPLETED);
+}
+
+void WriteThread::BeginWriteStall() {
+ LinkOne(&write_stall_dummy_, &newest_writer_);
+
+ // Walk writer list until w->write_group != nullptr. The current write group
+ // will not have a mix of slowdown/no_slowdown, so its ok to stop at that
+ // point
+ Writer* w = write_stall_dummy_.link_older;
+ Writer* prev = &write_stall_dummy_;
+ while (w != nullptr && w->write_group == nullptr) {
+ if (w->no_slowdown) {
+ prev->link_older = w->link_older;
+ w->status = Status::Incomplete("Write stall");
+ SetState(w, STATE_COMPLETED);
+ // Only update `link_newer` if it's already set.
+ // `CreateMissingNewerLinks()` will update the nullptr `link_newer` later,
+ // which assumes the the first non-nullptr `link_newer` is the last
+ // nullptr link in the writer list.
+ // If `link_newer` is set here, `CreateMissingNewerLinks()` may stop
+ // updating the whole list when it sees the first non nullptr link.
+ if (prev->link_older && prev->link_older->link_newer) {
+ prev->link_older->link_newer = prev;
+ }
+ w = prev->link_older;
+ } else {
+ prev = w;
+ w = w->link_older;
+ }
+ }
+}
+
+void WriteThread::EndWriteStall() {
+ MutexLock lock(&stall_mu_);
+
+ // Unlink write_stall_dummy_ from the write queue. This will unblock
+ // pending write threads to enqueue themselves
+ assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_);
+ assert(write_stall_dummy_.link_older != nullptr);
+ write_stall_dummy_.link_older->link_newer = write_stall_dummy_.link_newer;
+ newest_writer_.exchange(write_stall_dummy_.link_older);
+
+ // Wake up writers
+ stall_cv_.SignalAll();
+}
+
+static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup");
+void WriteThread::JoinBatchGroup(Writer* w) {
+ TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w);
+ assert(w->batch != nullptr);
+
+ bool linked_as_leader = LinkOne(w, &newest_writer_);
+
+ if (linked_as_leader) {
+ SetState(w, STATE_GROUP_LEADER);
+ }
+
+ TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w);
+ TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait2", w);
+
+ if (!linked_as_leader) {
+ /**
+ * Wait util:
+ * 1) An existing leader pick us as the new leader when it finishes
+ * 2) An existing leader pick us as its follewer and
+ * 2.1) finishes the memtable writes on our behalf
+ * 2.2) Or tell us to finish the memtable writes in pralallel
+ * 3) (pipelined write) An existing leader pick us as its follower and
+ * finish book-keeping and WAL write for us, enqueue us as pending
+ * memtable writer, and
+ * 3.1) we become memtable writer group leader, or
+ * 3.2) an existing memtable writer group leader tell us to finish memtable
+ * writes in parallel.
+ */
+ TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:BeganWaiting", w);
+ AwaitState(w,
+ STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER |
+ STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
+ &jbg_ctx);
+ TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w);
+ }
+}
+
+size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader,
+ WriteGroup* write_group) {
+ assert(leader->link_older == nullptr);
+ assert(leader->batch != nullptr);
+ assert(write_group != nullptr);
+
+ size_t size = WriteBatchInternal::ByteSize(leader->batch);
+
+ // Allow the group to grow up to a maximum size, but if the
+ // original write is small, limit the growth so we do not slow
+ // down the small write too much.
+ size_t max_size = max_write_batch_group_size_bytes;
+ const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+ if (size <= min_batch_size_bytes) {
+ max_size = size + min_batch_size_bytes;
+ }
+
+ leader->write_group = write_group;
+ write_group->leader = leader;
+ write_group->last_writer = leader;
+ write_group->size = 1;
+ Writer* newest_writer = newest_writer_.load(std::memory_order_acquire);
+
+ // This is safe regardless of any db mutex status of the caller. Previous
+ // calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks
+ // (they emptied the list and then we added ourself as leader) or had to
+ // explicitly wake us up (the list was non-empty when we added ourself,
+ // so we have already received our MarkJoined).
+ CreateMissingNewerLinks(newest_writer);
+
+ // Tricky. Iteration start (leader) is exclusive and finish
+ // (newest_writer) is inclusive. Iteration goes from old to new.
+ Writer* w = leader;
+ while (w != newest_writer) {
+ assert(w->link_newer);
+ w = w->link_newer;
+
+ if (w->sync && !leader->sync) {
+ // Do not include a sync write into a batch handled by a non-sync write.
+ break;
+ }
+
+ if (w->no_slowdown != leader->no_slowdown) {
+ // Do not mix writes that are ok with delays with the ones that
+ // request fail on delays.
+ break;
+ }
+
+ if (w->disable_wal != leader->disable_wal) {
+ // Do not mix writes that enable WAL with the ones whose
+ // WAL disabled.
+ break;
+ }
+
+ if (w->protection_bytes_per_key != leader->protection_bytes_per_key) {
+ // Do not mix writes with different levels of integrity protection.
+ break;
+ }
+
+ if (w->rate_limiter_priority != leader->rate_limiter_priority) {
+ // Do not mix writes with different rate limiter priorities.
+ break;
+ }
+
+ if (w->batch == nullptr) {
+ // Do not include those writes with nullptr batch. Those are not writes,
+ // those are something else. They want to be alone
+ break;
+ }
+
+ if (w->callback != nullptr && !w->callback->AllowWriteBatching()) {
+ // don't batch writes that don't want to be batched
+ break;
+ }
+
+ auto batch_size = WriteBatchInternal::ByteSize(w->batch);
+ if (size + batch_size > max_size) {
+ // Do not make batch too big
+ break;
+ }
+
+ w->write_group = write_group;
+ size += batch_size;
+ write_group->last_writer = w;
+ write_group->size++;
+ }
+ TEST_SYNC_POINT_CALLBACK("WriteThread::EnterAsBatchGroupLeader:End", w);
+ return size;
+}
+
+void WriteThread::EnterAsMemTableWriter(Writer* leader,
+ WriteGroup* write_group) {
+ assert(leader != nullptr);
+ assert(leader->link_older == nullptr);
+ assert(leader->batch != nullptr);
+ assert(write_group != nullptr);
+
+ size_t size = WriteBatchInternal::ByteSize(leader->batch);
+
+ // Allow the group to grow up to a maximum size, but if the
+ // original write is small, limit the growth so we do not slow
+ // down the small write too much.
+ size_t max_size = max_write_batch_group_size_bytes;
+ const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+ if (size <= min_batch_size_bytes) {
+ max_size = size + min_batch_size_bytes;
+ }
+
+ leader->write_group = write_group;
+ write_group->leader = leader;
+ write_group->size = 1;
+ Writer* last_writer = leader;
+
+ if (!allow_concurrent_memtable_write_ || !leader->batch->HasMerge()) {
+ Writer* newest_writer = newest_memtable_writer_.load();
+ CreateMissingNewerLinks(newest_writer);
+
+ Writer* w = leader;
+ while (w != newest_writer) {
+ assert(w->link_newer);
+ w = w->link_newer;
+
+ if (w->batch == nullptr) {
+ break;
+ }
+
+ if (w->batch->HasMerge()) {
+ break;
+ }
+
+ if (!allow_concurrent_memtable_write_) {
+ auto batch_size = WriteBatchInternal::ByteSize(w->batch);
+ if (size + batch_size > max_size) {
+ // Do not make batch too big
+ break;
+ }
+ size += batch_size;
+ }
+
+ w->write_group = write_group;
+ last_writer = w;
+ write_group->size++;
+ }
+ }
+
+ write_group->last_writer = last_writer;
+ write_group->last_sequence =
+ last_writer->sequence + WriteBatchInternal::Count(last_writer->batch) - 1;
+}
+
+void WriteThread::ExitAsMemTableWriter(Writer* /*self*/,
+ WriteGroup& write_group) {
+ Writer* leader = write_group.leader;
+ Writer* last_writer = write_group.last_writer;
+
+ Writer* newest_writer = last_writer;
+ if (!newest_memtable_writer_.compare_exchange_strong(newest_writer,
+ nullptr)) {
+ CreateMissingNewerLinks(newest_writer);
+ Writer* next_leader = last_writer->link_newer;
+ assert(next_leader != nullptr);
+ next_leader->link_older = nullptr;
+ SetState(next_leader, STATE_MEMTABLE_WRITER_LEADER);
+ }
+ Writer* w = leader;
+ while (true) {
+ if (!write_group.status.ok()) {
+ w->status = write_group.status;
+ }
+ Writer* next = w->link_newer;
+ if (w != leader) {
+ SetState(w, STATE_COMPLETED);
+ }
+ if (w == last_writer) {
+ break;
+ }
+ assert(next);
+ w = next;
+ }
+ // Note that leader has to exit last, since it owns the write group.
+ SetState(leader, STATE_COMPLETED);
+}
+
+void WriteThread::LaunchParallelMemTableWriters(WriteGroup* write_group) {
+ assert(write_group != nullptr);
+ write_group->running.store(write_group->size);
+ for (auto w : *write_group) {
+ SetState(w, STATE_PARALLEL_MEMTABLE_WRITER);
+ }
+}
+
+static WriteThread::AdaptationContext cpmtw_ctx(
+ "CompleteParallelMemTableWriter");
+// This method is called by both the leader and parallel followers
+bool WriteThread::CompleteParallelMemTableWriter(Writer* w) {
+ auto* write_group = w->write_group;
+ if (!w->status.ok()) {
+ std::lock_guard<std::mutex> guard(write_group->leader->StateMutex());
+ write_group->status = w->status;
+ }
+
+ if (write_group->running-- > 1) {
+ // we're not the last one
+ AwaitState(w, STATE_COMPLETED, &cpmtw_ctx);
+ return false;
+ }
+ // else we're the last parallel worker and should perform exit duties.
+ w->status = write_group->status;
+ // Callers of this function must ensure w->status is checked.
+ write_group->status.PermitUncheckedError();
+ return true;
+}
+
+void WriteThread::ExitAsBatchGroupFollower(Writer* w) {
+ auto* write_group = w->write_group;
+
+ assert(w->state == STATE_PARALLEL_MEMTABLE_WRITER);
+ assert(write_group->status.ok());
+ ExitAsBatchGroupLeader(*write_group, write_group->status);
+ assert(w->status.ok());
+ assert(w->state == STATE_COMPLETED);
+ SetState(write_group->leader, STATE_COMPLETED);
+}
+
+static WriteThread::AdaptationContext eabgl_ctx("ExitAsBatchGroupLeader");
+void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
+ Status& status) {
+ TEST_SYNC_POINT_CALLBACK("WriteThread::ExitAsBatchGroupLeader:Start",
+ &write_group);
+
+ Writer* leader = write_group.leader;
+ Writer* last_writer = write_group.last_writer;
+ assert(leader->link_older == nullptr);
+
+ // If status is non-ok already, then write_group.status won't have the chance
+ // of being propagated to caller.
+ if (!status.ok()) {
+ write_group.status.PermitUncheckedError();
+ }
+
+ // Propagate memtable write error to the whole group.
+ if (status.ok() && !write_group.status.ok()) {
+ status = write_group.status;
+ }
+
+ if (enable_pipelined_write_) {
+ // We insert a dummy Writer right before our current write_group. This
+ // allows us to unlink our write_group without the risk that a subsequent
+ // writer becomes a new leader and might overtake us and add itself to the
+ // memtable-writer-list before we can do so. This ensures that writers are
+ // added to the memtable-writer-list in the exact same order in which they
+ // were in the newest_writer list.
+ // This must happen before completing the writers from our group to prevent
+ // a race where the owning thread of one of these writers can start a new
+ // write operation.
+ Writer dummy;
+ Writer* head = newest_writer_.load(std::memory_order_acquire);
+ if (head != last_writer ||
+ !newest_writer_.compare_exchange_strong(head, &dummy)) {
+ // Either last_writer wasn't the head during the load(), or it was the
+ // head during the load() but somebody else pushed onto the list before
+ // we did the compare_exchange_strong (causing it to fail). In the latter
+ // case compare_exchange_strong has the effect of re-reading its first
+ // param (head). No need to retry a failing CAS, because only a departing
+ // leader (which we are at the moment) can remove nodes from the list.
+ assert(head != last_writer);
+
+ // After walking link_older starting from head (if not already done) we
+ // will be able to traverse w->link_newer below.
+ CreateMissingNewerLinks(head);
+ assert(last_writer->link_newer != nullptr);
+ last_writer->link_newer->link_older = &dummy;
+ dummy.link_newer = last_writer->link_newer;
+ }
+
+ // Complete writers that don't write to memtable
+ for (Writer* w = last_writer; w != leader;) {
+ Writer* next = w->link_older;
+ w->status = status;
+ if (!w->ShouldWriteToMemtable()) {
+ CompleteFollower(w, write_group);
+ }
+ w = next;
+ }
+ if (!leader->ShouldWriteToMemtable()) {
+ CompleteLeader(write_group);
+ }
+
+ TEST_SYNC_POINT_CALLBACK(
+ "WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters",
+ &write_group);
+
+ // Link the remaining of the group to memtable writer list.
+ // We have to link our group to memtable writer queue before wake up the
+ // next leader or set newest_writer_ to null, otherwise the next leader
+ // can run ahead of us and link to memtable writer queue before we do.
+ if (write_group.size > 0) {
+ if (LinkGroup(write_group, &newest_memtable_writer_)) {
+ // The leader can now be different from current writer.
+ SetState(write_group.leader, STATE_MEMTABLE_WRITER_LEADER);
+ }
+ }
+
+ // Unlink the dummy writer from the list and identify the new leader
+ head = newest_writer_.load(std::memory_order_acquire);
+ if (head != &dummy ||
+ !newest_writer_.compare_exchange_strong(head, nullptr)) {
+ CreateMissingNewerLinks(head);
+ Writer* new_leader = dummy.link_newer;
+ assert(new_leader != nullptr);
+ new_leader->link_older = nullptr;
+ SetState(new_leader, STATE_GROUP_LEADER);
+ }
+
+ AwaitState(leader,
+ STATE_MEMTABLE_WRITER_LEADER | STATE_PARALLEL_MEMTABLE_WRITER |
+ STATE_COMPLETED,
+ &eabgl_ctx);
+ } else {
+ Writer* head = newest_writer_.load(std::memory_order_acquire);
+ if (head != last_writer ||
+ !newest_writer_.compare_exchange_strong(head, nullptr)) {
+ // Either last_writer wasn't the head during the load(), or it was the
+ // head during the load() but somebody else pushed onto the list before
+ // we did the compare_exchange_strong (causing it to fail). In the
+ // latter case compare_exchange_strong has the effect of re-reading
+ // its first param (head). No need to retry a failing CAS, because
+ // only a departing leader (which we are at the moment) can remove
+ // nodes from the list.
+ assert(head != last_writer);
+
+ // After walking link_older starting from head (if not already done)
+ // we will be able to traverse w->link_newer below. This function
+ // can only be called from an active leader, only a leader can
+ // clear newest_writer_, we didn't, and only a clear newest_writer_
+ // could cause the next leader to start their work without a call
+ // to MarkJoined, so we can definitely conclude that no other leader
+ // work is going on here (with or without db mutex).
+ CreateMissingNewerLinks(head);
+ assert(last_writer->link_newer != nullptr);
+ assert(last_writer->link_newer->link_older == last_writer);
+ last_writer->link_newer->link_older = nullptr;
+
+ // Next leader didn't self-identify, because newest_writer_ wasn't
+ // nullptr when they enqueued (we were definitely enqueued before them
+ // and are still in the list). That means leader handoff occurs when
+ // we call MarkJoined
+ SetState(last_writer->link_newer, STATE_GROUP_LEADER);
+ }
+ // else nobody else was waiting, although there might already be a new
+ // leader now
+
+ while (last_writer != leader) {
+ assert(last_writer);
+ last_writer->status = status;
+ // we need to read link_older before calling SetState, because as soon
+ // as it is marked committed the other thread's Await may return and
+ // deallocate the Writer.
+ auto next = last_writer->link_older;
+ SetState(last_writer, STATE_COMPLETED);
+
+ last_writer = next;
+ }
+ }
+}
+
+static WriteThread::AdaptationContext eu_ctx("EnterUnbatched");
+void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) {
+ assert(w != nullptr && w->batch == nullptr);
+ mu->Unlock();
+ bool linked_as_leader = LinkOne(w, &newest_writer_);
+ if (!linked_as_leader) {
+ TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait");
+ // Last leader will not pick us as a follower since our batch is nullptr
+ AwaitState(w, STATE_GROUP_LEADER, &eu_ctx);
+ }
+ if (enable_pipelined_write_) {
+ WaitForMemTableWriters();
+ }
+ mu->Lock();
+}
+
+void WriteThread::ExitUnbatched(Writer* w) {
+ assert(w != nullptr);
+ Writer* newest_writer = w;
+ if (!newest_writer_.compare_exchange_strong(newest_writer, nullptr)) {
+ CreateMissingNewerLinks(newest_writer);
+ Writer* next_leader = w->link_newer;
+ assert(next_leader != nullptr);
+ next_leader->link_older = nullptr;
+ SetState(next_leader, STATE_GROUP_LEADER);
+ }
+}
+
+static WriteThread::AdaptationContext wfmw_ctx("WaitForMemTableWriters");
+void WriteThread::WaitForMemTableWriters() {
+ assert(enable_pipelined_write_);
+ if (newest_memtable_writer_.load() == nullptr) {
+ return;
+ }
+ Writer w;
+ if (!LinkOne(&w, &newest_memtable_writer_)) {
+ AwaitState(&w, STATE_MEMTABLE_WRITER_LEADER, &wfmw_ctx);
+ }
+ newest_memtable_writer_.store(nullptr);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_thread.h b/src/rocksdb/db/write_thread.h
new file mode 100644
index 000000000..0ea51d922
--- /dev/null
+++ b/src/rocksdb/db/write_thread.h
@@ -0,0 +1,440 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <condition_variable>
+#include <cstdint>
+#include <mutex>
+#include <type_traits>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/post_memtable_callback.h"
+#include "db/pre_release_callback.h"
+#include "db/write_callback.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteThread {
+ public:
+ enum State : uint8_t {
+ // The initial state of a writer. This is a Writer that is
+ // waiting in JoinBatchGroup. This state can be left when another
+ // thread informs the waiter that it has become a group leader
+ // (-> STATE_GROUP_LEADER), when a leader that has chosen to be
+ // non-parallel informs a follower that its writes have been committed
+ // (-> STATE_COMPLETED), or when a leader that has chosen to perform
+ // updates in parallel and needs this Writer to apply its batch (->
+ // STATE_PARALLEL_MEMTABLE_WRITER).
+ STATE_INIT = 1,
+
+ // The state used to inform a waiting Writer that it has become the
+ // leader, and it should now build a write batch group. Tricky:
+ // this state is not used if newest_writer_ is empty when a writer
+ // enqueues itself, because there is no need to wait (or even to
+ // create the mutex and condvar used to wait) in that case. This is
+ // a terminal state unless the leader chooses to make this a parallel
+ // batch, in which case the last parallel worker to finish will move
+ // the leader to STATE_COMPLETED.
+ STATE_GROUP_LEADER = 2,
+
+ // The state used to inform a waiting writer that it has become the
+ // leader of memtable writer group. The leader will either write
+ // memtable for the whole group, or launch a parallel group write
+ // to memtable by calling LaunchParallelMemTableWrite.
+ STATE_MEMTABLE_WRITER_LEADER = 4,
+
+ // The state used to inform a waiting writer that it has become a
+ // parallel memtable writer. It can be the group leader who launch the
+ // parallel writer group, or one of the followers. The writer should then
+ // apply its batch to the memtable concurrently and call
+ // CompleteParallelMemTableWriter.
+ STATE_PARALLEL_MEMTABLE_WRITER = 8,
+
+ // A follower whose writes have been applied, or a parallel leader
+ // whose followers have all finished their work. This is a terminal
+ // state.
+ STATE_COMPLETED = 16,
+
+ // A state indicating that the thread may be waiting using StateMutex()
+ // and StateCondVar()
+ STATE_LOCKED_WAITING = 32,
+ };
+
+ struct Writer;
+
+ struct WriteGroup {
+ Writer* leader = nullptr;
+ Writer* last_writer = nullptr;
+ SequenceNumber last_sequence;
+ // before running goes to zero, status needs leader->StateMutex()
+ Status status;
+ std::atomic<size_t> running;
+ size_t size = 0;
+
+ struct Iterator {
+ Writer* writer;
+ Writer* last_writer;
+
+ explicit Iterator(Writer* w, Writer* last)
+ : writer(w), last_writer(last) {}
+
+ Writer* operator*() const { return writer; }
+
+ Iterator& operator++() {
+ assert(writer != nullptr);
+ if (writer == last_writer) {
+ writer = nullptr;
+ } else {
+ writer = writer->link_newer;
+ }
+ return *this;
+ }
+
+ bool operator!=(const Iterator& other) const {
+ return writer != other.writer;
+ }
+ };
+
+ Iterator begin() const { return Iterator(leader, last_writer); }
+ Iterator end() const { return Iterator(nullptr, nullptr); }
+ };
+
+ // Information kept for every waiting writer.
+ struct Writer {
+ WriteBatch* batch;
+ bool sync;
+ bool no_slowdown;
+ bool disable_wal;
+ Env::IOPriority rate_limiter_priority;
+ bool disable_memtable;
+ size_t batch_cnt; // if non-zero, number of sub-batches in the write batch
+ size_t protection_bytes_per_key;
+ PreReleaseCallback* pre_release_callback;
+ PostMemTableCallback* post_memtable_callback;
+ uint64_t log_used; // log number that this batch was inserted into
+ uint64_t log_ref; // log number that memtable insert should reference
+ WriteCallback* callback;
+ bool made_waitable; // records lazy construction of mutex and cv
+ std::atomic<uint8_t> state; // write under StateMutex() or pre-link
+ WriteGroup* write_group;
+ SequenceNumber sequence; // the sequence number to use for the first key
+ Status status;
+ Status callback_status; // status returned by callback->Callback()
+
+ std::aligned_storage<sizeof(std::mutex)>::type state_mutex_bytes;
+ std::aligned_storage<sizeof(std::condition_variable)>::type state_cv_bytes;
+ Writer* link_older; // read/write only before linking, or as leader
+ Writer* link_newer; // lazy, read/write only before linking, or as leader
+
+ Writer()
+ : batch(nullptr),
+ sync(false),
+ no_slowdown(false),
+ disable_wal(false),
+ rate_limiter_priority(Env::IOPriority::IO_TOTAL),
+ disable_memtable(false),
+ batch_cnt(0),
+ protection_bytes_per_key(0),
+ pre_release_callback(nullptr),
+ post_memtable_callback(nullptr),
+ log_used(0),
+ log_ref(0),
+ callback(nullptr),
+ made_waitable(false),
+ state(STATE_INIT),
+ write_group(nullptr),
+ sequence(kMaxSequenceNumber),
+ link_older(nullptr),
+ link_newer(nullptr) {}
+
+ Writer(const WriteOptions& write_options, WriteBatch* _batch,
+ WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable,
+ size_t _batch_cnt = 0,
+ PreReleaseCallback* _pre_release_callback = nullptr,
+ PostMemTableCallback* _post_memtable_callback = nullptr)
+ : batch(_batch),
+ sync(write_options.sync),
+ no_slowdown(write_options.no_slowdown),
+ disable_wal(write_options.disableWAL),
+ rate_limiter_priority(write_options.rate_limiter_priority),
+ disable_memtable(_disable_memtable),
+ batch_cnt(_batch_cnt),
+ protection_bytes_per_key(_batch->GetProtectionBytesPerKey()),
+ pre_release_callback(_pre_release_callback),
+ post_memtable_callback(_post_memtable_callback),
+ log_used(0),
+ log_ref(_log_ref),
+ callback(_callback),
+ made_waitable(false),
+ state(STATE_INIT),
+ write_group(nullptr),
+ sequence(kMaxSequenceNumber),
+ link_older(nullptr),
+ link_newer(nullptr) {}
+
+ ~Writer() {
+ if (made_waitable) {
+ StateMutex().~mutex();
+ StateCV().~condition_variable();
+ }
+ status.PermitUncheckedError();
+ callback_status.PermitUncheckedError();
+ }
+
+ bool CheckCallback(DB* db) {
+ if (callback != nullptr) {
+ callback_status = callback->Callback(db);
+ }
+ return callback_status.ok();
+ }
+
+ void CreateMutex() {
+ if (!made_waitable) {
+ // Note that made_waitable is tracked separately from state
+ // transitions, because we can't atomically create the mutex and
+ // link into the list.
+ made_waitable = true;
+ new (&state_mutex_bytes) std::mutex;
+ new (&state_cv_bytes) std::condition_variable;
+ }
+ }
+
+ // returns the aggregate status of this Writer
+ Status FinalStatus() {
+ if (!status.ok()) {
+ // a non-ok memtable write status takes presidence
+ assert(callback == nullptr || callback_status.ok());
+ return status;
+ } else if (!callback_status.ok()) {
+ // if the callback failed then that is the status we want
+ // because a memtable insert should not have been attempted
+ assert(callback != nullptr);
+ assert(status.ok());
+ return callback_status;
+ } else {
+ // if there is no callback then we only care about
+ // the memtable insert status
+ assert(callback == nullptr || callback_status.ok());
+ return status;
+ }
+ }
+
+ bool CallbackFailed() {
+ return (callback != nullptr) && !callback_status.ok();
+ }
+
+ bool ShouldWriteToMemtable() {
+ return status.ok() && !CallbackFailed() && !disable_memtable;
+ }
+
+ bool ShouldWriteToWAL() {
+ return status.ok() && !CallbackFailed() && !disable_wal;
+ }
+
+ // No other mutexes may be acquired while holding StateMutex(), it is
+ // always last in the order
+ std::mutex& StateMutex() {
+ assert(made_waitable);
+ return *static_cast<std::mutex*>(static_cast<void*>(&state_mutex_bytes));
+ }
+
+ std::condition_variable& StateCV() {
+ assert(made_waitable);
+ return *static_cast<std::condition_variable*>(
+ static_cast<void*>(&state_cv_bytes));
+ }
+ };
+
+ struct AdaptationContext {
+ const char* name;
+ std::atomic<int32_t> value;
+
+ explicit AdaptationContext(const char* name0) : name(name0), value(0) {}
+ };
+
+ explicit WriteThread(const ImmutableDBOptions& db_options);
+
+ virtual ~WriteThread() = default;
+
+ // IMPORTANT: None of the methods in this class rely on the db mutex
+ // for correctness. All of the methods except JoinBatchGroup and
+ // EnterUnbatched may be called either with or without the db mutex held.
+ // Correctness is maintained by ensuring that only a single thread is
+ // a leader at a time.
+
+ // Registers w as ready to become part of a batch group, waits until the
+ // caller should perform some work, and returns the current state of the
+ // writer. If w has become the leader of a write batch group, returns
+ // STATE_GROUP_LEADER. If w has been made part of a sequential batch
+ // group and the leader has performed the write, returns STATE_DONE.
+ // If w has been made part of a parallel batch group and is responsible
+ // for updating the memtable, returns STATE_PARALLEL_MEMTABLE_WRITER.
+ //
+ // The db mutex SHOULD NOT be held when calling this function, because
+ // it will block.
+ //
+ // Writer* w: Writer to be executed as part of a batch group
+ void JoinBatchGroup(Writer* w);
+
+ // Constructs a write batch group led by leader, which should be a
+ // Writer passed to JoinBatchGroup on the current thread.
+ //
+ // Writer* leader: Writer that is STATE_GROUP_LEADER
+ // WriteGroup* write_group: Out-param of group members
+ // returns: Total batch group byte size
+ size_t EnterAsBatchGroupLeader(Writer* leader, WriteGroup* write_group);
+
+ // Unlinks the Writer-s in a batch group, wakes up the non-leaders,
+ // and wakes up the next leader (if any).
+ //
+ // WriteGroup* write_group: the write group
+ // Status status: Status of write operation
+ void ExitAsBatchGroupLeader(WriteGroup& write_group, Status& status);
+
+ // Exit batch group on behalf of batch group leader.
+ void ExitAsBatchGroupFollower(Writer* w);
+
+ // Constructs a write batch group led by leader from newest_memtable_writers_
+ // list. The leader should either write memtable for the whole group and
+ // call ExitAsMemTableWriter, or launch parallel memtable write through
+ // LaunchParallelMemTableWriters.
+ void EnterAsMemTableWriter(Writer* leader, WriteGroup* write_grup);
+
+ // Memtable writer group leader, or the last finished writer in a parallel
+ // write group, exit from the newest_memtable_writers_ list, and wake up
+ // the next leader if needed.
+ void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group);
+
+ // Causes JoinBatchGroup to return STATE_PARALLEL_MEMTABLE_WRITER for all of
+ // the non-leader members of this write batch group. Sets Writer::sequence
+ // before waking them up.
+ //
+ // WriteGroup* write_group: Extra state used to coordinate the parallel add
+ void LaunchParallelMemTableWriters(WriteGroup* write_group);
+
+ // Reports the completion of w's batch to the parallel group leader, and
+ // waits for the rest of the parallel batch to complete. Returns true
+ // if this thread is the last to complete, and hence should advance
+ // the sequence number and then call EarlyExitParallelGroup, false if
+ // someone else has already taken responsibility for that.
+ bool CompleteParallelMemTableWriter(Writer* w);
+
+ // Waits for all preceding writers (unlocking mu while waiting), then
+ // registers w as the currently proceeding writer.
+ //
+ // Writer* w: A Writer not eligible for batching
+ // InstrumentedMutex* mu: The db mutex, to unlock while waiting
+ // REQUIRES: db mutex held
+ void EnterUnbatched(Writer* w, InstrumentedMutex* mu);
+
+ // Completes a Writer begun with EnterUnbatched, unblocking subsequent
+ // writers.
+ void ExitUnbatched(Writer* w);
+
+ // Wait for all parallel memtable writers to finish, in case pipelined
+ // write is enabled.
+ void WaitForMemTableWriters();
+
+ SequenceNumber UpdateLastSequence(SequenceNumber sequence) {
+ if (sequence > last_sequence_) {
+ last_sequence_ = sequence;
+ }
+ return last_sequence_;
+ }
+
+ // Insert a dummy writer at the tail of the write queue to indicate a write
+ // stall, and fail any writers in the queue with no_slowdown set to true
+ void BeginWriteStall();
+
+ // Remove the dummy writer and wake up waiting writers
+ void EndWriteStall();
+
+ private:
+ // See AwaitState.
+ const uint64_t max_yield_usec_;
+ const uint64_t slow_yield_usec_;
+
+ // Allow multiple writers write to memtable concurrently.
+ const bool allow_concurrent_memtable_write_;
+
+ // Enable pipelined write to WAL and memtable.
+ const bool enable_pipelined_write_;
+
+ // The maximum limit of number of bytes that are written in a single batch
+ // of WAL or memtable write. It is followed when the leader write size
+ // is larger than 1/8 of this limit.
+ const uint64_t max_write_batch_group_size_bytes;
+
+ // Points to the newest pending writer. Only leader can remove
+ // elements, adding can be done lock-free by anybody.
+ std::atomic<Writer*> newest_writer_;
+
+ // Points to the newest pending memtable writer. Used only when pipelined
+ // write is enabled.
+ std::atomic<Writer*> newest_memtable_writer_;
+
+ // The last sequence that have been consumed by a writer. The sequence
+ // is not necessary visible to reads because the writer can be ongoing.
+ SequenceNumber last_sequence_;
+
+ // A dummy writer to indicate a write stall condition. This will be inserted
+ // at the tail of the writer queue by the leader, so newer writers can just
+ // check for this and bail
+ Writer write_stall_dummy_;
+
+ // Mutex and condvar for writers to block on a write stall. During a write
+ // stall, writers with no_slowdown set to false will wait on this rather
+ // on the writer queue
+ port::Mutex stall_mu_;
+ port::CondVar stall_cv_;
+
+ // Waits for w->state & goal_mask using w->StateMutex(). Returns
+ // the state that satisfies goal_mask.
+ uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask);
+
+ // Blocks until w->state & goal_mask, returning the state value
+ // that satisfied the predicate. Uses ctx to adaptively use
+ // std::this_thread::yield() to avoid mutex overheads. ctx should be
+ // a context-dependent static.
+ uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx);
+
+ // Set writer state and wake the writer up if it is waiting.
+ void SetState(Writer* w, uint8_t new_state);
+
+ // Links w into the newest_writer list. Return true if w was linked directly
+ // into the leader position. Safe to call from multiple threads without
+ // external locking.
+ bool LinkOne(Writer* w, std::atomic<Writer*>* newest_writer);
+
+ // Link write group into the newest_writer list as a whole, while keeping the
+ // order of the writers unchanged. Return true if the group was linked
+ // directly into the leader position.
+ bool LinkGroup(WriteGroup& write_group, std::atomic<Writer*>* newest_writer);
+
+ // Computes any missing link_newer links. Should not be called
+ // concurrently with itself.
+ void CreateMissingNewerLinks(Writer* head);
+
+ // Set the leader in write_group to completed state and remove it from the
+ // write group.
+ void CompleteLeader(WriteGroup& write_group);
+
+ // Set a follower in write_group to completed state and remove it from the
+ // write group.
+ void CompleteFollower(Writer* w, WriteGroup& write_group);
+};
+
+} // namespace ROCKSDB_NAMESPACE